Data analysis

Author

Lucy D’Agostino McGowan

library(tidyverse)
library(tidymodels)
library(broom)
library(DALEXtra)

load("data/data_nypd.rds")

Define a binary variable over3h which is 1 if duration is greater than 3 hours. Note that it can be obtained even for censored duration.

data_nypd <- data_nypd |>
  mutate(over3h = ifelse(duration_hours > 3 | is.na(closed_date),
                         1, 0) |> 
           as.factor()
         )

ggplot(data_nypd, aes(x = over3h)) + 
  geom_bar(fill = "pink") + 
  theme_minimal()

Build a logistic model to predict over3h using the 311 request data as well as those zip code level covariates. If your model has tuning parameters, justify their choices.

data_cv <- vfold_cv(data_nypd, v = 5)

# write out model specification
logistic_spec <- logistic_reg(penalty = tune(), mixture = tune()) |>
  set_engine("glmnet")

# write out recipe
rec <- recipe(over3h ~ location_type + community_board + borough + 
                weekday + population + population_density + housing_units + 
                occupied_housing_units + median_home_value + median_household_income,
              data = data_nypd) |>
  step_dummy(all_nominal_predictors()) |>
  step_impute_knn(all_predictors()) |>
  step_ns(population,
          population_density, 
          housing_units, 
          occupied_housing_units, 
          median_home_value,
          median_household_income, 
          deg_free = 5) |>
  step_scale(all_predictors()) 

# put together in workflow
wf <- workflow() |>
  add_recipe(rec) |>
  add_model(logistic_spec)

# create grid to tune over
grid <- expand_grid(penalty = seq(0, 20, by = 5),
                    mixture = seq(0, 1, by = 0.2))

# tune model
results <- tune_grid(wf,
                     grid = grid, 
                     resamples = data_cv)

Use appropriate metrics to assess the performance of the model.

show_best(results, "roc_auc")

# A tibble: 5 × 8
  penalty mixture .metric .estimator  mean     n std_err .config              
    <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1       0     0   roc_auc binary     0.802     5 0.00284 Preprocessor1_Model01
2       0     1   roc_auc binary     0.802     5 0.00292 Preprocessor1_Model26
3       0     0.6 roc_auc binary     0.802     5 0.00293 Preprocessor1_Model16
4       0     0.2 roc_auc binary     0.802     5 0.00296 Preprocessor1_Model06
5       0     0.4 roc_auc binary     0.802     5 0.00295 Preprocessor1_Model11

best_params <- select_best(results, "roc_auc")
final_wf <- wf |>
  finalize_workflow(best_params)
final_fit <- fit(final_wf, data = data_nypd)

tidy(final_fit) |>
  arrange(desc(abs(estimate))) |>
  slice(1:20) |>
  ggplot(aes(x = abs(estimate), y = fct_reorder(term, abs(estimate)))) + 
  geom_col(fill = "pink") + 
  labs(
    y = "Term",
    x = "Absolute value of coefficient"
  ) + 
  theme_minimal()

ex_logistic <- explain_tidymodels(final_fit, 
                   data = data_nypd[,-73], 
                   y = data_nypd$over3h == 1,
                   label = "logistic")

Preparation of a new explainer is initiated
  -> model label       :  logistic 
  -> data              :  21534  rows  72  cols 
  -> data              :  tibble converted into a data.frame 
  -> target variable   :  21534  values 
  -> predict function  :  yhat.workflow  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package tidymodels , ver. 1.0.0 , task classification (  default  ) 
  -> model_info        :  Model info detected classification task but 'y' is a logical . Converted to numeric.  (  NOTE  )
  -> predicted values  :  numerical, min =  0.002851162 , mean =  0.1638339 , max =  0.9725077  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.8114293 , mean =  -2.713633e-09 , max =  0.9903004  
  A new explainer has been created!

Repeat the analysis with another model (e.g., random forest; neural network; etc.).

# switch to random forest
rand_spec <- rand_forest(
  mode = "classification",
  mtry = tune()
) |> 
  set_engine("ranger")

# new recipe

rec <- recipe(over3h ~ location_type + community_board + borough + 
                weekday + population + population_density + housing_units + 
                occupied_housing_units + median_home_value + median_household_income,
              data = data_nypd) |>
  step_dummy(all_nominal_predictors()) |>
  step_impute_knn(all_predictors()) |>
  step_scale(all_predictors()) 

# update workflow
rand_wf <- wf |>
  update_recipe(rec) |>
  update_model(rand_spec)

# create grid to tune over
grid <- data.frame(mtry = seq(5, 30, by = 5))

# tune model
results_rf <- tune_grid(rand_wf,
                     grid = grid, 
                     resamples = data_cv)

show_best(results_rf, "roc_auc")

# A tibble: 5 × 7
   mtry .metric .estimator  mean     n std_err .config             
  <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1    20 roc_auc binary     0.830     5 0.00310 Preprocessor1_Model4
2    15 roc_auc binary     0.830     5 0.00295 Preprocessor1_Model3
3    25 roc_auc binary     0.829     5 0.00327 Preprocessor1_Model5
4    30 roc_auc binary     0.828     5 0.00352 Preprocessor1_Model6
5    10 roc_auc binary     0.825     5 0.00260 Preprocessor1_Model2

best_params <- select_best(results_rf, "roc_auc")

final_rf_wf <- rand_wf |>
  finalize_workflow(best_params)

rf_fit <- fit(final_rf_wf, data = data_nypd)

ex_rf <- explain_tidymodels(rf_fit, 
                   data = data_nypd[, -73], 
                   y = data_nypd$over3h == 1,
                   label = "random forest")

Preparation of a new explainer is initiated
  -> model label       :  random forest 
  -> data              :  21534  rows  72  cols 
  -> data              :  tibble converted into a data.frame 
  -> target variable   :  21534  values 
  -> predict function  :  yhat.workflow  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package tidymodels , ver. 1.0.0 , task classification (  default  ) 
  -> model_info        :  Model info detected classification task but 'y' is a logical . Converted to numeric.  (  NOTE  )
  -> predicted values  :  numerical, min =  4.860093e-05 , mean =  0.1642249 , max =  0.9437438  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.8639433 , mean =  -0.0003909769 , max =  0.9733889  
  A new explainer has been created!

logistic_per <- model_performance(ex_logistic)
rf_per <- model_performance(ex_rf)

logistic_per

Measures for:  classification
recall     : 0.1451247 
precision  : 0.6448363 
f1         : 0.2369273 
accuracy   : 0.8468468 
auc        : 0.8094355

Residuals:
         0%         10%         20%         30%         40%         50% 
-0.81142926 -0.30330020 -0.19036595 -0.14083562 -0.10674678 -0.06560108 
        60%         70%         80%         90%        100% 
-0.03450531 -0.02138780 -0.01117944  0.63415837  0.99030043

rf_per

Measures for:  classification
recall     : 0.3177438 
precision  : 0.7209003 
f1         : 0.4410781 
accuracy   : 0.8680691 
auc        : 0.8965362

Residuals:
           0%           10%           20%           30%           40% 
-0.8639433230 -0.2944662615 -0.1806810766 -0.1154710980 -0.0708922523 
          50%           60%           70%           80%           90% 
-0.0340832010 -0.0157616149 -0.0051814599 -0.0002548776  0.5396458885 
         100% 
 0.9733888526

plot(logistic_per, rf_per, geom = "prc")

plot(logistic_per, rf_per, geom = "roc")