library(tidyverse)
library(tidymodels)
library(broom)
library(DALEXtra)
Data analysis
load("data/data_nypd.rds")
Define a binary variable over3h which is 1 if duration is greater than 3 hours. Note that it can be obtained even for censored duration.
<- data_nypd |>
data_nypd mutate(over3h = ifelse(duration_hours > 3 | is.na(closed_date),
1, 0) |>
as.factor()
)
ggplot(data_nypd, aes(x = over3h)) +
geom_bar(fill = "pink") +
theme_minimal()
Build a logistic model to predict over3h using the 311 request data as well as those zip code level covariates. If your model has tuning parameters, justify their choices.
<- vfold_cv(data_nypd, v = 5)
data_cv
# write out model specification
<- logistic_reg(penalty = tune(), mixture = tune()) |>
logistic_spec set_engine("glmnet")
# write out recipe
<- recipe(over3h ~ location_type + community_board + borough +
rec + population + population_density + housing_units +
weekday + median_home_value + median_household_income,
occupied_housing_units data = data_nypd) |>
step_dummy(all_nominal_predictors()) |>
step_impute_knn(all_predictors()) |>
step_ns(population,
population_density,
housing_units,
occupied_housing_units,
median_home_value,
median_household_income, deg_free = 5) |>
step_scale(all_predictors())
# put together in workflow
<- workflow() |>
wf add_recipe(rec) |>
add_model(logistic_spec)
# create grid to tune over
<- expand_grid(penalty = seq(0, 20, by = 5),
grid mixture = seq(0, 1, by = 0.2))
# tune model
<- tune_grid(wf,
results grid = grid,
resamples = data_cv)
Use appropriate metrics to assess the performance of the model.
show_best(results, "roc_auc")
# A tibble: 5 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0 0 roc_auc binary 0.802 5 0.00284 Preprocessor1_Model01
2 0 1 roc_auc binary 0.802 5 0.00292 Preprocessor1_Model26
3 0 0.6 roc_auc binary 0.802 5 0.00293 Preprocessor1_Model16
4 0 0.2 roc_auc binary 0.802 5 0.00296 Preprocessor1_Model06
5 0 0.4 roc_auc binary 0.802 5 0.00295 Preprocessor1_Model11
<- select_best(results, "roc_auc")
best_params <- wf |>
final_wf finalize_workflow(best_params)
<- fit(final_wf, data = data_nypd) final_fit
tidy(final_fit) |>
arrange(desc(abs(estimate))) |>
slice(1:20) |>
ggplot(aes(x = abs(estimate), y = fct_reorder(term, abs(estimate)))) +
geom_col(fill = "pink") +
labs(
y = "Term",
x = "Absolute value of coefficient"
+
) theme_minimal()
<- explain_tidymodels(final_fit,
ex_logistic data = data_nypd[,-73],
y = data_nypd$over3h == 1,
label = "logistic")
Preparation of a new explainer is initiated
-> model label : logistic
-> data : 21534 rows 72 cols
-> data : tibble converted into a data.frame
-> target variable : 21534 values
-> predict function : yhat.workflow will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
-> model_info : Model info detected classification task but 'y' is a logical . Converted to numeric. ( NOTE )
-> predicted values : numerical, min = 0.002851162 , mean = 0.1638339 , max = 0.9725077
-> residual function : difference between y and yhat ( default )
-> residuals : numerical, min = -0.8114293 , mean = -2.713633e-09 , max = 0.9903004
A new explainer has been created!
Repeat the analysis with another model (e.g., random forest; neural network; etc.).
# switch to random forest
<- rand_forest(
rand_spec mode = "classification",
mtry = tune()
|>
) set_engine("ranger")
# new recipe
<- recipe(over3h ~ location_type + community_board + borough +
rec + population + population_density + housing_units +
weekday + median_home_value + median_household_income,
occupied_housing_units data = data_nypd) |>
step_dummy(all_nominal_predictors()) |>
step_impute_knn(all_predictors()) |>
step_scale(all_predictors())
# update workflow
<- wf |>
rand_wf update_recipe(rec) |>
update_model(rand_spec)
# create grid to tune over
<- data.frame(mtry = seq(5, 30, by = 5))
grid
# tune model
<- tune_grid(rand_wf,
results_rf grid = grid,
resamples = data_cv)
show_best(results_rf, "roc_auc")
# A tibble: 5 × 7
mtry .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 20 roc_auc binary 0.830 5 0.00310 Preprocessor1_Model4
2 15 roc_auc binary 0.830 5 0.00295 Preprocessor1_Model3
3 25 roc_auc binary 0.829 5 0.00327 Preprocessor1_Model5
4 30 roc_auc binary 0.828 5 0.00352 Preprocessor1_Model6
5 10 roc_auc binary 0.825 5 0.00260 Preprocessor1_Model2
<- select_best(results_rf, "roc_auc")
best_params
<- rand_wf |>
final_rf_wf finalize_workflow(best_params)
<- fit(final_rf_wf, data = data_nypd) rf_fit
<- explain_tidymodels(rf_fit,
ex_rf data = data_nypd[, -73],
y = data_nypd$over3h == 1,
label = "random forest")
Preparation of a new explainer is initiated
-> model label : random forest
-> data : 21534 rows 72 cols
-> data : tibble converted into a data.frame
-> target variable : 21534 values
-> predict function : yhat.workflow will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
-> model_info : Model info detected classification task but 'y' is a logical . Converted to numeric. ( NOTE )
-> predicted values : numerical, min = 4.860093e-05 , mean = 0.1642249 , max = 0.9437438
-> residual function : difference between y and yhat ( default )
-> residuals : numerical, min = -0.8639433 , mean = -0.0003909769 , max = 0.9733889
A new explainer has been created!
<- model_performance(ex_logistic)
logistic_per <- model_performance(ex_rf)
rf_per
logistic_per
Measures for: classification
recall : 0.1451247
precision : 0.6448363
f1 : 0.2369273
accuracy : 0.8468468
auc : 0.8094355
Residuals:
0% 10% 20% 30% 40% 50%
-0.81142926 -0.30330020 -0.19036595 -0.14083562 -0.10674678 -0.06560108
60% 70% 80% 90% 100%
-0.03450531 -0.02138780 -0.01117944 0.63415837 0.99030043
rf_per
Measures for: classification
recall : 0.3177438
precision : 0.7209003
f1 : 0.4410781
accuracy : 0.8680691
auc : 0.8965362
Residuals:
0% 10% 20% 30% 40%
-0.8639433230 -0.2944662615 -0.1806810766 -0.1154710980 -0.0708922523
50% 60% 70% 80% 90%
-0.0340832010 -0.0157616149 -0.0051814599 -0.0002548776 0.5396458885
100%
0.9733888526
plot(logistic_per, rf_per, geom = "prc")
plot(logistic_per, rf_per, geom = "roc")