library(here)
library(tidyverse)
library(rsample) #Data split
library(tidymodels)
library(rpart) #Model fit
library(ranger) #Model fit
library(glmnet) #Model fit
library(rpart.plot) #viz of decision tree
library(vip) #viz of variable importance plots
library(ggpmisc) #for adding linear regression to plots
Machine Learning
Load packages
Load data
<- readRDS(here("fluanalysis/data/processed_data/symptoms_clean.RDS")) data
Data splitting
set.seed(123)
<- initial_split(
data_split prop = 7/10, #70:30 Split
data, strata = BodyTemp) #more balanced outcomes across train/test split
<- training(data_split)
train <- testing(data_split) test
Create recipes for later use
#Training data
<- recipe(BodyTemp~., data = train) %>%
bt_rec_train step_dummy(all_nominal(), -all_outcomes()) %>% #pick nominal predictors
step_ordinalscore() %>%
step_zv(all_predictors())
#Testing data
<- recipe(BodyTemp~., data = test) %>%
bt_rec_test step_dummy(all_nominal(), -all_outcomes()) %>% #pick nominal predictors
step_ordinalscore() %>%
step_zv(all_predictors())
Null Model
5-Fold Cross Validation
<- vfold_cv(train, v = 5, repeats = 5, strata = BodyTemp)
fold_bt_train
<- vfold_cv(test, v = 5, repeats = 5, strata = BodyTemp) fold_bt_test
Create recipe
#Train Data
<-
rec_train recipe(BodyTemp ~ ., data = train) %>% #predict BodyTemp with all variables
step_dummy(all_nominal(), -all_outcomes()) %>% #pick nominal predictors
step_ordinalscore() %>%
step_zv(all_predictors())
#Test Data
<-
rec_test recipe(BodyTemp ~ ., data = train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_ordinalscore() %>%
step_zv(all_predictors())
Define Model
<- linear_reg() %>%
lm_mod set_engine("lm") %>%
set_mode("regression")
Create workflows
#training data
<- workflow() %>%
null_train_wf add_model(lm_mod) %>%
add_recipe(bt_rec_train)
#testing data
<- workflow() %>%
null_test_wf add_model(lm_mod) %>%
add_recipe(bt_rec_test)
Fit models
#training data
<- fit_resamples(null_train_wf, resamples = fold_bt_train)
null_train_fit
<- fit_resamples(null_test_wf, resamples = fold_bt_test) null_test_fit
Compare train/test metrics
<- collect_metrics(null_train_fit)
null_train_met null_train_met
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 1.17 25 0.0171 Preprocessor1_Model1
2 rsq standard 0.0849 25 0.00815 Preprocessor1_Model1
<- collect_metrics(null_test_fit)
null_test_met null_test_met
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 1.27 25 0.0273 Preprocessor1_Model1
2 rsq standard 0.0146 25 0.00444 Preprocessor1_Model1
Training RMSE: 1.21
Testing RMSE: 1.16
Tree model tuning and fitting
Specify model
<- decision_tree(
tune_spec_dtree cost_complexity = tune(),
tree_depth = tune()) %>%
set_engine("rpart") %>%
set_mode("regression")
tune_spec_dtree
Decision Tree Model Specification (regression)
Main Arguments:
cost_complexity = tune()
tree_depth = tune()
Computational engine: rpart
Create workflow
<- workflow() %>%
dtree_wf add_model(tune_spec_dtree) %>%
add_recipe(bt_rec_train)
Create grid for model tuning
<-
tree_grid_dtree grid_regular(
cost_complexity(),
tree_depth(),
levels = 5)
tree_grid_dtree
# A tibble: 25 × 2
cost_complexity tree_depth
<dbl> <int>
1 0.0000000001 1
2 0.0000000178 1
3 0.00000316 1
4 0.000562 1
5 0.1 1
6 0.0000000001 4
7 0.0000000178 4
8 0.00000316 4
9 0.000562 4
10 0.1 4
# … with 15 more rows
Use cross-validation for tuning
<- dtree_wf %>%
dtree_resample tune_grid(
resamples = fold_bt_train,
grid = tree_grid_dtree)
%>%
dtree_resample collect_metrics()
# A tibble: 50 × 8
cost_complexity tree_depth .metric .estimator mean n std_err .config
<dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 1 rmse standard 1.19 25 0.0170 Prepro…
2 0.0000000001 1 rsq standard 0.0383 25 0.00415 Prepro…
3 0.0000000178 1 rmse standard 1.19 25 0.0170 Prepro…
4 0.0000000178 1 rsq standard 0.0383 25 0.00415 Prepro…
5 0.00000316 1 rmse standard 1.19 25 0.0170 Prepro…
6 0.00000316 1 rsq standard 0.0383 25 0.00415 Prepro…
7 0.000562 1 rmse standard 1.19 25 0.0170 Prepro…
8 0.000562 1 rsq standard 0.0383 25 0.00415 Prepro…
9 0.1 1 rmse standard 1.21 25 0.0169 Prepro…
10 0.1 1 rsq standard NaN 0 NA Prepro…
# … with 40 more rows
Model plotting
%>% collect_metrics() %>%
dtree_resample mutate(tree_depth = factor(tree_depth)) %>%
ggplot(aes(cost_complexity, mean, color = tree_depth)) +
geom_line(linewidth = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
Warning: Removed 5 rows containing missing values (`geom_line()`).
Warning: Removed 5 rows containing missing values (`geom_point()`).
Select the best model
The show_best()
function shows us the top 5 candidate models by default. We set n=1
%>%
dtree_resample show_best(n=1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 1 × 8
cost_complexity tree_depth .metric .estimator mean n std_err .config
<dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 1 rmse standard 1.19 25 0.0170 Preprocesso…
Tree model RMSE: 1.23
Tree depth of 1 has the best performance metrics
#Selects best performing model
<- dtree_resample %>%
best_tree select_best() #pull out best performing hyperparameters
best_tree
# A tibble: 1 × 3
cost_complexity tree_depth .config
<dbl> <int> <chr>
1 0.0000000001 1 Preprocessor1_Model01
Create final fit based on best model
Update workflow
<- dtree_wf %>%
dtree_final_wf finalize_workflow(best_tree) #update workflow with values from select_best()
dtree_final_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps
• step_dummy()
• step_ordinalscore()
• step_zv()
── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)
Main Arguments:
cost_complexity = 1e-10
tree_depth = 1
Computational engine: rpart
Fit model to training data
<- dtree_final_wf %>%
dtree_final_fit fit(train)
Evaluate final fit
Calculating residuals
<- dtree_final_fit %>%
dtree_residuals augment(train) %>% #use augment() to make predictions
select(c(.pred, BodyTemp)) %>%
mutate(.resid = BodyTemp - .pred) #calculate residuals
dtree_residuals
# A tibble: 508 × 3
.pred BodyTemp .resid
<dbl> <dbl> <dbl>
1 99.2 97.8 -1.44
2 99.2 98.1 -1.14
3 98.7 98.1 -0.591
4 98.7 98.2 -0.491
5 98.7 97.8 -0.891
6 98.7 98.2 -0.491
7 98.7 98.1 -0.591
8 99.2 98 -1.24
9 99.2 97.7 -1.54
10 99.2 98.2 -1.04
# … with 498 more rows
Plot predictions vs. true value
<- ggplot(dtree_residuals,
dtree_pred_plot aes(x = BodyTemp,
y = .pred)) +
geom_point() +
labs(title = "Predictions vs Actual: Decision Tree",
x = "Body Temperature Outcome",
y = "Body Temperature Prediction")
dtree_pred_plot
Plot residuals vs. predictions
<- ggplot(dtree_residuals,
dtree_residual_plot aes(y = .resid,
x = .pred)) +
geom_point() +
labs(title = "Predictions vs Residuals: Decision Tree",
x = "Body Temperature Prediction",
y = "Residuals")
plot(dtree_residual_plot) #view plot
LASSO model tuning and fitting
Specify Model
<- linear_reg(penalty = tune(), mixture = 1) %>%
lasso_mod set_engine("glmnet")
Setting mixture
to a value of one means that the glmnet model will potentially remove irrelevant predictors and choose a simpler model.
Create Workflow
<- workflow() %>%
lasso_wf add_model(lasso_mod) %>%
add_recipe(bt_rec_train)
Creating a Tuning Grid
<- tibble(penalty = 10^seq(-3, 0, length.out = 30)) lasso_grid
Use cross-validation for tuning
<- lasso_wf %>%
lasso_resample tune_grid(resamples = fold_bt_train,
grid = lasso_grid,
control = control_grid(verbose = FALSE, save_pred = TRUE),
metrics = metric_set(rmse))
%>%
lasso_resample collect_metrics()
# A tibble: 30 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.001 rmse standard 1.17 25 0.0170 Preprocessor1_Model01
2 0.00127 rmse standard 1.17 25 0.0170 Preprocessor1_Model02
3 0.00161 rmse standard 1.17 25 0.0170 Preprocessor1_Model03
4 0.00204 rmse standard 1.17 25 0.0170 Preprocessor1_Model04
5 0.00259 rmse standard 1.17 25 0.0170 Preprocessor1_Model05
6 0.00329 rmse standard 1.17 25 0.0169 Preprocessor1_Model06
7 0.00418 rmse standard 1.17 25 0.0169 Preprocessor1_Model07
8 0.00530 rmse standard 1.17 25 0.0168 Preprocessor1_Model08
9 0.00672 rmse standard 1.16 25 0.0168 Preprocessor1_Model09
10 0.00853 rmse standard 1.16 25 0.0167 Preprocessor1_Model10
# … with 20 more rows
Model plotting
<- lasso_resample %>%
lr_plot collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10(labels = scales::label_number())
lr_plot
Select the best model
#Show best model
%>% show_best(n=1) lasso_resample
# A tibble: 1 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0452 rmse standard 1.15 25 0.0162 Preprocessor1_Model17
#Select best model
<- lasso_resample %>%
best_lasso select_best()
Lasso model RMSE: 1.18 (slightly better than Tree)
Create final fit based on best model
<- lasso_wf %>%
lasso_final_wf finalize_workflow(best_lasso) #update workflow with best model
<- lasso_final_wf %>%
lasso_final_fit fit(train) #fit updated workflow to training data
Evaluate final fit
Calculating residuals
<- lasso_final_fit %>%
lasso_residuals augment(train) %>% #use augment() to make predictions from train data
select(c(.pred, BodyTemp)) %>%
mutate(.resid = BodyTemp - .pred) #calculate residuals and make new row.
lasso_residuals
# A tibble: 508 × 3
.pred BodyTemp .resid
<dbl> <dbl> <dbl>
1 98.7 97.8 -0.913
2 98.8 98.1 -0.705
3 98.5 98.1 -0.391
4 98.8 98.2 -0.637
5 98.7 97.8 -0.947
6 98.6 98.2 -0.401
7 98.2 98.1 -0.147
8 99.3 98 -1.28
9 98.8 97.7 -1.11
10 99.0 98.2 -0.790
# … with 498 more rows
Plot predictions vs. true value
<- ggplot(lasso_residuals,
lasso_pred_plot aes(x = BodyTemp,
y = .pred)) +
geom_point() +
labs(title = "Predictions vs Actual: LASSO",
x = "Body Temperature Outcome",
y = "Body Temperature Prediction") +
stat_poly_line() +
stat_poly_eq()
lasso_pred_plot
RSQ is low but there is still a positive correlation there (which is good)
Plot residuals vs. predictions
<- ggplot(lasso_residuals,
lasso_residual_plot aes(y = .resid,
x = .pred)) +
geom_point() +
labs(title = "Predictions vs Residuals: LASSO",
x = "Body Temperature Prediction",
y = "Residuals") +
stat_poly_line() +
stat_poly_eq()
plot(lasso_residual_plot) #view plot
There should be no relationship here, which there isn’t.
Random Forest model tuning and fitting
Detect cores for RFM computation
<- parallel::detectCores()
cores cores
[1] 4
Specify Model
<- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
rf_mod set_engine("ranger", num.threads = cores) %>%
set_mode("regression")
Create workflow
<- workflow() %>% add_model(rf_mod) %>%
rf_wf add_recipe(bt_rec_train)
Create grid for model tuning
<- expand.grid(mtry = c(3, 4, 5, 6),
rf_grid min_n = c(40,50,60),
trees = c(500,1000))
Use cross-validation for tuning
<- rf_wf %>%
rf_resample tune_grid(fold_bt_train,
grid = 25,
control = control_grid(save_pred = TRUE),
metrics = metric_set(rmse))
Check CV metrics
%>%
rf_resample collect_metrics()
# A tibble: 25 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 8 38 rmse standard 1.16 25 0.0160 Preprocessor1_Model01
2 29 14 rmse standard 1.20 25 0.0162 Preprocessor1_Model02
3 24 2 rmse standard 1.22 25 0.0158 Preprocessor1_Model03
4 15 31 rmse standard 1.17 25 0.0160 Preprocessor1_Model04
5 7 32 rmse standard 1.16 25 0.0164 Preprocessor1_Model05
6 20 9 rmse standard 1.20 25 0.0159 Preprocessor1_Model06
7 13 7 rmse standard 1.19 25 0.0163 Preprocessor1_Model07
8 2 15 rmse standard 1.17 25 0.0164 Preprocessor1_Model08
9 23 28 rmse standard 1.18 25 0.0161 Preprocessor1_Model09
10 23 4 rmse standard 1.22 25 0.0158 Preprocessor1_Model10
# … with 15 more rows
Model performance plotting
%>% autoplot() rf_resample
Select the best model
#Show best model
%>%
rf_resample show_best(n=1)
# A tibble: 1 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 8 38 rmse standard 1.16 25 0.0160 Preprocessor1_Model01
#Select best model
<- rf_resample %>%
best_rf select_best(method = "rmse")
Random forest model RMSE: 1.20
Create final fit based on best model
<-
rf_final_wf %>%
rf_wf finalize_workflow(best_rf) #update workflow with best model
<-
rf_final_fit %>%
rf_final_wf fit(train) #fit best model to training data
Evaluate final fit
Calculating residuals
<- rf_final_fit %>%
rf_residuals augment(train) %>% #use augment() to make predictions from train data
select(c(.pred, BodyTemp)) %>%
mutate(.resid = BodyTemp - .pred) #calculate residuals and make new row.
rf_residuals
# A tibble: 508 × 3
.pred BodyTemp .resid
<dbl> <dbl> <dbl>
1 98.7 97.8 -0.854
2 98.6 98.1 -0.513
3 98.7 98.1 -0.643
4 98.7 98.2 -0.535
5 98.8 97.8 -1.02
6 98.5 98.2 -0.336
7 98.3 98.1 -0.218
8 99.1 98 -1.12
9 98.8 97.7 -1.07
10 98.8 98.2 -0.644
# … with 498 more rows
Plot predictions vs. true value
<- ggplot(rf_residuals,
rf_pred_plot aes(x = BodyTemp,
y = .pred)) +
geom_point() +
labs(title = "Predictions vs Actual: Random Forest",
x = "Body Temperature Actual",
y = "Body Temperature Prediction") +
stat_poly_line() +
stat_poly_eq()
rf_pred_plot
There is a stronger correlation here than in the LASSO or Tree Model
Plot residuals vs. predictions
<- ggplot(rf_residuals,
rf_residual_plot aes(y = .resid,
x = .pred)) +
geom_point() +
labs(title = "Predictions vs Residuals: Random Forest",
x = "Body Temperature Prediction",
y = "Residuals") +
stat_poly_line() +
stat_poly_eq()
plot(rf_residual_plot) #view plot
There is a slight correlation here, which is not ideal.
Model Selection
Model | RMSE | Std_Err |
---|---|---|
Null Train | 1.21 | 0.018 |
Null Test | 1.16 | 0.029 |
Tree | 1.23 | 0.016 |
LASSO | 1.18 | 0.017 |
Random Forest | 1.20 | 0.017 |
Of the three machine learning models we generated, LASSO had the lowest RMSE, but they were all close. The disadvantage with LASSO though is there is a weak correlation between actual value and prediction. However, the advantage of this model is there is no correlation between predictions and residuals.
The Random Forest model also performed decently with the next lowest RMSE. The disadvantage with this model is that there is a slight correlation between predictions and residuals (as prediction value increases, so does the residual), but the advantage is that there is a stronger correlation between actual values and predictions than in the LASSO model.
The Tree model performed the poorest, being the only one that did not out-perform the null model. The plots generated for this model also did not look promising.
I have decided on the Random Forest model as the final model.
Final Evaluation
Once you picked your final model, you are allowed to once – and only once – fit it to the test data and check how well it performs on that data. You can do that using the last_fit()
function applied to the model you end up choosing. For the final model applied to the test set, report performance and the diagnostic plots as above.
<- rf_final_wf %>%
rf_last_fit last_fit(data_split)
%>% collect_metrics() rf_last_fit
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 1.18 Preprocessor1_Model1
2 rsq standard 0.00941 Preprocessor1_Model1
RMSE of 1.10 and standard error of 0.012.
This performance is slightly better than the null model.