Machine Learning

Load packages

library(here)
library(tidyverse)
library(rsample) #Data split
library(tidymodels)
library(rpart) #Model fit
library(ranger) #Model fit
library(glmnet) #Model fit
library(rpart.plot)  #viz of decision tree
library(vip) #viz of variable importance plots
library(ggpmisc) #for adding linear regression to plots

Load data

data <- readRDS(here("fluanalysis/data/processed_data/symptoms_clean.RDS"))

Data splitting

set.seed(123)

data_split <- initial_split(
  data, prop = 7/10, #70:30 Split
  strata = BodyTemp) #more balanced outcomes across train/test split
  
train <- training(data_split)
test  <- testing(data_split)

Create recipes for later use

#Training data
bt_rec_train <- recipe(BodyTemp~., data = train) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>% #pick nominal predictors
  step_ordinalscore() %>%
  step_zv(all_predictors()) 

#Testing data
bt_rec_test <- recipe(BodyTemp~., data = test) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>% #pick nominal predictors
  step_ordinalscore() %>%
  step_zv(all_predictors()) 

Null Model

5-Fold Cross Validation

fold_bt_train <- vfold_cv(train, v = 5, repeats = 5, strata = BodyTemp)

fold_bt_test <- vfold_cv(test, v = 5, repeats = 5, strata = BodyTemp)

Create recipe

#Train Data
rec_train <- 
  recipe(BodyTemp ~ ., data = train) %>% #predict BodyTemp with all variables
  step_dummy(all_nominal(), -all_outcomes()) %>% #pick nominal predictors
  step_ordinalscore() %>%
  step_zv(all_predictors()) 

#Test Data
rec_test <- 
  recipe(BodyTemp ~ ., data = train) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_ordinalscore() %>%
  step_zv(all_predictors()) 

Define Model

lm_mod <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Create workflows

#training data
null_train_wf <- workflow() %>% 
  add_model(lm_mod) %>% 
  add_recipe(bt_rec_train)

#testing data
null_test_wf <- workflow() %>% 
  add_model(lm_mod) %>% 
  add_recipe(bt_rec_test)

Fit models

#training data
null_train_fit <- fit_resamples(null_train_wf, resamples = fold_bt_train)

null_test_fit <- fit_resamples(null_test_wf, resamples = fold_bt_test)

Compare train/test metrics

null_train_met <- collect_metrics(null_train_fit)
null_train_met
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 rmse    standard   1.17      25 0.0171  Preprocessor1_Model1
2 rsq     standard   0.0849    25 0.00815 Preprocessor1_Model1
null_test_met <- collect_metrics(null_test_fit)
null_test_met
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 rmse    standard   1.27      25 0.0273  Preprocessor1_Model1
2 rsq     standard   0.0146    25 0.00444 Preprocessor1_Model1

Training RMSE: 1.21

Testing RMSE: 1.16

Tree model tuning and fitting

Specify model

tune_spec_dtree <- decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()) %>%
  set_engine("rpart") %>% 
  set_mode("regression")

tune_spec_dtree
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = tune()
  tree_depth = tune()

Computational engine: rpart 

Create workflow

dtree_wf <- workflow() %>%
  add_model(tune_spec_dtree) %>%
  add_recipe(bt_rec_train)

Create grid for model tuning

tree_grid_dtree <-
  grid_regular(
    cost_complexity(), 
    tree_depth(), 
    levels = 5)

tree_grid_dtree
# A tibble: 25 × 2
   cost_complexity tree_depth
             <dbl>      <int>
 1    0.0000000001          1
 2    0.0000000178          1
 3    0.00000316            1
 4    0.000562              1
 5    0.1                   1
 6    0.0000000001          4
 7    0.0000000178          4
 8    0.00000316            4
 9    0.000562              4
10    0.1                   4
# … with 15 more rows

Use cross-validation for tuning

dtree_resample <- dtree_wf %>% 
  tune_grid(
    resamples = fold_bt_train,
    grid = tree_grid_dtree)
dtree_resample %>%
  collect_metrics()
# A tibble: 50 × 8
   cost_complexity tree_depth .metric .estimator     mean     n  std_err .config
             <dbl>      <int> <chr>   <chr>         <dbl> <int>    <dbl> <chr>  
 1    0.0000000001          1 rmse    standard     1.19      25  0.0170  Prepro…
 2    0.0000000001          1 rsq     standard     0.0383    25  0.00415 Prepro…
 3    0.0000000178          1 rmse    standard     1.19      25  0.0170  Prepro…
 4    0.0000000178          1 rsq     standard     0.0383    25  0.00415 Prepro…
 5    0.00000316            1 rmse    standard     1.19      25  0.0170  Prepro…
 6    0.00000316            1 rsq     standard     0.0383    25  0.00415 Prepro…
 7    0.000562              1 rmse    standard     1.19      25  0.0170  Prepro…
 8    0.000562              1 rsq     standard     0.0383    25  0.00415 Prepro…
 9    0.1                   1 rmse    standard     1.21      25  0.0169  Prepro…
10    0.1                   1 rsq     standard   NaN          0 NA       Prepro…
# … with 40 more rows

Model plotting

dtree_resample %>% collect_metrics() %>%
  mutate(tree_depth = factor(tree_depth)) %>%
  ggplot(aes(cost_complexity, mean, color = tree_depth)) +
  geom_line(linewidth = 1.5, alpha = 0.6) +
  geom_point(size = 2) +
  facet_wrap(~ .metric, scales = "free", nrow = 2) +
  scale_x_log10(labels = scales::label_number()) +
  scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
Warning: Removed 5 rows containing missing values (`geom_line()`).
Warning: Removed 5 rows containing missing values (`geom_point()`).

Select the best model

The show_best() function shows us the top 5 candidate models by default. We set n=1

dtree_resample %>%
  show_best(n=1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 1 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1    0.0000000001          1 rmse    standard    1.19    25  0.0170 Preprocesso…

Tree model RMSE: 1.23

Tree depth of 1 has the best performance metrics

#Selects best performing model
best_tree <- dtree_resample %>%
  select_best() #pull out best performing hyperparameters

best_tree
# A tibble: 1 × 3
  cost_complexity tree_depth .config              
            <dbl>      <int> <chr>                
1    0.0000000001          1 Preprocessor1_Model01

Create final fit based on best model

Update workflow

dtree_final_wf <- dtree_wf %>% 
  finalize_workflow(best_tree) #update workflow with values from select_best()

dtree_final_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps

• step_dummy()
• step_ordinalscore()
• step_zv()

── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)

Main Arguments:
  cost_complexity = 1e-10
  tree_depth = 1

Computational engine: rpart 

Fit model to training data

dtree_final_fit <- dtree_final_wf %>%
  fit(train) 

Evaluate final fit

Calculating residuals

dtree_residuals <- dtree_final_fit %>%
  augment(train) %>% #use augment() to make predictions
  select(c(.pred, BodyTemp)) %>%
  mutate(.resid = BodyTemp - .pred) #calculate residuals

dtree_residuals
# A tibble: 508 × 3
   .pred BodyTemp .resid
   <dbl>    <dbl>  <dbl>
 1  99.2     97.8 -1.44 
 2  99.2     98.1 -1.14 
 3  98.7     98.1 -0.591
 4  98.7     98.2 -0.491
 5  98.7     97.8 -0.891
 6  98.7     98.2 -0.491
 7  98.7     98.1 -0.591
 8  99.2     98   -1.24 
 9  99.2     97.7 -1.54 
10  99.2     98.2 -1.04 
# … with 498 more rows

Plot predictions vs. true value

dtree_pred_plot <- ggplot(dtree_residuals, 
                          aes(x = BodyTemp, 
                              y = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Actual: Decision Tree", 
       x = "Body Temperature Outcome", 
       y = "Body Temperature Prediction")
dtree_pred_plot

Plot residuals vs. predictions

dtree_residual_plot <- ggplot(dtree_residuals, 
                              aes(y = .resid, 
                                  x = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Residuals: Decision Tree", 
       x = "Body Temperature Prediction", 
       y = "Residuals")
plot(dtree_residual_plot) #view plot

LASSO model tuning and fitting

Specify Model

lasso_mod <- linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

Setting mixture to a value of one means that the glmnet model will potentially remove irrelevant predictors and choose a simpler model.

Create Workflow

lasso_wf <- workflow() %>%
  add_model(lasso_mod) %>%
  add_recipe(bt_rec_train)

Creating a Tuning Grid

lasso_grid <- tibble(penalty = 10^seq(-3, 0, length.out = 30))

Use cross-validation for tuning

lasso_resample <- lasso_wf %>%
  tune_grid(resamples = fold_bt_train,
            grid = lasso_grid,
            control = control_grid(verbose = FALSE, save_pred = TRUE),
            metrics = metric_set(rmse))

lasso_resample %>%
  collect_metrics()
# A tibble: 30 × 7
   penalty .metric .estimator  mean     n std_err .config              
     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.001   rmse    standard    1.17    25  0.0170 Preprocessor1_Model01
 2 0.00127 rmse    standard    1.17    25  0.0170 Preprocessor1_Model02
 3 0.00161 rmse    standard    1.17    25  0.0170 Preprocessor1_Model03
 4 0.00204 rmse    standard    1.17    25  0.0170 Preprocessor1_Model04
 5 0.00259 rmse    standard    1.17    25  0.0170 Preprocessor1_Model05
 6 0.00329 rmse    standard    1.17    25  0.0169 Preprocessor1_Model06
 7 0.00418 rmse    standard    1.17    25  0.0169 Preprocessor1_Model07
 8 0.00530 rmse    standard    1.17    25  0.0168 Preprocessor1_Model08
 9 0.00672 rmse    standard    1.16    25  0.0168 Preprocessor1_Model09
10 0.00853 rmse    standard    1.16    25  0.0167 Preprocessor1_Model10
# … with 20 more rows

Model plotting

lr_plot <- lasso_resample %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() +
  scale_x_log10(labels = scales::label_number())

lr_plot

Select the best model

#Show best model
lasso_resample %>% show_best(n=1)
# A tibble: 1 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1  0.0452 rmse    standard    1.15    25  0.0162 Preprocessor1_Model17
#Select best model
best_lasso <- lasso_resample %>%
  select_best()

Lasso model RMSE: 1.18 (slightly better than Tree)

Create final fit based on best model

lasso_final_wf <- lasso_wf %>% 
  finalize_workflow(best_lasso) #update workflow with best model

lasso_final_fit <- lasso_final_wf %>%
  fit(train) #fit updated workflow to training data

Evaluate final fit

Calculating residuals

lasso_residuals <- lasso_final_fit %>%
  augment(train) %>% #use augment() to make predictions from train data
  select(c(.pred, BodyTemp)) %>%
  mutate(.resid = BodyTemp - .pred) #calculate residuals and make new row.

lasso_residuals
# A tibble: 508 × 3
   .pred BodyTemp .resid
   <dbl>    <dbl>  <dbl>
 1  98.7     97.8 -0.913
 2  98.8     98.1 -0.705
 3  98.5     98.1 -0.391
 4  98.8     98.2 -0.637
 5  98.7     97.8 -0.947
 6  98.6     98.2 -0.401
 7  98.2     98.1 -0.147
 8  99.3     98   -1.28 
 9  98.8     97.7 -1.11 
10  99.0     98.2 -0.790
# … with 498 more rows

Plot predictions vs. true value

lasso_pred_plot <- ggplot(lasso_residuals, 
                          aes(x = BodyTemp, 
                              y = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Actual: LASSO", 
       x = "Body Temperature Outcome", 
       y = "Body Temperature Prediction") +
  stat_poly_line() +
  stat_poly_eq()
lasso_pred_plot

RSQ is low but there is still a positive correlation there (which is good)

Plot residuals vs. predictions

lasso_residual_plot <- ggplot(lasso_residuals, 
                              aes(y = .resid, 
                                  x = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Residuals: LASSO", 
       x = "Body Temperature Prediction", 
       y = "Residuals") +
   stat_poly_line() +
  stat_poly_eq()
plot(lasso_residual_plot) #view plot

There should be no relationship here, which there isn’t.

Random Forest model tuning and fitting

Detect cores for RFM computation

cores <- parallel::detectCores()
cores
[1] 4

Specify Model

rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", num.threads = cores) %>% 
  set_mode("regression")

Create workflow

rf_wf <- workflow() %>% add_model(rf_mod) %>%
  add_recipe(bt_rec_train)

Create grid for model tuning

rf_grid  <- expand.grid(mtry = c(3, 4, 5, 6),
                        min_n = c(40,50,60), 
                        trees = c(500,1000))

Use cross-validation for tuning

rf_resample <- rf_wf %>% 
  tune_grid(fold_bt_train,
            grid = 25,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))

Check CV metrics

rf_resample %>%
  collect_metrics()
# A tibble: 25 × 8
    mtry min_n .metric .estimator  mean     n std_err .config              
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1     8    38 rmse    standard    1.16    25  0.0160 Preprocessor1_Model01
 2    29    14 rmse    standard    1.20    25  0.0162 Preprocessor1_Model02
 3    24     2 rmse    standard    1.22    25  0.0158 Preprocessor1_Model03
 4    15    31 rmse    standard    1.17    25  0.0160 Preprocessor1_Model04
 5     7    32 rmse    standard    1.16    25  0.0164 Preprocessor1_Model05
 6    20     9 rmse    standard    1.20    25  0.0159 Preprocessor1_Model06
 7    13     7 rmse    standard    1.19    25  0.0163 Preprocessor1_Model07
 8     2    15 rmse    standard    1.17    25  0.0164 Preprocessor1_Model08
 9    23    28 rmse    standard    1.18    25  0.0161 Preprocessor1_Model09
10    23     4 rmse    standard    1.22    25  0.0158 Preprocessor1_Model10
# … with 15 more rows

Model performance plotting

rf_resample %>% autoplot()

Select the best model

#Show best model
rf_resample %>%
  show_best(n=1)
# A tibble: 1 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     8    38 rmse    standard    1.16    25  0.0160 Preprocessor1_Model01
#Select best model
best_rf <- rf_resample %>%
  select_best(method = "rmse")

Random forest model RMSE: 1.20

Create final fit based on best model

rf_final_wf <- 
  rf_wf %>% 
  finalize_workflow(best_rf) #update workflow with best model

rf_final_fit <- 
  rf_final_wf %>%
  fit(train) #fit best model to training data

Evaluate final fit

Calculating residuals

rf_residuals <- rf_final_fit %>%
  augment(train) %>% #use augment() to make predictions from train data
  select(c(.pred, BodyTemp)) %>%
  mutate(.resid = BodyTemp - .pred) #calculate residuals and make new row.

rf_residuals
# A tibble: 508 × 3
   .pred BodyTemp .resid
   <dbl>    <dbl>  <dbl>
 1  98.7     97.8 -0.854
 2  98.6     98.1 -0.513
 3  98.7     98.1 -0.643
 4  98.7     98.2 -0.535
 5  98.8     97.8 -1.02 
 6  98.5     98.2 -0.336
 7  98.3     98.1 -0.218
 8  99.1     98   -1.12 
 9  98.8     97.7 -1.07 
10  98.8     98.2 -0.644
# … with 498 more rows

Plot predictions vs. true value

rf_pred_plot <- ggplot(rf_residuals, 
                          aes(x = BodyTemp, 
                              y = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Actual: Random Forest", 
       x = "Body Temperature Actual", 
       y = "Body Temperature Prediction") +
   stat_poly_line() +
  stat_poly_eq()
rf_pred_plot

There is a stronger correlation here than in the LASSO or Tree Model

Plot residuals vs. predictions

rf_residual_plot <- ggplot(rf_residuals, 
                              aes(y = .resid, 
                                  x = .pred)) + 
  geom_point() + 
  labs(title = "Predictions vs Residuals: Random Forest", 
       x = "Body Temperature Prediction", 
       y = "Residuals") +
   stat_poly_line() +
  stat_poly_eq()
plot(rf_residual_plot) #view plot

There is a slight correlation here, which is not ideal.

Model Selection

Table 1: Metrics for Models
Model RMSE Std_Err
Null Train 1.21 0.018
Null Test 1.16 0.029
Tree 1.23 0.016
LASSO 1.18 0.017
Random Forest 1.20 0.017

Of the three machine learning models we generated, LASSO had the lowest RMSE, but they were all close. The disadvantage with LASSO though is there is a weak correlation between actual value and prediction. However, the advantage of this model is there is no correlation between predictions and residuals.

The Random Forest model also performed decently with the next lowest RMSE. The disadvantage with this model is that there is a slight correlation between predictions and residuals (as prediction value increases, so does the residual), but the advantage is that there is a stronger correlation between actual values and predictions than in the LASSO model.

The Tree model performed the poorest, being the only one that did not out-perform the null model. The plots generated for this model also did not look promising.

I have decided on the Random Forest model as the final model.

Final Evaluation

Once you picked your final model, you are allowed to once – and only once – fit it to the test data and check how well it performs on that data. You can do that using the last_fit() function applied to the model you end up choosing. For the final model applied to the test set, report performance and the diagnostic plots as above.

rf_last_fit <- rf_final_wf %>% 
  last_fit(data_split)

rf_last_fit %>% collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard     1.18    Preprocessor1_Model1
2 rsq     standard     0.00941 Preprocessor1_Model1

RMSE of 1.10 and standard error of 0.012.

This performance is slightly better than the null model.