Machine Learning

With a focus on our single outcome, the continuous, numerical value of Body temperature.

We will:

Fit regression models

library(here)

Warning: package 'here' was built under R version 4.2.2

here() starts at C:/Users/Raquel/GitHub/MADA/RaquelFrancisco-MADA-portfolio

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.2.3

Warning: package 'ggplot2' was built under R version 4.2.3

Warning: package 'tibble' was built under R version 4.2.3

Warning: package 'tidyr' was built under R version 4.2.3

Warning: package 'readr' was built under R version 4.2.3

Warning: package 'dplyr' was built under R version 4.2.3

Warning: package 'stringr' was built under R version 4.2.2

Warning: package 'forcats' was built under R version 4.2.3

Warning: package 'lubridate' was built under R version 4.2.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.1     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(tidymodels)

Warning: package 'tidymodels' was built under R version 4.2.3

── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.4     ✔ rsample      1.1.1
✔ dials        1.2.0     ✔ tune         1.1.1
✔ infer        1.0.4     ✔ workflows    1.1.3
✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
✔ parsnip      1.1.0     ✔ yardstick    1.1.0
✔ recipes      1.0.5

Warning: package 'broom' was built under R version 4.2.3

Warning: package 'dials' was built under R version 4.2.3

Warning: package 'infer' was built under R version 4.2.2

Warning: package 'modeldata' was built under R version 4.2.2

Warning: package 'parsnip' was built under R version 4.2.3

Warning: package 'recipes' was built under R version 4.2.3

Warning: package 'rsample' was built under R version 4.2.2

Warning: package 'tune' was built under R version 4.2.3

Warning: package 'workflows' was built under R version 4.2.3

Warning: package 'workflowsets' was built under R version 4.2.3

Warning: package 'yardstick' was built under R version 4.2.2

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/

library(dplyr)
library(ranger)

Warning: package 'ranger' was built under R version 4.2.3

library(rpart)

Warning: package 'rpart' was built under R version 4.2.3


Attaching package: 'rpart'

The following object is masked from 'package:dials':

    prune

library(glmnet)

Warning: package 'glmnet' was built under R version 4.2.3

Loading required package: Matrix

Warning: package 'Matrix' was built under R version 4.2.3


Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-7

library(rpart.plot)

Warning: package 'rpart.plot' was built under R version 4.2.3

library(vip)

Warning: package 'vip' was built under R version 4.2.3


Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi

library(finetune)

Warning: package 'finetune' was built under R version 4.2.3

library(kernlab)

Warning: package 'kernlab' was built under R version 4.2.2


Attaching package: 'kernlab'

The following object is masked from 'package:scales':

    alpha

The following object is masked from 'package:purrr':

    cross

The following object is masked from 'package:ggplot2':

    alpha

library(doParallel)

Warning: package 'doParallel' was built under R version 4.2.3

Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loading required package: iterators
Loading required package: parallel

data <- readRDS(here('fluanalysis/data/SypAct_clean.rds')) #upload cleaned data
tibble(data) #overview of data

# A tibble: 730 × 26
   SwollenLymphNodes ChestCongestion ChillsSweats NasalCongestion Sneeze Fatigue
   <fct>             <fct>           <fct>        <fct>           <fct>  <fct>  
 1 Yes               No              No           No              No     Yes    
 2 Yes               Yes             No           Yes             No     Yes    
 3 Yes               Yes             Yes          Yes             Yes    Yes    
 4 Yes               Yes             Yes          Yes             Yes    Yes    
 5 Yes               No              Yes          No              No     Yes    
 6 No                No              Yes          No              Yes    Yes    
 7 No                No              Yes          No              No     Yes    
 8 No                Yes             Yes          Yes             Yes    Yes    
 9 Yes               Yes             Yes          Yes             No     Yes    
10 No                Yes             No           Yes             No     Yes    
# ℹ 720 more rows
# ℹ 20 more variables: SubjectiveFever <fct>, Headache <fct>, Weakness <fct>,
#   CoughIntensity <fct>, Myalgia <fct>, RunnyNose <fct>, AbPain <fct>,
#   ChestPain <fct>, Diarrhea <fct>, EyePn <fct>, Insomnia <fct>,
#   ItchyEye <fct>, Nausea <fct>, EarPn <fct>, Pharyngitis <fct>,
#   Breathless <fct>, ToothPn <fct>, Vomit <fct>, Wheeze <fct>, BodyTemp <dbl>

Data Setup

set.seed(123)
# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 

data_split <- initial_split(data, prop = 2.8/4, strata = BodyTemp) #70% training, 30% testing

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

#5-fold cross-validation, 5 times repeated
fold_data <- vfold_cv(train_data, v = 5, repeats = 5, strata = BodyTemp)

#Create a recipe for the data and fitting. 
data_recipe <- recipe(BodyTemp ~ ., data = train_data) %>%
  step_dummy(all_nominal(), -all_outcomes())

Null model performance

null_recipe <- recipe(BodyTemp ~ 1, data = train_data) %>%
  step_dummy(all_nominal(), -all_outcomes())

# Logistic model recipe
recipe_mod <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

# Model workflow to pair model and recipe 
null_flow <- workflow() %>% 
  add_model(recipe_mod) %>% 
  add_recipe(null_recipe)

#fit the null model to the folds made from the train data set.
null_train <- fit_resamples(null_flow, resamples = fold_data)

→ A | warning: There was 1 warning in `dplyr::summarise()`.
               ℹ In argument: `.estimate = metric_fn(truth = BodyTemp, estimate = .pred, na_rm
                 = na_rm)`.
               Caused by warning:
               ! A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.

There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x13

There were issues with some computations   A: x25
There were issues with some computations   A: x25

#Compute the RMSE for both training and test data
Null_Met <- collect_metrics(null_train)
#RMSE = 1.22

Model tuning and fitting

Fit a Tree

#TUNING HYPERPARAMETERS 
tune_spec <- 
  decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()
  ) %>% 
  set_engine("rpart") %>% 
  set_mode("regression")

tree_grid <- grid_regular(cost_complexity(),
                          tree_depth(),
                          levels = 5)

tree_grid %>% 
  count(tree_depth)

# A tibble: 5 × 2
  tree_depth     n
       <int> <int>
1          1     5
2          4     5
3          8     5
4         11     5
5         15     5

#Model tuning with a grid
tree_wf <- workflow() %>%
  add_model(tune_spec) %>%
  add_recipe(data_recipe)

tree_res <- #Code will take a hot minute to run
  tree_wf %>% 
  tune_grid(
    resamples = fold_data,
    grid = tree_grid
    )

→ A | warning: There was 1 warning in `dplyr::summarise()`.
               ℹ In argument: `.estimate = metric_fn(truth = BodyTemp, estimate = .pred, na_rm
                 = na_rm)`.
               ℹ In group 1: `cost_complexity = 0.1`, `tree_depth = 1`.
               Caused by warning:
               ! A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned., There was 1 warning in `dplyr::summarise()`.
               ℹ In argument: `.estimate = metric_fn(truth = BodyTemp, estimate = .pred, na_rm
                 = na_rm)`.
               ℹ In group 1: `cost_complexity = 0.1`, `tree_depth = 4`.
               Caused by warning:
               ! A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned., There was 1 warning in `dplyr::summarise()`.
               ℹ In argument: `.estimate = metric_fn(truth = BodyTemp, estimate = .pred, na_rm
                 = na_rm)`.
               ℹ In group 1: `cost_complexity = 0.1`, `tree_depth = 8`.
               Caused by warning:
               ! A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned., There was 1 warning in `dplyr::summarise()`.
               ℹ In argument: `.estimate = metric_fn(truth = BodyTemp, estimate = .pred, na_rm
                 = na_rm)`.
               ℹ In group 1: `cost_complexity = 0.1`, `tree_depth = 11`.
               Caused by warning:
               ! A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned., There was 1 warning in `dplyr::summarise()`.
               ℹ In argument: `.estimate = metric_fn(truth = BodyTemp, estimate = .pred, na_rm
                 = na_rm)`.
               ℹ In group 1: `cost_complexity = 0.1`, `tree_depth = 15`.
               Caused by warning:
               ! A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.

There were issues with some computations   A: x1

There were issues with some computations   A: x2

There were issues with some computations   A: x3

There were issues with some computations   A: x4

There were issues with some computations   A: x5

There were issues with some computations   A: x6

There were issues with some computations   A: x7

There were issues with some computations   A: x8

There were issues with some computations   A: x9

There were issues with some computations   A: x10

There were issues with some computations   A: x11

There were issues with some computations   A: x12

There were issues with some computations   A: x13

There were issues with some computations   A: x14

There were issues with some computations   A: x15

There were issues with some computations   A: x16

There were issues with some computations   A: x17

There were issues with some computations   A: x18

There were issues with some computations   A: x19

There were issues with some computations   A: x20

There were issues with some computations   A: x21

There were issues with some computations   A: x22

There were issues with some computations   A: x23

There were issues with some computations   A: x24

There were issues with some computations   A: x25
There were issues with some computations   A: x25

tree_res %>% 
  collect_metrics()

# A tibble: 50 × 8
   cost_complexity tree_depth .metric .estimator     mean     n  std_err .config
             <dbl>      <int> <chr>   <chr>         <dbl> <int>    <dbl> <chr>  
 1    0.0000000001          1 rmse    standard     1.19      25  0.0181  Prepro…
 2    0.0000000001          1 rsq     standard     0.0361    25  0.00422 Prepro…
 3    0.0000000178          1 rmse    standard     1.19      25  0.0181  Prepro…
 4    0.0000000178          1 rsq     standard     0.0361    25  0.00422 Prepro…
 5    0.00000316            1 rmse    standard     1.19      25  0.0181  Prepro…
 6    0.00000316            1 rsq     standard     0.0361    25  0.00422 Prepro…
 7    0.000562              1 rmse    standard     1.19      25  0.0181  Prepro…
 8    0.000562              1 rsq     standard     0.0361    25  0.00422 Prepro…
 9    0.1                   1 rmse    standard     1.21      25  0.0177  Prepro…
10    0.1                   1 rsq     standard   NaN          0 NA       Prepro…
# ℹ 40 more rows

#show and select best
tree_res %>%
  show_best()

Warning: No value of `metric` was given; metric 'rmse' will be used.

# A tibble: 5 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1    0.0000000001          1 rmse    standard    1.19    25  0.0181 Preprocesso…
2    0.0000000178          1 rmse    standard    1.19    25  0.0181 Preprocesso…
3    0.00000316            1 rmse    standard    1.19    25  0.0181 Preprocesso…
4    0.000562              1 rmse    standard    1.19    25  0.0181 Preprocesso…
5    0.0000000001          4 rmse    standard    1.20    25  0.0187 Preprocesso…

#rmse = 1.2

best_tree <- tree_res %>%
  select_best(n=1)

Warning: No value of `metric` was given; metric 'rmse' will be used.

#Final tuned model
final_wf <- 
  tree_wf %>% 
  finalize_workflow(best_tree)

#The Last Fit
final_fit <- 
  final_wf %>%
  fit(train_data) 

#Plot
rpart.plot(extract_fit_parsnip(final_fit)$fit)

Warning: Cannot retrieve the data used to build the model (model.frame: object '..y' not found).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

Fit a LASSO

#BUILD THE MODEL
lasso_mod <- 
  linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

# Recipe and create Workflow
data_recipe

── Recipe ──────────────────────────────────────────────────────────────────────

── Inputs

Number of variables by role

outcome:    1
predictor: 25

── Operations

• Dummy variables from: all_nominal(), -all_outcomes()

lasso_wf <- 
  workflow() %>% 
  add_model(lasso_mod) %>% 
  add_recipe(data_recipe)

#Create grid for tuning
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5) # lowest penalty values

Selecting by penalty

# A tibble: 5 × 1
   penalty
     <dbl>
1 0.0001  
2 0.000127
3 0.000161
4 0.000204
5 0.000259

lr_reg_grid %>% top_n(5)  # highest penalty values

Selecting by penalty

# A tibble: 5 × 1
  penalty
    <dbl>
1  0.0386
2  0.0489
3  0.0621
4  0.0788
5  0.1

#TRAIN AND TUNE THE MODEL
lr_res <- 
  lasso_wf %>% 
  tune_grid(resamples = fold_data,
            grid = lr_reg_grid,
            control = control_grid(verbose = FALSE, save_pred = TRUE),
            metrics = NULL)

lr_res %>%
  collect_metrics()

# A tibble: 60 × 7
    penalty .metric .estimator   mean     n std_err .config              
      <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
 1 0.0001   rmse    standard   1.18      25 0.0167  Preprocessor1_Model01
 2 0.0001   rsq     standard   0.0819    25 0.00875 Preprocessor1_Model01
 3 0.000127 rmse    standard   1.18      25 0.0167  Preprocessor1_Model02
 4 0.000127 rsq     standard   0.0819    25 0.00875 Preprocessor1_Model02
 5 0.000161 rmse    standard   1.18      25 0.0167  Preprocessor1_Model03
 6 0.000161 rsq     standard   0.0819    25 0.00875 Preprocessor1_Model03
 7 0.000204 rmse    standard   1.18      25 0.0167  Preprocessor1_Model04
 8 0.000204 rsq     standard   0.0819    25 0.00875 Preprocessor1_Model04
 9 0.000259 rmse    standard   1.18      25 0.0167  Preprocessor1_Model05
10 0.000259 rsq     standard   0.0819    25 0.00875 Preprocessor1_Model05
# ℹ 50 more rows

lr_res %>%
  show_best()

Warning: No value of `metric` was given; metric 'rmse' will be used.

# A tibble: 5 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1  0.0489 rmse    standard    1.15    25  0.0170 Preprocessor1_Model27
2  0.0621 rmse    standard    1.15    25  0.0170 Preprocessor1_Model28
3  0.0386 rmse    standard    1.16    25  0.0170 Preprocessor1_Model26
4  0.0304 rmse    standard    1.16    25  0.0169 Preprocessor1_Model25
5  0.0788 rmse    standard    1.16    25  0.0172 Preprocessor1_Model29

#Selects best performing model
best_lasso <- lr_res %>%
  select_best()

Warning: No value of `metric` was given; metric 'rmse' will be used.

#rmse = 1.18

#Final Model
lasso_final_wf <- 
  lasso_wf %>% 
  finalize_workflow(best_lasso)

lasso_final_fit <- 
  lasso_final_wf %>%
  fit(train_data) 

#Plot
x <- extract_fit_engine(lasso_final_fit)
plot(x, "lambda")

Fit a Random Forest

#BUILD THE MODEL AND IMPROVE TRAINING TIME
cores <- parallel::detectCores()
cores

[1] 4

f_mod <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger",importance = "impurity", num.threads = cores) %>% 
  set_mode("regression")

f_wf <- workflow() %>%
  add_model(f_mod) %>%
  add_recipe(data_recipe)

#TRAIN AND TUNE THE MODEL
f_mod

Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  trees = 1000
  min_n = tune()

Engine-Specific Arguments:
  importance = impurity
  num.threads = cores

Computational engine: ranger

extract_parameter_set_dials(f_mod)

Collection of 2 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[?]
      min_n min_n nparam[+]

Model parameters needing finalization:
   # Randomly Selected Predictors ('mtry')

See `?dials::finalize` or `?dials::update.parameters` for more information.

f_res <- #This code takes a long time to run!
  f_wf %>% 
  tune_grid(fold_data,
            grid = 25,
            control = control_grid(save_pred = TRUE),
            metrics = NULL)

i Creating pre-processing data to finalize unknown parameter: mtry

#Show and select the best

f_res %>%
  show_best()

Warning: No value of `metric` was given; metric 'rmse' will be used.

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     5    27 rmse    standard    1.16    25  0.0168 Preprocessor1_Model18
2     3    19 rmse    standard    1.16    25  0.0167 Preprocessor1_Model09
3     8    24 rmse    standard    1.17    25  0.0167 Preprocessor1_Model04
4    11    36 rmse    standard    1.17    25  0.0168 Preprocessor1_Model13
5     6    15 rmse    standard    1.17    25  0.0167 Preprocessor1_Model22

#rmse = 1.19

f_best <- 
  f_res %>% 
  select_best(metric = "rmse")

f_final_wf <- 
  f_wf %>% 
  finalize_workflow(f_best)

#Final model fit
f_final_fit <- 
  f_final_wf %>%
  fit(train_data) 

f_final_fit %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 28)

#Plot
fx <- extract_fit_engine(f_final_fit)
vip(fx)

Final Evaluation

#Based on rmse the Lasso model appears to be the best
#We will fit final lasson model on split data!

lasso_final_test <- 
  lasso_final_wf %>%
  last_fit(data_split) 

lasso_final_test %>%
   collect_metrics()

# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      1.16   Preprocessor1_Model1
2 rsq     standard      0.0299 Preprocessor1_Model1

#rmse = 1.156