Improving Models

Load packakes and data

library(here)
Warning: package 'here' was built under R version 4.2.2
here() starts at C:/Users/Raquel/GitHub/MADA/RaquelFrancisco-MADA-portfolio
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.2.2
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
Warning: package 'ggplot2' was built under R version 4.2.2
Warning: package 'stringr' was built under R version 4.2.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(tidymodels)
Warning: package 'tidymodels' was built under R version 4.2.2
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.3     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.2
✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
✔ parsnip      1.0.3     ✔ yardstick    1.1.0
✔ recipes      1.0.4     
Warning: package 'broom' was built under R version 4.2.2
Warning: package 'dials' was built under R version 4.2.2
Warning: package 'infer' was built under R version 4.2.2
Warning: package 'modeldata' was built under R version 4.2.2
Warning: package 'parsnip' was built under R version 4.2.2
Warning: package 'recipes' was built under R version 4.2.2
Warning: package 'rsample' was built under R version 4.2.2
Warning: package 'tune' was built under R version 4.2.2
Warning: package 'workflows' was built under R version 4.2.2
Warning: package 'workflowsets' was built under R version 4.2.2
Warning: package 'yardstick' was built under R version 4.2.2
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Dig deeper into tidy modeling with R at https://www.tmwr.org
library(dplyr)

data <- readRDS(here('fluanalysis/data/SypAct_clean.rds')) #upload cleaned data
tibble(data) #overview of data
# A tibble: 730 × 26
   SwollenLymph…¹ Chest…² Chill…³ Nasal…⁴ Sneeze Fatigue Subje…⁵ Heada…⁶ Weakn…⁷
   <fct>          <fct>   <fct>   <fct>   <fct>  <fct>   <fct>   <fct>   <fct>  
 1 Yes            No      No      No      No     Yes     Yes     Yes     Mild   
 2 Yes            Yes     No      Yes     No     Yes     Yes     Yes     Severe 
 3 Yes            Yes     Yes     Yes     Yes    Yes     Yes     Yes     Severe 
 4 Yes            Yes     Yes     Yes     Yes    Yes     Yes     Yes     Severe 
 5 Yes            No      Yes     No      No     Yes     Yes     Yes     Modera…
 6 No             No      Yes     No      Yes    Yes     Yes     Yes     Modera…
 7 No             No      Yes     No      No     Yes     Yes     No      Mild   
 8 No             Yes     Yes     Yes     Yes    Yes     Yes     Yes     Severe 
 9 Yes            Yes     Yes     Yes     No     Yes     Yes     Yes     Modera…
10 No             Yes     No      Yes     No     Yes     No      Yes     Modera…
# … with 720 more rows, 17 more variables: CoughIntensity <fct>, Myalgia <fct>,
#   RunnyNose <fct>, AbPain <fct>, ChestPain <fct>, Diarrhea <fct>,
#   EyePn <fct>, Insomnia <fct>, ItchyEye <fct>, Nausea <fct>, EarPn <fct>,
#   Pharyngitis <fct>, Breathless <fct>, ToothPn <fct>, Vomit <fct>,
#   Wheeze <fct>, BodyTemp <dbl>, and abbreviated variable names
#   ¹​SwollenLymphNodes, ²​ChestCongestion, ³​ChillsSweats, ⁴​NasalCongestion,
#   ⁵​SubjectiveFever, ⁶​Headache, ⁷​Weakness

Split Data

set.seed(123)
# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 

data_split <- initial_split(data, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Make a recipe and fit a model

# Create recipe using Nausea as categorical variable

data_recipe <- recipe(Nausea ~ ., data = train_data)

# Logistic model recipe
recipe_mod <- logistic_reg() %>% set_engine("glm")

# Model workflow to pair model and recipe 
mod_flow <- workflow() %>% 
  add_model(recipe_mod) %>% 
  add_recipe(data_recipe)

mod_flow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm 

Train the model from the resulting predictors

data_fit <- 
  mod_flow %>% 
  fit(data = train_data)

data_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 32 × 5
   term                 estimate std.error statistic p.value
   <chr>                   <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)            -2.83      9.07     -0.312  0.755 
 2 SwollenLymphNodesYes   -0.435     0.235    -1.85   0.0649
 3 ChestCongestionYes      0.277     0.252     1.10   0.271 
 4 ChillsSweatsYes         0.333     0.355     0.939  0.348 
 5 NasalCongestionYes      0.233     0.300     0.778  0.437 
 6 SneezeYes               0.115     0.253     0.455  0.649 
 7 FatigueYes              0.288     0.460     0.626  0.531 
 8 SubjectiveFeverYes      0.402     0.271     1.48   0.138 
 9 HeadacheYes             0.644     0.367     1.75   0.0794
10 WeaknessMild           -0.274     0.574    -0.478  0.633 
# … with 22 more rows

Use a trained workflow to Predict

predict(data_fit, test_data)
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 No         
 2 Yes        
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 Yes        
# … with 173 more rows
data_aug <- 
  augment(data_fit, test_data)

# The data look like: 
data_aug %>%
  select(Nausea, .pred_No, .pred_Yes)
# A tibble: 183 × 3
   Nausea .pred_No .pred_Yes
   <fct>     <dbl>     <dbl>
 1 No        0.961    0.0394
 2 Yes       0.121    0.879 
 3 Yes       0.769    0.231 
 4 Yes       0.744    0.256 
 5 Yes       0.759    0.241 
 6 No        0.771    0.229 
 7 No        0.544    0.456 
 8 No        0.745    0.255 
 9 No        0.941    0.0585
10 Yes       0.171    0.829 
# … with 173 more rows

ROC Curve

data_aug %>% 
  roc_curve(truth = Nausea, .pred_No) %>% 
  autoplot()

data_aug %>% 
  roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.709
# > 0.7; the model might be useful. 

Alternative Model: Main predictor to the Categorical Outcome

# Create new recipe
data_recipe2 <- recipe(Nausea ~ Myalgia, data = train_data) #%>%
#  step_nzv(all_predictors(), freq_cut = 995/5, unique_cut = 10) %>%
#  step_ordinalscore(all_of(ordered_names))

# Model workflow to pair model and recipe 
mod_flow2 <- workflow() %>% 
  add_model(recipe_mod) %>% 
  add_recipe(data_recipe2)

#Train data
data_fit2 <- 
  mod_flow2 %>% 
  fit(data = train_data)

data_aug2 <- 
  augment(data_fit2, test_data)

data_aug2 %>% 
  roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.633
# < 0.7; the model not very useful. 

The following added by SETH LATTNER

Fitting continuous variables

data_recipe3 <- recipe(BodyTemp ~ ., data = train_data) #%>%
#  step_nzv(all_predictors(), freq_cut = 995/5, unique_cut = 10) %>%
#  step_ordinalscore(all_of(ordered_names))

# Logistic model recipe
recipe_mod3 <- linear_reg() %>% set_engine("lm")

# Model workflow to pair model and recipe 
mod_flow3 <- workflow() %>% 
  add_model(recipe_mod3) %>% 
  add_recipe(data_recipe3)

mod_flow3
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: lm 

Train the model from the resulting predictors

data_fit3 <- 
  mod_flow3 %>% 
  fit(data = train_data)

data_fit3 %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 32 × 5
   term                 estimate std.error statistic   p.value
   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
 1 (Intercept)           98.2        0.352   279.    0        
 2 SwollenLymphNodesYes  -0.104      0.108    -0.960 0.337    
 3 ChestCongestionYes     0.0566     0.114     0.496 0.620    
 4 ChillsSweatsYes        0.279      0.151     1.84  0.0664   
 5 NasalCongestionYes    -0.213      0.133    -1.60  0.110    
 6 SneezeYes             -0.400      0.116    -3.44  0.000627 
 7 FatigueYes             0.375      0.187     2.00  0.0461   
 8 SubjectiveFeverYes     0.517      0.122     4.25  0.0000259
 9 HeadacheYes           -0.0614     0.151    -0.406 0.685    
10 WeaknessMild          -0.0473     0.226    -0.209 0.834    
# … with 22 more rows

Use a trained workflow to Predict

predict(data_fit3, test_data)
# A tibble: 183 × 1
   .pred
   <dbl>
 1  99.5
 2  98.9
 3  99.0
 4  98.7
 5  98.5
 6  99.0
 7  99.5
 8  99.7
 9  98.9
10  99.6
# … with 173 more rows
data_aug3 <- 
  augment(data_fit3, test_data)

# The data look like: 
data_aug3 %>%
  select(BodyTemp, .pred)
# A tibble: 183 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1     98.3  99.5
 2    101.   98.9
 3     98.8  99.0
 4     98.5  98.7
 5     98.1  98.5
 6     98.4  99.0
 7     99.5  99.5
 8     98.8  99.7
 9    102.   98.9
10     99.7  99.6
# … with 173 more rows
#calculate RMSE
yardstick::rmse(data_aug3, BodyTemp, .pred)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.14

Alternative Model: Main predictor to the Categorical Outcome

# Create new recipe
data_recipe4 <- recipe(BodyTemp ~ RunnyNose, data = train_data) #%>%
#  step_nzv(all_predictors(), freq_cut = 995/5, unique_cut = 10) %>%
# step_ordinalscore(all_of(ordered_names))

# Model workflow to pair model and recipe 
mod_flow4 <- workflow() %>% 
  add_model(recipe_mod3) %>% 
  add_recipe(data_recipe4)

#Train data
data_fit4 <- 
  mod_flow4 %>% 
  fit(data = train_data)

data_fit4 %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic p.value
  <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    99.2      0.0969   1024.   0      
2 RunnyNoseYes   -0.362    0.115      -3.16 0.00168
#predict from test data
predict(data_fit4, test_data)
# A tibble: 183 × 1
   .pred
   <dbl>
 1  99.2
 2  98.9
 3  98.9
 4  98.9
 5  98.9
 6  98.9
 7  99.2
 8  99.2
 9  99.2
10  99.2
# … with 173 more rows
#augment test data
data_aug4 <- 
  augment(data_fit4, test_data)

#the data look like:
data_aug4 %>%
  select(BodyTemp, .pred)
# A tibble: 183 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1     98.3  99.2
 2    101.   98.9
 3     98.8  98.9
 4     98.5  98.9
 5     98.1  98.9
 6     98.4  98.9
 7     99.5  99.2
 8     98.8  99.2
 9    102.   99.2
10     99.7  99.2
# … with 173 more rows
#calculate RMSE
yardstick::rmse(data_aug4, BodyTemp, .pred)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        1.12

The RMSE of the univariate model 4 (1.12) was lower than that of the global model 3 (1.15), showing that it is a better fit to the data.