options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Homework Unit 5: Resampling
Introduction
This file serves as the answer key for the Unit_05 homework. Unit 5 Resampling Methods for Model Selection and Evaluation in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to select among model configurations using resampling methods. This includes training models with 2 different feature sets (specified in your recipe) and tuning hyperparameters (k
in KNN).
Set up
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
library(xfun, include.only = "cache_rds")
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify other global settings
Since we are going to use cache_rds()
, we are also going to include rerun_setting <- FALSE
in this chunk
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
<- FALSE rerun_setting
Paths
<- "homework/unit_05" path_data
Set up parallel processing
Note you can type cl
into your console to see how many cores your computer has.
<- parallel::makePSOCKcluster(parallel::detectCores(logical = FALSE))
cl ::registerDoParallel(cl) doParallel
Read in data
Read in the smoking_ema.csv
data file.
Note you can set column types (col_types = cols()
) or set show_col_types = FALSE
if you don’t want the read_csv()
output which is redundant with some of the glimpse()
output!
<- read_csv(here::here(path_data, "smoking_ema.csv"),
data_all show_col_types = FALSE) |>
glimpse()
Rows: 5,876
Columns: 13
$ subid <dbl> 11011, 11011, 11011, 11011, 11011, 11011, 11011, 11011…
$ cigs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ craving <dbl> 2, 0, 0, 1, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 4, …
$ anxious <dbl> 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ irritable <dbl> 1.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
$ sad <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ happy <dbl> 0.0, 0.0, 3.0, 2.0, 2.0, 1.0, 4.0, 3.0, 3.0, 2.0, 3.0,…
$ stressful_event <dbl> 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ positive_event <dbl> 0, 0, 2, 1, 0, 1, 4, 0, 0, 1, 3, 0, 0, 2, 0, 1, 0, 1, …
$ txt <chr> "nrt", "nrt", "nrt", "nrt", "nrt", "nrt", "nrt", "nrt"…
$ prior_cigs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ lag <dbl> 6.069444, 3.306389, 4.544444, 11.491111, 6.152222, 3.4…
$ time_since_quit <dbl> 0.6007639, 0.7385301, 0.9278819, 1.4066782, 1.6630208,…
Lets skim the data to check mins, maxs, and missing values
Looks good!
|>
data_all skim_some()
Name | data_all |
Number of rows | 5876 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 12 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
txt | 0 | 1 | 3 | 7 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
subid | 0 | 1 | 11011.00 | 22149.00 |
cigs | 0 | 1 | 0.00 | 1.00 |
craving | 0 | 1 | 0.00 | 4.00 |
anxious | 0 | 1 | 0.00 | 4.00 |
irritable | 0 | 1 | 0.00 | 4.00 |
sad | 0 | 1 | 0.00 | 4.00 |
happy | 0 | 1 | 0.00 | 4.00 |
stressful_event | 0 | 1 | 0.00 | 4.00 |
positive_event | 0 | 1 | 0.00 | 4.00 |
prior_cigs | 0 | 1 | 0.00 | 1.00 |
lag | 0 | 1 | 0.09 | 108.00 |
time_since_quit | 0 | 1 | 0.10 | 30.03 |
Next we are going to use the data dictionary to class variables
We are doing this on the whole dataset prior to splitting it, so no need for a function (i.e., we are only using this code once!)
<- c("not at all", "mildly", "moderately", "very", "extremely")
valence_labels <- c("no", "mild", "moderate", "very", "extreme")
event_labels
<- data_all |>
data_all mutate(cigs = factor(cigs, levels = 0:1, labels = c("no", "yes")),
across(c(craving, anxious, irritable, sad, happy),
~ factor(.x, levels = 0:4, labels = valence_labels)),
across(contains("_event"),
~ factor(.x, levels = 0:4, labels = event_labels)),
txt = factor(txt, levels = c("placebo", "nrt")),
prior_cigs = factor(prior_cigs, levels = 0:1, labels = c("no", "yes"))) |>
glimpse()
Rows: 5,876
Columns: 13
$ subid <dbl> 11011, 11011, 11011, 11011, 11011, 11011, 11011, 11011…
$ cigs <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no…
$ craving <fct> moderately, not at all, not at all, mildly, not at all…
$ anxious <fct> not at all, moderately, not at all, not at all, not at…
$ irritable <fct> mildly, not at all, not at all, not at all, very, mild…
$ sad <fct> not at all, not at all, not at all, not at all, not at…
$ happy <fct> not at all, not at all, very, moderately, moderately, …
$ stressful_event <fct> no, moderate, no, no, no, moderate, no, no, no, no, no…
$ positive_event <fct> no, no, moderate, mild, no, mild, extreme, no, no, mil…
$ txt <fct> nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt,…
$ prior_cigs <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no…
$ lag <dbl> 6.069444, 3.306389, 4.544444, 11.491111, 6.152222, 3.4…
$ time_since_quit <dbl> 0.6007639, 0.7385301, 0.9278819, 1.4066782, 1.6630208,…
Lets skim again to check our new variables
|>
data_all skim_some()
Name | data_all |
Number of rows | 5876 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
factor | 10 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
cigs | 0 | 1.00 | FALSE | 2 | no: 4798, yes: 1078 |
craving | 71 | 0.99 | FALSE | 5 | mil: 1766, not: 1669, mod: 1344, ver: 594 |
anxious | 56 | 0.99 | FALSE | 5 | not: 2913, mil: 1829, mod: 788, ver: 186 |
irritable | 61 | 0.99 | FALSE | 5 | not: 3131, mil: 1679, mod: 649, ver: 234 |
sad | 35 | 0.99 | FALSE | 5 | not: 4176, mil: 1081, mod: 380, ver: 130 |
happy | 79 | 0.99 | FALSE | 5 | mil: 1792, mod: 1779, not: 1203, ver: 827 |
stressful_event | 0 | 1.00 | FALSE | 5 | no: 3776, mil: 1057, mod: 620, ver: 264 |
positive_event | 0 | 1.00 | FALSE | 5 | no: 3108, mil: 1228, mod: 1006, ver: 401 |
txt | 0 | 1.00 | FALSE | 2 | nrt: 2983, pla: 2893 |
prior_cigs | 0 | 1.00 | FALSE | 2 | no: 4783, yes: 1093 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
subid | 0 | 1 | 11011.00 | 22149.00 |
lag | 0 | 1 | 0.09 | 108.00 |
time_since_quit | 0 | 1 | 0.10 | 30.03 |
Hmm now we have missing data - Lets investigate
If we re-read in the raw data we can look at the different values for our factor variables
<- read_csv(here::here(path_data, "smoking_ema.csv"),
data_all show_col_types = FALSE)
|>
data_all select(craving:positive_event) |>
map(janitor::tabyl)
$craving
.x[[i]] n percent
0.0000000 1669 0.2840367597
0.5000000 16 0.0027229408
0.6666667 1 0.0001701838
0.7142857 1 0.0001701838
0.7500000 1 0.0001701838
1.0000000 1766 0.3005445882
1.1666667 1 0.0001701838
1.2500000 1 0.0001701838
1.3000000 1 0.0001701838
1.3333333 1 0.0001701838
1.4000000 1 0.0001701838
1.5000000 21 0.0035738598
1.6000000 1 0.0001701838
1.6666667 4 0.0006807352
1.7500000 1 0.0001701838
1.8333333 1 0.0001701838
2.0000000 1344 0.2287270252
2.3333333 1 0.0001701838
2.5000000 12 0.0020422056
2.6666667 1 0.0001701838
3.0000000 594 0.1010891763
3.3333333 2 0.0003403676
3.5000000 3 0.0005105514
4.0000000 432 0.0735194010
$anxious
.x[[i]] n percent
0.0000000 2913 0.4957454050
0.3333333 4 0.0006807352
0.5000000 16 0.0027229408
0.7500000 1 0.0001701838
1.0000000 1829 0.3112661675
1.2000000 1 0.0001701838
1.3333333 2 0.0003403676
1.4000000 1 0.0001701838
1.5000000 22 0.0037440436
1.8333333 1 0.0001701838
2.0000000 788 0.1341048332
2.3333333 1 0.0001701838
2.5000000 4 0.0006807352
3.0000000 186 0.0316541865
3.3333333 1 0.0001701838
3.5000000 2 0.0003403676
4.0000000 104 0.0176991150
$irritable
.x[[i]] n percent
0.0000000 3131 0.5328454731
0.2500000 1 0.0001701838
0.3333333 3 0.0005105514
0.5000000 24 0.0040844112
0.6666667 2 0.0003403676
0.8333333 1 0.0001701838
0.9000000 1 0.0001701838
1.0000000 1679 0.2857385977
1.3333333 1 0.0001701838
1.5000000 15 0.0025527570
1.6000000 1 0.0001701838
1.7500000 1 0.0001701838
2.0000000 649 0.1104492852
2.5000000 7 0.0011912866
2.6666667 1 0.0001701838
3.0000000 234 0.0398230088
3.3333333 1 0.0001701838
3.5000000 2 0.0003403676
4.0000000 122 0.0207624234
$sad
.x[[i]] n percent
0.0000000 4176 0.7106875425
0.1666667 1 0.0001701838
0.5000000 19 0.0032334922
0.6666667 1 0.0001701838
1.0000000 1081 0.1839686862
1.5000000 7 0.0011912866
1.6666667 2 0.0003403676
2.0000000 380 0.0646698434
2.4000000 1 0.0001701838
2.5000000 2 0.0003403676
3.0000000 130 0.0221238938
3.3333333 1 0.0001701838
3.5000000 1 0.0001701838
4.0000000 74 0.0125936011
$happy
.x[[i]] n percent
0.0000000 1203 0.2047311096
0.3333333 1 0.0001701838
0.5000000 18 0.0030633084
0.6666667 2 0.0003403676
0.7500000 1 0.0001701838
1.0000000 1792 0.3049693669
1.3333333 3 0.0005105514
1.4000000 1 0.0001701838
1.5000000 25 0.0042545950
1.6666667 3 0.0005105514
2.0000000 1779 0.3027569775
2.3333333 2 0.0003403676
2.5000000 20 0.0034036760
2.7500000 1 0.0001701838
3.0000000 827 0.1407420014
3.3333333 2 0.0003403676
4.0000000 196 0.0333560245
$stressful_event
.x[[i]] n percent
0 3776 0.64261402
1 1057 0.17988428
2 620 0.10551396
3 264 0.04492852
4 159 0.02705922
$positive_event
.x[[i]] n percent
0 3108 0.52893125
1 1228 0.20898570
2 1006 0.17120490
3 401 0.06824370
4 133 0.02263445
So now we see some of our ordinal variables have levels not in the data dictionary (i.e., they are not 0, 1, 2, 3, or 4).
What should we do? Are these errors that need to be corrected? Well thats up to you, the data scientist, to figure out! If you had access to the researchers who collected these data, you might start by discussing this with them. Otherwise, you may need to decide on your own how to proceed.
Some possibilities include, but are not limited to:
- Treat the real (non-integer) numbers as errors, set them to NA, and add a step in your recipe to impute them.
- Keep the variables in their raw numeric form.
- Round the variables to the nearest whole number and then convert to factor.
For example you could use the following code
<- data_all |>
data_round mutate(across(craving:happy, round))
|>
data_round select(craving:happy) |>
map(janitor::tabyl)
$craving
.x[[i]] n percent
0 1685 0.28675970
1 1774 0.30190606
2 1385 0.23570456
3 597 0.10159973
4 435 0.07402995
$anxious
.x[[i]] n percent
0 2933 0.49914908
1 1834 0.31211709
2 816 0.13886998
3 187 0.03182437
4 106 0.01803948
$irritable
.x[[i]] n percent
0 3159 0.53761062
1 1684 0.28658952
2 673 0.11453370
3 236 0.04016338
4 124 0.02110279
$sad
.x[[i]] n percent
0 4196 0.71409122
1 1082 0.18413887
2 392 0.06671205
3 131 0.02229408
4 75 0.01276378
$happy
.x[[i]] n percent
0 1222 0.20796460
1 1799 0.30616065
2 1829 0.31126617
3 830 0.14125255
4 196 0.03335602
There might be no right answer in cases like this and you have to make your best guess. Luckily, we know the people that collected this data. It turns out these scores were averaged across different survey responses and as a result non-integer numbers were introduced. Since, we know that these values are valid we are going to proceed by treating them as numeric variables in our models. We are still going to keep stressful_event
and positive_event
as ordinal variables since they have no non-integer values and class our other character variables as before.
<- data_all |>
data_all mutate(cigs = factor(cigs, levels = 0:1, labels = c("no", "yes")),
across(contains("_event"),
~ factor(.x, levels = 0:4, labels = event_labels)),
txt = factor(txt, levels = c("placebo", "nrt")),
prior_cigs = factor(prior_cigs, levels = 0:1, labels = c("no", "yes"))) |>
glimpse()
Rows: 5,876
Columns: 13
$ subid <dbl> 11011, 11011, 11011, 11011, 11011, 11011, 11011, 11011…
$ cigs <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no…
$ craving <dbl> 2, 0, 0, 1, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 4, …
$ anxious <dbl> 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ irritable <dbl> 1.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
$ sad <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ happy <dbl> 0.0, 0.0, 3.0, 2.0, 2.0, 1.0, 4.0, 3.0, 3.0, 2.0, 3.0,…
$ stressful_event <fct> no, moderate, no, no, no, moderate, no, no, no, no, no…
$ positive_event <fct> no, no, moderate, mild, no, mild, extreme, no, no, mil…
$ txt <fct> nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt, nrt,…
$ prior_cigs <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no…
$ lag <dbl> 6.069444, 3.306389, 4.544444, 11.491111, 6.152222, 3.4…
$ time_since_quit <dbl> 0.6007639, 0.7385301, 0.9278819, 1.4066782, 1.6630208,…
Lets skim the data one more time to be sure everything looks good.
|>
data_all skim_some()
Name | data_all |
Number of rows | 5876 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
factor | 5 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
cigs | 0 | 1 | FALSE | 2 | no: 4798, yes: 1078 |
stressful_event | 0 | 1 | FALSE | 5 | no: 3776, mil: 1057, mod: 620, ver: 264 |
positive_event | 0 | 1 | FALSE | 5 | no: 3108, mil: 1228, mod: 1006, ver: 401 |
txt | 0 | 1 | FALSE | 2 | nrt: 2983, pla: 2893 |
prior_cigs | 0 | 1 | FALSE | 2 | no: 4783, yes: 1093 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
subid | 0 | 1 | 11011.00 | 22149.00 |
craving | 0 | 1 | 0.00 | 4.00 |
anxious | 0 | 1 | 0.00 | 4.00 |
irritable | 0 | 1 | 0.00 | 4.00 |
sad | 0 | 1 | 0.00 | 4.00 |
happy | 0 | 1 | 0.00 | 4.00 |
lag | 0 | 1 | 0.09 | 108.00 |
time_since_quit | 0 | 1 | 0.10 | 30.03 |
Part 1: Selecting among model configurations with k-fold cross-validation
Split data
Splitting data in 10 folds, stratified on cigs
set.seed(102030)
<- data_all |>
splits_kfold vfold_cv(v = 10, repeats = 1, strata = "cigs")
splits_kfold
# 10-fold cross-validation using stratification
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [5288/588]> Fold01
2 <split [5288/588]> Fold02
3 <split [5288/588]> Fold03
4 <split [5288/588]> Fold04
5 <split [5288/588]> Fold05
6 <split [5288/588]> Fold06
7 <split [5288/588]> Fold07
8 <split [5288/588]> Fold08
9 <split [5290/586]> Fold09
10 <split [5290/586]> Fold10
Feature Set 1
Build a recipe with the following specifications:
- Use
cigs
as the outcome variable regressed oncraving
.
- Set up the outcome as a factor with the positive class (“yes”) as the second level.
- Use craving as a feature by simply transforming this ordinal predictor to numeric based on the order of its scores.
Note we already set our outcome variable up as factor with positive class as second level when we classed our data so we aren’t repeating that here. Also we have decided to keep craving as a numeric variable so we don’t have to do any manipulations here either.
<- recipe(cigs ~ craving, data = data_all) rec_kfold_1
As just a double check we can prep
and bake
our recipe on our full data set to see if the variables look correct. Looks good!
<- rec_kfold_1 |>
rec_prep prep(data_all)
<- rec_prep |>
feat_all bake(data_all)
|>
feat_all glimpse()
Rows: 5,876
Columns: 2
$ craving <dbl> 2, 0, 0, 1, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 4, 2, 1, 0,…
$ cigs <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no…
Fit the model with k-fold cross-validation. Use logistic regression as your statistical algorithm, rec_kfold_1
as your recipe, and accuracy
as your metric.
We are going to also use cache_rds
as we fit our models.
<- cache_rds(
fits_kfold_1 expr = {
logistic_reg() |>
set_engine("glm") |>
fit_resamples(preprocessor = rec_kfold_1,
resamples = splits_kfold,
metrics = metric_set(accuracy))
}, dir = "cache/",
file = "fits_kfold_1",
rerun = rerun_setting)
Examine performance estimates. Use the collect_metrics()
function to make a table of the held-out performance estimates from the 10 folds
<- collect_metrics(fits_kfold_1, summarize = FALSE)
metrics_kfold_1
|> print_kbl() metrics_kfold_1
id | .metric | .estimator | .estimate | .config |
---|---|---|---|---|
Fold01 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold02 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold03 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold04 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold05 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold06 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold07 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold08 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold09 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Fold10 | accuracy | binary | 0.82 | Preprocessor1_Model1 |
Plot a histogram of the performance estimates
|> plot_hist(".estimate") metrics_kfold_1
Print the average performance over folds with the summarize = TRUE
argument.
collect_metrics(fits_kfold_1, summarize = TRUE)
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.817 10 0.000144 Preprocessor1_Model1
Feature Set 2
Build a second recipe with the following specifications:
- Use
cigs
as the outcome variable regressed on all variables.
- Set up the outcome as a factor with the positive class (“yes”) as the second level.
- Set up other variables as factors where needed.
- Apply dummy coding.
- Create an interaction between
stressful_event
andlag
. - Remove the
subid
variable withstep_rm()
Note we already classed our character variables as factors when we read the data in so we don’t need to do this again
<- recipe(cigs ~ ., data = data_all) |>
rec_kfold_2 step_mutate_at(c(all_factor_predictors(), -txt, -prior_cigs), fn = as.numeric) |>
step_dummy(txt, prior_cigs) |>
step_interact(~ stressful_event:lag) |>
step_rm(subid)
Again lets check that our recipe is creating features how we expect it should
<- rec_kfold_2 |>
rec_prep prep(data_all)
<- rec_prep |>
feat_all bake(data_all)
|>
feat_all glimpse()
Rows: 5,876
Columns: 13
$ craving <dbl> 2, 0, 0, 1, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1, 0, …
$ anxious <dbl> 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ irritable <dbl> 1.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0…
$ sad <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ happy <dbl> 0.0, 0.0, 3.0, 2.0, 2.0, 1.0, 4.0, 3.0, 3.0, 2.0…
$ stressful_event <dbl> 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ positive_event <dbl> 1, 1, 3, 2, 1, 2, 5, 1, 1, 2, 4, 1, 1, 3, 1, 2, …
$ lag <dbl> 6.069444, 3.306389, 4.544444, 11.491111, 6.15222…
$ time_since_quit <dbl> 0.6007639, 0.7385301, 0.9278819, 1.4066782, 1.66…
$ cigs <fct> no, no, no, no, no, no, no, no, no, no, no, no, …
$ txt_nrt <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ prior_cigs_yes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stressful_event_x_lag <dbl> 6.069444, 9.919167, 4.544444, 11.491111, 6.15222…
Fit the model with k-fold cross-validation. Use logistic regression as your statistical algorithm, rec_kfold_2
as your recipe, and accuracy
as your metric.
<- cache_rds(
fits_kfold_2 expr = {
logistic_reg() |>
set_engine("glm") |>
fit_resamples(preprocessor = rec_kfold_2,
resamples = splits_kfold,
metrics = metric_set(accuracy))
}, dir = "cache/",
file = "fits_kfold_2",
rerun = rerun_setting)
Examine performance estimates. Use the collect_metrics()
function to make a table of the held-out performance estimates.
<- collect_metrics(fits_kfold_2, summarize = FALSE)
metrics_kfold_2 |> print_kbl() metrics_kfold_2
id | .metric | .estimator | .estimate | .config |
---|---|---|---|---|
Fold01 | accuracy | binary | 0.88 | Preprocessor1_Model1 |
Fold02 | accuracy | binary | 0.86 | Preprocessor1_Model1 |
Fold03 | accuracy | binary | 0.86 | Preprocessor1_Model1 |
Fold04 | accuracy | binary | 0.85 | Preprocessor1_Model1 |
Fold05 | accuracy | binary | 0.87 | Preprocessor1_Model1 |
Fold06 | accuracy | binary | 0.86 | Preprocessor1_Model1 |
Fold07 | accuracy | binary | 0.87 | Preprocessor1_Model1 |
Fold08 | accuracy | binary | 0.87 | Preprocessor1_Model1 |
Fold09 | accuracy | binary | 0.84 | Preprocessor1_Model1 |
Fold10 | accuracy | binary | 0.86 | Preprocessor1_Model1 |
Plot a histogram of the performance estimates.
|> plot_hist(".estimate") metrics_kfold_2
Print the average performance over folds with the summarize = TRUE
argument.
collect_metrics(fits_kfold_2, summarize = TRUE)
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.862 10 0.00292 Preprocessor1_Model1
Selecting the best model configuration
Looking back at our two average performance estimates (one from each feature set), we would select model configuration 2. It has higher average performance across held-out folds.
Part 2: Tuning hyperparameters with bootstrap resampling
Split data
Split your data using bootstrap. Stratify on cigs
and use 100 bootstraps. Don’t forget to set a seed!
set.seed(102030)
<- data_all |>
splits_boot bootstraps(times = 100, strata = "cigs")
splits_boot
# Bootstrap sampling using stratification
# A tibble: 100 × 2
splits id
<list> <chr>
1 <split [5876/2174]> Bootstrap001
2 <split [5876/2155]> Bootstrap002
3 <split [5876/2125]> Bootstrap003
4 <split [5876/2169]> Bootstrap004
5 <split [5876/2202]> Bootstrap005
6 <split [5876/2174]> Bootstrap006
7 <split [5876/2122]> Bootstrap007
8 <split [5876/2130]> Bootstrap008
9 <split [5876/2188]> Bootstrap009
10 <split [5876/2155]> Bootstrap010
11 <split [5876/2167]> Bootstrap011
12 <split [5876/2135]> Bootstrap012
13 <split [5876/2123]> Bootstrap013
14 <split [5876/2169]> Bootstrap014
15 <split [5876/2136]> Bootstrap015
16 <split [5876/2138]> Bootstrap016
17 <split [5876/2091]> Bootstrap017
18 <split [5876/2190]> Bootstrap018
19 <split [5876/2168]> Bootstrap019
20 <split [5876/2159]> Bootstrap020
21 <split [5876/2177]> Bootstrap021
22 <split [5876/2195]> Bootstrap022
23 <split [5876/2170]> Bootstrap023
24 <split [5876/2179]> Bootstrap024
25 <split [5876/2142]> Bootstrap025
26 <split [5876/2208]> Bootstrap026
27 <split [5876/2184]> Bootstrap027
28 <split [5876/2174]> Bootstrap028
29 <split [5876/2117]> Bootstrap029
30 <split [5876/2190]> Bootstrap030
31 <split [5876/2147]> Bootstrap031
32 <split [5876/2185]> Bootstrap032
33 <split [5876/2168]> Bootstrap033
34 <split [5876/2167]> Bootstrap034
35 <split [5876/2167]> Bootstrap035
36 <split [5876/2143]> Bootstrap036
37 <split [5876/2128]> Bootstrap037
38 <split [5876/2186]> Bootstrap038
39 <split [5876/2172]> Bootstrap039
40 <split [5876/2147]> Bootstrap040
41 <split [5876/2181]> Bootstrap041
42 <split [5876/2164]> Bootstrap042
43 <split [5876/2168]> Bootstrap043
44 <split [5876/2153]> Bootstrap044
45 <split [5876/2199]> Bootstrap045
46 <split [5876/2153]> Bootstrap046
47 <split [5876/2188]> Bootstrap047
48 <split [5876/2179]> Bootstrap048
49 <split [5876/2145]> Bootstrap049
50 <split [5876/2167]> Bootstrap050
51 <split [5876/2184]> Bootstrap051
52 <split [5876/2150]> Bootstrap052
53 <split [5876/2152]> Bootstrap053
54 <split [5876/2156]> Bootstrap054
55 <split [5876/2150]> Bootstrap055
56 <split [5876/2146]> Bootstrap056
57 <split [5876/2170]> Bootstrap057
58 <split [5876/2141]> Bootstrap058
59 <split [5876/2109]> Bootstrap059
60 <split [5876/2115]> Bootstrap060
61 <split [5876/2190]> Bootstrap061
62 <split [5876/2172]> Bootstrap062
63 <split [5876/2133]> Bootstrap063
64 <split [5876/2196]> Bootstrap064
65 <split [5876/2168]> Bootstrap065
66 <split [5876/2156]> Bootstrap066
67 <split [5876/2138]> Bootstrap067
68 <split [5876/2137]> Bootstrap068
69 <split [5876/2146]> Bootstrap069
70 <split [5876/2140]> Bootstrap070
71 <split [5876/2182]> Bootstrap071
72 <split [5876/2177]> Bootstrap072
73 <split [5876/2154]> Bootstrap073
74 <split [5876/2148]> Bootstrap074
75 <split [5876/2160]> Bootstrap075
76 <split [5876/2153]> Bootstrap076
77 <split [5876/2166]> Bootstrap077
78 <split [5876/2141]> Bootstrap078
79 <split [5876/2148]> Bootstrap079
80 <split [5876/2180]> Bootstrap080
81 <split [5876/2187]> Bootstrap081
82 <split [5876/2190]> Bootstrap082
83 <split [5876/2144]> Bootstrap083
84 <split [5876/2169]> Bootstrap084
85 <split [5876/2187]> Bootstrap085
86 <split [5876/2109]> Bootstrap086
87 <split [5876/2127]> Bootstrap087
88 <split [5876/2202]> Bootstrap088
89 <split [5876/2165]> Bootstrap089
90 <split [5876/2140]> Bootstrap090
91 <split [5876/2148]> Bootstrap091
92 <split [5876/2133]> Bootstrap092
93 <split [5876/2158]> Bootstrap093
94 <split [5876/2182]> Bootstrap094
95 <split [5876/2156]> Bootstrap095
96 <split [5876/2152]> Bootstrap096
97 <split [5876/2129]> Bootstrap097
98 <split [5876/2184]> Bootstrap098
99 <split [5876/2169]> Bootstrap099
100 <split [5876/2186]> Bootstrap100
Set up hyperparameter grid
Create a tibble with all values of the hyperparameter (k
) you will consider. Remember that this dataset has a lot of observations in it (~6000), so starting with some higher k-values than we’re used to testing may be helpful.
<- expand.grid(neighbors = seq(5, 500, by = 20)) hyper_grid
Feature Set 1
Build a recipe with the following specifications:
- Use
cigs
as the outcome variable regressed oncraving
.
- Set up the outcome as a factor with the positive class (“yes”) as the second level.
- Use the same numeric transformation of craving as before
This is the same recipe as earlier so we are just going to rename it here. Also remember even though this is going to be used in a KNN model, we don’t have to scale or range correct if there is only one predictor!
<- rec_kfold_1 rec_boot_1
Tune the model with bootstrapping for cross-validation. Use KNN classification as your statistical algorithm, rec_boot_1
as your recipe, hyper_grid
as your tuning grid, and accuracy
as your metric.
<- cache_rds(
tune_boot_1 expr = {
nearest_neighbor(neighbors = tune()) |>
set_engine("kknn") |>
set_mode("classification") |>
tune_grid(preprocessor = rec_boot_1,
resamples = splits_boot,
grid = hyper_grid,
metrics = metric_set(accuracy))
}, dir = "cache/",
file = "tune_boot_1",
rerun = rerun_setting)
Examine performance estimates across the OOB held-out sets. Remember that you will now see one estimate per hyperparameter value, averaged across bootstraps.
collect_metrics(tune_boot_1, summarize = TRUE) |>
print_kbl()
neighbors | .metric | .estimator | mean | n | std_err | .config |
---|---|---|---|---|---|---|
5 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model01 |
25 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model02 |
45 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model03 |
65 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model04 |
85 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model05 |
105 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model06 |
125 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model07 |
145 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model08 |
165 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model09 |
185 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model10 |
205 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model11 |
225 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model12 |
245 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model13 |
265 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model14 |
285 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model15 |
305 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model16 |
325 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model17 |
345 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model18 |
365 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model19 |
385 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model20 |
405 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model21 |
425 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model22 |
445 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model23 |
465 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model24 |
485 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model25 |
Plot the average performance by hyperparameter value.
plot_hyperparameters(tune_boot_1, hp1 = "neighbors", metric = "accuracy")
I considered values of k up to 2000, and performance never decreased after about k = 250 (long plateau). I then backtracked down to a range of 5 to 500 in increments of 20 (the plot you see here). We should note that cigs = no
occurs about 81.6% of the time. The fact that this performance curve is completely flat is because this model really does not predict the outcome above its base rate. You will often see this (flat hyperparameter curves) with models that are not predicting well.
Print the performance of your best model configuration with the show_best()
function.
show_best(tune_boot_1, n = 1)
# A tibble: 1 × 7
neighbors .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 45 accuracy binary 0.816 100 0.000405 Preprocessor1_Model03
Feature Set 2
Build a second recipe with the following specifications:
- Use
cigs
as the outcome variable regressed on all variables.
- Set up the outcome as a factor with the positive class (“yes”) as the second level.
- Set up other variables as factors where needed.
- Apply dummy coding.
- Create an interaction between
stressful_event
andlag
. - Remove the
subid
variable withstep_rm()
- Take appropriate scaling steps for features to be able to fit KNN requirements.
This is the same recipe as above except we are scaling numeric predictors for KNN
<- rec_kfold_2 |>
rec_boot_2 step_range(all_numeric_predictors())
Tune the model with bootstrapping for cross-validation. Use KNN classification as your statistical algorithm, rec_boot_2
as your recipe, hyper_grid
as your tuning grid, and accuracy
as your metric.
<- cache_rds(
tune_boot_2 expr = {
nearest_neighbor(neighbors = tune()) |>
set_engine("kknn") |>
set_mode("classification") |>
tune_grid(preprocessor = rec_boot_2,
resamples = splits_boot,
grid = hyper_grid,
metrics = metric_set(accuracy))
}, dir = "cache/",
file = "tune_boot_2",
rerun = rerun_setting)
Examine performance estimates. Remember that you will now see one estimate per hyperparameter value, averaged across bootstraps.
collect_metrics(tune_boot_2, summarize = TRUE) |>
print_kbl()
neighbors | .metric | .estimator | mean | n | std_err | .config |
---|---|---|---|---|---|---|
5 | accuracy | binary | 0.82 | 100 | 0 | Preprocessor1_Model01 |
25 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model02 |
45 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model03 |
65 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model04 |
85 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model05 |
105 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model06 |
125 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model07 |
145 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model08 |
165 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model09 |
185 | accuracy | binary | 0.86 | 100 | 0 | Preprocessor1_Model10 |
205 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model11 |
225 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model12 |
245 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model13 |
265 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model14 |
285 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model15 |
305 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model16 |
325 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model17 |
345 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model18 |
365 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model19 |
385 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model20 |
405 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model21 |
425 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model22 |
445 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model23 |
465 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model24 |
485 | accuracy | binary | 0.85 | 100 | 0 | Preprocessor1_Model25 |
Plot the average performance by hyperparameter value.
plot_hyperparameters(tune_boot_2, hp1 = "neighbors", metric = "accuracy")
Here we can see a peak and then slow, continuous decrease in performance. That peak makes us feel confident that we’ve considered a sufficient range of k-values.
Print the performance of your best model configuration with the show_best()
function.
show_best(tune_boot_2, n = 1)
# A tibble: 1 × 7
neighbors .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 65 accuracy binary 0.857 100 0.000494 Preprocessor1_Model04
Part 3: Selecting among model configurations with grouped k-fold cross-validation
In the final part of this assignment, you will use grouped k-fold cross-validation to match the data structure.
Split data
Split your data into 10 folds using the group_vfold_cv()
function. Use the grouping argument (group = "subid"
).
For this example, we asked you to not stratify on cigs
. As a note, if you ever want to stratify on your outcome (e.g., cigs
) within a grouping variable (e.g., subid
), you need to create a new variable capturing the distribution of the outcome within each subject (in this case, the mean number of positive cases of cigs
within each subject). This allows the splitting to attempt to assign different subid
groups to analysis and assessment sets within each fold in combinations that will yield roughly the same proportion of positive cases on the outcome across all folds.
We show you an example of this here:
<- data_all |>
data_all group_by(subid) |>
mutate(strata = mean(as.numeric(cigs))) |> # creates new strata variable
ungroup()
set.seed(102030)
<- group_vfold_cv(data_all,
splits_group_kfold group = "subid",
v = 10,
repeats = 1,
strata = "strata")
Feature Set 1
Fit the first set of models with grouped k-fold cross-validation. Use logistic regression as your statistical algorithm, rec_kfold_1
as your recipe (doesn’t need to change from above!), and accuracy
as your metric.
<- cache_rds(
fits_group_kfold_1 expr = {
logistic_reg() |>
set_engine("glm") |>
fit_resamples(preprocessor = rec_kfold_1,
resamples = splits_group_kfold,
metrics = metric_set(accuracy))
}, dir = "cache/",
file = "fits_group_kfold_1",
rerun = rerun_setting)
Examine performance estimates on held out samples.
<- collect_metrics(fits_group_kfold_1,
metrics_group_kfold_1 summarize = FALSE)
|>
metrics_group_kfold_1 print_kbl()
id | .metric | .estimator | .estimate | .config |
---|---|---|---|---|
Resample01 | accuracy | binary | 0.76 | Preprocessor1_Model1 |
Resample02 | accuracy | binary | 0.84 | Preprocessor1_Model1 |
Resample03 | accuracy | binary | 0.79 | Preprocessor1_Model1 |
Resample04 | accuracy | binary | 0.85 | Preprocessor1_Model1 |
Resample05 | accuracy | binary | 0.83 | Preprocessor1_Model1 |
Resample06 | accuracy | binary | 0.89 | Preprocessor1_Model1 |
Resample07 | accuracy | binary | 0.79 | Preprocessor1_Model1 |
Resample08 | accuracy | binary | 0.84 | Preprocessor1_Model1 |
Resample09 | accuracy | binary | 0.65 | Preprocessor1_Model1 |
Resample10 | accuracy | binary | 0.91 | Preprocessor1_Model1 |
Plot a histogram of the performance estimates.
|>
metrics_group_kfold_1 plot_hist(".estimate")
Print the average performance over folds with the summarize = TRUE
argument.
collect_metrics(fits_group_kfold_1, summarize = TRUE)
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.816 10 0.0230 Preprocessor1_Model1
Feature Set 2
Fit the second model with grouped k-fold cross-validation. Use logistic regression as your statistical algorithm, rec_kfold_2
as your recipe (doesn’t need to change from above!), and accuracy
as your metric.
<- cache_rds(
fits_group_kfold_2 expr = {
logistic_reg() |>
set_engine("glm") |>
fit_resamples(preprocessor = rec_kfold_2,
resamples = splits_group_kfold,
metrics = metric_set(accuracy))
}, dir = "cache/",
file = "fits_group_kfold_2",
rerun = rerun_setting)
Examine performance estimates on held out samples.
<- collect_metrics(fits_group_kfold_2,
metrics_group_kfold_2 summarize = FALSE)
|>
metrics_group_kfold_2 print_kbl()
id | .metric | .estimator | .estimate | .config |
---|---|---|---|---|
Resample01 | accuracy | binary | 0.83 | Preprocessor1_Model1 |
Resample02 | accuracy | binary | 0.86 | Preprocessor1_Model1 |
Resample03 | accuracy | binary | 0.85 | Preprocessor1_Model1 |
Resample04 | accuracy | binary | 0.85 | Preprocessor1_Model1 |
Resample05 | accuracy | binary | 0.87 | Preprocessor1_Model1 |
Resample06 | accuracy | binary | 0.91 | Preprocessor1_Model1 |
Resample07 | accuracy | binary | 0.86 | Preprocessor1_Model1 |
Resample08 | accuracy | binary | 0.80 | Preprocessor1_Model1 |
Resample09 | accuracy | binary | 0.80 | Preprocessor1_Model1 |
Resample10 | accuracy | binary | 0.92 | Preprocessor1_Model1 |
Plot a histogram of the performance estimates.
|>
metrics_group_kfold_2 plot_hist(".estimate")
Print the average performance over folds with the summarize = TRUE
argument.
collect_metrics(fits_group_kfold_2, summarize = TRUE)
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.856 10 0.0130 Preprocessor1_Model1
Selecting the best model configuration
We would select model configuration 2 because it has the higher average performance across folds!
Remember, once your best model is chosen and evaluated you would then want to refit it once on all the data before deploying it for use!