options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Unit 4 (Classification): Fit RDA Models
Introduction
This file serves as the answer key for the RDA portion of the Unit_04 homework. Unit 4 Exploratory Introduction to Classification Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to fit multiple configurations of KNN and RDA classification models using data from the Titanic data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all KNN and RDA models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.
Set up
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
library(discrim, exclude = "smoothness")
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
Paths
<- "homework/unit_04" path_data
Load data
Load the cleaned training, validation, and test data files
Use here::here()
and relative path for your data. Make sure your iaml project is open
<- read_csv(here::here(path_data, "titanic_train_cln.csv"),
data_trn col_types = cols()) |>
glimpse()
Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <chr> "female", "male", "male", "male", "male", "male", "male…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <chr> "113781", "113781", "PC 17757", "PC 17318", "PC 17558",…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <chr> "C22 C26", "C22 C26", "C62 C64", NA, "B58 B60", NA, "A2…
$ embarked <chr> "s", "s", "c", "s", "c", "c", "s", "c", "c", "s", "s", …
<- read_csv(here::here(path_data, "titanic_val_cln.csv"),
data_val col_types = cols()) |>
glimpse()
Rows: 246
Columns: 12
$ passenger_id <dbl> 2, 12, 18, 20, 21, 31, 42, 51, 52, 84, 85, 97, 101, 103…
$ survived <chr> "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Master. Hudson Trevor", "Astor, Mrs. John Jac…
$ sex <chr> "male", "female", "female", "male", "male", "male", "fe…
$ age <dbl> 0.9167, 18.0000, 50.0000, 36.0000, 37.0000, 45.0000, 44…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 3, 0…
$ n_parent_child <dbl> 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0…
$ ticket <chr> "113781", "PC 17757", "PC 17558", "13050", "11751", "11…
$ fare <dbl> 151.5500, 227.5250, 247.5208, 75.2417, 52.5542, 35.5000…
$ cabin <chr> "C22 C26", "C62 C64", "B58 B60", "C6", "D35", "T", "B4"…
$ embarked <chr> "s", "c", "c", "c", "s", "s", "c", "c", "s", "s", "c", …
<- read_csv(here::here(path_data, "titanic_test_cln_no_label.csv"),
data_test col_types = cols()) |>
glimpse()
Rows: 328
Columns: 12
$ passenger_id <dbl> 2, 3, 6, 13, 15, 20, 22, 25, 30, 31, 39, 42, 48, 54, 58…
$ survived <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Master. Hudson Trevor", "Allison, Miss. Helen…
$ sex <chr> "male", "female", "male", "female", "male", "male", "fe…
$ age <dbl> 0.9167, 2.0000, 48.0000, 24.0000, 80.0000, 36.0000, 47.…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1…
$ ticket <chr> "113781", "113781", "19952", "PC 17477", "27042", "1305…
$ fare <dbl> 151.5500, 151.5500, 26.5500, 69.3000, 30.0000, 75.2417,…
$ cabin <chr> "C22 C26", "C22 C26", "E12", "B35", "A23", "C6", "D35",…
$ embarked <chr> "s", "s", "s", "c", "s", "c", "s", "s", "s", "s", "s", …
Set appropriate variable classes:
- Nominal and ordinal variables are set to factors
- interval and ratio variables are set to numeric
Remember ordinal variables like pclass
should have the factor levels explicitly stated to retain the order
It also helps (removes warning) to class survived as a factor because otherwise it will be read as logical in the test set where the scores are all NA.
- Lets write a classing function
- Probably not needed here since classing is so quick but still good practice!
- Handle order for the factors we care about (that have order or are outcome)
- Handle the rest of the factors with
across()
<- function(d){
class_titantic |>
d mutate(survived = factor(survived, levels = c("no", "yes")),
pclass = factor(pclass, levels = c("1", "2", "3")),
across(where(is.character), factor))
}
- Now lets use it
<- data_trn |>
data_trn class_titantic() |>
glimpse()
Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <fct> female, male, male, male, male, male, male, male, male,…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
<- data_val |>
data_val class_titantic() |>
glimpse()
Rows: 246
Columns: 12
$ passenger_id <dbl> 2, 12, 18, 20, 21, 31, 42, 51, 52, 84, 85, 97, 101, 103…
$ survived <fct> yes, yes, yes, no, yes, no, yes, yes, no, yes, no, no, …
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Astor, Mrs. John Jac…
$ sex <fct> male, female, female, male, male, male, female, female,…
$ age <dbl> 0.9167, 18.0000, 50.0000, 36.0000, 37.0000, 45.0000, 44…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 3, 0…
$ n_parent_child <dbl> 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0…
$ ticket <fct> 113781, PC 17757, PC 17558, 13050, 11751, 113784, PC 17…
$ fare <dbl> 151.5500, 227.5250, 247.5208, 75.2417, 52.5542, 35.5000…
$ cabin <fct> C22 C26, C62 C64, B58 B60, C6, D35, T, B4, B51 B53 B55,…
$ embarked <fct> s, c, c, c, s, s, c, c, s, s, c, c, c, c, c, c, c, s, s…
<- data_test |>
data_test class_titantic() |>
glimpse()
Rows: 328
Columns: 12
$ passenger_id <dbl> 2, 3, 6, 13, 15, 20, 22, 25, 30, 31, 39, 42, 48, 54, 58…
$ survived <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Allison, Miss. Helen…
$ sex <fct> male, female, male, female, male, male, female, female,…
$ age <dbl> 0.9167, 2.0000, 48.0000, 24.0000, 80.0000, 36.0000, 47.…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1…
$ ticket <fct> 113781, 113781, 19952, PC 17477, 27042, 13050, 11751, P…
$ fare <dbl> 151.5500, 151.5500, 26.5500, 69.3000, 30.0000, 75.2417,…
$ cabin <fct> C22 C26, C22 C26, E12, B35, A23, C6, D35, C97, C52, T, …
$ embarked <fct> s, s, s, c, s, c, s, s, s, s, s, c, s, s, s, s, c, s, s…
Here, we make some additional predictors (title
and group_size
) that might help in predicting survived
- This could have been done in recipe or here because it doesn’t use any calculations from the dataset at this point
- CRITICALLY, we decided to do this using modeling EDA only based on training data
<- function(d){
add_predictors |>
d mutate(group_size = n_parent_child + n_sib_spouse) |>
mutate(title = str_to_lower(str_extract(name,
"(?<=[:punct:][:space:])[:alpha:]{2,}(?=[:punct:])"))) |>
mutate(sparse_title = case_when(
== "master" ~ "boy",
title %in% c("miss", "ms") ~ "young_woman",
title %in% c("dona", "lady", "mrs") ~ "adult_woman",
title %in% c("mr", "sir", "don", "jonkheer") ~ "adult_man",
title %in% c("col", "dr", "major", "rev") ~ "professional",
title TRUE ~ NA_character_)) |>
mutate(sparse_title = case_when(
is.na(sparse_title) & age < 12 & sex == "male" ~ "boy",
is.na(sparse_title) & age < 18 & sex == "female" ~ "young_woman",
is.na(sparse_title) & sex == "male" ~ "adult_man",
is.na(sparse_title) & sex == "female" ~ "adult_woman",
!is.na(sparse_title) ~ as.character(sparse_title),
TRUE ~ NA_character_)) |>
mutate(n_cabins = str_count(cabin, "[:alpha:]"))
}
- Now lets use this function with all three of our sets
<- data_trn |>
data_trn add_predictors() |>
glimpse()
Rows: 736
Columns: 16
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <fct> female, male, male, male, male, male, male, male, male,…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
$ group_size <dbl> 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0…
$ title <chr> "miss", "mr", "col", "mr", "mr", "mr", "mr", "mr", "dr"…
$ sparse_title <chr> "young_woman", "adult_man", "professional", "adult_man"…
$ n_cabins <int> 2, 2, 2, NA, 2, NA, 1, 1, NA, 1, NA, NA, NA, 1, 1, 1, 1…
|>
data_trn skim_some()
Name | data_trn |
Number of rows | 736 |
Number of columns | 16 |
_______________________ | |
Column type frequency: | |
character | 2 |
factor | 7 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
title | 1 | 1 | 2 | 8 | 0 | 13 | 0 |
sparse_title | 0 | 1 | 3 | 12 | 0 | 5 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
survived | 0 | 1.00 | FALSE | 2 | no: 455, yes: 281 |
pclass | 0 | 1.00 | FALSE | 3 | 3: 387, 2: 175, 1: 174 |
name | 0 | 1.00 | FALSE | 736 | Abb: 1, Abb: 1, Abb: 1, Abe: 1 |
sex | 0 | 1.00 | FALSE | 2 | mal: 479, fem: 257 |
ticket | 0 | 1.00 | FALSE | 588 | CA.: 6, S.O: 6, 113: 4, 160: 4 |
cabin | 579 | 0.21 | FALSE | 118 | F4: 4, B57: 3, C23: 3, D: 3 |
embarked | 0 | 1.00 | FALSE | 3 | s: 524, c: 144, q: 68 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
passenger_id | 0 | 1.00 | 3.00 | 1308.00 |
age | 146 | 0.80 | 0.17 | 76.00 |
n_sib_spouse | 0 | 1.00 | 0.00 | 8.00 |
n_parent_child | 0 | 1.00 | 0.00 | 9.00 |
fare | 0 | 1.00 | 0.00 | 512.33 |
group_size | 0 | 1.00 | 0.00 | 10.00 |
n_cabins | 579 | 0.21 | 1.00 | 4.00 |
# create a "title" variable for each name in data_val
<- data_val %>%
data_val add_predictors() |>
glimpse()
Rows: 246
Columns: 16
$ passenger_id <dbl> 2, 12, 18, 20, 21, 31, 42, 51, 52, 84, 85, 97, 101, 103…
$ survived <fct> yes, yes, yes, no, yes, no, yes, yes, no, yes, no, no, …
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Astor, Mrs. John Jac…
$ sex <fct> male, female, female, male, male, male, female, female,…
$ age <dbl> 0.9167, 18.0000, 50.0000, 36.0000, 37.0000, 45.0000, 44…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 3, 0…
$ n_parent_child <dbl> 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0…
$ ticket <fct> 113781, PC 17757, PC 17558, 13050, 11751, 113784, PC 17…
$ fare <dbl> 151.5500, 227.5250, 247.5208, 75.2417, 52.5542, 35.5000…
$ cabin <fct> C22 C26, C62 C64, B58 B60, C6, D35, T, B4, B51 B53 B55,…
$ embarked <fct> s, c, c, c, s, s, c, c, s, s, c, c, c, c, c, c, c, s, s…
$ group_size <dbl> 3, 1, 1, 0, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 1, 0, 0, 5, 0…
$ title <chr> "master", "mrs", "mrs", "mr", "mr", "mr", "mrs", "mrs",…
$ sparse_title <chr> "boy", "adult_woman", "adult_woman", "adult_man", "adul…
$ n_cabins <int> 2, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1…
# create a "title" variable for each name in data_test
<- data_test %>%
data_test add_predictors() |>
glimpse()
Rows: 328
Columns: 16
$ passenger_id <dbl> 2, 3, 6, 13, 15, 20, 22, 25, 30, 31, 39, 42, 48, 54, 58…
$ survived <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Allison, Miss. Helen…
$ sex <fct> male, female, male, female, male, male, female, female,…
$ age <dbl> 0.9167, 2.0000, 48.0000, 24.0000, 80.0000, 36.0000, 47.…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1…
$ ticket <fct> 113781, 113781, 19952, PC 17477, 27042, 13050, 11751, P…
$ fare <dbl> 151.5500, 151.5500, 26.5500, 69.3000, 30.0000, 75.2417,…
$ cabin <fct> C22 C26, C22 C26, E12, B35, A23, C6, D35, C97, C52, T, …
$ embarked <fct> s, s, s, c, s, c, s, s, s, s, s, c, s, s, s, s, c, s, s…
$ group_size <dbl> 3, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 1…
$ title <chr> "master", "miss", "mr", "mme", "mr", "mr", "mrs", "miss…
$ sparse_title <chr> "boy", "young_woman", "adult_man", "adult_woman", "adul…
$ n_cabins <int> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, 2, 1, 1, 1, …
Create tracking tibble
Create an empty tracking tibble to track the validation error across the various model configurations you will fit.
<- tibble(model = character(),
error_val acc_val = numeric()) |>
glimpse()
Rows: 0
Columns: 2
$ model <chr>
$ acc_val <dbl>
NOTE: See the key for fitting KNN models to see a demonstration of accuracy & what it means in this particular dataset.
RDA model configuration 1
We start by feature engineering a variable title
(see above and the key for modeling EDA for more information). We extract the passengers’ titles from their names (e.g., “Mr” or “Miss”). This variable represents a combination of age and sex because titles differ by both. That also means I can use age and sex variables to help fill in values when title
is missing. Because there are many unique titles, we collapse it down into a sparse version (sparse_title
). This isn’t required for for the homework but we show it to you here for learning purposes
Set up recipe
# only using sparse_title in the model
<-
rec_1 recipe(survived ~ sparse_title, data = data_trn) |>
step_dummy(sparse_title)
Training feature matrix
prep recipe
<- rec_1 |>
rec_prep prep(training = data_trn)
bake recipe to get feature set
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit Model
Fit first model configuration with rda
<- discrim_regularized(frac_common_cov = 0,
fit_rda_1 frac_identity = 0) |>
set_engine("klaR") |>
fit(survived ~ ., data = feat_trn)
Validation feature matrix
Use bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your model
Use accuracy_vec()
to calculate the validation error (accuracy) of your model. Add this value to your validation error tracking tibble
accuracy_vec(feat_val$survived, predict(fit_rda_1, feat_val, type = "class")$.pred_class)
[1] 0.7886179
<- add_row(error_val,
error_val model = "rda_1",
acc_val = accuracy_vec(feat_val$survived,
predict(fit_rda_1,
$.pred_class))
feat_val)
error_val
# A tibble: 1 × 2
model acc_val
<chr> <dbl>
1 rda_1 0.789
RDA model configuration 2
We’re now going to use sparse title again but for imputation. Age has a lot of missing values, but we know that age differs by sparse_title (see modeling EDA). We’ll use sparse_title as part of a knn imputation for age. And then maybe Age will be a better predictor that it would be with lots of missing data.
This meets the following homework requirements: Variable with missingness
Set up recipe
Informed by EDA from your modeling EDA, create a recipe for your first model
<-
rec_2 recipe(survived ~ sparse_title + age + sex, data = data_trn) |>
step_dummy(sex, sparse_title) |>
step_impute_knn(age, impute_with = imp_vars(starts_with("sparse_title"))) |>
step_rm(starts_with("sparse_title"))
Training feature matrix
Use the prep()
function to prep your recipe
<- rec_2 |>
rec_prep_2 prep(training = data_trn)
Use the bake()
function to generate the feature matrix of the training data.
<- rec_prep_2 |>
feat_trn_2 bake(new_data = data_trn)
Fit your model
Fit a rda
model predicting survived
from your training feature matrix
<- discrim_regularized(frac_common_cov = 0,
fit_rda_2 frac_identity = 0) %>%
set_engine("klaR") %>%
fit(survived ~ ., data = feat_trn_2)
Validation feature matrix
Use bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep_2 |>
feat_val_2 bake(new_data = data_val)
Assess your model
Use accuracy_vec()
to calculate the validation error (accuracy) of your model. Add this value to your validation error tracking tibble
accuracy_vec(feat_val_2$survived, predict(fit_rda_2, feat_val_2)$.pred_class)
[1] 0.7723577
<- add_row(error_val,
error_val model = "rda_2",
acc_val = accuracy_vec(feat_val_2$survived,
predict(fit_rda_2,
$.pred_class))
feat_val_2)
error_val
# A tibble: 2 × 2
model acc_val
<chr> <dbl>
1 rda_1 0.789
2 rda_2 0.772
RDA Configuration 3
KNN imputation for age and then a sex X age interaction performed comparably. Let’s incorporate some other information.
Let’s use step_pca() to make an “expense” variable. This will contain information from fare
(YJ-transformed), pclass
(kept as numeric), cabins
(see modeling EDA key for more information on extracting the number of cabins from the cabin variable; created then YJ-transformed), and group_size
(created then YJ-transformed).
This meets the following homework requirements: Variable with missingness Transformed numeric
Set up recipe
Informed by EDA from your modeling EDA, create a recipe for your first model
<- recipe(survived ~ sparse_title + age + sex + group_size + n_cabins
rec_3 + fare, data = data_trn) |>
step_dummy(sex, sparse_title) |>
step_impute_knn(age, impute_with = imp_vars(starts_with("sparse_title"))) |>
step_YeoJohnson(fare, group_size, n_cabins) |>
step_impute_median(fare, n_cabins) |>
step_pca(fare, group_size, n_cabins, group_size, prefix = "expense_") |>
step_rm(starts_with("sparse_title"))
Training feature matrix
Use the prep()
function to prep your recipe
<- rec_3 |>
rec_prep_3 prep(training = data_trn)
Use the bake()
function to generate the feature matrix of the training data.
<- rec_prep_3 |>
feat_trn_3 bake(new_data = data_trn)
Fit your model
Fit a rda
model predicting survived
from your training feature matrix
<- discrim_regularized(frac_common_cov = 0,
fit_rda_3 frac_identity = 0) |>
set_engine("klaR") |>
fit(survived ~ ., data = feat_trn_3)
Validation feature matrix
Use bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep_3 |>
feat_val_3 bake(new_data = data_val)
Assess your model
Use accuracy_vec()
to calculate the validation error (accuracy) of your model. Add this value to your validation error tracking tibble
accuracy_vec(feat_val_3$survived, predict(fit_rda_3, feat_val_3)$.pred_class)
[1] 0.7804878
<- add_row(error_val,
error_val model = "rda_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_rda_3,
$.pred_class))
feat_val_3)
error_val
# A tibble: 3 × 2
model acc_val
<chr> <dbl>
1 rda_1 0.789
2 rda_2 0.772
3 rda_3 0.780
Generate predictions
My best model is a KNN, so this script is done! See key_unit_04_fit_knn.html
for solution code for generating predictions.