options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Homework Unit 4 (Classification): Fit KNN Models
Introduction
This file serves as the answer key for the KNN portion of the Unit_04 homework. Unit 4 Exploratory Introduction to Classification Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to fit multiple configurations of KNN and RDA classification models using data from the Titanic data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all KNN and RDA models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.
Set up
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
Paths
<- "homework/unit_04" path_data
Load data
Load the cleaned training, validation, and test data files
Use here::here()
and relative path for your data. Make sure your iaml project is open
<- read_csv(here::here(path_data, "titanic_train_cln.csv"),
data_trn col_types = cols()) |>
glimpse()
Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <chr> "female", "male", "male", "male", "male", "male", "male…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <chr> "113781", "113781", "PC 17757", "PC 17318", "PC 17558",…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <chr> "C22 C26", "C22 C26", "C62 C64", NA, "B58 B60", NA, "A2…
$ embarked <chr> "s", "s", "c", "s", "c", "c", "s", "c", "c", "s", "s", …
<- read_csv(here::here(path_data, "titanic_val_cln.csv"),
data_val col_types = cols()) |>
glimpse()
Rows: 246
Columns: 12
$ passenger_id <dbl> 2, 12, 18, 20, 21, 31, 42, 51, 52, 84, 85, 97, 101, 103…
$ survived <chr> "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Master. Hudson Trevor", "Astor, Mrs. John Jac…
$ sex <chr> "male", "female", "female", "male", "male", "male", "fe…
$ age <dbl> 0.9167, 18.0000, 50.0000, 36.0000, 37.0000, 45.0000, 44…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 3, 0…
$ n_parent_child <dbl> 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0…
$ ticket <chr> "113781", "PC 17757", "PC 17558", "13050", "11751", "11…
$ fare <dbl> 151.5500, 227.5250, 247.5208, 75.2417, 52.5542, 35.5000…
$ cabin <chr> "C22 C26", "C62 C64", "B58 B60", "C6", "D35", "T", "B4"…
$ embarked <chr> "s", "c", "c", "c", "s", "s", "c", "c", "s", "s", "c", …
<- read_csv(here::here(path_data, "titanic_test_cln_no_label.csv"),
data_test col_types = cols()) |>
glimpse()
Rows: 328
Columns: 12
$ passenger_id <dbl> 2, 3, 6, 13, 15, 20, 22, 25, 30, 31, 39, 42, 48, 54, 58…
$ survived <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Master. Hudson Trevor", "Allison, Miss. Helen…
$ sex <chr> "male", "female", "male", "female", "male", "male", "fe…
$ age <dbl> 0.9167, 2.0000, 48.0000, 24.0000, 80.0000, 36.0000, 47.…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1…
$ ticket <chr> "113781", "113781", "19952", "PC 17477", "27042", "1305…
$ fare <dbl> 151.5500, 151.5500, 26.5500, 69.3000, 30.0000, 75.2417,…
$ cabin <chr> "C22 C26", "C22 C26", "E12", "B35", "A23", "C6", "D35",…
$ embarked <chr> "s", "s", "s", "c", "s", "c", "s", "s", "s", "s", "s", …
Set appropriate variable classes:
- Nominal and ordinal variables are set to factors
- interval and ratio variables are set to numeric
Remember ordinal variables like pclass
should have the factor levels explicitly stated to retain the order
It also helps (removes warning) to class survived as a factor because otherwise it will be read as logical in the test set where the scores are all NA.
- Lets write a classing function
- Probably not needed here since classing is so quick but still good practice!
- Handle order for the factors we care about (that have order or are outcome)
- Handle the rest of the factors with
across()
<- function(d){
class_titantic |>
d mutate(survived = factor(survived, levels = c("no", "yes")),
pclass = factor(pclass, levels = c("1", "2", "3")),
across(where(is.character), factor))
}
- Now lets use it
<- data_trn |>
data_trn class_titantic() |>
glimpse()
Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <fct> female, male, male, male, male, male, male, male, male,…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
<- data_val |>
data_val class_titantic() |>
glimpse()
Rows: 246
Columns: 12
$ passenger_id <dbl> 2, 12, 18, 20, 21, 31, 42, 51, 52, 84, 85, 97, 101, 103…
$ survived <fct> yes, yes, yes, no, yes, no, yes, yes, no, yes, no, no, …
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Astor, Mrs. John Jac…
$ sex <fct> male, female, female, male, male, male, female, female,…
$ age <dbl> 0.9167, 18.0000, 50.0000, 36.0000, 37.0000, 45.0000, 44…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 3, 0…
$ n_parent_child <dbl> 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0…
$ ticket <fct> 113781, PC 17757, PC 17558, 13050, 11751, 113784, PC 17…
$ fare <dbl> 151.5500, 227.5250, 247.5208, 75.2417, 52.5542, 35.5000…
$ cabin <fct> C22 C26, C62 C64, B58 B60, C6, D35, T, B4, B51 B53 B55,…
$ embarked <fct> s, c, c, c, s, s, c, c, s, s, c, c, c, c, c, c, c, s, s…
<- data_test |>
data_test class_titantic() |>
glimpse()
Rows: 328
Columns: 12
$ passenger_id <dbl> 2, 3, 6, 13, 15, 20, 22, 25, 30, 31, 39, 42, 48, 54, 58…
$ survived <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Allison, Miss. Helen…
$ sex <fct> male, female, male, female, male, male, female, female,…
$ age <dbl> 0.9167, 2.0000, 48.0000, 24.0000, 80.0000, 36.0000, 47.…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1…
$ ticket <fct> 113781, 113781, 19952, PC 17477, 27042, 13050, 11751, P…
$ fare <dbl> 151.5500, 151.5500, 26.5500, 69.3000, 30.0000, 75.2417,…
$ cabin <fct> C22 C26, C22 C26, E12, B35, A23, C6, D35, C97, C52, T, …
$ embarked <fct> s, s, s, c, s, c, s, s, s, s, s, c, s, s, s, s, c, s, s…
Here, we make some additional predictors (title
and group_size
) that might help in predicting survived
- This could have been done in recipe or here because it doesn’t use any calculations from the dataset at this point
- CRITICALLY, we decided to do this using modeling EDA only based on training data
<- function(d){
add_predictors |>
d mutate(group_size = n_parent_child + n_sib_spouse) |>
mutate(title = str_to_lower(str_extract(name,
"(?<=[:punct:][:space:])[:alpha:]{2,}(?=[:punct:])"))) |>
mutate(sparse_title = case_when(
== "master" ~ "boy",
title %in% c("miss", "ms") ~ "young_woman",
title %in% c("dona", "lady", "mrs") ~ "adult_woman",
title %in% c("mr", "sir", "don", "jonkheer") ~ "adult_man",
title %in% c("col", "dr", "major", "rev") ~ "professional",
title TRUE ~ NA_character_)) |>
mutate(sparse_title = case_when(
is.na(sparse_title) & age < 12 & sex == "male" ~ "boy",
is.na(sparse_title) & age < 18 & sex == "female" ~ "young_woman",
is.na(sparse_title) & sex == "male" ~ "adult_man",
is.na(sparse_title) & sex == "female" ~ "adult_woman",
!is.na(sparse_title) ~ as.character(sparse_title),
TRUE ~ NA_character_)) |>
mutate(across(where(is.character), factor))
}
- Now lets use this function with all three of our sets
<- data_trn |>
data_trn add_predictors() |>
glimpse()
Rows: 736
Columns: 15
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <fct> female, male, male, male, male, male, male, male, male,…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
$ group_size <dbl> 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0…
$ title <fct> miss, mr, col, mr, mr, mr, mr, mr, dr, major, mr, mr, m…
$ sparse_title <fct> young_woman, adult_man, professional, adult_man, adult_…
|>
data_trn skim_some()
Name | data_trn |
Number of rows | 736 |
Number of columns | 15 |
_______________________ | |
Column type frequency: | |
factor | 9 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
survived | 0 | 1.00 | FALSE | 2 | no: 455, yes: 281 |
pclass | 0 | 1.00 | FALSE | 3 | 3: 387, 2: 175, 1: 174 |
name | 0 | 1.00 | FALSE | 736 | Abb: 1, Abb: 1, Abb: 1, Abe: 1 |
sex | 0 | 1.00 | FALSE | 2 | mal: 479, fem: 257 |
ticket | 0 | 1.00 | FALSE | 588 | CA.: 6, S.O: 6, 113: 4, 160: 4 |
cabin | 579 | 0.21 | FALSE | 118 | F4: 4, B57: 3, C23: 3, D: 3 |
embarked | 0 | 1.00 | FALSE | 3 | s: 524, c: 144, q: 68 |
title | 1 | 1.00 | FALSE | 13 | mr: 435, mis: 149, mrs: 102, mas: 30 |
sparse_title | 0 | 1.00 | FALSE | 5 | adu: 436, you: 150, adu: 106, boy: 30 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
passenger_id | 0 | 1.0 | 3.00 | 1308.00 |
age | 146 | 0.8 | 0.17 | 76.00 |
n_sib_spouse | 0 | 1.0 | 0.00 | 8.00 |
n_parent_child | 0 | 1.0 | 0.00 | 9.00 |
fare | 0 | 1.0 | 0.00 | 512.33 |
group_size | 0 | 1.0 | 0.00 | 10.00 |
# create a "title" variable for each name in data_val
<- data_val %>%
data_val add_predictors() |>
glimpse()
Rows: 246
Columns: 15
$ passenger_id <dbl> 2, 12, 18, 20, 21, 31, 42, 51, 52, 84, 85, 97, 101, 103…
$ survived <fct> yes, yes, yes, no, yes, no, yes, yes, no, yes, no, no, …
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Astor, Mrs. John Jac…
$ sex <fct> male, female, female, male, male, male, female, female,…
$ age <dbl> 0.9167, 18.0000, 50.0000, 36.0000, 37.0000, 45.0000, 44…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 3, 0…
$ n_parent_child <dbl> 2, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0…
$ ticket <fct> 113781, PC 17757, PC 17558, 13050, 11751, 113784, PC 17…
$ fare <dbl> 151.5500, 227.5250, 247.5208, 75.2417, 52.5542, 35.5000…
$ cabin <fct> C22 C26, C62 C64, B58 B60, C6, D35, T, B4, B51 B53 B55,…
$ embarked <fct> s, c, c, c, s, s, c, c, s, s, c, c, c, c, c, c, c, s, s…
$ group_size <dbl> 3, 1, 1, 0, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 1, 0, 0, 5, 0…
$ title <fct> master, mrs, mrs, mr, mr, mr, mrs, mrs, mr, mrs, mr, mr…
$ sparse_title <fct> boy, adult_woman, adult_woman, adult_man, adult_man, ad…
# create a "title" variable for each name in data_test
<- data_test %>%
data_test add_predictors() |>
glimpse()
Rows: 328
Columns: 15
$ passenger_id <dbl> 2, 3, 6, 13, 15, 20, 22, 25, 30, 31, 39, 42, 48, 54, 58…
$ survived <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Master. Hudson Trevor", "Allison, Miss. Helen…
$ sex <fct> male, female, male, female, male, male, female, female,…
$ age <dbl> 0.9167, 2.0000, 48.0000, 24.0000, 80.0000, 36.0000, 47.…
$ n_sib_spouse <dbl> 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1…
$ ticket <fct> 113781, 113781, 19952, PC 17477, 27042, 13050, 11751, P…
$ fare <dbl> 151.5500, 151.5500, 26.5500, 69.3000, 30.0000, 75.2417,…
$ cabin <fct> C22 C26, C22 C26, E12, B35, A23, C6, D35, C97, C52, T, …
$ embarked <fct> s, s, s, c, s, c, s, s, s, s, s, c, s, s, s, s, c, s, s…
$ group_size <dbl> 3, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 1…
$ title <fct> master, miss, mr, mme, mr, mr, mrs, miss, mr, mr, mr, m…
$ sparse_title <fct> boy, young_woman, adult_man, adult_woman, adult_man, ad…
Create tracking tibble
Create an empty tracking tibble to track the validation error across the various model configurations you will fit.
<- tibble(model = character(),
error_val acc_val = numeric()) |>
glimpse()
Rows: 0
Columns: 2
$ model <chr>
$ acc_val <dbl>
Understanding Accuracy (Example)
Before we build any models, let’s explain a bit about accuracy. When outcomes are imbalanced, accuracy is not our best performance metric because accuracy is the number of correctly predicted observations (including both true positives and true negatives). We’ll learn about more advanced performance metrics in Unit 6, but for now let’s demonstrate why accuracy can be problematic.
Look at the distribution of our outcome in train:
|>
data_trn plot_bar("survived") # it is not a balanced distribution
If I build a “model” that just regresses survived
on the intercept, it will predict the more probable class (death) for everyone in the training set. This is the logistic regression “equivalent” of predicting the mean when we don’t have any other data from which to make predictions.
<- recipe(survived ~ 1, data_trn)
rec_example
<- rec_example |>
rec_prep_example prep(training = data_trn)
<- rec_prep_example |>
feat_trn_example bake(new_data = data_trn)
<- rec_prep_example |>
feat_val_example bake(new_data = data_val)
<-
fit_knn_example nearest_neighbor(neighbors = 20) |>
set_engine("kknn")|>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_example)
predict(fit_knn_example, feat_val_example)$.pred_class
[1] no no no no no no no no no no no no no no no no no no no no no no no no no
[26] no no no no no no no no no no no no no no no no no no no no no no no no no
[51] no no no no no no no no no no no no no no no no no no no no no no no no no
[76] no no no no no no no no no no no no no no no no no no no no no no no no no
[101] no no no no no no no no no no no no no no no no no no no no no no no no no
[126] no no no no no no no no no no no no no no no no no no no no no no no no no
[151] no no no no no no no no no no no no no no no no no no no no no no no no no
[176] no no no no no no no no no no no no no no no no no no no no no no no no no
[201] no no no no no no no no no no no no no no no no no no no no no no no no no
[226] no no no no no no no no no no no no no no no no no no no no no
Levels: no yes
When we get the accuracy for this model…
accuracy_vec(feat_val_example$survived,
predict(fit_knn_example, feat_val_example)$.pred_class)
[1] 0.6178862
… we see that we can get almost 62% accuracy with no other information! This example is merely to illustrate that a) accuracy is not the best performance metric, especially for unbalanced outcomes, and b) knowing your outcome distribution is important for assessing how well your model is actually doing. For example, if we got accuracies in the low- to mid-60s, we could probably safely assume that we are not in fact doing that well because our no-information model can do almost that well on its own.
KNN Model 1
We start by feature engineering a variable title
(see above and the key for modeling EDA for more information). We extract the passengers’ titles from their names (e.g., “Mr” or “Miss”). This variable represents a combination of age and sex because titles differ by both. That also means I can use age and sex variables to help fill in values when title
is missing. Because there are many unique titles, we collapse it down into a sparse version (sparse_title
). This isn’t required for for the homework but we show it to you here for learning purposes
This meets the homework requirements for: Categorical > 2 levels Modified categorical More creative feature engineering
Set up recipe
# only using sparse_title in the model
<-
rec_1 recipe(survived ~ sparse_title, data = data_trn) |>
step_dummy(sparse_title)
Training feature matrix
prep recipe
<- rec_1 |>
rec_prep prep(training = data_trn)
bake recipe to get feature set
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit Model
Fit first model configuration with knn
<-
fit_knn_1_20nn nearest_neighbor(neighbors = 20) |>
set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn)
Validation feature matrix
Use bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your model
Use accuracy_vec()
to calculate the validation error (accuracy) of your model. Add this value to your validation error tracking tibble
# accuracy in validation set
accuracy_vec(feat_val$survived, predict(fit_knn_1_20nn, feat_val, type = "class")$.pred_class)
[1] 0.6178862
<- add_row(error_val,
error_val model = "knn_1_20nn",
acc_val = accuracy_vec(feat_val$survived,
predict(fit_knn_1_20nn, feat_val)$.pred_class))
error_val
# A tibble: 1 × 2
model acc_val
<chr> <dbl>
1 knn_1_20nn 0.618
KNN model configuration 2
We build off recipe #1 by adding two numeric features - fare, and a new variable called group_size that combines n_sib_spouse & n_parent_child. We Yeo-Johnson transform both (see distributions from modeling EDA) and then range-correct (needed for KNN). In step_rm(), we are removing all predictors (with the all_predictors() selector) EXCEPT for the ones we want to keep.
This model meets the following homework requirements: 2+ numeric predictors Categorical > 2 levels Modified categorical More creative feature engineering
Set up recipe
<- recipe(survived ~ sparse_title + fare + group_size, data = data_trn) |>
rec_2 step_dummy(sparse_title) |>
step_impute_median(fare) |> # one missing value needs imputation
step_YeoJohnson(c(group_size, fare)) |> # deal with high skew
step_range(c(group_size, fare))
Training feature matrix
Use the prep()
function to prep your recipe
<- rec_2 |>
rec_prep prep(training = data_trn)
Use the bake()
function to generate the feature matrix of the training data.
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit your model
Fit a knn
model predicting survived
from your training feature matrix
<-
fit_knn_20nn_2 nearest_neighbor(neighbors = 20) |>
set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn)
Validation feature matrix
Use bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your model
Use accuracy_vec()
to calculate the validation error (accuracy) of your model. Add this value to your validation error tracking tibble
accuracy_vec(feat_val$survived, predict(fit_knn_20nn_2, feat_val, type = "class")$.pred_class)
[1] 0.7804878
<- add_row(error_val,
error_val model = "fit_knn_20nn_2",
acc_val = accuracy_vec(feat_val$survived,
predict(fit_knn_20nn_2, feat_val)$.pred_class))
error_val
# A tibble: 2 × 2
model acc_val
<chr> <dbl>
1 knn_1_20nn 0.618
2 fit_knn_20nn_2 0.780
KNN model configuration 3
For the final model configuration, we add in pclass using dummy features. Note that given the monotonic relationship we identified in modeling EDA, we could have kept pclass as a numeric variable to reduce the number of parameters. We tried that first - it didn’t predict as well, so we switched to using dummhy codes instead. This is the great part about having a validation set - we didn’t know what the right choice was, but we were able to test multiple ways of feature engineering pclass (i.e., multiple model configurations) and let out-of-sample performance tell us what to do!
We also iterate through a few values of k for this configuration & show you how we found a good value.
This model meets the following homework requirements: 2+ numeric predictors Categorical > 2 levels Modified categorical 3+ values of k More creative feature engineering
Set up recipe
Informed by EDA from your modeling EDA, create a recipe for your third model
<- recipe(survived ~ sparse_title + fare + group_size + pclass,
rec_3 data = data_trn) |>
step_dummy(c(sparse_title, pclass)) |>
step_impute_median(fare) |> # impute missing value in fare
step_YeoJohnson(c(group_size, fare)) |> # deal with high skew
step_range(c(group_size, fare))
Training feature matrix
Use the prep()
function to prep your recipe
<- rec_3 |>
rec_prep_3 prep(training = data_trn)
Use the bake()
function to generate the feature matrix of the training data.
<- rec_prep_3 |>
feat_trn_3 bake(new_data = data_trn)
Fit your model
Fit a knn
model predicting survived
from your training feature matrix
<- nearest_neighbor(neighbors = 10) |>
fit_knn_10nn_3 set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_3)
<- nearest_neighbor(neighbors = 15) |>
fit_knn_15nn_3 set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_3)
<- nearest_neighbor(neighbors = 20) |>
fit_knn_20nn_3 set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_3)
<- nearest_neighbor(neighbors = 25) |>
fit_knn_25nn_3 set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_3)
<- nearest_neighbor(neighbors = 30) |>
fit_knn_30nn_3 set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_3)
<- nearest_neighbor(neighbors = 35) |>
fit_knn_35nn_3 set_engine("kknn") |>
set_mode("classification") |>
fit(survived ~ ., data = feat_trn_3)
Validation feature matrix
Use bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep_3 |>
feat_val_3 bake(new_data = data_val)
Assess your model
Use accuracy_vec()
to calculate the validation error (accuracy) of your model. Add this value to your validation error tracking tibble
accuracy_vec(feat_val_3$survived, predict(fit_knn_10nn_3, feat_val_3)$.pred_class)
[1] 0.7804878
accuracy_vec(feat_val_3$survived, predict(fit_knn_15nn_3, feat_val_3)$.pred_class)
[1] 0.7804878
accuracy_vec(feat_val_3$survived, predict(fit_knn_20nn_3, feat_val_3)$.pred_class)
[1] 0.796748
accuracy_vec(feat_val_3$survived, predict(fit_knn_25nn_3, feat_val_3)$.pred_class)
[1] 0.796748
accuracy_vec(feat_val_3$survived, predict(fit_knn_30nn_3, feat_val_3)$.pred_class)
[1] 0.800813
accuracy_vec(feat_val_3$survived, predict(fit_knn_35nn_3, feat_val_3)$.pred_class)
[1] 0.7926829
<- add_row(error_val,
error_val model = "fit_knn_10nn_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_knn_10nn_3,
$.pred_class))
feat_val_3)
<- add_row(error_val,
error_val model = "fit_knn_15nn_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_knn_15nn_3,
$.pred_class))
feat_val_3)<- add_row(error_val,
error_val model = "fit_knn_20nn_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_knn_20nn_3,
$.pred_class))
feat_val_3)<- add_row(error_val,
error_val model = "fit_knn_25nn_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_knn_25nn_3,
$.pred_class))
feat_val_3)<- add_row(error_val,
error_val model = "fit_knn_30nn_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_knn_30nn_3,
$.pred_class))
feat_val_3)<- add_row(error_val,
error_val model = "fit_knn_35nn_3",
acc_val = accuracy_vec(feat_val_3$survived,
predict(fit_knn_35nn_3,
$.pred_class))
feat_val_3)
error_val
# A tibble: 8 × 2
model acc_val
<chr> <dbl>
1 knn_1_20nn 0.618
2 fit_knn_20nn_2 0.780
3 fit_knn_10nn_3 0.780
4 fit_knn_15nn_3 0.780
5 fit_knn_20nn_3 0.797
6 fit_knn_25nn_3 0.797
7 fit_knn_30nn_3 0.801
8 fit_knn_35nn_3 0.793
Example: Visualizing performance by hyperparameter value
Once we are tuning for hyperparameter values (see Unit 5 & 6!), we’ll be able to visualize performance by hyperparameter values. For now, we create it manually.
tibble(neighbors = c(10, 15, 20, 25, 30, 35),
accuracy = error_val$acc_val[3:8]) |>
ggplot(aes(x = neighbors, y = accuracy)) +
geom_line()
There’s a clear peak at k = 30.
Generate predictions
Our best model is a 30NN with recipe 3, so I’ll generate predictions here!
<- fit_knn_30nn_3
best_model <- rec_3 # best recipe
best_rec <- "TA" last_name
save best model predictions
# Make test set features with the best recipe
<- best_rec |>
rec_prep prep(training = bind_rows(data_trn, data_val))
<- rec_prep |>
feat_test bake(new_data = data_test)
# Generate predictions made by the best model
<- data_test |>
test_preds mutate(survived = predict(best_model, feat_test)$.pred_class) |>
select(passenger_id, survived) #passenger_id is the id variable to match predictions
# Save predictions as a csv file with your last name in the file name
|>
test_preds write_csv(here::here(path_data, str_c("test_preds_",last_name,".csv")))