library(tidyverse)
library(tidymodels)
options(conflicts.policy = "depends.ok")Homework Unit 3: KNN
Introduction
This file serves as the answer key for the KNN portion of the Unit_03 homework. Unit 3 Exploratory Introduction to Regression Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to fit multiple configurations of LM and KNN regression models using data from the ames housing data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all LM and KNN models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.
Set up
Load 2 Very Important Packages (and set conflicts)
Source function scripts (John’s or your own)
source("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")# define a data general cleaning/engineering function for all data
class_ames <- function(df){
levels <- c("none", "po", "fa", "ta", "gd", "ex")
df |>
mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |>
mutate(garage_qual = fct_relevel(garage_qual, levels)) |>
mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |>
mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190"))) |>
suppressWarnings()
}Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)Paths
path_data <- "application_assignments/unit_03"Load data
Load the cleaned training, validation, and test data files
Use here::here() and relative path for your data. Make sure your iaml project is open
data_trn <- read_csv(here::here(path_data,"ames_train_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 1,466
Columns: 19
$ sale_price <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood <chr> "n_ames", "n_ames", "gilbert", "stone_br", "stone_br",…
$ ms_sub_class <dbl> 20, 20, 60, 120, 120, 120, 60, 60, 20, 120, 60, 50, 20…
$ total_bsmt_sf <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu <chr> "none", "none", "ta", "none", "none", "ta", "ta", "gd"…
$ gr_liv_area <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "inside", "corner", "inside", "inside", "inside", "ins…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "twhs_ext…
data_val <- read_csv(here::here(path_data,"ames_val_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 486
Columns: 19
$ sale_price <dbl> 215000, 175900, 115000, 127500, 275000, 224000, 192000…
$ garage_area <dbl> 528, 440, 0, 440, 730, 484, 430, 440, 400, 676, 264, 5…
$ neighborhood <chr> "n_ames", "gilbert", "n_ames", "n_pk_vill", "nridg_ht"…
$ ms_sub_class <dbl> 20, 60, 20, 120, 20, 120, 120, 60, 80, 20, 120, 85, 20…
$ total_bsmt_sf <dbl> 1080, 763, 864, 1069, 1698, 1358, 1256, 860, 384, 1218…
$ bsmt_qual <chr> "ta", "gd", "ta", "gd", "ex", "gd", "gd", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 7, 7, 5, 4, 7, 6, 6, 8, 7, 4, 5, 6, 6, 4, 5, 6, 9, 5, …
$ fireplaces <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fireplace_qu <chr> "gd", "ta", "po", "fa", "gd", "gd", "ta", "ta", "ta", …
$ gr_liv_area <dbl> 1656, 1655, 864, 1069, 1698, 1358, 1269, 1960, 1430, 1…
$ lot_area <dbl> 31770, 10000, 10500, 4043, 11520, 6371, 3182, 7851, 77…
$ year_built <dbl> 1960, 1993, 1971, 1977, 2005, 2009, 2004, 2002, 2000, …
$ overall_qual <dbl> 6, 6, 4, 6, 9, 7, 8, 6, 7, 7, 6, 5, 5, 7, 6, 7, 6, 6, …
$ garage_cars <dbl> 2, 2, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "none", "ta", "ta", "ta", "ta", "ta", "ta"…
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "corner", "corner", "fr2", "inside", "inside", "inside…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "one_fam"…
data_test <- read_csv(here::here(path_data,"ames_test_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 973
Columns: 20
$ pid <dbl> 526353030, 527105030, 527165230, 527358200, 527402250,…
$ garage_area <dbl> 522, 470, 420, 528, 500, 304, 511, 264, 264, 751, 532,…
$ neighborhood <chr> "n_ames", "gilbert", "gilbert", "nw_ames", "n_ames", "…
$ ms_sub_class <dbl> 20, 60, 20, 85, 20, 20, 120, 160, 160, 60, 20, 20, 20,…
$ total_bsmt_sf <dbl> 2110, 926, 1168, 1053, 1078, 1056, 1405, 483, 525, 159…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "ta", "ta", "gd", "ta", "ta", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 8, 7, 6, 6, 6, 6, 5, 5, 6, 10, 7, 11, 10, 6, 7, 7, 7, …
$ fireplaces <dbl> 2, 1, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 1, 0, 0, …
$ fireplace_qu <chr> "ta", "gd", "none", "ta", "fa", "fa", "fa", "none", "n…
$ gr_liv_area <dbl> 2110, 1604, 1187, 1173, 1078, 1056, 1337, 987, 1092, 2…
$ lot_area <dbl> 11160, 9978, 7980, 10625, 12537, 8450, 5858, 1680, 168…
$ year_built <dbl> 1968, 1998, 1992, 1974, 1971, 1968, 1999, 1971, 1971, …
$ overall_qual <dbl> 7, 6, 6, 7, 5, 5, 7, 6, 6, 9, 7, 9, 9, 6, 7, 7, 7, 8, …
$ garage_cars <dbl> 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rh", "rm", "rm", …
$ lot_config <chr> "corner", "inside", "inside", "inside", "cul_d_sac", "…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "one_fam", "one_fam",…
$ sale_price <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Set appropriate variable classes:
- Nominal and ordinal variables are set to factors
- interval and ration variables are set to numeric
# general engineering for the full data
data_trn <- data_trn |>
class_ames()
data_val <- data_val |>
class_ames()
data_test <- data_test |>
class_ames()Create tracking tibble
Create a tibble to track the validation errors across various model configurations.
error_val <- tibble(model = character(), rmse_val = numeric()) |>
glimpse()Rows: 0
Columns: 2
$ model <chr>
$ rmse_val <dbl>
KNN 1
First we will try a fully numeric model, selecting down to just the top correlated predictors we found in modeling EDA. Remember you always need to range correct numeric predictors when you have more than one numeric predictor in a KNN model. This model includes the following homework requirements:
- 2+ numeric predictors
- Varied
k
Set up recipe
rec_1 <-
recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_range(gr_liv_area, garage_area, garage_cars, overall_qual, total_bsmt_sf,
year_built, tot_rms_abv_grd) Training feature matrix
prep recipe
rec_prep <- rec_1 |>
prep(training = data_trn)bake recipe to get feature set. Use new_data = NULL to get previously saved training features
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit model
fit_knn_1_5nn <-
nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual + total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn) Validation feature matrix
Use the bake() to generate the feature matrix of the validation data that we will use to assess your model.
feat_val <- rec_prep |>
bake(new_data = data_val)Assess model
Use rmse_vec() to calculate the validation error (RMSE) of the model.
error_val <- bind_rows(error_val,
tibble(model = "knn_1_5nn",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_knn_1_5nn, feat_val)$.pred)))
error_val# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 32044.
Lets also try with k = 10:
fit_knn_1_10nn <-
nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual + total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)Assess validation error
error_val <- bind_rows(error_val,
tibble(model = "knn_1_10nn",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_knn_1_10nn, feat_val)$.pred)))
error_val# A tibble: 2 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 32044.
2 knn_1_10nn 30848.
Visualize performance
Visualize the relationship between raw and predicted sale price in your validation set using the plot_truth() function.
knn_1_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_5nn, feat_val)$.pred)
knn_1_10_nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_10nn, feat_val)$.pred)
cowplot::plot_grid(knn_1_5nn, knn_1_10_nn,
labels = list("k = 5NN", "k = 10nn"))KNN 2
Next, we build on this model by adding in categorical variables that looked like they may have promising relationships with sale_price during modeling EDA. We collapse the neighborhood variable down to categories based on the relationship with sale_price to reduce the number of levels in a way that is unifying. Note that we only have 27/28 neighborhoods in our training data so we will have to impute this new variable in case a new level shows up! We use step_impute_mode(). We do not need to range correct the dummy coded variables, but we do need to range correct the quality factors that we converted to numeric variables with step_mutate_at(). This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels
- Varied
k
2.1 Set up recipe
rec_2 <-
recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_range(all_numeric_predictors()) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_dummy(neighborhood) |>
step_rm(ms_sub_class, central_air, fireplaces, lot_area, ms_zoning, lot_config, bldg_type) Training feature matrix
prep recipe
rec_prep <- rec_2 |>
prep(training = data_trn)bake recipe to get feature set
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit your models
fit_knn_2_3nn <-
nearest_neighbor(neighbors = 3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
fit_knn_2_5nn <-
nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
fit_knn_2_10nn <-
nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
feat_val <- rec_prep |>
bake(new_data = data_val)Assess your models
error_val <- bind_rows(error_val,
tibble(model = c("knn_2_3nn","knn_2_5nn","knn_2_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_2_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_2_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_2_10nn, feat_val)$.pred))))
error_val# A tibble: 5 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 32044.
2 knn_1_10nn 30848.
3 knn_2_3nn 34715.
4 knn_2_5nn 32508.
5 knn_2_10nn 31425.
Visualize performance
knn_2_3nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_3nn, feat_val)$.pred)
knn_2_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_5nn, feat_val)$.pred)
knn_2_10nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_10nn, feat_val)$.pred)
cowplot::plot_grid(knn_2_3nn, knn_2_5nn, knn_2_10nn,
labels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)KNN 3
In this KNN, we will add in more numeric and categorical variables to see what an increase in predictors does. Some categorical variables, like neighborhood and ms_sub_class, we manually collapse based on frequencies observed during EDA. Others are handled more simply by using step_other() to automatically bin low frequency categories into their own other label. This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels’
- Modified categorical
Set up recipe
rec_3 <-
recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(all_numeric_predictors()) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_range(all_numeric_predictors()) |>
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_other(all_nominal_predictors()) |>
step_rm(central_air) |>
step_unknown(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_nzv(all_predictors())Training feature matrix
prep recipe
rec_prep <- rec_3 |>
prep(training = data_trn)bake recipe
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit your models
fit_knn_3_3nn <-
nearest_neighbor(neighbors=3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
fit_knn_3_5nn <-
nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
fit_knn_3_10nn <-
nearest_neighbor(neighbors=10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
feat_val <- rec_prep |>
bake(new_data = data_val)Assess your models
error_val <- bind_rows(error_val,
tibble(model = c("knn_3_3nn", "knn_3_5nn","knn_3_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_3_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_3_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_3_10nn, feat_val)$.pred))))
error_val# A tibble: 8 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 32044.
2 knn_1_10nn 30848.
3 knn_2_3nn 34715.
4 knn_2_5nn 32508.
5 knn_2_10nn 31425.
6 knn_3_3nn 36902.
7 knn_3_5nn 35101.
8 knn_3_10nn 34105.
Visualize performance
knn_3_3nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_3nn, feat_val)$.pred)
knn_3_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_5nn, feat_val)$.pred)
knn_3_10nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_10nn, feat_val)$.pred)
cowplot::plot_grid(knn_3_3nn, knn_3_5nn, knn_3_10nn,
labels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)Additional configurations
In this next configuration, we add more complexity to the model. We manually created a few variables by combining other related variables (e.g. combining all variables that measure the size of different parts of the house into one size variable, resulting in a predictor that is highly correlated with sale_price). We also allow knn to consider all possible interactions of numeric variables (since it can handle high complexity). Because we add many terms, we also add step_nzv() to the end of the recipe here to remove any variables that are highly sparse/ unbalanced (i.e., that will not be that useful for prediction). This ended up improving things quite a bit!
This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels’
- Modified categorical
- Multiple
k
Create recipe
rec_4 <-
recipe(sale_price ~ ., data = data_trn) |>
# kind of meta knn but should help impute a reasonable value for missing data
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
# apply a function to pointed cols
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
# self-defined features
step_mutate(all_size = gr_liv_area + total_bsmt_sf + garage_area,
all_qual = overall_qual + fireplace_qu + bsmt_qual + garage_qual,
all_garage = garage_qual*garage_area*garage_cars,
all_fireplace = fireplaces*fireplace_qu,
all_bsmt = bsmt_qual*total_bsmt_sf,
avg_room = gr_liv_area/tot_rms_abv_grd) |>
# normalization
step_range(all_numeric_predictors()) |>
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_other(all_nominal_predictors(), threshold = .1) |>
# remove features not being used
step_rm(ms_zoning, bldg_type,lot_config) |>
step_unknown(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |>
# remove not features without too much difference across items
step_nzv(all_predictors())Training feature matrix
prep recipe
rec_prep <- rec_4 |>
prep(training = data_trn)bake recipe
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit models
fit_knn_4_3nn <-
nearest_neighbor(neighbors = 3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
fit_knn_4_5nn <-
nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
fit_knn_4_10nn <-
nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
feat_val <- rec_prep |>
bake(new_data = data_val)Assess models
error_val <- bind_rows(error_val,
tibble(model = c("knn_4_3nn", "knn_4_5nn","knn_4_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_4_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_4_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_4_10nn, feat_val)$.pred))))
error_val # A tibble: 11 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 32044.
2 knn_1_10nn 30848.
3 knn_2_3nn 34715.
4 knn_2_5nn 32508.
5 knn_2_10nn 31425.
6 knn_3_3nn 36902.
7 knn_3_5nn 35101.
8 knn_3_10nn 34105.
9 knn_4_3nn 32279.
10 knn_4_5nn 31371.
11 knn_4_10nn 30744.
Visualize performance
knn_4_3nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_3nn, feat_val)$.pred)
knn_4_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_5nn, feat_val)$.pred)
knn_4_10nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_10nn, feat_val)$.pred)
cowplot::plot_grid(knn_4_3nn, knn_4_5nn, knn_4_10nn,
labels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)Lets now check what our best configuration for knn
error_val |>
arrange(rmse_val) |>
slice(1)# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 knn_4_10nn 30744.