options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Homework Unit 3: KNN
Introduction
This file serves as the answer key for the KNN portion of the Unit_03 homework. Unit 3 Exploratory Introduction to Regression Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to fit multiple configurations of LM and KNN regression models using data from the ames housing data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all LM and KNN models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.
Set up
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
# define a data general cleaning/engineering function for all data
<- function(df){
class_ames <- c("none", "po", "fa", "ta", "gd", "ex")
levels |>
df mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |>
mutate(garage_qual = fct_relevel(garage_qual, levels)) |>
mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |>
mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190"))) |>
suppressWarnings()
}
Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
Paths
<- "application_assignments/unit_03" path_data
Load data
Load the cleaned training, validation, and test data files
Use here::here()
and relative path for your data. Make sure your iaml project is open
<- read_csv(here::here(path_data,"ames_train_cln.csv"),
data_trn col_types = cols()) |>
glimpse()
Rows: 1,467
Columns: 19
$ sale_price <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood <chr> "n_ames", "n_ames", "gilbert", "stone_br", "stone_br",…
$ ms_sub_class <chr> "020", "020", "060", "120", "120", "120", "060", "060"…
$ total_bsmt_sf <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu <chr> "none", "none", "ta", "none", "none", "ta", "ta", "gd"…
$ gr_liv_area <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "inside", "corner", "inside", "inside", "inside", "ins…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "twhs_ext…
<- read_csv(here::here(path_data,"ames_val_cln.csv"),
data_val col_types = cols()) |>
glimpse()
Rows: 488
Columns: 19
$ sale_price <dbl> 215000, 175900, 115000, 127500, 275000, 224000, 192000…
$ garage_area <dbl> 528, 440, 0, 440, 730, 484, 430, 440, 400, 676, 264, 5…
$ neighborhood <chr> "n_ames", "gilbert", "n_ames", "n_pk_vill", "nridg_ht"…
$ ms_sub_class <chr> "020", "060", "020", "120", "020", "120", "120", "060"…
$ total_bsmt_sf <dbl> 1080, 763, 864, 1069, 1698, 1358, 1256, 860, 384, 1218…
$ bsmt_qual <chr> "ta", "gd", "ta", "gd", "ex", "gd", "gd", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 7, 7, 5, 4, 7, 6, 6, 8, 7, 4, 5, 6, 6, 4, 5, 6, 9, 5, …
$ fireplaces <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fireplace_qu <chr> "gd", "ta", "po", "fa", "gd", "gd", "ta", "ta", "ta", …
$ gr_liv_area <dbl> 1656, 1655, 864, 1069, 1698, 1358, 1269, 1960, 1430, 1…
$ lot_area <dbl> 31770, 10000, 10500, 4043, 11520, 6371, 3182, 7851, 77…
$ year_built <dbl> 1960, 1993, 1971, 1977, 2005, 2009, 2004, 2002, 2000, …
$ overall_qual <dbl> 6, 6, 4, 6, 9, 7, 8, 6, 7, 7, 6, 5, 5, 7, 6, 7, 6, 6, …
$ garage_cars <dbl> 2, 2, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "none", "ta", "ta", "ta", "ta", "ta", "ta"…
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "corner", "corner", "fr2", "inside", "inside", "inside…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "one_fam"…
<- read_csv(here::here(path_data,"ames_test_cln.csv"),
data_test col_types = cols()) |>
glimpse()
Rows: 975
Columns: 20
$ pid <chr> "0526353030", "0527105030", "0527165230", "0527358200"…
$ garage_area <dbl> 522, 470, 420, 528, 500, 304, 511, 264, 264, 751, 532,…
$ neighborhood <chr> "n_ames", "gilbert", "gilbert", "nw_ames", "n_ames", "…
$ ms_sub_class <chr> "020", "060", "020", "085", "020", "020", "120", "160"…
$ total_bsmt_sf <dbl> 2110, 926, 1168, 1053, 1078, 1056, 1405, 483, 525, 159…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "ta", "ta", "gd", "ta", "ta", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 8, 7, 6, 6, 6, 6, 5, 5, 6, 10, 7, 11, 10, 6, 7, 7, 7, …
$ fireplaces <dbl> 2, 1, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 1, 0, 0, …
$ fireplace_qu <chr> "ta", "gd", "none", "ta", "fa", "fa", "fa", "none", "n…
$ gr_liv_area <dbl> 2110, 1604, 1187, 1173, 1078, 1056, 1337, 987, 1092, 2…
$ lot_area <dbl> 11160, 9978, 7980, 10625, 12537, 8450, 5858, 1680, 168…
$ year_built <dbl> 1968, 1998, 1992, 1974, 1971, 1968, 1999, 1971, 1971, …
$ overall_qual <dbl> 7, 6, 6, 7, 5, 5, 7, 6, 6, 9, 7, 9, 9, 6, 7, 7, 7, 8, …
$ garage_cars <dbl> 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rh", "rm", "rm", …
$ lot_config <chr> "corner", "inside", "inside", "inside", "cul_d_sac", "…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "one_fam", "one_fam",…
$ sale_price <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Set appropriate variable classes:
- Nominal and ordinal variables are set to factors
- interval and ration variables are set to numeric
# general engineering for the full data
<- data_trn |>
data_trn class_ames()
<- data_val |>
data_val class_ames()
<- data_test |>
data_test class_ames()
Create tracking tibble
Create a tibble to track the validation errors across various model configurations.
<- tibble(model = character(), rmse_val = numeric()) |>
error_val glimpse()
Rows: 0
Columns: 2
$ model <chr>
$ rmse_val <dbl>
KNN 1
First we will try a fully numeric model, selecting down to just the top correlated predictors we found in modeling EDA. Remember you always need to range correct numeric predictors when you have more than one numeric predictor in a KNN model. This model includes the following homework requirements:
- 2+ numeric predictors
- Varied
k
Set up recipe
<-
rec_1 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_range(gr_liv_area, garage_area, garage_cars, overall_qual, total_bsmt_sf,
year_built, tot_rms_abv_grd)
Training feature matrix
prep recipe
<- rec_1 |>
rec_prep prep(training = data_trn)
bake recipe to get feature set. Use new_data = NULL
to get previously saved training features
<- rec_prep |>
feat_trn bake(new_data = NULL)
Fit model
<-
fit_knn_1_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual + total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)
Validation feature matrix
Use the bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess model
Use rmse_vec()
to calculate the validation error (RMSE) of the model.
<- bind_rows(error_val,
error_val tibble(model = "knn_1_5nn",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_knn_1_5nn, feat_val)$.pred)))
error_val
# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
Lets also try with k
= 10:
<-
fit_knn_1_10nn nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual + total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)
Assess validation error
<- bind_rows(error_val,
error_val tibble(model = "knn_1_10nn",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_knn_1_10nn, feat_val)$.pred)))
error_val
# A tibble: 2 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
Visualize performance
Visualize the relationship between raw and predicted sale price in your validation set using the plot_truth()
function.
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_5nn, feat_val)$.pred)
knn_1_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_10nn, feat_val)$.pred)
knn_1_10_nn
::plot_grid(knn_1_5nn, knn_1_10_nn,
cowplotlabels = list("k = 5NN", "k = 10nn"))
KNN 2
Next, we build on this model by adding in categorical variables that looked like they may have promising relationships with sale_price
during modeling EDA. We collapse the neighborhood
variable down to categories based on the relationship with sale_price
to reduce the number of levels in a way that is unifying. Note that we only have 27/28 neighborhoods in our training data so we will have to impute this new variable in case a new level shows up! We use step_impute_mode()
. We do not need to range correct the dummy coded variables, but we do need to range correct the quality factors that we converted to numeric variables with step_mutate_at()
. This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels
- Varied
k
2.1 Set up recipe
<-
rec_2 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_range(all_numeric_predictors()) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_dummy(neighborhood) |>
step_rm(ms_sub_class, central_air, fireplaces, lot_area, ms_zoning, lot_config, bldg_type)
Training feature matrix
prep recipe
<- rec_2 |>
rec_prep prep(training = data_trn)
bake recipe to get feature set
<- rec_prep |>
feat_trn bake(new_data = NULL)
Fit your models
<-
fit_knn_2_3nn nearest_neighbor(neighbors = 3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_2_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_2_10nn nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
Validation feature matrix
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your models
<- bind_rows(error_val,
error_val tibble(model = c("knn_2_3nn","knn_2_5nn","knn_2_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_2_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_2_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_2_10nn, feat_val)$.pred))))
error_val
# A tibble: 5 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
3 knn_2_3nn 35891.
4 knn_2_5nn 34864.
5 knn_2_10nn 34795.
Visualize performance
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_3nn, feat_val)$.pred)
knn_2_3nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_5nn, feat_val)$.pred)
knn_2_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_10nn, feat_val)$.pred)
knn_2_10nn
::plot_grid(knn_2_3nn, knn_2_5nn, knn_2_10nn,
cowplotlabels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)
KNN 3
In this KNN, we will add in more numeric and categorical variables to see what an increase in predictors does. Some categorical variables, like neighborhood
and ms_sub_class
, we manually collapse based on frequencies observed during EDA. Others are handled more simply by using step_other()
to automatically bin low frequency categories into their own other label. This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels’
- Modified categorical
Set up recipe
<-
rec_3 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(all_numeric_predictors()) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_range(all_numeric_predictors()) |>
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_other(all_nominal_predictors()) |>
step_rm(central_air) |>
step_dummy(all_nominal_predictors()) |>
step_nzv(all_predictors())
Training feature matrix
prep recipe
<- rec_3 |>
rec_prep prep(training = data_trn)
bake recipe
<- rec_prep |>
feat_trn bake(new_data = NULL)
Fit your models
<-
fit_knn_3_3nn nearest_neighbor(neighbors=3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_3_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_3_10nn nearest_neighbor(neighbors=10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
Validation feature matrix
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your models
<- bind_rows(error_val,
error_val tibble(model = c("knn_3_3nn", "knn_3_5nn","knn_3_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_3_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_3_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_3_10nn, feat_val)$.pred))))
error_val
# A tibble: 8 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
3 knn_2_3nn 35891.
4 knn_2_5nn 34864.
5 knn_2_10nn 34795.
6 knn_3_3nn 39689.
7 knn_3_5nn 38615.
8 knn_3_10nn 38787.
Visualize performance
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_3nn, feat_val)$.pred)
knn_3_3nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_5nn, feat_val)$.pred)
knn_3_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_10nn, feat_val)$.pred)
knn_3_10nn
::plot_grid(knn_3_3nn, knn_3_5nn, knn_3_10nn,
cowplotlabels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)
Additional configurations
In this next configuration, we add more complexity to the model. We manually created a few variables by combining other related variables (e.g. combining all variables that measure the size of different parts of the house into one size variable, resulting in a predictor that is highly correlated with sale_price
). We also allow knn
to consider all possible interactions of numeric variables (since it can handle high complexity). Because we add many terms, we also add step_nzv()
to the end of the recipe here to remove any variables that are highly sparse/ unbalanced (i.e., that will not be that useful for prediction). This ended up improving things quite a bit!
This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels’
- Modified categorical
- Multiple
k
Create recipe
<-
rec_4 recipe(sale_price ~ ., data = data_trn) |>
# kind of meta knn but should help impute a reasonable value for missing data
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
# apply a function to pointed cols
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
# self-defined features
step_mutate(all_size = gr_liv_area + total_bsmt_sf + garage_area,
all_qual = overall_qual + fireplace_qu + bsmt_qual + garage_qual,
all_garage = garage_qual*garage_area*garage_cars,
all_fireplace = fireplaces*fireplace_qu,
all_bsmt = bsmt_qual*total_bsmt_sf,
avg_room = gr_liv_area/tot_rms_abv_grd) |>
# normalization
step_range(all_numeric_predictors()) |>
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_other(all_nominal_predictors(), threshold = .1) |>
# remove features not being used
step_rm(ms_zoning, bldg_type,lot_config) |>
step_dummy(all_nominal_predictors()) |>
# remove not features without too much difference across items
step_nzv(all_predictors())
Training feature matrix
prep recipe
<- rec_4 |>
rec_prep prep(training = data_trn)
bake recipe
<- rec_prep |>
feat_trn bake(new_data = NULL)
Fit models
<-
fit_knn_4_3nn nearest_neighbor(neighbors = 3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_4_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_4_10nn nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
Validation feature matrix
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess models
<- bind_rows(error_val,
error_val tibble(model = c("knn_4_3nn", "knn_4_5nn","knn_4_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_4_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_4_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_4_10nn, feat_val)$.pred))))
error_val
# A tibble: 11 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
3 knn_2_3nn 35891.
4 knn_2_5nn 34864.
5 knn_2_10nn 34795.
6 knn_3_3nn 39689.
7 knn_3_5nn 38615.
8 knn_3_10nn 38787.
9 knn_4_3nn 33547.
10 knn_4_5nn 33882.
11 knn_4_10nn 34825.
Visualize performance
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_3nn, feat_val)$.pred)
knn_4_3nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_5nn, feat_val)$.pred)
knn_4_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_10nn, feat_val)$.pred)
knn_4_10nn
::plot_grid(knn_4_3nn, knn_4_5nn, knn_4_10nn,
cowplotlabels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)
Lets now check what our best configuration for knn
|>
error_val arrange(rmse_val) |>
slice(1)
# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 knn_4_3nn 33547.
Predictions
Since the knn
is the best model configuration that we’ve fit, we will use this model to generate predictions for the held out test set. Remember, we only make predictions in the test set once (important!).
One new step: when you are preparing to make predictions in your held out test set, your recipe should be prepped using the combination of your training and validation data in order to get the best estimates possible for the held out data. For the competition, we didn’t ask you to do this but we will show you what it looks like here.
Our best model is a KNN (i.e., from this/ the KNN script) so we worked out this process of re-fitting with train and validation below . If your best model is a GLM this would be applicable to that script in the assignment, and you could skip to Save & Knit at the end of this document.
- Prep best recipe now with “held-in” train and val data
<- data_trn |>
data_in bind_rows(data_val) |>
glimpse()
Rows: 1,955
Columns: 19
$ sale_price <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood <fct> n_ames, n_ames, gilbert, stone_br, stone_br, stone_br,…
$ ms_sub_class <fct> 020, 020, 060, 120, 120, 120, 060, 060, 020, 120, 060,…
$ total_bsmt_sf <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual <fct> ta, ta, gd, gd, gd, gd, ta, gd, gd, gd, gd, gd, ex, ta…
$ central_air <fct> y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, …
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu <fct> none, none, ta, none, none, ta, ta, gd, po, none, gd, …
$ gr_liv_area <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ ms_zoning <fct> rh, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl…
$ lot_config <fct> inside, corner, inside, inside, inside, inside, inside…
$ bldg_type <fct> one_fam, one_fam, one_fam, twhs_ext, twhs_ext, twhs_ex…
<- rec_4 |> # rec_4 is our recipe for our BEST model configuration
rec_prep_best prep(training = data_in) # prepping this recipe with ALL held in data (trn and val)
- Bake features using held in data from train and val
<- rec_prep_best |>
feat_in bake(new_data = data_in) |>
glimpse()
Rows: 1,955
Columns: 26
$ garage_area <dbl> 0.4905914, 0.2096774, 0.3239247, 0.3911290, …
$ total_bsmt_sf <dbl> 0.14435352, 0.21751227, 0.15188216, 0.218985…
$ bsmt_qual <dbl> 0.50, 0.50, 0.75, 0.75, 0.75, 0.75, 0.50, 0.…
$ tot_rms_abv_grd <dbl> 0.18181818, 0.27272727, 0.27272727, 0.272727…
$ fireplaces <dbl> 0.0000000, 0.0000000, 0.3333333, 0.0000000, …
$ fireplace_qu <dbl> 0.0, 0.0, 0.6, 0.0, 0.0, 0.6, 0.6, 0.8, 0.2,…
$ gr_liv_area <dbl> 0.08800922, 0.17121445, 0.22886241, 0.172943…
$ lot_area <dbl> 0.0474624478, 0.0598356170, 0.0577913542, 0.…
$ year_built <dbl> 0.6370370, 0.6148148, 0.9037037, 0.9333333, …
$ overall_qual <dbl> 0.4444444, 0.5555556, 0.4444444, 0.7777778, …
$ garage_cars <dbl> 0.25, 0.25, 0.50, 0.50, 0.50, 0.50, 0.50, 0.…
$ garage_qual <dbl> 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6,…
$ sale_price <dbl> 105000, 172000, 189900, 213500, 191500, 2365…
$ all_size <dbl> 0.13246542, 0.17005696, 0.17567128, 0.193490…
$ all_qual <dbl> 0.40, 0.45, 0.60, 0.60, 0.60, 0.75, 0.65, 0.…
$ all_garage <dbl> 0.13888889, 0.05936073, 0.18340944, 0.221461…
$ all_fireplace <dbl> 0.0000000, 0.0000000, 0.2222222, 0.0000000, …
$ all_bsmt <dbl> 0.08661211, 0.13050736, 0.12150573, 0.175188…
$ avg_room <dbl> 0.1345455, 0.2306818, 0.3443182, 0.2340909, …
$ neighborhood_mod_lo_price <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ neighborhood_mod_hi_price <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ neighborhood_other <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,…
$ ms_sub_class_two_st <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,…
$ ms_sub_class_multi <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ ms_sub_class_pud <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
$ central_air_other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
- Train best model configuration using these held in feature
<-
fit_knn_best nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_in)
- Now generate features for held out (test) data using the same prepped recipe
<- rec_prep_best |>
feat_test bake(new_data = data_test)
- Generate predictions made by the best model
<- data_test |>
preds_test select(pid) |> # keep only pid
mutate(sale_price = predict(fit_knn_best, feat_test)$.pred)
# IF you did any transformation of sale price (e.g., log), you will need a final
# mutate to untransform sale_price back to raw dollars in the pipe above
Save out the best model’s predictions of raw sale_price
in the held-out test set. Name file with your name
|>
preds_test write_csv(here::here(path_data, "test_preds_TA.csv"))