options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Homework Unit 3: KNN
Introduction
This file serves as the answer key for the KNN portion of the Unit_03 homework. Unit 3 Exploratory Introduction to Regression Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to fit multiple configurations of LM and KNN regression models using data from the ames housing data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all LM and KNN models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.
Set up
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
Paths
<- "homework/unit_03" path_data
Load data
Load the cleaned training, validation, and test data files
Use here::here()
and relative path for your data. Make sure your iaml project is open
<- read_csv(here::here(path_data,"ames_train_cln.csv"),
data_trn col_types = cols()) |>
glimpse()
Rows: 1,467
Columns: 19
$ sale_price <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood <chr> "n_ames", "n_ames", "gilbert", "stone_br", "stone_br",…
$ ms_sub_class <chr> "020", "020", "060", "120", "120", "120", "060", "060"…
$ total_bsmt_sf <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu <chr> "none", "none", "ta", "none", "none", "ta", "ta", "gd"…
$ gr_liv_area <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "inside", "corner", "inside", "inside", "inside", "ins…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "twhs_ext…
<- read_csv(here::here(path_data,"ames_val_cln.csv"),
data_val col_types = cols()) |>
glimpse()
Rows: 488
Columns: 19
$ sale_price <dbl> 215000, 175900, 115000, 127500, 275000, 224000, 192000…
$ garage_area <dbl> 528, 440, 0, 440, 730, 484, 430, 440, 400, 676, 264, 5…
$ neighborhood <chr> "n_ames", "gilbert", "n_ames", "n_pk_vill", "nridg_ht"…
$ ms_sub_class <chr> "020", "060", "020", "120", "020", "120", "120", "060"…
$ total_bsmt_sf <dbl> 1080, 763, 864, 1069, 1698, 1358, 1256, 860, 384, 1218…
$ bsmt_qual <chr> "ta", "gd", "ta", "gd", "ex", "gd", "gd", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 7, 7, 5, 4, 7, 6, 6, 8, 7, 4, 5, 6, 6, 4, 5, 6, 9, 5, …
$ fireplaces <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fireplace_qu <chr> "gd", "ta", "po", "fa", "gd", "gd", "ta", "ta", "ta", …
$ gr_liv_area <dbl> 1656, 1655, 864, 1069, 1698, 1358, 1269, 1960, 1430, 1…
$ lot_area <dbl> 31770, 10000, 10500, 4043, 11520, 6371, 3182, 7851, 77…
$ year_built <dbl> 1960, 1993, 1971, 1977, 2005, 2009, 2004, 2002, 2000, …
$ overall_qual <dbl> 6, 6, 4, 6, 9, 7, 8, 6, 7, 7, 6, 5, 5, 7, 6, 7, 6, 6, …
$ garage_cars <dbl> 2, 2, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "none", "ta", "ta", "ta", "ta", "ta", "ta"…
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "corner", "corner", "fr2", "inside", "inside", "inside…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "one_fam"…
<- read_csv(here::here(path_data,"ames_test_cln.csv"),
data_test col_types = cols()) |>
glimpse()
Rows: 975
Columns: 19
$ pid <chr> "0526353030", "0527105030", "0527165230", "0527358200"…
$ garage_area <dbl> 522, 470, 420, 528, 500, 304, 511, 264, 264, 751, 532,…
$ neighborhood <chr> "n_ames", "gilbert", "gilbert", "nw_ames", "n_ames", "…
$ ms_sub_class <chr> "020", "060", "020", "085", "020", "020", "120", "160"…
$ total_bsmt_sf <dbl> 2110, 926, 1168, 1053, 1078, 1056, 1405, 483, 525, 159…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "ta", "ta", "gd", "ta", "ta", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 8, 7, 6, 6, 6, 6, 5, 5, 6, 10, 7, 11, 10, 6, 7, 7, 7, …
$ fireplaces <dbl> 2, 1, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 1, 0, 0, …
$ fireplace_qu <chr> "ta", "gd", "none", "ta", "fa", "fa", "fa", "none", "n…
$ gr_liv_area <dbl> 2110, 1604, 1187, 1173, 1078, 1056, 1337, 987, 1092, 2…
$ lot_area <dbl> 11160, 9978, 7980, 10625, 12537, 8450, 5858, 1680, 168…
$ year_built <dbl> 1968, 1998, 1992, 1974, 1971, 1968, 1999, 1971, 1971, …
$ overall_qual <dbl> 7, 6, 6, 7, 5, 5, 7, 6, 6, 9, 7, 9, 9, 6, 7, 7, 7, 8, …
$ garage_cars <dbl> 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rh", "rm", "rm", …
$ lot_config <chr> "corner", "inside", "inside", "inside", "cul_d_sac", "…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "one_fam", "one_fam",…
Set appropriate variable classes:
- Nominal and ordinal variables are set to factors
- interval and ration variables are set to numeric
Remember ordinal variables should have the factor levels explicitly stated to retain the order (you will see why when we get to recipe 3!)
Also note you will get a warning stating there is an unknown level in the factor - this is because there are no basement rated as having “po” quality in data_trn or data_val and there is no garage rated as “po” in data_val. Since we investigated this warning we are suppressing warning messages using suppressWarnings()
to prevent the warning from coming up again.
<- c("none", "po", "fa", "ta", "gd", "ex")
levels <- data_trn |>
data_trn mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |>
mutate(garage_qual = fct_relevel(garage_qual, levels)) |>
mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |>
suppressWarnings()
<- data_val |>
data_val mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |>
mutate(garage_qual = fct_relevel(garage_qual, levels)) |>
mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |>
suppressWarnings()
<- data_test |>
data_test mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |>
mutate(garage_qual = fct_relevel(garage_qual, levels)) |>
mutate(fireplace_qu = fct_relevel(fireplace_qu, levels))
Additionally, from EDA we know two variables have a lot of factor levels and some with very few observations.
Even though these are not ordinal variables we are still going to set these levels explicitly (based on data dictionary) so we don’t need to use step_novel()
later when new factor levels show up in validation or test sets.
<- data_trn |>
data_trn mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190")))
<- data_val |>
data_val mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190")))
<- data_test |>
data_test mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190")))
Create tracking tibble
Create a tibble to track the validation errors across various model configurations.
<- tibble(model = character(), rmse_val = numeric()) |>
error_val glimpse()
Rows: 0
Columns: 2
$ model <chr>
$ rmse_val <dbl>
KNN 1
First we will try a fully numeric model, selecting down to just the top correlated predictors we found in modeling EDA. Remember you always need to range correct numeric predictors when you have more than one numeric predictor in a KNN model. This model includes the following homework requirements:
- 2+ numeric predictors
- Varied
k
Set up recipe
<-
rec_1 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_range(gr_liv_area, garage_area, garage_cars, overall_qual, total_bsmt_sf,
year_built, tot_rms_abv_grd)
Training feature matrix
prep recipe
<- rec_1 |>
rec_prep prep(training = data_trn)
bake recipe to get feature set
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit model
<-
fit_knn_1_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual + total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)
Validation feature matrix
Use the bake()
to generate the feature matrix of the validation data that we will use to assess your model.
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess model
Use rmse_vec()
to calculate the validation error (RMSE) of the model.
<- bind_rows(error_val,
error_val tibble(model = "knn_1_5nn",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_knn_1_5nn, feat_val)$.pred)))
error_val
# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
Lets also try with k
= 10:
<-
fit_knn_1_10nn nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual + total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)
Assess validation error
<- bind_rows(error_val,
error_val tibble(model = "knn_1_10nn",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_knn_1_10nn, feat_val)$.pred)))
error_val
# A tibble: 2 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
Visualize performance
Visualize the relationship between raw and predicted sale price in your validation set using the plot_truth()
function.
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_5nn, feat_val)$.pred)
knn_1_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_10nn, feat_val)$.pred)
knn_1_10_nn
::plot_grid(knn_1_5nn, knn_1_10_nn,
cowplotlabels = list("k = 5NN", "k = 10nn"))
KNN 2
Next, we build on this model by adding in categorical variables that looked like they may have promising relationships with sale_price
during modeling EDA. We collapse the neighborhood
variable down to categories based on the relationship with sale_price
to reduce the number of levels in a way that is unifying. Note that we only have 27/28 neighborhoods in our training data so we will have to impute this new variable in case a new level shows up! We use step_impute_mode()
. We do not need to range correct the dummy coded variables, but we do need to range correct the quality factors that we converted to numeric variables with step_mutate_at()
. This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels
- Varied
k
2.1 Set up recipe
<-
rec_2 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_range(all_numeric_predictors()) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_dummy(neighborhood) |>
step_rm(ms_sub_class, central_air, fireplaces, lot_area, ms_zoning, lot_config, bldg_type)
Training feature matrix
prep recipe
<- rec_2 |>
rec_prep prep(training = data_trn)
bake recipe to get feature set
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit your models
<-
fit_knn_2_3nn nearest_neighbor(neighbors = 3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_2_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_2_10nn nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
Validation feature matrix
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your models
<- bind_rows(error_val,
error_val tibble(model = c("knn_2_3nn","knn_2_5nn","knn_2_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_2_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_2_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_2_10nn, feat_val)$.pred))))
error_val
# A tibble: 5 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
3 knn_2_3nn 35891.
4 knn_2_5nn 34864.
5 knn_2_10nn 34795.
Visualize performance
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_3nn, feat_val)$.pred)
knn_2_3nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_5nn, feat_val)$.pred)
knn_2_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_10nn, feat_val)$.pred)
knn_2_10nn
::plot_grid(knn_2_3nn, knn_2_5nn, knn_2_10nn,
cowplotlabels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)
KNN 3
In this KNN, we will add in more numeric and categorical variables to see what an increase in predictors does. Some categorical variables, like neighborhood
and ms_sub_class
, we manually collapse based on frequencies observed during EDA. Others are handled more simply by using step_other()
to automatically bin low frequency categories into their own other label. This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels’
- Modified categorical
Set up recipe
<-
rec_3 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(all_numeric_predictors()) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_range(all_numeric_predictors()) |>
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_other(all_nominal_predictors()) |>
step_rm(central_air) |>
step_dummy(all_nominal_predictors()) |>
step_nzv(all_predictors())
Training feature matrix
prep recipe
<- rec_3 |>
rec_prep prep(training = data_trn)
bake recipe
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit your models
<-
fit_knn_3_3nn nearest_neighbor(neighbors=3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_3_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_3_10nn nearest_neighbor(neighbors=10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
Validation feature matrix
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess your models
<- bind_rows(error_val,
error_val tibble(model = c("knn_3_3nn", "knn_3_5nn","knn_3_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_3_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_3_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_3_10nn, feat_val)$.pred))))
error_val
# A tibble: 8 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
3 knn_2_3nn 35891.
4 knn_2_5nn 34864.
5 knn_2_10nn 34795.
6 knn_3_3nn 39689.
7 knn_3_5nn 38615.
8 knn_3_10nn 38787.
Visualize performance
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_3nn, feat_val)$.pred)
knn_3_3nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_5nn, feat_val)$.pred)
knn_3_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_10nn, feat_val)$.pred)
knn_3_10nn
::plot_grid(knn_3_3nn, knn_3_5nn, knn_3_10nn,
cowplotlabels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)
Additional configurations
In this next configuration, we add more complexity to the model. We manually created a few variables by combining other related variables (e.g. combining all variables that measure the size of different parts of the house into one size variable, resulting in a predictor that is highly correlated with sale_price
). We also allow knn
to consider all possible interactions of numeric variables (since it can handle high complexity). Because we add many terms, we also add step_nzv()
to the end of the recipe here to remove any variables that are highly sparse/ unbalanced (i.e., that will not be that useful for prediction). This ended up improving things quite a bit!
This model includes the following homework requirements:
- 2+ numeric predictors
- Categorical > 2 levels’
- Modified categorical
- Multiple
k
Create recipe
<-
rec_4 recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_mutate(all_size = gr_liv_area + total_bsmt_sf + garage_area,
all_qual = overall_qual + fireplace_qu + bsmt_qual + garage_qual,
all_garage = garage_qual*garage_area*garage_cars,
all_fireplace = fireplaces*fireplace_qu,
all_bsmt = bsmt_qual*total_bsmt_sf,
avg_room = gr_liv_area/tot_rms_abv_grd) |>
step_range(all_numeric_predictors()) %>%
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
step_other(all_nominal_predictors(), threshold = .1) |>
step_interact(~ all_numeric_predictors():all_numeric_predictors()) |>
step_rm(ms_zoning, bldg_type,lot_config) |>
step_dummy(all_nominal_predictors()) |>
step_nzv(all_predictors())
Training feature matrix
prep recipe
<- rec_4 |>
rec_prep prep(training = data_trn)
bake recipe
<- rec_prep |>
feat_trn bake(new_data = data_trn)
Fit models
<-
fit_knn_4_3nn nearest_neighbor(neighbors = 3) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_4_5nn nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
<-
fit_knn_4_10nn nearest_neighbor(neighbors = 10) |>
set_engine("kknn") |>
set_mode("regression") |>
fit(sale_price ~ ., data = feat_trn)
Validation feature matrix
<- rec_prep |>
feat_val bake(new_data = data_val)
Assess models
<- bind_rows(error_val,
error_val tibble(model = c("knn_4_3nn", "knn_4_5nn","knn_4_10nn"),
rmse_val = c(rmse_vec(feat_val$sale_price,
predict(fit_knn_4_3nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_4_5nn, feat_val)$.pred),
rmse_vec(feat_val$sale_price,
predict(fit_knn_4_10nn, feat_val)$.pred))))
error_val
# A tibble: 11 × 2
model rmse_val
<chr> <dbl>
1 knn_1_5nn 37195.
2 knn_1_10nn 36114.
3 knn_2_3nn 35891.
4 knn_2_5nn 34864.
5 knn_2_10nn 34795.
6 knn_3_3nn 39689.
7 knn_3_5nn 38615.
8 knn_3_10nn 38787.
9 knn_4_3nn 34400.
10 knn_4_5nn 34029.
11 knn_4_10nn 34365.
Visualize performance
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_3nn, feat_val)$.pred)
knn_4_3nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_5nn, feat_val)$.pred)
knn_4_5nn
<- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_10nn, feat_val)$.pred)
knn_4_10nn
::plot_grid(knn_4_3nn, knn_4_5nn, knn_4_10nn,
cowplotlabels = list("k = 3NN", "k = 5NN", "k = 10nn"),
nrow = 1)
Lets now check what our best configuration for knn
|>
error_val arrange(rmse_val) |>
slice(1)
# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 knn_4_5nn 34029.
Predictions
Since the knn
is the best model configuration that we’ve fit, we will use this model to generate predictions for the held out test set. Remember, we only make predictions in the test set once (important!).
One new step: when you are preparing to make predictions in your held out test set, your recipe should be prepped using the combination of your training and validation data in order to get the best estimates possible for the held out data. For the competition, we didn’t ask you to do this but we will show you what it looks like here.
Our best model is a KNN (i.e., from this/ the KNN script) so we worked out this process of re-fitting with train and validation below . If your best model is a GLM this would be applicable to that script in the assignment, and you could skip to Save & Knit at the end of this document.
Assign best model
Since our best model was a KNN model we change NA
to our best KNN model (fit_knn_4_5nn
). We also put one our names between the quotation marks.
<- fit_knn_4_5nn
best_model <- rec_4
best_rec <- "TA" last_name
Generate test predictions
Save out the best model’s predictions of raw sale_price
in the held-out test set.
# Make test set features with the best recipe
<- best_rec |>
rec_prep prep(training = data_trn)
<- rec_prep |>
feat_test bake(new_data = data_test)
# Generate predictions made by the best model
<- data_test |>
test_preds mutate(sale_price = predict(best_model, feat_test)$.pred) |>
select(pid, sale_price) #pid is the id variable to match predictions
# Save predictions as a csv file with your last name in the file name
|>
test_preds write_csv(here::here(path_data, str_c("test_preds_",last_name,".csv")))