Homework Unit 3: KNN

Author

TA Key

Published

February 11, 2025

Introduction

This file serves as the answer key for the KNN portion of the Unit_03 homework. Unit 3 Exploratory Introduction to Regression Models in the course web book contains all materials required for this assignment.

In this assignment, we demonstrate how to fit multiple configurations of LM and KNN regression models using data from the ames housing data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all LM and KNN models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.

Set up

Handle conflicts

options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
tidymodels_conflictRules()

Load required packages

library(tidyverse) 
library(tidymodels)

Source function scripts (John’s or your own)

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")

# define a data general cleaning/engineering function for all data
class_ames <- function(df){
  levels <- c("none", "po", "fa", "ta", "gd", "ex")
  df |>
    mutate(across(where(is.character), factor)) |> 
    mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |> 
    mutate(garage_qual = fct_relevel(garage_qual, levels)) |> 
    mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |> 
    mutate(neighborhood = factor(neighborhood,
                               levels = c("blmngtn", "blueste", "br_dale", "brk_side",
                                          "clear_cr", "collg_cr", "crawfor", "edwards",
                                          "gilbert", "greens", "grn_hill", "idotrr",
                                          "landmrk", "meadow_v", "mitchel", "n_ames",
                                          "no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
                                          "old_town", "sawyer", "sawyer_w", "somerst", 
                                          "stone_br", "swisu", "timber", "veenker")),
         ms_sub_class = factor(ms_sub_class,
                               levels = c("020", "030", "040", "045", "050", "060",
                                          "070", "075", "080", "085", "090", "120",
                                          "150", "160", "180", "190"))) |> 
    suppressWarnings()
}

Specify other global settings

theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)

Paths

path_data <- "application_assignments/unit_03"

Load data

Load the cleaned training, validation, and test data files

Use here::here() and relative path for your data. Make sure your iaml project is open

data_trn <- read_csv(here::here(path_data,"ames_train_cln.csv"), 
                     col_types = cols()) |> 
  glimpse()

Rows: 1,467
Columns: 19
$ sale_price      <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area     <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood    <chr> "n_ames", "n_ames", "gilbert", "stone_br", "stone_br",…
$ ms_sub_class    <chr> "020", "020", "060", "120", "120", "120", "060", "060"…
$ total_bsmt_sf   <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual       <chr> "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", "gd", …
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces      <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu    <chr> "none", "none", "ta", "none", "none", "ta", "ta", "gd"…
$ gr_liv_area     <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area        <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built      <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual    <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars     <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual     <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning       <chr> "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config      <chr> "inside", "corner", "inside", "inside", "inside", "ins…
$ bldg_type       <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "twhs_ext…

data_val <- read_csv(here::here(path_data,"ames_val_cln.csv"),
                     col_types = cols()) |> 
  glimpse()

Rows: 488
Columns: 19
$ sale_price      <dbl> 215000, 175900, 115000, 127500, 275000, 224000, 192000…
$ garage_area     <dbl> 528, 440, 0, 440, 730, 484, 430, 440, 400, 676, 264, 5…
$ neighborhood    <chr> "n_ames", "gilbert", "n_ames", "n_pk_vill", "nridg_ht"…
$ ms_sub_class    <chr> "020", "060", "020", "120", "020", "120", "120", "060"…
$ total_bsmt_sf   <dbl> 1080, 763, 864, 1069, 1698, 1358, 1256, 860, 384, 1218…
$ bsmt_qual       <chr> "ta", "gd", "ta", "gd", "ex", "gd", "gd", "gd", "gd", …
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 7, 7, 5, 4, 7, 6, 6, 8, 7, 4, 5, 6, 6, 4, 5, 6, 9, 5, …
$ fireplaces      <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fireplace_qu    <chr> "gd", "ta", "po", "fa", "gd", "gd", "ta", "ta", "ta", …
$ gr_liv_area     <dbl> 1656, 1655, 864, 1069, 1698, 1358, 1269, 1960, 1430, 1…
$ lot_area        <dbl> 31770, 10000, 10500, 4043, 11520, 6371, 3182, 7851, 77…
$ year_built      <dbl> 1960, 1993, 1971, 1977, 2005, 2009, 2004, 2002, 2000, …
$ overall_qual    <dbl> 6, 6, 4, 6, 9, 7, 8, 6, 7, 7, 6, 5, 5, 7, 6, 7, 6, 6, …
$ garage_cars     <dbl> 2, 2, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
$ garage_qual     <chr> "ta", "ta", "none", "ta", "ta", "ta", "ta", "ta", "ta"…
$ ms_zoning       <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config      <chr> "corner", "corner", "fr2", "inside", "inside", "inside…
$ bldg_type       <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "one_fam"…

data_test <- read_csv(here::here(path_data,"ames_test_cln.csv"),
                      col_types = cols()) |> 
  glimpse()

Rows: 975
Columns: 20
$ pid             <chr> "0526353030", "0527105030", "0527165230", "0527358200"…
$ garage_area     <dbl> 522, 470, 420, 528, 500, 304, 511, 264, 264, 751, 532,…
$ neighborhood    <chr> "n_ames", "gilbert", "gilbert", "nw_ames", "n_ames", "…
$ ms_sub_class    <chr> "020", "060", "020", "085", "020", "020", "120", "160"…
$ total_bsmt_sf   <dbl> 2110, 926, 1168, 1053, 1078, 1056, 1405, 483, 525, 159…
$ bsmt_qual       <chr> "ta", "ta", "gd", "gd", "ta", "ta", "gd", "ta", "ta", …
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 8, 7, 6, 6, 6, 6, 5, 5, 6, 10, 7, 11, 10, 6, 7, 7, 7, …
$ fireplaces      <dbl> 2, 1, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 1, 0, 0, …
$ fireplace_qu    <chr> "ta", "gd", "none", "ta", "fa", "fa", "fa", "none", "n…
$ gr_liv_area     <dbl> 2110, 1604, 1187, 1173, 1078, 1056, 1337, 987, 1092, 2…
$ lot_area        <dbl> 11160, 9978, 7980, 10625, 12537, 8450, 5858, 1680, 168…
$ year_built      <dbl> 1968, 1998, 1992, 1974, 1971, 1968, 1999, 1971, 1971, …
$ overall_qual    <dbl> 7, 6, 6, 7, 5, 5, 7, 6, 6, 9, 7, 9, 9, 6, 7, 7, 7, 8, …
$ garage_cars     <dbl> 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 2, …
$ garage_qual     <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning       <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rh", "rm", "rm", …
$ lot_config      <chr> "corner", "inside", "inside", "inside", "cul_d_sac", "…
$ bldg_type       <chr> "one_fam", "one_fam", "one_fam", "one_fam", "one_fam",…
$ sale_price      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Set appropriate variable classes:

Nominal and ordinal variables are set to factors
interval and ration variables are set to numeric

# general engineering for the full data
data_trn <- data_trn |> 
  class_ames()

data_val <- data_val |> 
  class_ames()

data_test <- data_test |> 
  class_ames()

Create tracking tibble

Create a tibble to track the validation errors across various model configurations.

error_val <- tibble(model = character(), rmse_val = numeric()) |> 
  glimpse()

Rows: 0
Columns: 2
$ model    <chr> 
$ rmse_val <dbl>

KNN 1

First we will try a fully numeric model, selecting down to just the top correlated predictors we found in modeling EDA. Remember you always need to range correct numeric predictors when you have more than one numeric predictor in a KNN model. This model includes the following homework requirements:

2+ numeric predictors
Varied k

Set up recipe

rec_1 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |> 
  step_range(gr_liv_area, garage_area, garage_cars, overall_qual, total_bsmt_sf, 
             year_built, tot_rms_abv_grd)

Training feature matrix

prep recipe

rec_prep <- rec_1 |> 
  prep(training = data_trn)

bake recipe to get feature set. Use new_data = NULL to get previously saved training features

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit model

fit_knn_1_5nn <- 
  nearest_neighbor(neighbors = 5) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual +  total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)

Validation feature matrix

Use the bake() to generate the feature matrix of the validation data that we will use to assess your model.

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess model

Use rmse_vec() to calculate the validation error (RMSE) of the model.

error_val <- bind_rows(error_val, 
                        tibble(model = "knn_1_5nn", 
                               rmse_val = rmse_vec(feat_val$sale_price, 
                                                   predict(fit_knn_1_5nn, feat_val)$.pred)))

error_val

# A tibble: 1 × 2
  model     rmse_val
  <chr>        <dbl>
1 knn_1_5nn   37195.

Lets also try with k = 10:

fit_knn_1_10nn <- 
  nearest_neighbor(neighbors = 10) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ gr_liv_area + garage_area + garage_cars + overall_qual +  total_bsmt_sf + year_built + tot_rms_abv_grd, data = feat_trn)

Assess validation error

error_val <- bind_rows(error_val, 
                        tibble(model = "knn_1_10nn", 
                               rmse_val = rmse_vec(feat_val$sale_price, 
                                                   predict(fit_knn_1_10nn, feat_val)$.pred)))

error_val

# A tibble: 2 × 2
  model      rmse_val
  <chr>         <dbl>
1 knn_1_5nn    37195.
2 knn_1_10nn   36114.

Visualize performance

Visualize the relationship between raw and predicted sale price in your validation set using the plot_truth() function.

knn_1_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_5nn, feat_val)$.pred)

knn_1_10_nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_1_10nn, feat_val)$.pred)

cowplot::plot_grid(knn_1_5nn, knn_1_10_nn, 
          labels = list("k = 5NN", "k = 10nn"))

KNN 2

Next, we build on this model by adding in categorical variables that looked like they may have promising relationships with sale_price during modeling EDA. We collapse the neighborhood variable down to categories based on the relationship with sale_price to reduce the number of levels in a way that is unifying. Note that we only have 27/28 neighborhoods in our training data so we will have to impute this new variable in case a new level shows up! We use step_impute_mode(). We do not need to range correct the dummy coded variables, but we do need to range correct the quality factors that we converted to numeric variables with step_mutate_at(). This model includes the following homework requirements:

2+ numeric predictors
Categorical > 2 levels
Varied k

2.1 Set up recipe

rec_2 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |> 
  step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
  step_range(all_numeric_predictors()) |> 
  step_mutate(neighborhood = fct_collapse(neighborhood,
                                          "low_price" = c("meadow_v", "idotrr", "br_dale"),
                                          "mod_lo_price" = c("old_town","blueste", 
                                                             "edwards", "brk_side", "sawyer", 
                                                             "landmrk","swisu","n_ames", 
                                                             "n_pk_vill"),
                                          "mod_price" = c("mitchel","blmngtn", "nw_ames", 
                                                          "gilbert", "sawyer_w",
                                                          "crawfor", "collg_cr", "greens"),
                                          "mod_hi_price" = c("somerst","clear_cr", "timber",
                                                             "veenker") ,
                                          "high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
  step_impute_mode(neighborhood) |> 
  step_dummy(neighborhood) |>
  step_rm(ms_sub_class, central_air, fireplaces, lot_area, ms_zoning, lot_config, bldg_type)

Training feature matrix

prep recipe

rec_prep <- rec_2 |> 
  prep(training = data_trn)

bake recipe to get feature set

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit your models

fit_knn_2_3nn <- 
  nearest_neighbor(neighbors = 3) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

fit_knn_2_5nn <- 
  nearest_neighbor(neighbors = 5) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

fit_knn_2_10nn <- 
  nearest_neighbor(neighbors = 10) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess your models

error_val <- bind_rows(error_val, 
                        tibble(model = c("knn_2_3nn","knn_2_5nn","knn_2_10nn"), 
                               rmse_val = c(rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_2_3nn, feat_val)$.pred),
                                            rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_2_5nn, feat_val)$.pred), 
                                            rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_2_10nn, feat_val)$.pred))))

error_val

# A tibble: 5 × 2
  model      rmse_val
  <chr>         <dbl>
1 knn_1_5nn    37195.
2 knn_1_10nn   36114.
3 knn_2_3nn    35891.
4 knn_2_5nn    34864.
5 knn_2_10nn   34795.

Visualize performance

knn_2_3nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_3nn, feat_val)$.pred)

knn_2_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_5nn, feat_val)$.pred)

knn_2_10nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_2_10nn, feat_val)$.pred)

cowplot::plot_grid(knn_2_3nn, knn_2_5nn, knn_2_10nn, 
         labels = list("k = 3NN", "k = 5NN", "k = 10nn"),
         nrow = 1)

KNN 3

In this KNN, we will add in more numeric and categorical variables to see what an increase in predictors does. Some categorical variables, like neighborhood and ms_sub_class, we manually collapse based on frequencies observed during EDA. Others are handled more simply by using step_other() to automatically bin low frequency categories into their own other label. This model includes the following homework requirements:

2+ numeric predictors
Categorical > 2 levels’
Modified categorical

Set up recipe

rec_3 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  step_impute_knn(all_numeric_predictors()) |> 
  step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
  step_range(all_numeric_predictors()) |> 
  step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
                                          "pud" = c("120", "150", "160", "180"),
                                          "multi" = c("080", "085", "090","190"),
                                          "one_st" = c("020", "030", "040", "045", "050"),
                                          "two_st" = c("060", "070", "075"))) |>
  step_mutate(neighborhood = fct_collapse(neighborhood,
                                          "low_price" = c("meadow_v", "idotrr", "br_dale"),
                                          "mod_lo_price" = c("old_town","blueste", 
                                                             "edwards", "brk_side", "sawyer", 
                                                             "landmrk","swisu","n_ames", 
                                                             "n_pk_vill"),
                                          "mod_price" = c("mitchel","blmngtn", "nw_ames", 
                                                          "gilbert", "sawyer_w",
                                                          "crawfor", "collg_cr", "greens"),
                                          "mod_hi_price" = c("somerst","clear_cr", "timber",
                                                             "veenker") ,
                                          "high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
  step_impute_mode(neighborhood) |> 
  step_other(all_nominal_predictors()) |> 
  step_rm(central_air) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_nzv(all_predictors())

Training feature matrix

prep recipe

rec_prep <- rec_3 |> 
  prep(training = data_trn)

bake recipe

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit your models

fit_knn_3_3nn <- 
  nearest_neighbor(neighbors=3) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

fit_knn_3_5nn <- 
  nearest_neighbor(neighbors = 5) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

fit_knn_3_10nn <- 
  nearest_neighbor(neighbors=10) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess your models

error_val <- bind_rows(error_val, 
                        tibble(model = c("knn_3_3nn", "knn_3_5nn","knn_3_10nn"),
                               rmse_val = c(rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_3_3nn, feat_val)$.pred), 
                                            rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_3_5nn, feat_val)$.pred), 
                                            rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_3_10nn, feat_val)$.pred))))

error_val

# A tibble: 8 × 2
  model      rmse_val
  <chr>         <dbl>
1 knn_1_5nn    37195.
2 knn_1_10nn   36114.
3 knn_2_3nn    35891.
4 knn_2_5nn    34864.
5 knn_2_10nn   34795.
6 knn_3_3nn    39689.
7 knn_3_5nn    38615.
8 knn_3_10nn   38787.

Visualize performance

knn_3_3nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_3nn, feat_val)$.pred)

knn_3_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_5nn, feat_val)$.pred)

knn_3_10nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_3_10nn, feat_val)$.pred)

cowplot::plot_grid(knn_3_3nn, knn_3_5nn, knn_3_10nn, 
         labels = list("k = 3NN", "k = 5NN", "k = 10nn"),
         nrow = 1)

Additional configurations

In this next configuration, we add more complexity to the model. We manually created a few variables by combining other related variables (e.g. combining all variables that measure the size of different parts of the house into one size variable, resulting in a predictor that is highly correlated with sale_price). We also allow knn to consider all possible interactions of numeric variables (since it can handle high complexity). Because we add many terms, we also add step_nzv() to the end of the recipe here to remove any variables that are highly sparse/ unbalanced (i.e., that will not be that useful for prediction). This ended up improving things quite a bit!

This model includes the following homework requirements:

2+ numeric predictors
Categorical > 2 levels’
Modified categorical
Multiple k

Create recipe

rec_4 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  # kind of meta knn but should help impute a reasonable value for missing data
  step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
  # apply a function to pointed cols
  step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
  # self-defined features
  step_mutate(all_size = gr_liv_area + total_bsmt_sf + garage_area,
              all_qual = overall_qual + fireplace_qu + bsmt_qual + garage_qual,
              all_garage = garage_qual*garage_area*garage_cars,
              all_fireplace = fireplaces*fireplace_qu,
              all_bsmt = bsmt_qual*total_bsmt_sf,
              avg_room = gr_liv_area/tot_rms_abv_grd) |> 
  # normalization
  step_range(all_numeric_predictors()) |> 
  step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
                                          "pud" = c("120", "150", "160", "180"),
                                          "multi" = c("080", "085", "090","190"),
                                          "one_st" = c("020", "030", "040", "045", "050"),
                                          "two_st" = c("060", "070", "075"))) |>
  step_mutate(neighborhood = fct_collapse(neighborhood,
                                          "low_price" = c("meadow_v", "idotrr", "br_dale"),
                                          "mod_lo_price" = c("old_town","blueste", 
                                                             "edwards", "brk_side", "sawyer", 
                                                             "landmrk","swisu","n_ames", 
                                                             "n_pk_vill"),
                                          "mod_price" = c("mitchel","blmngtn", "nw_ames", 
                                                          "gilbert", "sawyer_w",
                                                          "crawfor", "collg_cr", "greens"),
                                          "mod_hi_price" = c("somerst","clear_cr", "timber",
                                                             "veenker") ,
                                          "high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
  step_impute_mode(neighborhood) |> 
  step_other(all_nominal_predictors(), threshold = .1) |> 
  # remove features not being used
  step_rm(ms_zoning, bldg_type,lot_config) |> 
  step_dummy(all_nominal_predictors()) |> 
  # remove not features without too much difference across items
  step_nzv(all_predictors())

Training feature matrix

prep recipe

rec_prep <- rec_4 |> 
  prep(training = data_trn)

bake recipe

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit models

fit_knn_4_3nn <- 
  nearest_neighbor(neighbors = 3) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

fit_knn_4_5nn <- 
  nearest_neighbor(neighbors = 5) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

fit_knn_4_10nn <- 
  nearest_neighbor(neighbors = 10) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess models

error_val <- bind_rows(error_val, 
                        tibble(model = c("knn_4_3nn", "knn_4_5nn","knn_4_10nn"),
                               rmse_val = c(rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_4_3nn, feat_val)$.pred), 
                                            rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_4_5nn, feat_val)$.pred), 
                                            rmse_vec(feat_val$sale_price, 
                                                     predict(fit_knn_4_10nn, feat_val)$.pred))))
 
error_val

# A tibble: 11 × 2
   model      rmse_val
   <chr>         <dbl>
 1 knn_1_5nn    37195.
 2 knn_1_10nn   36114.
 3 knn_2_3nn    35891.
 4 knn_2_5nn    34864.
 5 knn_2_10nn   34795.
 6 knn_3_3nn    39689.
 7 knn_3_5nn    38615.
 8 knn_3_10nn   38787.
 9 knn_4_3nn    33547.
10 knn_4_5nn    33882.
11 knn_4_10nn   34825.

Visualize performance

knn_4_3nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_3nn, feat_val)$.pred)

knn_4_5nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_5nn, feat_val)$.pred)

knn_4_10nn <- plot_truth(truth = feat_val$sale_price, estimate = predict(fit_knn_4_10nn, feat_val)$.pred)

cowplot::plot_grid(knn_4_3nn, knn_4_5nn, knn_4_10nn, 
         labels = list("k = 3NN", "k = 5NN", "k = 10nn"),
         nrow = 1)

Lets now check what our best configuration for knn

error_val |> 
  arrange(rmse_val) |> 
  slice(1)

# A tibble: 1 × 2
  model     rmse_val
  <chr>        <dbl>
1 knn_4_3nn   33547.

Predictions

Since the knn is the best model configuration that we’ve fit, we will use this model to generate predictions for the held out test set. Remember, we only make predictions in the test set once (important!).

One new step: when you are preparing to make predictions in your held out test set, your recipe should be prepped using the combination of your training and validation data in order to get the best estimates possible for the held out data. For the competition, we didn’t ask you to do this but we will show you what it looks like here.

Our best model is a KNN (i.e., from this/ the KNN script) so we worked out this process of re-fitting with train and validation below . If your best model is a GLM this would be applicable to that script in the assignment, and you could skip to Save & Knit at the end of this document.

Prep best recipe now with “held-in” train and val data

data_in <- data_trn |> 
  bind_rows(data_val) |> 
  glimpse()

Rows: 1,955
Columns: 19
$ sale_price      <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area     <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood    <fct> n_ames, n_ames, gilbert, stone_br, stone_br, stone_br,…
$ ms_sub_class    <fct> 020, 020, 060, 120, 120, 120, 060, 060, 020, 120, 060,…
$ total_bsmt_sf   <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual       <fct> ta, ta, gd, gd, gd, gd, ta, gd, gd, gd, gd, gd, ex, ta…
$ central_air     <fct> y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, …
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces      <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu    <fct> none, none, ta, none, none, ta, ta, gd, po, none, gd, …
$ gr_liv_area     <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area        <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built      <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual    <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars     <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual     <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ ms_zoning       <fct> rh, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl…
$ lot_config      <fct> inside, corner, inside, inside, inside, inside, inside…
$ bldg_type       <fct> one_fam, one_fam, one_fam, twhs_ext, twhs_ext, twhs_ex…

rec_prep_best <- rec_4 |>   # rec_4 is our recipe for our BEST model configuration
  prep(training = data_in)  # prepping this recipe with ALL held in data (trn and val)

Bake features using held in data from train and val

feat_in <- rec_prep_best |> 
  bake(new_data = data_in) |> 
  glimpse()

Rows: 1,955
Columns: 26
$ garage_area               <dbl> 0.4905914, 0.2096774, 0.3239247, 0.3911290, …
$ total_bsmt_sf             <dbl> 0.14435352, 0.21751227, 0.15188216, 0.218985…
$ bsmt_qual                 <dbl> 0.50, 0.50, 0.75, 0.75, 0.75, 0.75, 0.50, 0.…
$ tot_rms_abv_grd           <dbl> 0.18181818, 0.27272727, 0.27272727, 0.272727…
$ fireplaces                <dbl> 0.0000000, 0.0000000, 0.3333333, 0.0000000, …
$ fireplace_qu              <dbl> 0.0, 0.0, 0.6, 0.0, 0.0, 0.6, 0.6, 0.8, 0.2,…
$ gr_liv_area               <dbl> 0.08800922, 0.17121445, 0.22886241, 0.172943…
$ lot_area                  <dbl> 0.0474624478, 0.0598356170, 0.0577913542, 0.…
$ year_built                <dbl> 0.6370370, 0.6148148, 0.9037037, 0.9333333, …
$ overall_qual              <dbl> 0.4444444, 0.5555556, 0.4444444, 0.7777778, …
$ garage_cars               <dbl> 0.25, 0.25, 0.50, 0.50, 0.50, 0.50, 0.50, 0.…
$ garage_qual               <dbl> 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6,…
$ sale_price                <dbl> 105000, 172000, 189900, 213500, 191500, 2365…
$ all_size                  <dbl> 0.13246542, 0.17005696, 0.17567128, 0.193490…
$ all_qual                  <dbl> 0.40, 0.45, 0.60, 0.60, 0.60, 0.75, 0.65, 0.…
$ all_garage                <dbl> 0.13888889, 0.05936073, 0.18340944, 0.221461…
$ all_fireplace             <dbl> 0.0000000, 0.0000000, 0.2222222, 0.0000000, …
$ all_bsmt                  <dbl> 0.08661211, 0.13050736, 0.12150573, 0.175188…
$ avg_room                  <dbl> 0.1345455, 0.2306818, 0.3443182, 0.2340909, …
$ neighborhood_mod_lo_price <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ neighborhood_mod_hi_price <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ neighborhood_other        <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,…
$ ms_sub_class_two_st       <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,…
$ ms_sub_class_multi        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ ms_sub_class_pud          <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
$ central_air_other         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

Train best model configuration using these held in feature

fit_knn_best <- 
  nearest_neighbor(neighbors = 5) |>   
  set_engine("kknn") |>   
  set_mode("regression") |> 
  fit(sale_price ~ ., data = feat_in)

Now generate features for held out (test) data using the same prepped recipe

feat_test <- rec_prep_best |> 
  bake(new_data = data_test)

Generate predictions made by the best model

preds_test <- data_test |> 
  select(pid) |>   # keep only pid
  mutate(sale_price = predict(fit_knn_best, feat_test)$.pred) 

# IF you did any transformation of sale price (e.g., log), you will need a final 
# mutate to untransform sale_price back to raw dollars in the pipe above

Save out the best model’s predictions of raw sale_price in the held-out test set. Name file with your name

 preds_test |> 
   write_csv(here::here(path_data, "test_preds_TA.csv"))