Homework Unit 3: LM

Author

TA Key

Published

February 11, 2025

Introduction

This file serves as the answer key for the LM portion of the Unit_03 homework. Unit 3 Exploratory Introduction to Regression Models in the course web book contains all materials required for this assignment.

In this assignment, we demonstrate how to fit multiple configurations of LM and KNN regression models using data from the ames housing data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all LM and KNN models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.

Set up

Handle conflicts

options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
tidymodels_conflictRules()

Load required packages

library(tidyverse) 
library(tidymodels)

Source function scripts (John’s or your own)

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")

# define a data general cleaning/engineering function for all data
class_ames <- function(df){
  levels <- c("none", "po", "fa", "ta", "gd", "ex")
  df |>
    mutate(across(where(is.character), factor)) |> 
    mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |> 
    mutate(garage_qual = fct_relevel(garage_qual, levels)) |> 
    mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |> 
    mutate(neighborhood = factor(neighborhood,
                               levels = c("blmngtn", "blueste", "br_dale", "brk_side",
                                          "clear_cr", "collg_cr", "crawfor", "edwards",
                                          "gilbert", "greens", "grn_hill", "idotrr",
                                          "landmrk", "meadow_v", "mitchel", "n_ames",
                                          "no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
                                          "old_town", "sawyer", "sawyer_w", "somerst", 
                                          "stone_br", "swisu", "timber", "veenker")),
         ms_sub_class = factor(ms_sub_class,
                               levels = c("020", "030", "040", "045", "050", "060",
                                          "070", "075", "080", "085", "090", "120",
                                          "150", "160", "180", "190"))) |> 
    suppressWarnings()
}

Specify other global settings

theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)

Paths

path_data <- "application_assignments/unit_03"

Load data

Load the cleaned training, validation, and test data files

Use here::here() and relative path for your data. Make sure your iaml project is open

data_trn <- read_csv(here::here(path_data,"ames_train_cln.csv"), 
                     col_types = cols()) |> 
  glimpse()

Rows: 1,467
Columns: 19
$ sale_price      <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area     <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood    <chr> "n_ames", "n_ames", "gilbert", "stone_br", "stone_br",…
$ ms_sub_class    <chr> "020", "020", "060", "120", "120", "120", "060", "060"…
$ total_bsmt_sf   <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual       <chr> "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", "gd", …
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces      <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu    <chr> "none", "none", "ta", "none", "none", "ta", "ta", "gd"…
$ gr_liv_area     <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area        <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built      <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual    <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars     <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual     <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning       <chr> "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config      <chr> "inside", "corner", "inside", "inside", "inside", "ins…
$ bldg_type       <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "twhs_ext…

data_val <- read_csv(here::here(path_data,"ames_val_cln.csv"),
                     col_types = cols()) |> 
  glimpse()

Rows: 488
Columns: 19
$ sale_price      <dbl> 215000, 175900, 115000, 127500, 275000, 224000, 192000…
$ garage_area     <dbl> 528, 440, 0, 440, 730, 484, 430, 440, 400, 676, 264, 5…
$ neighborhood    <chr> "n_ames", "gilbert", "n_ames", "n_pk_vill", "nridg_ht"…
$ ms_sub_class    <chr> "020", "060", "020", "120", "020", "120", "120", "060"…
$ total_bsmt_sf   <dbl> 1080, 763, 864, 1069, 1698, 1358, 1256, 860, 384, 1218…
$ bsmt_qual       <chr> "ta", "gd", "ta", "gd", "ex", "gd", "gd", "gd", "gd", …
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 7, 7, 5, 4, 7, 6, 6, 8, 7, 4, 5, 6, 6, 4, 5, 6, 9, 5, …
$ fireplaces      <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fireplace_qu    <chr> "gd", "ta", "po", "fa", "gd", "gd", "ta", "ta", "ta", …
$ gr_liv_area     <dbl> 1656, 1655, 864, 1069, 1698, 1358, 1269, 1960, 1430, 1…
$ lot_area        <dbl> 31770, 10000, 10500, 4043, 11520, 6371, 3182, 7851, 77…
$ year_built      <dbl> 1960, 1993, 1971, 1977, 2005, 2009, 2004, 2002, 2000, …
$ overall_qual    <dbl> 6, 6, 4, 6, 9, 7, 8, 6, 7, 7, 6, 5, 5, 7, 6, 7, 6, 6, …
$ garage_cars     <dbl> 2, 2, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
$ garage_qual     <chr> "ta", "ta", "none", "ta", "ta", "ta", "ta", "ta", "ta"…
$ ms_zoning       <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config      <chr> "corner", "corner", "fr2", "inside", "inside", "inside…
$ bldg_type       <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "one_fam"…

data_test <- read_csv(here::here(path_data,"ames_test_cln.csv"),
                      col_types = cols()) |> 
  glimpse()

Rows: 975
Columns: 20
$ pid             <chr> "0526353030", "0527105030", "0527165230", "0527358200"…
$ garage_area     <dbl> 522, 470, 420, 528, 500, 304, 511, 264, 264, 751, 532,…
$ neighborhood    <chr> "n_ames", "gilbert", "gilbert", "nw_ames", "n_ames", "…
$ ms_sub_class    <chr> "020", "060", "020", "085", "020", "020", "120", "160"…
$ total_bsmt_sf   <dbl> 2110, 926, 1168, 1053, 1078, 1056, 1405, 483, 525, 159…
$ bsmt_qual       <chr> "ta", "ta", "gd", "gd", "ta", "ta", "gd", "ta", "ta", …
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 8, 7, 6, 6, 6, 6, 5, 5, 6, 10, 7, 11, 10, 6, 7, 7, 7, …
$ fireplaces      <dbl> 2, 1, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 1, 0, 0, …
$ fireplace_qu    <chr> "ta", "gd", "none", "ta", "fa", "fa", "fa", "none", "n…
$ gr_liv_area     <dbl> 2110, 1604, 1187, 1173, 1078, 1056, 1337, 987, 1092, 2…
$ lot_area        <dbl> 11160, 9978, 7980, 10625, 12537, 8450, 5858, 1680, 168…
$ year_built      <dbl> 1968, 1998, 1992, 1974, 1971, 1968, 1999, 1971, 1971, …
$ overall_qual    <dbl> 7, 6, 6, 7, 5, 5, 7, 6, 6, 9, 7, 9, 9, 6, 7, 7, 7, 8, …
$ garage_cars     <dbl> 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 2, …
$ garage_qual     <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning       <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rh", "rm", "rm", …
$ lot_config      <chr> "corner", "inside", "inside", "inside", "cul_d_sac", "…
$ bldg_type       <chr> "one_fam", "one_fam", "one_fam", "one_fam", "one_fam",…
$ sale_price      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

Set appropriate variable classes:

Nominal and ordinal variables are set to factors
interval and ration variables are set to numeric

Remember ordinal variables should have the factor levels explicitly stated to retain the order (you will see why when we get to recipe 3!)

Also note you will get a warning stating there is an unknown level in the factor - this is because there are no basement rated as having “po” quality in data_trn or data_val and there is no garage rated as “po” in data_val. Since we investigated this warning we are suppressing warning messages using suppressWarnings() to prevent the warning from coming up again.

data_trn <- data_trn |> 
  class_ames()

data_val <- data_val |> 
  class_ames()

data_test <- data_test |> 
  class_ames()

Create tracking tibble

Create a tibble to track the validation errors across various model configurations.

error_val <- tibble(model = character(), rmse_val = numeric()) |> 
  glimpse()

Rows: 0
Columns: 2
$ model    <chr> 
$ rmse_val <dbl>

LM 1

First, we will try a simple LM using all numeric predictors in the set. We used the graphs generated in the modeling EDA script to decide which transformations to apply. We opt to use step_impute_knn() instead of step_impute_median() because it allowed us to get closer estimates of the missing data (although as mentioned in class, this can be more computationally expensive as models get more complex!), especially given that there are outliers on variables with missingness. This model includes the following homework requirements:

Transformed numeric predictor

Set up recipe

rec_1 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |> 
  step_log(gr_liv_area, lot_area, base = 10) |>  # for the transformation choice question -> lab shell
  step_YeoJohnson(garage_area, total_bsmt_sf, year_built) |> 
  step_rm(all_nominal_predictors())

Training feature matrix

prep recipe

rec_prep <- rec_1 |> 
  prep(training = data_trn)

bake recipe to get feature set. Set new_data = NULL to get training features

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit model

Linear model predicting raw sale_price from transformed numeric predictors.

fit_lm_1 <- 
  linear_reg() |> 
  set_engine("lm") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

Use the bake() to generate the feature matrix of the validation data that we will use to assess your model.

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess your model

Use the rmse_vec() function to calculate the validation error (RMSE) of the model. Add this value to the validation error tracking tibble.

error_val <- bind_rows(error_val, 
                       tibble(model = "lm_1", 
                              rmse_val = rmse_vec(feat_val$sale_price, 
                                                  predict(fit_lm_1, feat_val)$.pred)))

error_val

# A tibble: 1 × 2
  model rmse_val
  <chr>    <dbl>
1 lm_1    42129.

Visualize performance

Visualize the relationship between raw and predicted sale price in the validation set using the plot_truth().

plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_1, feat_val)$.pred)

LM 2

For our second model, we decided to add in all categorical variables. Rather than modifying individual categorical variables, we use step_other() to automatically collapse factor levels that occur infrequently (default threshold is .05). step_other() also handles new levels of categorical predictors by assigning them to “other” (since in this data, we had a couple factor levels in our validation sample that were not present in train). You can also handle new factor levels explicitly by using step_novel() or by addressing them when you correct for character class like we did at the top of the script before building our recipe. This model includes the following homework requirements:

Transformed numeric predictor
Categorical > 2 levels
Modified categorical

Set up recipe

rec_2 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  step_impute_median(garage_cars, garage_area, total_bsmt_sf) |> 
  step_log(gr_liv_area, lot_area, base = 10) |> 
  step_YeoJohnson(garage_area, total_bsmt_sf, year_built)|> 
  step_other(all_nominal_predictors(), -central_air) |> 
  step_dummy(all_nominal_predictors())

Training feature matrix

prep recipe

rec_prep <- rec_2 |> 
  prep(training = data_trn)

bake recipe

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit model

fit_lm_2 <- 
  linear_reg() |> 
  set_engine("lm") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess model in validation data

error_val <- bind_rows(error_val,
                       tibble(model = "lm_2", 
                              rmse_val = rmse_vec(feat_val$sale_price, 
                                                  predict(fit_lm_2, feat_val)$.pred)))
error_val

# A tibble: 2 × 2
  model rmse_val
  <chr>    <dbl>
1 lm_1    42129.
2 lm_2    38911.

Visualize performance

Visualize the relationship between raw and predicted sale price in your validation set using plot_truth().

plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_2, feat_val)$.pred)

LM 3

For a third LM, we build upon the previous model by adding interactions that we think make sense based on modeling EDA. For example, the interaction of overall_qual and neighborhood may help explain why some lower quality houses sell for higher prices when they are in more desirable neighborhoods. We also decide to convert all quality variables to numeric using step_mutate_at() (remember we already made sure these factor levels were ordered when we read the data in) so that it will take fewer variables to include these in interactions. This model includes the following homework requirements:

Transformed numeric predictor
Categorical > 2 levels
Modified categorical
Interactions

Set up recipe

rec_3 <- 
  recipe(sale_price ~ ., data = data_trn) |> 
  step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
  step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
  step_YeoJohnson(gr_liv_area, lot_area, garage_area, total_bsmt_sf, year_built) |> 
  step_other(all_nominal_predictors()) |> 
  step_dummy(all_nominal_predictors()) |>   # should dummy code the nominal before applying interactions
  step_interact(~ overall_qual:year_built:fireplace_qu) |>  # Here added a "3-way interaction" - only the 3-way term
  step_interact(~ total_bsmt_sf:starts_with("bsmt_qual")) |> 
  step_interact(~tot_rms_abv_grd:year_built) |> 
  step_interact(~overall_qual:starts_with("neighborhood")) |> 
  step_interact(~overall_qual:starts_with("ms_sub_class"))

# To check with the term info below
rec_3$term_info

# A tibble: 19 × 4
   variable        type      role      source  
   <chr>           <list>    <chr>     <chr>   
 1 garage_area     <chr [2]> predictor original
 2 neighborhood    <chr [3]> predictor original
 3 ms_sub_class    <chr [3]> predictor original
 4 total_bsmt_sf   <chr [2]> predictor original
 5 bsmt_qual       <chr [3]> predictor original
 6 central_air     <chr [3]> predictor original
 7 tot_rms_abv_grd <chr [2]> predictor original
 8 fireplaces      <chr [2]> predictor original
 9 fireplace_qu    <chr [3]> predictor original
10 gr_liv_area     <chr [2]> predictor original
11 lot_area        <chr [2]> predictor original
12 year_built      <chr [2]> predictor original
13 overall_qual    <chr [2]> predictor original
14 garage_cars     <chr [2]> predictor original
15 garage_qual     <chr [3]> predictor original
16 ms_zoning       <chr [3]> predictor original
17 lot_config      <chr [3]> predictor original
18 bldg_type       <chr [3]> predictor original
19 sale_price      <chr [2]> outcome   original

The step_interact here for the 3-way term only involves with the product of the three original predictors. It’s in effect manually adding an extra feature to the data, which is the 3-way interaction term of a canonical expression of linear regression. And thus, using this step is equal to adding the new feature by :

# ... |> 
#   step_mutate(new_3_way_interaction_term = overall_qual*year_built*fireplace_qu) |> 
#   ...

Training feature matrix

prep recipe

rec_prep <- rec_3 |> 
  prep(training = data_trn)

rec_prep$term_info$source

 [1] "original" "original" "original" "original" "original" "original"
 [7] "original" "original" "original" "original" "original" "original"
[13] "original" "derived"  "derived"  "derived"  "derived"  "derived" 
[19] "derived"  "derived"  "derived"  "derived"  "derived"  "derived" 
[25] "derived"  "derived"  "derived"  "derived"  "derived"  "derived" 
[31] "derived"  "derived"  "derived"  "derived"  "derived"  "derived" 
[37] "derived"  "derived"  "derived"  "derived"  "derived"  "derived" 
[43] "derived"  "derived"  "derived"  "derived"  "derived"  "derived"

bake recipe

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit model

fit_lm_3 <- 
  linear_reg() |> 
  set_engine("lm") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess model

Calculate the validation error (RMSE) of your model in validation data.

error_val <- bind_rows(error_val, 
                       tibble(model = "lm_3", 
                              rmse_val = rmse_vec(feat_val$sale_price, 
                                                  predict(fit_lm_3, feat_val)$.pred)))

error_val

# A tibble: 3 × 2
  model rmse_val
  <chr>    <dbl>
1 lm_1    42129.
2 lm_2    38911.
3 lm_3    37043.

Visualize performance

Visualize the relationship between raw and predicted sale price in your validation set using plot_truth().

plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_3, feat_val)$.pred)

4 Additional configurations

Here is where we did some deeper dives into the variables. First, we manually created a few variables by combining other related variables (e.g. combining all variables talking about the size of different parts of the house into one size variable, which resulted in a predictor that is highly correlated with sale_price). Then, we chose to manually assign levels to some categorical variables based on values that made sense to be grouped together (either because of low frequencies or relationship with sale_price). This allowed us to consider more complex interactions. For example, we noticed some homes with large basements, garages (etc.) were selling for less. We wanted to find a way to separate these homes from mansion-type houses that had large sizes and sold for much more. EDA revealed that homes with multiple nice fireplaces sell for a lot more, so we interacted all_fireplace with the all_size variable. We also collapsed the neighborhood variable down to categories based on the relationship with sale_price to reduce the number of levels in a way that is unifying. However, we only have 27/28 neighborhoods in our training data so we will have to impute this new variable in case a new level shows up! We use step_impute_mode(). This model contains the following homework requirements:

Transformed numeric predictor
Categorical > 2 levels
Modified categorical
Interactions
Bonus: Researcher created variables

Set up recipe

rec_4 <- recipe(sale_price ~., data = data_trn) |> 
  step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
  # apply functions to specific cols
  step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |> 
  # self-defined features
  step_mutate(all_size = gr_liv_area + total_bsmt_sf + garage_area,
              all_qual = overall_qual + fireplace_qu + bsmt_qual + garage_qual,
              all_garage = garage_qual*garage_area*garage_cars,
              all_fireplace = fireplaces*fireplace_qu,
              all_bsmt = bsmt_qual*total_bsmt_sf,
              avg_room = gr_liv_area/tot_rms_abv_grd) |> 
  step_YeoJohnson(gr_liv_area, lot_area,all_qual, avg_room, all_size, all_garage, garage_area,
                  total_bsmt_sf, year_built, all_bsmt) |> 
  step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
                                          "pud" = c("120", "150", "160", "180"),
                                          "multi" = c("080", "085", "090","190"),
                                          "one_st" = c("020", "030", "040", "045", "050"),
                                          "two_st" = c("060", "070", "075"))) |>
  step_mutate(neighborhood = fct_collapse(neighborhood,
                                          "low_price" = c("meadow_v", "idotrr", "br_dale"),
                                          "mod_lo_price" = c("old_town","blueste", 
                                                             "edwards", "brk_side", "sawyer", 
                                                             "landmrk","swisu","n_ames", 
                                                             "n_pk_vill"),
                                          "mod_price" = c("mitchel","blmngtn", "nw_ames", 
                                                          "gilbert", "sawyer_w",
                                                          "crawfor", "collg_cr", "greens"),
                                          "mod_hi_price" = c("somerst","clear_cr", "timber",
                                                             "veenker") ,
                                          "high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
  step_impute_mode(neighborhood) |> 
  # automatic collapsing by default, could set a threshold
  step_other(all_nominal_predictors()) |> 
  step_dummy(all_nominal_predictors()) |>
  step_interact(~ overall_qual:year_built) |>
  step_interact(~ avg_room:year_built) |> 
  step_interact(~ all_fireplace:all_size) |> 
  step_interact(~lot_area:starts_with("neighborhood")) |> 
  step_interact(~overall_qual:starts_with("neighborhood")) |> 
  step_interact(~overall_qual:starts_with("ms_sub_class")) |> 
  step_interact(~ all_size:starts_with("neighborhood")) |> 
  step_interact(~ all_size:starts_with("ms_sub_class"))

Training feature matrix

prep recipe

rec_prep <- rec_4 |> 
  prep(training = data_trn)

bake recipe

feat_trn <- rec_prep |> 
  bake(new_data = NULL)

Fit model

fit_lm_4 <- 
  linear_reg() |> 
  set_engine("lm") |> 
  fit(sale_price ~ ., data = feat_trn)

Validation feature matrix

feat_val <- rec_prep |> 
  bake(new_data = data_val)

Assess model

error_val<- bind_rows(error_val,  
                      tibble(model = "lm_4", 
                             rmse_val = rmse_vec(feat_val$sale_price, 
                                                 predict(fit_lm_4, feat_val)$.pred)))

error_val

# A tibble: 4 × 2
  model rmse_val
  <chr>    <dbl>
1 lm_1    42129.
2 lm_2    38911.
3 lm_3    37043.
4 lm_4    36281.

Visualize model

plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_4, feat_val)$.pred)

Lets now check what our best configuration for lm was

error_val |> 
  arrange(rmse_val) |> 
  slice(1)

# A tibble: 1 × 2
  model rmse_val
  <chr>    <dbl>
1 lm_4    36281.

We were able to improve on the LMs, but best model ended up being a KNN, so we won’t be generating predictions in this script (though you might have had a different outcome).