library(tidyverse)
library(tidymodels)
options(conflicts.policy = "depends.ok")Homework Unit 3: LM
Introduction
This file serves as the answer key for the LM portion of the Unit_03 homework. Unit 3 Exploratory Introduction to Regression Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to fit multiple configurations of LM and KNN regression models using data from the ames housing data set. By evaluating these various configurations in our validation set, we select the top performing model in our validation set out of all LM and KNN models we fit. We use this best model to generate predictions for our held out test set, which we only use ONCE for evaluation of our final best model.
Set up
Load 2 Very Important Packages (and set conflicts)
Source function scripts (John’s or your own)
source("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")# define a data general cleaning/engineering function for all data
class_ames <- function(df){
levels <- c("none", "po", "fa", "ta", "gd", "ex")
df |>
mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = fct_relevel(bsmt_qual, levels)) |>
mutate(garage_qual = fct_relevel(garage_qual, levels)) |>
mutate(fireplace_qu = fct_relevel(fireplace_qu, levels)) |>
mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190"))) |>
suppressWarnings()
}Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)Paths
path_data <- "application_assignments/unit_03"Load data
Load the cleaned training, validation, and test data files
Use here::here() and relative path for your data. Make sure your iaml project is open
data_trn <- read_csv(here::here(path_data,"ames_train_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 1,466
Columns: 19
$ sale_price <dbl> 105000, 172000, 189900, 213500, 191500, 236500, 189000…
$ garage_area <dbl> 730, 312, 482, 582, 506, 608, 442, 393, 506, 528, 841,…
$ neighborhood <chr> "n_ames", "n_ames", "gilbert", "stone_br", "stone_br",…
$ ms_sub_class <dbl> 20, 20, 60, 120, 120, 120, 60, 60, 20, 120, 60, 50, 20…
$ total_bsmt_sf <dbl> 882, 1329, 928, 1338, 1280, 1595, 994, 789, 1300, 1488…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 6, 6, 6, 5, 5, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7, 7, 5,…
$ fireplaces <dbl> 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, …
$ fireplace_qu <chr> "none", "none", "ta", "none", "none", "ta", "ta", "gd"…
$ gr_liv_area <dbl> 896, 1329, 1629, 1338, 1280, 1616, 1804, 1465, 1341, 1…
$ lot_area <dbl> 11622, 14267, 13830, 4920, 5005, 5389, 7500, 8402, 101…
$ year_built <dbl> 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1998, 1990, …
$ overall_qual <dbl> 5, 6, 5, 8, 8, 8, 7, 6, 7, 8, 8, 8, 9, 4, 6, 6, 7, 6, …
$ garage_cars <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "inside", "corner", "inside", "inside", "inside", "ins…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "twhs_ext…
data_val <- read_csv(here::here(path_data,"ames_val_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 486
Columns: 19
$ sale_price <dbl> 215000, 175900, 115000, 127500, 275000, 224000, 192000…
$ garage_area <dbl> 528, 440, 0, 440, 730, 484, 430, 440, 400, 676, 264, 5…
$ neighborhood <chr> "n_ames", "gilbert", "n_ames", "n_pk_vill", "nridg_ht"…
$ ms_sub_class <dbl> 20, 60, 20, 120, 20, 120, 120, 60, 80, 20, 120, 85, 20…
$ total_bsmt_sf <dbl> 1080, 763, 864, 1069, 1698, 1358, 1256, 860, 384, 1218…
$ bsmt_qual <chr> "ta", "gd", "ta", "gd", "ex", "gd", "gd", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 7, 7, 5, 4, 7, 6, 6, 8, 7, 4, 5, 6, 6, 4, 5, 6, 9, 5, …
$ fireplaces <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fireplace_qu <chr> "gd", "ta", "po", "fa", "gd", "gd", "ta", "ta", "ta", …
$ gr_liv_area <dbl> 1656, 1655, 864, 1069, 1698, 1358, 1269, 1960, 1430, 1…
$ lot_area <dbl> 31770, 10000, 10500, 4043, 11520, 6371, 3182, 7851, 77…
$ year_built <dbl> 1960, 1993, 1971, 1977, 2005, 2009, 2004, 2002, 2000, …
$ overall_qual <dbl> 6, 6, 4, 6, 9, 7, 8, 6, 7, 7, 6, 5, 5, 7, 6, 7, 6, 6, …
$ garage_cars <dbl> 2, 2, 0, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "none", "ta", "ta", "ta", "ta", "ta", "ta"…
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_config <chr> "corner", "corner", "fr2", "inside", "inside", "inside…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "twhs_ext", "one_fam"…
data_test <- read_csv(here::here(path_data,"ames_test_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 973
Columns: 20
$ pid <dbl> 526353030, 527105030, 527165230, 527358200, 527402250,…
$ garage_area <dbl> 522, 470, 420, 528, 500, 304, 511, 264, 264, 751, 532,…
$ neighborhood <chr> "n_ames", "gilbert", "gilbert", "nw_ames", "n_ames", "…
$ ms_sub_class <dbl> 20, 60, 20, 85, 20, 20, 120, 160, 160, 60, 20, 20, 20,…
$ total_bsmt_sf <dbl> 2110, 926, 1168, 1053, 1078, 1056, 1405, 483, 525, 159…
$ bsmt_qual <chr> "ta", "ta", "gd", "gd", "ta", "ta", "gd", "ta", "ta", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 8, 7, 6, 6, 6, 6, 5, 5, 6, 10, 7, 11, 10, 6, 7, 7, 7, …
$ fireplaces <dbl> 2, 1, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 1, 0, 0, …
$ fireplace_qu <chr> "ta", "gd", "none", "ta", "fa", "fa", "fa", "none", "n…
$ gr_liv_area <dbl> 2110, 1604, 1187, 1173, 1078, 1056, 1337, 987, 1092, 2…
$ lot_area <dbl> 11160, 9978, 7980, 10625, 12537, 8450, 5858, 1680, 168…
$ year_built <dbl> 1968, 1998, 1992, 1974, 1971, 1968, 1999, 1971, 1971, …
$ overall_qual <dbl> 7, 6, 6, 7, 5, 5, 7, 6, 6, 9, 7, 9, 9, 6, 7, 7, 7, 8, …
$ garage_cars <dbl> 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 3, 3, 2, 2, 2, 2, 2, …
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ ms_zoning <chr> "rl", "rl", "rl", "rl", "rl", "rl", "rh", "rm", "rm", …
$ lot_config <chr> "corner", "inside", "inside", "inside", "cul_d_sac", "…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "one_fam", "one_fam",…
$ sale_price <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
Set appropriate variable classes:
- Nominal and ordinal variables are set to factors
- interval and ration variables are set to numeric
Remember ordinal variables should have the factor levels explicitly stated to retain the order (you will see why when we get to recipe 3!)
Also note you will get a warning stating there is an unknown level in the factor - this is because there are no basement rated as having “po” quality in data_trn or data_val and there is no garage rated as “po” in data_val. Since we investigated this warning we are suppressing warning messages using suppressWarnings() to prevent the warning from coming up again.
data_trn <- data_trn |>
class_ames()
data_val <- data_val |>
class_ames()
data_test <- data_test |>
class_ames()Create tracking tibble
Create a tibble to track the validation errors across various model configurations.
error_val <- tibble(model = character(), rmse_val = numeric()) |>
glimpse()Rows: 0
Columns: 2
$ model <chr>
$ rmse_val <dbl>
LM 1
First, we will try a simple LM using all numeric predictors in the set. We used the graphs generated in the modeling EDA script to decide which transformations to apply. We opt to use step_impute_knn() instead of step_impute_median() because it allowed us to get closer estimates of the missing data (although as mentioned in class, this can be more computationally expensive as models get more complex!), especially given that there are outliers on variables with missingness. This model includes the following homework requirements:
- Transformed numeric predictor
Set up recipe
rec_1 <-
recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
step_log(gr_liv_area, lot_area, base = 10) |> # for the transformation choice question -> lab shell
step_YeoJohnson(garage_area, total_bsmt_sf, year_built) |>
step_rm(all_nominal_predictors())Training feature matrix
prep recipe
rec_prep <- rec_1 |>
prep(training = data_trn)bake recipe to get feature set. Set new_data = NULL to get training features
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit model
Linear model predicting raw sale_price from transformed numeric predictors.
fit_lm_1 <-
linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
Use the bake() to generate the feature matrix of the validation data that we will use to assess your model.
feat_val <- rec_prep |>
bake(new_data = data_val)Assess your model
Use the rmse_vec() function to calculate the validation error (RMSE) of the model. Add this value to the validation error tracking tibble.
error_val <- bind_rows(error_val,
tibble(model = "lm_1",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_lm_1, feat_val)$.pred)))
error_val# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 lm_1 35011.
Visualize performance
Visualize the relationship between raw and predicted sale price in the validation set using the plot_truth().
plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_1, feat_val)$.pred)LM 2
For our second model, we decided to add in all categorical variables. Rather than modifying individual categorical variables, we use step_other() to automatically collapse factor levels that occur infrequently (default threshold is .05). step_other() also handles new levels of categorical predictors by assigning them to “other” (since in this data, we had a couple factor levels in our validation sample that were not present in train). You can also handle new factor levels explicitly by using step_novel() or by addressing them when you correct for character class like we did at the top of the script before building our recipe. This model includes the following homework requirements:
- Transformed numeric predictor
- Categorical > 2 levels
- Modified categorical
Set up recipe
rec_2 <-
recipe(sale_price ~ ., data = data_trn) |>
step_impute_median(garage_cars, garage_area, total_bsmt_sf) |>
step_log(gr_liv_area, lot_area, base = 10) |>
step_YeoJohnson(garage_area, total_bsmt_sf, year_built)|>
step_other(all_nominal_predictors(), -central_air) |>
step_unknown(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) Training feature matrix
prep recipe
rec_prep <- rec_2 |>
prep(training = data_trn)bake recipe
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit model
fit_lm_2 <-
linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
feat_val <- rec_prep |>
bake(new_data = data_val)Assess model in validation data
error_val <- bind_rows(error_val,
tibble(model = "lm_2",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_lm_2, feat_val)$.pred)))Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
error_val# A tibble: 2 × 2
model rmse_val
<chr> <dbl>
1 lm_1 35011.
2 lm_2 31194.
Visualize performance
Visualize the relationship between raw and predicted sale price in your validation set using plot_truth().
plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_2, feat_val)$.pred)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
LM 3
For a third LM, we build upon the previous model by adding interactions that we think make sense based on modeling EDA. For example, the interaction of overall_qual and neighborhood may help explain why some lower quality houses sell for higher prices when they are in more desirable neighborhoods. We also decide to convert all quality variables to numeric using step_mutate_at() (remember we already made sure these factor levels were ordered when we read the data in) so that it will take fewer variables to include these in interactions. This model includes the following homework requirements:
- Transformed numeric predictor
- Categorical > 2 levels
- Modified categorical
- Interactions
Set up recipe
rec_3 <-
recipe(sale_price ~ ., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf, ms_sub_class) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_YeoJohnson(gr_liv_area, lot_area, garage_area, total_bsmt_sf, year_built) |>
step_other(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |> # should dummy code the nominal before applying interactions
step_interact(~ overall_qual:year_built:fireplace_qu) |> # Here added a "3-way interaction" - only the 3-way term
step_interact(~ total_bsmt_sf:starts_with("bsmt_qual")) |>
step_interact(~tot_rms_abv_grd:year_built) |>
step_interact(~overall_qual:starts_with("neighborhood")) |>
step_interact(~overall_qual:starts_with("ms_sub_class"))# To check with the term info below
rec_3$term_info# A tibble: 19 × 4
variable type role source
<chr> <list> <chr> <chr>
1 garage_area <chr [2]> predictor original
2 neighborhood <chr [3]> predictor original
3 ms_sub_class <chr [3]> predictor original
4 total_bsmt_sf <chr [2]> predictor original
5 bsmt_qual <chr [3]> predictor original
6 central_air <chr [3]> predictor original
7 tot_rms_abv_grd <chr [2]> predictor original
8 fireplaces <chr [2]> predictor original
9 fireplace_qu <chr [3]> predictor original
10 gr_liv_area <chr [2]> predictor original
11 lot_area <chr [2]> predictor original
12 year_built <chr [2]> predictor original
13 overall_qual <chr [2]> predictor original
14 garage_cars <chr [2]> predictor original
15 garage_qual <chr [3]> predictor original
16 ms_zoning <chr [3]> predictor original
17 lot_config <chr [3]> predictor original
18 bldg_type <chr [3]> predictor original
19 sale_price <chr [2]> outcome original
The step_interact here for the 3-way term only involves with the product of the three original predictors. It’s in effect manually adding an extra feature to the data, which is the 3-way interaction term of a canonical expression of linear regression. And thus, using this step is equal to adding the new feature by :
# ... |>
# step_mutate(new_3_way_interaction_term = overall_qual*year_built*fireplace_qu) |>
# ...Training feature matrix
prep recipe
rec_prep <- rec_3 |>
prep(training = data_trn)rec_prep$term_info$source [1] "original" "original" "original" "original" "original" "original"
[7] "original" "original" "original" "original" "original" "original"
[13] "original" "derived" "derived" "derived" "derived" "derived"
[19] "derived" "derived" "derived" "derived" "derived" "derived"
[25] "derived" "derived" "derived" "derived" "derived" "derived"
[31] "derived" "derived" "derived" "derived" "derived" "derived"
[37] "derived" "derived" "derived" "derived" "derived" "derived"
[43] "derived" "derived" "derived" "derived"
bake recipe
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit model
fit_lm_3 <-
linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
feat_val <- rec_prep |>
bake(new_data = data_val)Assess model
Calculate the validation error (RMSE) of your model in validation data.
error_val <- bind_rows(error_val,
tibble(model = "lm_3",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_lm_3, feat_val)$.pred)))
error_val# A tibble: 3 × 2
model rmse_val
<chr> <dbl>
1 lm_1 35011.
2 lm_2 31194.
3 lm_3 29123.
Visualize performance
Visualize the relationship between raw and predicted sale price in your validation set using plot_truth().
plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_3, feat_val)$.pred)4 Additional configurations
Here is where we did some deeper dives into the variables. First, we manually created a few variables by combining other related variables (e.g. combining all variables talking about the size of different parts of the house into one size variable, which resulted in a predictor that is highly correlated with sale_price). Then, we chose to manually assign levels to some categorical variables based on values that made sense to be grouped together (either because of low frequencies or relationship with sale_price). This allowed us to consider more complex interactions. For example, we noticed some homes with large basements, garages (etc.) were selling for less. We wanted to find a way to separate these homes from mansion-type houses that had large sizes and sold for much more. EDA revealed that homes with multiple nice fireplaces sell for a lot more, so we interacted all_fireplace with the all_size variable. We also collapsed the neighborhood variable down to categories based on the relationship with sale_price to reduce the number of levels in a way that is unifying. However, we only have 27/28 neighborhoods in our training data so we will have to impute this new variable in case a new level shows up! We use step_impute_mode(). This model contains the following homework requirements:
- Transformed numeric predictor
- Categorical > 2 levels
- Modified categorical
- Interactions
- Bonus: Researcher created variables
Set up recipe
rec_4 <- recipe(sale_price ~., data = data_trn) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf) |>
# apply functions to specific cols
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
# self-defined features
step_mutate(all_size = gr_liv_area + total_bsmt_sf + garage_area,
all_qual = overall_qual + fireplace_qu + bsmt_qual + garage_qual,
all_garage = garage_qual*garage_area*garage_cars,
all_fireplace = fireplaces*fireplace_qu,
all_bsmt = bsmt_qual*total_bsmt_sf,
avg_room = gr_liv_area/tot_rms_abv_grd) |>
step_YeoJohnson(gr_liv_area, lot_area,all_qual, avg_room, all_size, all_garage, garage_area,
total_bsmt_sf, year_built, all_bsmt) |>
step_mutate(ms_sub_class = fct_collapse(ms_sub_class,
"pud" = c("120", "150", "160", "180"),
"multi" = c("080", "085", "090","190"),
"one_st" = c("020", "030", "040", "045", "050"),
"two_st" = c("060", "070", "075"))) |>
step_mutate(neighborhood = fct_collapse(neighborhood,
"low_price" = c("meadow_v", "idotrr", "br_dale"),
"mod_lo_price" = c("old_town","blueste",
"edwards", "brk_side", "sawyer",
"landmrk","swisu","n_ames",
"n_pk_vill"),
"mod_price" = c("mitchel","blmngtn", "nw_ames",
"gilbert", "sawyer_w",
"crawfor", "collg_cr", "greens"),
"mod_hi_price" = c("somerst","clear_cr", "timber",
"veenker") ,
"high_price" = c("no_ridge", "stone_br", "nridg_ht"))) |>
step_impute_mode(neighborhood) |>
# automatic collapsing by default, could set a threshold
step_other(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_interact(~ overall_qual:year_built) |>
step_interact(~ avg_room:year_built) |>
step_interact(~ all_fireplace:all_size) |>
step_interact(~lot_area:starts_with("neighborhood")) |>
step_interact(~overall_qual:starts_with("neighborhood")) |>
step_interact(~overall_qual:starts_with("ms_sub_class")) |>
step_interact(~ all_size:starts_with("neighborhood")) |>
step_interact(~ all_size:starts_with("ms_sub_class"))Training feature matrix
prep recipe
rec_prep <- rec_4 |>
prep(training = data_trn)Warning: ! There are new levels in `ms_sub_class`: NA.
ℹ Consider using step_unknown() (`?recipes::step_unknown()`) before
`step_dummy()` to handle missing values.
bake recipe
feat_trn <- rec_prep |>
bake(new_data = NULL)Fit model
fit_lm_4 <-
linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn)Validation feature matrix
feat_val <- rec_prep |>
bake(new_data = data_val)Warning: ! There are new levels in `ms_sub_class`: NA.
ℹ Consider using step_unknown() (`?recipes::step_unknown()`) before
`step_dummy()` to handle missing values.
Assess model
error_val<- bind_rows(error_val,
tibble(model = "lm_4",
rmse_val = rmse_vec(feat_val$sale_price,
predict(fit_lm_4, feat_val)$.pred)))Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
error_val # A tibble: 4 × 2
model rmse_val
<chr> <dbl>
1 lm_1 35011.
2 lm_2 31194.
3 lm_3 29123.
4 lm_4 36082.
Visualize model
plot_truth(truth = feat_val$sale_price, estimate = predict(fit_lm_4, feat_val)$.pred)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
Warning: Removed 423 rows containing missing values or values outside the scale range
(`geom_point()`).
Lets now check what our best configuration for lm was
error_val |>
arrange(rmse_val) |>
slice(1)# A tibble: 1 × 2
model rmse_val
<chr> <dbl>
1 lm_3 29123.
Wow! That’s pretty good. Let’s write it out so we can compete in the competition:
best_model <- fit_lm_3
best_rec <- recipe(sale_price ~ ., data = data_trn) |>
step_novel(all_nominal_predictors()) |>
step_impute_knn(garage_cars, garage_area, total_bsmt_sf, ms_sub_class) |>
step_mutate_at(c(bsmt_qual, garage_qual, fireplace_qu), fn = as.numeric) |>
step_YeoJohnson(gr_liv_area, lot_area, garage_area, total_bsmt_sf, year_built) |>
step_other(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |> # should dummy code the nominal before applying interactions
step_interact(~ overall_qual:year_built:fireplace_qu) |> # Here added a "3-way interaction" - only the 3-way term
step_interact(~ total_bsmt_sf:starts_with("bsmt_qual")) |>
step_interact(~tot_rms_abv_grd:year_built) |>
step_interact(~overall_qual:starts_with("neighborhood")) |>
step_interact(~overall_qual:starts_with("ms_sub_class"))
last_name <- "punturieri"if (!is.null(best_model)){
# Make test set features with the best recipe
rec_prep <- best_rec |>
prep(training = data_trn)
feat_test <- rec_prep |>
bake(new_data = data_test)
# Generate predictions made by the best model
test_preds <- data_test |>
mutate(sale_price = predict(best_model, feat_test)$.pred) |>
select(pid, sale_price) #pid is the id variable to match predictions
# Save predictions as a csv file with your last name in the file name
write.csv(test_preds, here::here(path_data,str_c("test_preds_",
last_name,
".csv")))
}