library(tidyverse)
library(tidymodels)Homework Unit 6: Regularization and Penalized Models
Introduction
This file serves as the answer key for the Unit_06 homework. Unit 6 Regularization and Penalized Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to tune two regularization hyperparameters (\(\alpha\) and \(\lambda\)) and select among model configurations using resampling methods.
Setup
Load in your tidy packages
Set up your conflicts policy
options(conflicts.policy = "depends.ok")
conflictRules("Matrix", mask.ok = c("expand", "pack", "unpack"))Load additional packages
library(future)
library(xfun, include.only = "cache_rds")Source function scripts (John’s or your own)
source("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")Specify other global settings
Since we are going to use cache_rds(), we are also going to include rerun_setting <- FALSE in this chunk.
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
rerun_setting <- FALSEPaths
path_data <- "application_assignments/unit_06"Read in data
Read in the ames_full_cln.csv data file .
data_all <- read_csv(here::here(path_data, "ames_full_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 1,955
Columns: 81
$ pid <chr> "x0526301100", "x0526350040", "x0526351010", "x0527105…
$ ms_sub_class <chr> "x020", "x020", "x020", "x060", "x120", "x120", "x120"…
$ ms_zoning <chr> "rl", "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_frontage <dbl> 141, 80, 81, 74, 41, 43, 39, 60, 75, 63, 85, NA, 47, 1…
$ lot_area <dbl> 31770, 11622, 14267, 13830, 4920, 5005, 5389, 7500, 10…
$ street <chr> "pave", "pave", "pave", "pave", "pave", "pave", "pave"…
$ alley <chr> "none", "none", "none", "none", "none", "none", "none"…
$ lot_shape <chr> "ir1", "reg", "ir1", "ir1", "reg", "ir1", "ir1", "reg"…
$ land_contour <chr> "lvl", "lvl", "lvl", "lvl", "lvl", "hls", "lvl", "lvl"…
$ utilities <chr> "all_pub", "all_pub", "all_pub", "all_pub", "all_pub",…
$ lot_config <chr> "corner", "inside", "corner", "inside", "inside", "ins…
$ land_slope <chr> "gtl", "gtl", "gtl", "gtl", "gtl", "gtl", "gtl", "gtl"…
$ neighborhood <chr> "n_ames", "n_ames", "n_ames", "gilbert", "stone_br", "…
$ condition_1 <chr> "norm", "feedr", "norm", "norm", "norm", "norm", "norm…
$ condition_2 <chr> "norm", "norm", "norm", "norm", "norm", "norm", "norm"…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "one_fam", "twhs_ext"…
$ house_style <chr> "x1story", "x1story", "x1story", "x2story", "x1story",…
$ overall_qual <dbl> 6, 5, 6, 5, 8, 8, 8, 7, 6, 6, 7, 8, 8, 8, 9, 4, 6, 6, …
$ overall_cond <dbl> 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 2, 5, 6, 6, …
$ year_built <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ year_remod_add <dbl> 1960, 1961, 1958, 1998, 2001, 1992, 1996, 1999, 1994, …
$ roof_style <chr> "hip", "gable", "hip", "gable", "gable", "gable", "gab…
$ roof_matl <chr> "comp_shg", "comp_shg", "comp_shg", "comp_shg", "comp_…
$ exterior_1st <chr> "brk_face", "vinyl_sd", "wd_sdng", "vinyl_sd", "cemnt_…
$ exterior_2nd <chr> "plywood", "vinyl_sd", "wd_sdng", "vinyl_sd", "cment_b…
$ mas_vnr_type <chr> "stone", "none", "brk_face", "none", "none", "none", "…
$ mas_vnr_area <dbl> 112, 0, 108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603, 0, 350, 0…
$ exter_qual <chr> "ta", "ta", "ta", "ta", "gd", "gd", "gd", "ta", "ta", …
$ exter_cond <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ foundation <chr> "c_block", "c_block", "c_block", "p_conc", "p_conc", "…
$ bsmt_qual <chr> "ta", "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", …
$ bsmt_cond <chr> "gd", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ bsmt_exposure <chr> "gd", "no", "no", "no", "mn", "no", "no", "no", "no", …
$ bsmt_fin_type_1 <chr> "blq", "rec", "alq", "glq", "glq", "alq", "glq", "unf"…
$ bsmt_fin_sf_1 <dbl> 639, 468, 923, 791, 616, 263, 1180, 0, 0, 0, 637, 368,…
$ bsmt_fin_type_2 <chr> "unf", "lw_q", "unf", "unf", "unf", "unf", "unf", "unf…
$ bsmt_fin_sf_2 <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0, 0, 0, 1…
$ bsmt_unf_sf <dbl> 441, 270, 406, 137, 722, 1017, 415, 994, 763, 789, 663…
$ total_bsmt_sf <dbl> 1080, 882, 1329, 928, 1338, 1280, 1595, 994, 763, 789,…
$ heating <chr> "gas_a", "gas_a", "gas_a", "gas_a", "gas_a", "gas_a", …
$ heating_qc <chr> "fa", "ta", "ta", "gd", "ex", "ex", "ex", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ electrical <chr> "s_brkr", "s_brkr", "s_brkr", "s_brkr", "s_brkr", "s_b…
$ x1st_flr_sf <dbl> 1656, 896, 1329, 928, 1338, 1280, 1616, 1028, 763, 789…
$ x2nd_flr_sf <dbl> 0, 0, 0, 701, 0, 0, 0, 776, 892, 676, 0, 0, 1589, 672,…
$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gr_liv_area <dbl> 1656, 896, 1329, 1629, 1338, 1280, 1616, 1804, 1655, 1…
$ bsmt_full_bath <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
$ bsmt_half_bath <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ full_bath <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, 1, 2, 2, …
$ half_bath <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
$ bedroom_abv_gr <dbl> 3, 2, 3, 3, 2, 2, 2, 3, 3, 3, 2, 1, 4, 4, 1, 2, 3, 3, …
$ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ kitchen_qual <chr> "ta", "ta", "gd", "ta", "gd", "gd", "gd", "gd", "ta", …
$ tot_rms_abv_grd <dbl> 7, 5, 6, 6, 6, 5, 5, 7, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7,…
$ functional <chr> "typ", "typ", "typ", "typ", "typ", "typ", "typ", "typ"…
$ fireplaces <dbl> 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, …
$ fireplace_qu <chr> "gd", "none", "none", "ta", "none", "none", "ta", "ta"…
$ garage_type <chr> "attchd", "attchd", "attchd", "attchd", "attchd", "att…
$ garage_yr_blt <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ garage_finish <chr> "fin", "unf", "unf", "fin", "fin", "r_fn", "r_fn", "fi…
$ garage_cars <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, …
$ garage_area <dbl> 528, 730, 312, 482, 582, 506, 608, 442, 440, 393, 506,…
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ garage_cond <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ paved_drive <chr> "p", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ wood_deck_sf <dbl> 210, 140, 393, 212, 0, 0, 237, 140, 157, 0, 192, 0, 50…
$ open_porch_sf <dbl> 62, 0, 36, 34, 0, 82, 152, 60, 84, 75, 0, 54, 36, 12, …
$ enclosed_porch <dbl> 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ x3ssn_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ screen_porch <dbl> 0, 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 140, 210, 0, 0, 0…
$ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ pool_qc <chr> "none", "none", "none", "none", "none", "none", "none"…
$ fence <chr> "none", "mn_prv", "none", "mn_prv", "none", "none", "n…
$ misc_feature <chr> "none", "none", "gar2", "none", "none", "none", "none"…
$ misc_val <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mo_sold <dbl> 5, 6, 6, 3, 4, 1, 3, 6, 4, 5, 2, 6, 6, 6, 6, 6, 2, 1, …
$ yr_sold <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ sale_type <chr> "wd", "wd", "wd", "wd", "wd", "wd", "wd", "wd", "wd", …
$ sale_condition <chr> "normal", "normal", "normal", "normal", "normal", "nor…
$ sale_price <dbl> 215000, 105000, 172000, 189900, 213500, 191500, 236500…
data_all |>
skim_some() |>
arrange(desc(n_missing))| Name | data_all |
| Number of rows | 1955 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| character | 45 |
| numeric | 36 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| mas_vnr_type | 17 | 0.99 | 4 | 8 | 0 | 5 | 0 |
| electrical | 1 | 1.00 | 3 | 6 | 0 | 5 | 0 |
| pid | 0 | 1.00 | 11 | 11 | 0 | 1955 | 0 |
| ms_sub_class | 0 | 1.00 | 4 | 4 | 0 | 16 | 0 |
| ms_zoning | 0 | 1.00 | 1 | 2 | 0 | 7 | 0 |
| street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| alley | 0 | 1.00 | 4 | 4 | 0 | 3 | 0 |
| lot_shape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| land_contour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| utilities | 0 | 1.00 | 7 | 7 | 0 | 2 | 0 |
| lot_config | 0 | 1.00 | 3 | 9 | 0 | 5 | 0 |
| land_slope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| neighborhood | 0 | 1.00 | 5 | 9 | 0 | 28 | 0 |
| condition_1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| condition_2 | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| bldg_type | 0 | 1.00 | 6 | 8 | 0 | 5 | 0 |
| house_style | 0 | 1.00 | 5 | 7 | 0 | 8 | 0 |
| roof_style | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| roof_matl | 0 | 1.00 | 5 | 8 | 0 | 7 | 0 |
| exterior_1st | 0 | 1.00 | 5 | 8 | 0 | 15 | 0 |
| exterior_2nd | 0 | 1.00 | 5 | 8 | 0 | 17 | 0 |
| exter_qual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| exter_cond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| foundation | 0 | 1.00 | 4 | 7 | 0 | 6 | 0 |
| bsmt_qual | 0 | 1.00 | 2 | 4 | 0 | 5 | 0 |
| bsmt_cond | 0 | 1.00 | 2 | 4 | 0 | 6 | 0 |
| bsmt_exposure | 0 | 1.00 | 2 | 4 | 0 | 5 | 0 |
| bsmt_fin_type_1 | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| bsmt_fin_type_2 | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
| heating_qc | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| central_air | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| kitchen_qual | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| functional | 0 | 1.00 | 3 | 4 | 0 | 8 | 0 |
| fireplace_qu | 0 | 1.00 | 2 | 4 | 0 | 6 | 0 |
| garage_type | 0 | 1.00 | 4 | 8 | 0 | 7 | 0 |
| garage_finish | 0 | 1.00 | 3 | 4 | 0 | 4 | 0 |
| garage_qual | 0 | 1.00 | 2 | 4 | 0 | 6 | 0 |
| garage_cond | 0 | 1.00 | 2 | 4 | 0 | 6 | 0 |
| paved_drive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| pool_qc | 0 | 1.00 | 2 | 4 | 0 | 5 | 0 |
| fence | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| misc_feature | 0 | 1.00 | 4 | 5 | 0 | 5 | 0 |
| sale_type | 0 | 1.00 | 2 | 6 | 0 | 10 | 0 |
| sale_condition | 0 | 1.00 | 6 | 8 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | p0 | p100 |
|---|---|---|---|---|
| lot_frontage | 319 | 0.84 | 21 | 313 |
| garage_yr_blt | 109 | 0.94 | 1896 | 2010 |
| mas_vnr_area | 17 | 0.99 | 0 | 1600 |
| bsmt_fin_sf_1 | 1 | 1.00 | 0 | 5644 |
| bsmt_fin_sf_2 | 1 | 1.00 | 0 | 1526 |
| bsmt_unf_sf | 1 | 1.00 | 0 | 2153 |
| total_bsmt_sf | 1 | 1.00 | 0 | 6110 |
| bsmt_full_bath | 1 | 1.00 | 0 | 3 |
| bsmt_half_bath | 1 | 1.00 | 0 | 2 |
| garage_cars | 1 | 1.00 | 0 | 4 |
| garage_area | 1 | 1.00 | 0 | 1488 |
| lot_area | 0 | 1.00 | 1476 | 215245 |
| overall_qual | 0 | 1.00 | 1 | 10 |
| overall_cond | 0 | 1.00 | 1 | 9 |
| year_built | 0 | 1.00 | 1875 | 2010 |
| year_remod_add | 0 | 1.00 | 1950 | 2010 |
| x1st_flr_sf | 0 | 1.00 | 372 | 4692 |
| x2nd_flr_sf | 0 | 1.00 | 0 | 2065 |
| low_qual_fin_sf | 0 | 1.00 | 0 | 1064 |
| gr_liv_area | 0 | 1.00 | 438 | 5642 |
| full_bath | 0 | 1.00 | 0 | 4 |
| half_bath | 0 | 1.00 | 0 | 2 |
| bedroom_abv_gr | 0 | 1.00 | 0 | 8 |
| kitchen_abv_gr | 0 | 1.00 | 0 | 3 |
| tot_rms_abv_grd | 0 | 1.00 | 3 | 14 |
| fireplaces | 0 | 1.00 | 0 | 3 |
| wood_deck_sf | 0 | 1.00 | 0 | 870 |
| open_porch_sf | 0 | 1.00 | 0 | 742 |
| enclosed_porch | 0 | 1.00 | 0 | 552 |
| x3ssn_porch | 0 | 1.00 | 0 | 508 |
| screen_porch | 0 | 1.00 | 0 | 576 |
| pool_area | 0 | 1.00 | 0 | 738 |
| misc_val | 0 | 1.00 | 0 | 12500 |
| mo_sold | 0 | 1.00 | 1 | 12 |
| yr_sold | 0 | 1.00 | 2006 | 2010 |
| sale_price | 0 | 1.00 | 12789 | 745000 |
Set variable classes
Set all variables to factor or numeric classes. Here is where you will also want to explicitly set factor levels for those with low frequency count levels (e.g.,
neighborhood,ms_sub_class) and do any ordering of factor levels.
Next we will use fct_relevel to order levels in our ordinal variables. We know which variables are ordinal by looking at the data dictionary!
You might have gotten a warning that bsmt_qual contains no values of the po level. We still want to keep this as a possible level so we will resolve this warning by first using fct_expand()!
levels_qu <- c("none", "po", "fa", "ta", "gd", "ex")
data_all <- data_all |>
mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("x020", "x030", "x040", "x045", "x050",
"x060", "x070", "x075", "x080", "x085",
"x090", "x120", "x150", "x160", "x180",
"x190")),
across(where(is.character), as.factor),
bsmt_qual = fct_expand(bsmt_qual, "po"), # here is where we use fct_expand()!
bsmt_qual = fct_relevel(bsmt_qual, levels_qu),
garage_qual = fct_relevel(garage_qual, levels_qu),
fireplace_qu = fct_relevel(fireplace_qu, levels_qu),
bsmt_cond = fct_relevel(bsmt_cond, levels_qu),
exter_qual = fct_expand(exter_qual, "po"), # and here!
exter_qual = fct_relevel(exter_qual, c("po", "fa", "ta", "gd", "ex")),
exter_cond = fct_relevel(exter_cond, c("po", "fa", "ta", "gd", "ex")),
pool_qc = fct_expand(pool_qc, "po"), # and here!
pool_qc = fct_relevel(pool_qc, levels_qu),
heating_qc = fct_relevel(heating_qc, c("po", "fa", "ta", "gd", "ex")),
kitchen_qual = fct_relevel(kitchen_qual, c("po", "fa", "ta", "gd", "ex")),
functional = fct_relevel(functional, c("typ", "min1", "min2", "mod", "maj1", "maj2", "sev", "sal")),
lot_shape = fct_relevel(lot_shape, c("reg", "ir1", "ir2", "ir3")),
land_slope = fct_relevel(land_slope, c("gtl", "mod", "sev"))) |>
glimpse() Rows: 1,955
Columns: 81
$ pid <fct> x0526301100, x0526350040, x0526351010, x0527105010, x0…
$ ms_sub_class <fct> x020, x020, x020, x060, x120, x120, x120, x060, x060, …
$ ms_zoning <fct> rl, rh, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl…
$ lot_frontage <dbl> 141, 80, 81, 74, 41, 43, 39, 60, 75, 63, 85, NA, 47, 1…
$ lot_area <dbl> 31770, 11622, 14267, 13830, 4920, 5005, 5389, 7500, 10…
$ street <fct> pave, pave, pave, pave, pave, pave, pave, pave, pave, …
$ alley <fct> none, none, none, none, none, none, none, none, none, …
$ lot_shape <fct> ir1, reg, ir1, ir1, reg, ir1, ir1, reg, ir1, ir1, reg,…
$ land_contour <fct> lvl, lvl, lvl, lvl, lvl, hls, lvl, lvl, lvl, lvl, lvl,…
$ utilities <fct> all_pub, all_pub, all_pub, all_pub, all_pub, all_pub, …
$ lot_config <fct> corner, inside, corner, inside, inside, inside, inside…
$ land_slope <fct> gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl,…
$ neighborhood <fct> n_ames, n_ames, n_ames, gilbert, stone_br, stone_br, s…
$ condition_1 <fct> norm, feedr, norm, norm, norm, norm, norm, norm, norm,…
$ condition_2 <fct> norm, norm, norm, norm, norm, norm, norm, norm, norm, …
$ bldg_type <fct> one_fam, one_fam, one_fam, one_fam, twhs_ext, twhs_ext…
$ house_style <fct> x1story, x1story, x1story, x2story, x1story, x1story, …
$ overall_qual <dbl> 6, 5, 6, 5, 8, 8, 8, 7, 6, 6, 7, 8, 8, 8, 9, 4, 6, 6, …
$ overall_cond <dbl> 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 2, 5, 6, 6, …
$ year_built <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ year_remod_add <dbl> 1960, 1961, 1958, 1998, 2001, 1992, 1996, 1999, 1994, …
$ roof_style <fct> hip, gable, hip, gable, gable, gable, gable, gable, ga…
$ roof_matl <fct> comp_shg, comp_shg, comp_shg, comp_shg, comp_shg, comp…
$ exterior_1st <fct> brk_face, vinyl_sd, wd_sdng, vinyl_sd, cemnt_bd, hd_bo…
$ exterior_2nd <fct> plywood, vinyl_sd, wd_sdng, vinyl_sd, cment_bd, hd_boa…
$ mas_vnr_type <fct> stone, none, brk_face, none, none, none, none, none, n…
$ mas_vnr_area <dbl> 112, 0, 108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603, 0, 350, 0…
$ exter_qual <fct> ta, ta, ta, ta, gd, gd, gd, ta, ta, ta, ta, gd, ex, gd…
$ exter_cond <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ foundation <fct> c_block, c_block, c_block, p_conc, p_conc, p_conc, p_c…
$ bsmt_qual <fct> ta, ta, ta, gd, gd, gd, gd, ta, gd, gd, gd, gd, gd, gd…
$ bsmt_cond <fct> gd, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ bsmt_exposure <fct> gd, no, no, no, mn, no, no, no, no, no, gd, av, gd, av…
$ bsmt_fin_type_1 <fct> blq, rec, alq, glq, glq, alq, glq, unf, unf, unf, glq,…
$ bsmt_fin_sf_1 <dbl> 639, 468, 923, 791, 616, 263, 1180, 0, 0, 0, 637, 368,…
$ bsmt_fin_type_2 <fct> unf, lw_q, unf, unf, unf, unf, unf, unf, unf, unf, unf…
$ bsmt_fin_sf_2 <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0, 0, 0, 1…
$ bsmt_unf_sf <dbl> 441, 270, 406, 137, 722, 1017, 415, 994, 763, 789, 663…
$ total_bsmt_sf <dbl> 1080, 882, 1329, 928, 1338, 1280, 1595, 994, 763, 789,…
$ heating <fct> gas_a, gas_a, gas_a, gas_a, gas_a, gas_a, gas_a, gas_a…
$ heating_qc <fct> fa, ta, ta, gd, ex, ex, ex, gd, gd, gd, gd, ta, ex, gd…
$ central_air <fct> y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, …
$ electrical <fct> s_brkr, s_brkr, s_brkr, s_brkr, s_brkr, s_brkr, s_brkr…
$ x1st_flr_sf <dbl> 1656, 896, 1329, 928, 1338, 1280, 1616, 1028, 763, 789…
$ x2nd_flr_sf <dbl> 0, 0, 0, 701, 0, 0, 0, 776, 892, 676, 0, 0, 1589, 672,…
$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gr_liv_area <dbl> 1656, 896, 1329, 1629, 1338, 1280, 1616, 1804, 1655, 1…
$ bsmt_full_bath <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
$ bsmt_half_bath <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ full_bath <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, 1, 2, 2, …
$ half_bath <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
$ bedroom_abv_gr <dbl> 3, 2, 3, 3, 2, 2, 2, 3, 3, 3, 2, 1, 4, 4, 1, 2, 3, 3, …
$ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ kitchen_qual <fct> ta, ta, gd, ta, gd, gd, gd, gd, ta, ta, gd, gd, ex, ta…
$ tot_rms_abv_grd <dbl> 7, 5, 6, 6, 6, 5, 5, 7, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7,…
$ functional <fct> typ, typ, typ, typ, typ, typ, typ, typ, typ, typ, typ,…
$ fireplaces <dbl> 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, …
$ fireplace_qu <fct> gd, none, none, ta, none, none, ta, ta, ta, gd, po, no…
$ garage_type <fct> attchd, attchd, attchd, attchd, attchd, attchd, attchd…
$ garage_yr_blt <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ garage_finish <fct> fin, unf, unf, fin, fin, r_fn, r_fn, fin, fin, fin, un…
$ garage_cars <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, …
$ garage_area <dbl> 528, 730, 312, 482, 582, 506, 608, 442, 440, 393, 506,…
$ garage_qual <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ garage_cond <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ paved_drive <fct> p, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, …
$ wood_deck_sf <dbl> 210, 140, 393, 212, 0, 0, 237, 140, 157, 0, 192, 0, 50…
$ open_porch_sf <dbl> 62, 0, 36, 34, 0, 82, 152, 60, 84, 75, 0, 54, 36, 12, …
$ enclosed_porch <dbl> 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ x3ssn_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ screen_porch <dbl> 0, 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 140, 210, 0, 0, 0…
$ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ pool_qc <fct> none, none, none, none, none, none, none, none, none, …
$ fence <fct> none, mn_prv, none, mn_prv, none, none, none, none, no…
$ misc_feature <fct> none, none, gar2, none, none, none, none, none, none, …
$ misc_val <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mo_sold <dbl> 5, 6, 6, 3, 4, 1, 3, 6, 4, 5, 2, 6, 6, 6, 6, 6, 2, 1, …
$ yr_sold <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ sale_type <fct> wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd…
$ sale_condition <fct> normal, normal, normal, normal, normal, normal, normal…
$ sale_price <dbl> 215000, 105000, 172000, 189900, 213500, 191500, 236500…
data_all |>
skim_some() |>
arrange(desc(n_missing))| Name | data_all |
| Number of rows | 1955 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| factor | 45 |
| numeric | 36 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| mas_vnr_type | 17 | 0.99 | FALSE | 5 | non: 1167, brk: 581, sto: 171, brk: 18 |
| electrical | 1 | 1.00 | FALSE | 5 | s_b: 1792, fus: 125, fus: 29, fus: 7 |
| pid | 0 | 1.00 | FALSE | 1955 | x05: 1, x05: 1, x05: 1, x05: 1 |
| ms_sub_class | 0 | 1.00 | FALSE | 16 | x02: 730, x06: 388, x05: 208, x12: 122 |
| ms_zoning | 0 | 1.00 | FALSE | 7 | rl: 1530, rm: 297, fv: 91, c: 19 |
| street | 0 | 1.00 | FALSE | 2 | pav: 1946, grv: 9 |
| alley | 0 | 1.00 | FALSE | 3 | non: 1821, grv: 86, pav: 48 |
| lot_shape | 0 | 1.00 | FALSE | 4 | reg: 1258, ir1: 636, ir2: 49, ir3: 12 |
| land_contour | 0 | 1.00 | FALSE | 4 | lvl: 1769, hls: 75, bnk: 72, low: 39 |
| utilities | 0 | 1.00 | FALSE | 2 | all: 1953, no_: 2 |
| lot_config | 0 | 1.00 | FALSE | 5 | ins: 1454, cor: 328, cul: 114, fr2: 55 |
| land_slope | 0 | 1.00 | FALSE | 3 | gtl: 1864, mod: 78, sev: 13 |
| neighborhood | 0 | 1.00 | FALSE | 28 | n_a: 299, col: 174, old: 161, edw: 135 |
| condition_1 | 0 | 1.00 | FALSE | 9 | nor: 1693, fee: 114, art: 54, rr_: 31 |
| condition_2 | 0 | 1.00 | FALSE | 6 | nor: 1938, fee: 6, art: 4, pos: 3 |
| bldg_type | 0 | 1.00 | FALSE | 5 | one: 1631, twh: 145, dup: 77, twh: 64 |
| house_style | 0 | 1.00 | FALSE | 8 | x1s: 989, x2s: 580, x1_: 224, s_l: 79 |
| roof_style | 0 | 1.00 | FALSE | 6 | gab: 1557, hip: 362, gam: 16, fla: 9 |
| roof_matl | 0 | 1.00 | FALSE | 7 | com: 1929, tar: 11, wd_: 8, wd_: 4 |
| exterior_1st | 0 | 1.00 | FALSE | 15 | vin: 671, hd_: 301, met: 298, wd_: 283 |
| exterior_2nd | 0 | 1.00 | FALSE | 17 | vin: 662, met: 295, hd_: 279, wd_: 267 |
| exter_qual | 0 | 1.00 | FALSE | 4 | ta: 1215, gd: 651, ex: 63, fa: 26 |
| exter_cond | 0 | 1.00 | FALSE | 5 | ta: 1707, gd: 195, fa: 42, ex: 8 |
| foundation | 0 | 1.00 | FALSE | 6 | p_c: 865, c_b: 849, brk: 198, sla: 33 |
| bsmt_qual | 0 | 1.00 | FALSE | 5 | ta: 861, gd: 808, ex: 167, fa: 62 |
| bsmt_cond | 0 | 1.00 | FALSE | 6 | ta: 1739, gd: 85, fa: 69, non: 57 |
| bsmt_exposure | 0 | 1.00 | FALSE | 5 | no: 1271, av: 274, gd: 183, mn: 168 |
| bsmt_fin_type_1 | 0 | 1.00 | FALSE | 7 | unf: 576, glq: 535, alq: 294, rec: 202 |
| bsmt_fin_type_2 | 0 | 1.00 | FALSE | 7 | unf: 1655, rec: 75, lw_: 69, non: 57 |
| heating | 0 | 1.00 | FALSE | 6 | gas: 1920, gas: 20, gra: 8, wal: 5 |
| heating_qc | 0 | 1.00 | FALSE | 5 | ex: 979, ta: 590, gd: 324, fa: 60 |
| central_air | 0 | 1.00 | FALSE | 2 | y: 1821, n: 134 |
| kitchen_qual | 0 | 1.00 | FALSE | 5 | ta: 1011, gd: 765, ex: 126, fa: 52 |
| functional | 0 | 1.00 | FALSE | 8 | typ: 1822, min: 48, min: 41, mod: 23 |
| fireplace_qu | 0 | 1.00 | FALSE | 6 | non: 960, gd: 481, ta: 407, fa: 44 |
| garage_type | 0 | 1.00 | FALSE | 7 | att: 1161, det: 521, bui: 123, non: 107 |
| garage_finish | 0 | 1.00 | FALSE | 4 | unf: 826, r_f: 547, fin: 473, non: 109 |
| garage_qual | 0 | 1.00 | FALSE | 6 | ta: 1745, non: 109, fa: 79, gd: 16 |
| garage_cond | 0 | 1.00 | FALSE | 6 | ta: 1778, non: 109, fa: 46, gd: 12 |
| paved_drive | 0 | 1.00 | FALSE | 3 | y: 1775, n: 139, p: 41 |
| pool_qc | 0 | 1.00 | FALSE | 5 | non: 1945, gd: 3, ex: 3, fa: 2 |
| fence | 0 | 1.00 | FALSE | 5 | non: 1599, mn_: 215, gd_: 70, gd_: 61 |
| misc_feature | 0 | 1.00 | FALSE | 5 | non: 1887, she: 62, oth: 3, gar: 2 |
| sale_type | 0 | 1.00 | FALSE | 10 | wd: 1695, new: 158, cod: 57, con: 16 |
| sale_condition | 0 | 1.00 | FALSE | 6 | nor: 1616, par: 161, abn: 120, fam: 30 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | p0 | p100 |
|---|---|---|---|---|
| lot_frontage | 319 | 0.84 | 21 | 313 |
| garage_yr_blt | 109 | 0.94 | 1896 | 2010 |
| mas_vnr_area | 17 | 0.99 | 0 | 1600 |
| bsmt_fin_sf_1 | 1 | 1.00 | 0 | 5644 |
| bsmt_fin_sf_2 | 1 | 1.00 | 0 | 1526 |
| bsmt_unf_sf | 1 | 1.00 | 0 | 2153 |
| total_bsmt_sf | 1 | 1.00 | 0 | 6110 |
| bsmt_full_bath | 1 | 1.00 | 0 | 3 |
| bsmt_half_bath | 1 | 1.00 | 0 | 2 |
| garage_cars | 1 | 1.00 | 0 | 4 |
| garage_area | 1 | 1.00 | 0 | 1488 |
| lot_area | 0 | 1.00 | 1476 | 215245 |
| overall_qual | 0 | 1.00 | 1 | 10 |
| overall_cond | 0 | 1.00 | 1 | 9 |
| year_built | 0 | 1.00 | 1875 | 2010 |
| year_remod_add | 0 | 1.00 | 1950 | 2010 |
| x1st_flr_sf | 0 | 1.00 | 372 | 4692 |
| x2nd_flr_sf | 0 | 1.00 | 0 | 2065 |
| low_qual_fin_sf | 0 | 1.00 | 0 | 1064 |
| gr_liv_area | 0 | 1.00 | 438 | 5642 |
| full_bath | 0 | 1.00 | 0 | 4 |
| half_bath | 0 | 1.00 | 0 | 2 |
| bedroom_abv_gr | 0 | 1.00 | 0 | 8 |
| kitchen_abv_gr | 0 | 1.00 | 0 | 3 |
| tot_rms_abv_grd | 0 | 1.00 | 3 | 14 |
| fireplaces | 0 | 1.00 | 0 | 3 |
| wood_deck_sf | 0 | 1.00 | 0 | 870 |
| open_porch_sf | 0 | 1.00 | 0 | 742 |
| enclosed_porch | 0 | 1.00 | 0 | 552 |
| x3ssn_porch | 0 | 1.00 | 0 | 508 |
| screen_porch | 0 | 1.00 | 0 | 576 |
| pool_area | 0 | 1.00 | 0 | 738 |
| misc_val | 0 | 1.00 | 0 | 12500 |
| mo_sold | 0 | 1.00 | 1 | 12 |
| yr_sold | 0 | 1.00 | 2006 | 2010 |
| sale_price | 0 | 1.00 | 12789 | 745000 |
Set up splits
Divide data into train and test
Hold out 25% of the data as
data_testfor evaluation using theinitial_split()function. Stratify onsale_price.
set.seed(102030)
splits_test <- data_all |>
initial_split(prop = 0.75, strata = "sale_price")
data_trn <- splits_test |>
analysis()
data_test <- splits_test |>
assessment()Make splits
For parts 2 and 3, you’ll need splits within
data_trnto select among among model configurations using held out data. Create 10 bootstrap splits stratified onsale_price.
splits_boot <- data_trn |>
bootstraps(times = 10, strata = "sale_price")Build recipe
You will build one recipe that can be used across three model fits. Please follow these instructions to build your recipe:
Regress the outcome
sale_priceon all predictorsRemove the ID variable (
pid) withstep_rm()Use
step_impute_median()to impute the median for any missing values in numeric featuresUse
step_impute_mode()to impute the mode for any missing values in the factor featuresUse
step_normalize()to normalize all numeric features (necessary for regularized models)Apply dummy coding to all factor features
rec <- recipe(sale_price ~ ., data = data_trn) |>
step_rm(pid) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors()) Create error tracking tibble
track_rmse <- tibble(model = character(),
rmse_trn = numeric(),
rmse_test = numeric(),
n_features = numeric())Part 1: Fitting an OLS linear regression
Fit a regression model in the full training set
Make a feature matrix for all training data and for test data.
rec_prep <- rec |>
prep(data_trn)
feat_trn <- rec_prep |>
bake(NULL)
feat_test <- rec_prep |>
bake(data_test)Fit a linear regression model. No resampling is needed because there are no hyperparameters to tune.
fit_linear <- linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn)Get RMSE in train & test
Use
rmse_vec()to get error infeat_trn.
lin_trn_rmse <- rmse_vec(truth = feat_trn$sale_price,
estimate = predict(fit_linear, feat_trn)$.pred)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
Use
rmse_vec()to get error infeat_test.
lin_test_rmse <-rmse_vec(truth = feat_test$sale_price,
estimate = predict(fit_linear, feat_test)$.pred)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
Get number of features.
lin_n_feat <- fit_linear |>
tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()Add to tracking tibble.
track_rmse <- add_row(track_rmse,
model = "OLS",
rmse_trn = lin_trn_rmse,
rmse_test = lin_test_rmse,
n_features = lin_n_feat)
track_rmse# A tibble: 1 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 19285. 25964. 265
We see the error is much higher in the test set. This is because with 265 features our model is overfit to noise in our training data.
Also, notice the warnings we get when making these predictions. R is telling us that our models are rank-deficient - this means that this model may not have determined a unique set of parameter estimates to minimize the cost function. This can occur for a variety of reasons, some of which are real problems and other times for reasons that are not a problem. However, you should never use such a model unless you know the source of the rank deficiency. You can read more about this warning message here and here.
step_nzv() and step_corr() can be used to reduce the number of features by eliminating highly correlated features (default is > .9 absolute correlation) or features with almost no variance (e.g., a dummy coded feature that is all or almost all 0’s). Here is an example of what that would like like.
Let’s add these two additional steps to our recipe
rec_nzv <- rec |>
step_nzv(all_predictors()) |>
step_corr(all_predictors()) Now let’s remake our feature matrices, refit the model and evaluate its performance.
rec_prep <- rec_nzv |>
prep(data_trn)
feat_trn_nzv <- rec_prep |>
bake(NULL)
feat_test_nzv <- rec_prep |>
bake(data_test)
fit_linear_nzv <- linear_reg() |>
set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn_nzv)
lin_nzv_trn_rmse <- rmse_vec(truth = data_trn$sale_price,
estimate = predict(fit_linear_nzv, feat_trn_nzv)$.pred)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
lin_nzv_test_rmse <- rmse_vec(truth = data_test$sale_price,
estimate = predict(fit_linear_nzv, feat_test_nzv)$.pred)Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
lin_nzv_n_feat <- fit_linear_nzv |>
tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()
track_rmse <- add_row(track_rmse,
model = "OLS with NZV/Corr",
rmse_trn = lin_nzv_trn_rmse,
rmse_test = lin_nzv_test_rmse,
n_features = lin_nzv_n_feat)
track_rmse# A tibble: 2 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 19285. 25964. 265
2 OLS with NZV/Corr 26567. 28277. 105
Our model is still performing similarly in test, but now we get a realistic idea of this performance in our training data. We also have cut down our features to 105 features! As a result we no longer get the rank deficient warning. Our model still has many features, but it is no longer rank-deficient. As an aside, you don’t need to wait to use step_nzv() or step_corr() until you get those kinds of warnings; they are generally helpful preprocessing steps that you can use as desired/appropriate!
Now let’s see how regularization handles sparse data!
Part 2: Fitting a LASSO regression
Set up a hyperparameter grid
In the LASSO, the mixture hyperparameter (\(\alpha\)) will be set to 1, but we’ll need to tune the penalty hyperparameter (\(\lambda\)).
grid_penalty <- expand_grid(penalty = exp(seq(-6, 8, length.out = 20)))Tune a LASSO regression
Use
linear_reg(),set_engine("glmnet"), andtune_grid()to fit your LASSO models.
plan(multisession, workers = parallel::detectCores(logical = FALSE))
fits_lasso <- cache_rds(
expr = {
linear_reg(penalty = tune(),
mixture = 1) |>
set_engine("glmnet") |>
tune_grid(preprocessor = rec,
resamples = splits_boot,
grid = grid_penalty,
metrics = metric_set(rmse))
},
dir = "cache/",
file = "fits_lasso",
rerun = rerun_setting)
plan(sequential)Plot performance in the validation sets by hyperparameter
Use the
plot_hyperparameters()function infun_ml.Ror your own code.
plot_hyperparameters(fits_lasso, hp1 = "penalty", metric = "rmse")Fit your best configuration in data_trn
Use your best configuration (i.e., your best \(\lambda\) value) to fit a model in the full training set (data_trn) using select_best().
select_best(fits_lasso)Warning in select_best(fits_lasso): No value of `metric` was given; "rmse" will
be used.
# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 1427. pre0_mod19_post0
fit_lasso <- linear_reg(penalty = select_best(fits_lasso)$penalty,
mixture = 1) |>
set_engine("glmnet") |>
fit(sale_price ~ ., data = feat_trn)Warning in select_best(fits_lasso): No value of `metric` was given; "rmse" will be used.
No value of `metric` was given; "rmse" will be used.
No value of `metric` was given; "rmse" will be used.
Examine parameter estimates
fit_lasso |>
tidy() |>
print(n = Inf) |>
suppressMessages() # suppressing message that appears when required Matrix package is loaded# A tibble: 286 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) 155754. 1427.
2 lot_frontage 0 1427.
3 lot_area 405. 1427.
4 overall_qual 16446. 1427.
5 overall_cond 3176. 1427.
6 year_built 5183. 1427.
7 year_remod_add 2007. 1427.
8 mas_vnr_area 3395. 1427.
9 bsmt_fin_sf_1 2147. 1427.
10 bsmt_fin_sf_2 0 1427.
11 bsmt_unf_sf 0 1427.
12 total_bsmt_sf 1743. 1427.
13 x1st_flr_sf 2429. 1427.
14 x2nd_flr_sf 0 1427.
15 low_qual_fin_sf -459. 1427.
16 gr_liv_area 20534. 1427.
17 bsmt_full_bath 3395. 1427.
18 bsmt_half_bath 0 1427.
19 full_bath 896. 1427.
20 half_bath 0 1427.
21 bedroom_abv_gr 0 1427.
22 kitchen_abv_gr 0 1427.
23 tot_rms_abv_grd 0 1427.
24 fireplaces 3485. 1427.
25 garage_yr_blt 1347. 1427.
26 garage_cars 4251. 1427.
27 garage_area 1868. 1427.
28 wood_deck_sf 2591. 1427.
29 open_porch_sf 0 1427.
30 enclosed_porch 0 1427.
31 x3ssn_porch 0 1427.
32 screen_porch 0 1427.
33 pool_area 0 1427.
34 misc_val 0 1427.
35 mo_sold 0 1427.
36 yr_sold 0 1427.
37 ms_sub_class_x030 0 1427.
38 ms_sub_class_x040 0 1427.
39 ms_sub_class_x045 0 1427.
40 ms_sub_class_x050 0 1427.
41 ms_sub_class_x060 0 1427.
42 ms_sub_class_x070 0 1427.
43 ms_sub_class_x075 0 1427.
44 ms_sub_class_x080 0 1427.
45 ms_sub_class_x085 0 1427.
46 ms_sub_class_x090 0 1427.
47 ms_sub_class_x120 0 1427.
48 ms_sub_class_x150 0 1427.
49 ms_sub_class_x160 0 1427.
50 ms_sub_class_x180 0 1427.
51 ms_sub_class_x190 0 1427.
52 ms_zoning_c 0 1427.
53 ms_zoning_fv 0 1427.
54 ms_zoning_i 0 1427.
55 ms_zoning_rh 0 1427.
56 ms_zoning_rl 0 1427.
57 ms_zoning_rm -3292. 1427.
58 street_pave 0 1427.
59 alley_none 0 1427.
60 alley_pave 0 1427.
61 lot_shape_ir1 0 1427.
62 lot_shape_ir2 382. 1427.
63 lot_shape_ir3 -29460. 1427.
64 land_contour_hls 10315. 1427.
65 land_contour_low 0 1427.
66 land_contour_lvl 0 1427.
67 utilities_no_sewr 0 1427.
68 lot_config_cul_d_sac 4764. 1427.
69 lot_config_fr2 0 1427.
70 lot_config_fr3 0 1427.
71 lot_config_inside 0 1427.
72 land_slope_mod 0 1427.
73 land_slope_sev 0 1427.
74 neighborhood_blueste 0 1427.
75 neighborhood_br_dale 0 1427.
76 neighborhood_brk_side 0 1427.
77 neighborhood_clear_cr 0 1427.
78 neighborhood_collg_cr 0 1427.
79 neighborhood_crawfor 15277. 1427.
80 neighborhood_edwards -3135. 1427.
81 neighborhood_gilbert 0 1427.
82 neighborhood_greens 0 1427.
83 neighborhood_grn_hill 21477. 1427.
84 neighborhood_idotrr 0 1427.
85 neighborhood_landmrk 0 1427.
86 neighborhood_meadow_v 0 1427.
87 neighborhood_mitchel 0 1427.
88 neighborhood_n_ames 0 1427.
89 neighborhood_no_ridge 38049. 1427.
90 neighborhood_n_pk_vill 0 1427.
91 neighborhood_nridg_ht 21747. 1427.
92 neighborhood_nw_ames 0 1427.
93 neighborhood_old_town 0 1427.
94 neighborhood_sawyer 0 1427.
95 neighborhood_sawyer_w 0 1427.
96 neighborhood_somerst 6169. 1427.
97 neighborhood_stone_br 32297. 1427.
98 neighborhood_swisu 0 1427.
99 neighborhood_timber 0 1427.
100 neighborhood_veenker 0 1427.
101 condition_1_feedr -720. 1427.
102 condition_1_norm 6991. 1427.
103 condition_1_pos_a 0 1427.
104 condition_1_pos_n 0 1427.
105 condition_1_rr_ae 0 1427.
106 condition_1_rr_an 0 1427.
107 condition_1_rr_ne 0 1427.
108 condition_1_rr_nn 0 1427.
109 condition_2_feedr 0 1427.
110 condition_2_norm 0 1427.
111 condition_2_pos_a 6675. 1427.
112 condition_2_pos_n -79108. 1427.
113 condition_2_rr_nn 0 1427.
114 bldg_type_one_fam 16339. 1427.
115 bldg_type_twhs_ext 0 1427.
116 bldg_type_twhs_int 0 1427.
117 bldg_type_two_fam 0 1427.
118 house_style_s_lvl 0 1427.
119 house_style_x1_5fin 0 1427.
120 house_style_x1_5unf 0 1427.
121 house_style_x1story 7.02 1427.
122 house_style_x2_5fin 0 1427.
123 house_style_x2_5unf 0 1427.
124 house_style_x2story 0 1427.
125 roof_style_gable -770. 1427.
126 roof_style_gambrel 0 1427.
127 roof_style_hip 0 1427.
128 roof_style_mansard 0 1427.
129 roof_style_shed 0 1427.
130 roof_matl_comp_shg 0 1427.
131 roof_matl_membran 0 1427.
132 roof_matl_metal 0 1427.
133 roof_matl_tar_grv 0 1427.
134 roof_matl_wd_shake 0 1427.
135 roof_matl_wd_shngl 0 1427.
136 exterior_1st_asph_shn 0 1427.
137 exterior_1st_brk_comm 0 1427.
138 exterior_1st_brk_face 11288. 1427.
139 exterior_1st_c_block 0 1427.
140 exterior_1st_cemnt_bd 0 1427.
141 exterior_1st_hd_board 0 1427.
142 exterior_1st_metal_sd 0 1427.
143 exterior_1st_plywood 0 1427.
144 exterior_1st_pre_cast 31009. 1427.
145 exterior_1st_stone 0 1427.
146 exterior_1st_stucco -14690. 1427.
147 exterior_1st_vinyl_sd 0 1427.
148 exterior_1st_wd_sdng 0 1427.
149 exterior_1st_wd_shing 0 1427.
150 exterior_2nd_asph_shn 0 1427.
151 exterior_2nd_brk_cmn 0 1427.
152 exterior_2nd_brk_face 0 1427.
153 exterior_2nd_c_block 0 1427.
154 exterior_2nd_cment_bd 1589. 1427.
155 exterior_2nd_hd_board 0 1427.
156 exterior_2nd_im_stucc 7986. 1427.
157 exterior_2nd_metal_sd 0 1427.
158 exterior_2nd_other 0 1427.
159 exterior_2nd_plywood 0 1427.
160 exterior_2nd_pre_cast 21.0 1427.
161 exterior_2nd_stone 0 1427.
162 exterior_2nd_stucco 0 1427.
163 exterior_2nd_vinyl_sd 0 1427.
164 exterior_2nd_wd_sdng 0 1427.
165 exterior_2nd_wd_shng 0 1427.
166 mas_vnr_type_brk_face 0 1427.
167 mas_vnr_type_c_block -789. 1427.
168 mas_vnr_type_none 0 1427.
169 mas_vnr_type_stone 0 1427.
170 exter_qual_fa 0 1427.
171 exter_qual_ta -5654. 1427.
172 exter_qual_gd 0 1427.
173 exter_qual_ex 794. 1427.
174 exter_cond_fa 0 1427.
175 exter_cond_ta 0 1427.
176 exter_cond_gd 0 1427.
177 exter_cond_ex 0 1427.
178 foundation_c_block 0 1427.
179 foundation_p_conc 0 1427.
180 foundation_slab 0 1427.
181 foundation_stone 0 1427.
182 foundation_wood 0 1427.
183 bsmt_qual_po 0 1427.
184 bsmt_qual_fa 0 1427.
185 bsmt_qual_ta 0 1427.
186 bsmt_qual_gd 0 1427.
187 bsmt_qual_ex 23299. 1427.
188 bsmt_cond_po 0 1427.
189 bsmt_cond_fa 0 1427.
190 bsmt_cond_ta 0 1427.
191 bsmt_cond_gd 0 1427.
192 bsmt_cond_ex 0 1427.
193 bsmt_exposure_gd 11743. 1427.
194 bsmt_exposure_mn 0 1427.
195 bsmt_exposure_no -3417. 1427.
196 bsmt_exposure_none 0 1427.
197 bsmt_fin_type_1_blq 0 1427.
198 bsmt_fin_type_1_glq 3466. 1427.
199 bsmt_fin_type_1_lw_q 0 1427.
200 bsmt_fin_type_1_none 0 1427.
201 bsmt_fin_type_1_rec 0 1427.
202 bsmt_fin_type_1_unf -186. 1427.
203 bsmt_fin_type_2_blq 0 1427.
204 bsmt_fin_type_2_glq 2267. 1427.
205 bsmt_fin_type_2_lw_q 0 1427.
206 bsmt_fin_type_2_none 0 1427.
207 bsmt_fin_type_2_rec 0 1427.
208 bsmt_fin_type_2_unf 0 1427.
209 heating_gas_a 0 1427.
210 heating_gas_w 0 1427.
211 heating_grav 0 1427.
212 heating_oth_w 0 1427.
213 heating_wall 0 1427.
214 heating_qc_fa 0 1427.
215 heating_qc_ta 0 1427.
216 heating_qc_gd 0 1427.
217 heating_qc_ex 2431. 1427.
218 central_air_y 731. 1427.
219 electrical_fuse_f 0 1427.
220 electrical_fuse_p 0 1427.
221 electrical_mix 0 1427.
222 electrical_s_brkr 0 1427.
223 kitchen_qual_fa 0 1427.
224 kitchen_qual_ta -1664. 1427.
225 kitchen_qual_gd 0 1427.
226 kitchen_qual_ex 26994. 1427.
227 functional_min1 0 1427.
228 functional_min2 0 1427.
229 functional_mod 0 1427.
230 functional_maj1 0 1427.
231 functional_maj2 0 1427.
232 functional_sev -21475. 1427.
233 functional_sal -2795. 1427.
234 fireplace_qu_po 0 1427.
235 fireplace_qu_fa 0 1427.
236 fireplace_qu_ta 0 1427.
237 fireplace_qu_gd 0 1427.
238 fireplace_qu_ex 1540. 1427.
239 garage_type_basment 0 1427.
240 garage_type_built_in 0 1427.
241 garage_type_car_port 0 1427.
242 garage_type_detchd 0 1427.
243 garage_type_none 0 1427.
244 garage_type_x2types 0 1427.
245 garage_finish_none 0 1427.
246 garage_finish_r_fn 0 1427.
247 garage_finish_unf 0 1427.
248 garage_qual_po 0 1427.
249 garage_qual_fa 0 1427.
250 garage_qual_ta 0 1427.
251 garage_qual_gd 0 1427.
252 garage_qual_ex 0 1427.
253 garage_cond_fa 0 1427.
254 garage_cond_gd 0 1427.
255 garage_cond_none 0 1427.
256 garage_cond_po 0 1427.
257 garage_cond_ta 0 1427.
258 paved_drive_p 0 1427.
259 paved_drive_y 0 1427.
260 pool_qc_po 0 1427.
261 pool_qc_fa 0 1427.
262 pool_qc_ta 0 1427.
263 pool_qc_gd -74783. 1427.
264 pool_qc_ex 57865. 1427.
265 fence_gd_wo 0 1427.
266 fence_mn_prv 0 1427.
267 fence_mn_ww 0 1427.
268 fence_none 0 1427.
269 misc_feature_none 0 1427.
270 misc_feature_othr 0 1427.
271 misc_feature_shed 0 1427.
272 misc_feature_ten_c 0 1427.
273 sale_type_con 0 1427.
274 sale_type_con_ld 0 1427.
275 sale_type_con_li 0 1427.
276 sale_type_con_lw 0 1427.
277 sale_type_cwd 0 1427.
278 sale_type_new 12688. 1427.
279 sale_type_oth 0 1427.
280 sale_type_vwd 0 1427.
281 sale_type_wd 0 1427.
282 sale_condition_adj_land 0 1427.
283 sale_condition_alloca 0 1427.
284 sale_condition_family 0 1427.
285 sale_condition_normal 0 1427.
286 sale_condition_partial 16.5 1427.
Get RMSE in train & test
Use rmse_vec() to get error in feat_trn
lasso_rmse_trn <- rmse_vec(truth = feat_trn$sale_price,
estimate = predict(fit_lasso, feat_trn)$.pred)Use rmse_vec() to get error in feat_test
lasso_rmse_test <- rmse_vec(truth = feat_test$sale_price,
estimate = predict(fit_lasso, feat_test)$.pred)Get number of features
lasso_n_feat <- fit_lasso |>
tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()Add to track_rmse
track_rmse <- add_row(track_rmse,
model = "LASSO",
rmse_trn = lasso_rmse_trn,
rmse_test = lasso_rmse_test,
n_features = lasso_n_feat)
track_rmse# A tibble: 3 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 19285. 25964. 265
2 OLS with NZV/Corr 26567. 28277. 105
3 LASSO 24967. 26860. 63
Performance is again about comparable in train and test. Also, LASSO didn’t really do any better than the OLS in test once we removed nzv and highly correlated features
LASSO retained 63 features with non-zero parameter estimates. This is fewer than what we got with our recipe that eliminated near zero variance and highly correlated features (105 ) and far fewer features than our OLS model with 265 features (which we know is bad, but helpful here for comparison). We know that a simpler model is likely to have slightly more bias but likely with a large reduction in variance.
Part 3: Fitting an Elastic Net regression
Set up a hyperparameter grid
Now we’ll need to tune both the mixture hyperparameter (\(\alpha\)) and the penalty hyperparameter (\(\lambda\)).
grid_glmnet <- expand_grid(penalty = exp(seq(-10, 11, length.out = 20)),
mixture = seq(0, 1, length.out = 3))Tune an elasticnet regression
Use linear_reg(), set_engine("glmnet"), and tune_grid() to fit your LASSO models.
plan(multisession, workers = parallel::detectCores(logical = FALSE))
fits_glmnet <- cache_rds(
expr = {
linear_reg(penalty = tune(),
mixture = tune()) |>
set_engine("glmnet") |>
tune_grid(preprocessor = rec,
resamples = splits_boot,
grid = grid_glmnet,
metrics = metric_set(rmse))
},
dir = "cache/",
file = "fits_glmnet",
rerun = rerun_setting)
plan(sequential)Plot performance in the validation sets by hyperparameter
Use the plot_hyperparameters() function or your own code.
plot_hyperparameters(fits_glmnet,
hp1 = "penalty", hp2 = "mixture",
metric = "rmse")Fit your best configuration in training data
Use your best configuration (i.e., your best combination of \(\alpha\) & \(\lambda\) values) to fit a model in the full training set (data_trn) using select_best().
select_best(fits_glmnet)Warning in select_best(fits_glmnet): No value of `metric` was given; "rmse"
will be used.
# A tibble: 1 × 3
penalty mixture .config
<dbl> <dbl> <chr>
1 2174. 0.5 pre0_mod50_post0
fit_glmnet <- linear_reg(penalty = select_best(fits_glmnet)$penalty,
mixture = select_best(fits_glmnet)$mixture) |>
set_engine("glmnet") |>
fit(sale_price ~ ., data = feat_trn)Warning in select_best(fits_glmnet): No value of `metric` was given; "rmse" will be used.
No value of `metric` was given; "rmse" will be used.
No value of `metric` was given; "rmse" will be used.
No value of `metric` was given; "rmse" will be used.
No value of `metric` was given; "rmse" will be used.
Examine parameter estimates
fit_glmnet |>
tidy() |>
print(n = Inf)# A tibble: 286 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) 148417. 2174.
2 lot_frontage 0 2174.
3 lot_area 594. 2174.
4 overall_qual 14968. 2174.
5 overall_cond 3644. 2174.
6 year_built 4825. 2174.
7 year_remod_add 1869. 2174.
8 mas_vnr_area 3681. 2174.
9 bsmt_fin_sf_1 2227. 2174.
10 bsmt_fin_sf_2 0 2174.
11 bsmt_unf_sf 0 2174.
12 total_bsmt_sf 2004. 2174.
13 x1st_flr_sf 2036. 2174.
14 x2nd_flr_sf 0 2174.
15 low_qual_fin_sf -765. 2174.
16 gr_liv_area 20152. 2174.
17 bsmt_full_bath 3457. 2174.
18 bsmt_half_bath 0 2174.
19 full_bath 1663. 2174.
20 half_bath 0 2174.
21 bedroom_abv_gr 0 2174.
22 kitchen_abv_gr -1.54 2174.
23 tot_rms_abv_grd 0 2174.
24 fireplaces 3559. 2174.
25 garage_yr_blt 1696. 2174.
26 garage_cars 4129. 2174.
27 garage_area 1988. 2174.
28 wood_deck_sf 2721. 2174.
29 open_porch_sf 0 2174.
30 enclosed_porch 0 2174.
31 x3ssn_porch 0 2174.
32 screen_porch 108. 2174.
33 pool_area 0 2174.
34 misc_val 0 2174.
35 mo_sold 0 2174.
36 yr_sold 0 2174.
37 ms_sub_class_x030 0 2174.
38 ms_sub_class_x040 0 2174.
39 ms_sub_class_x045 0 2174.
40 ms_sub_class_x050 0 2174.
41 ms_sub_class_x060 0 2174.
42 ms_sub_class_x070 0 2174.
43 ms_sub_class_x075 0 2174.
44 ms_sub_class_x080 0 2174.
45 ms_sub_class_x085 0 2174.
46 ms_sub_class_x090 0 2174.
47 ms_sub_class_x120 0 2174.
48 ms_sub_class_x150 0 2174.
49 ms_sub_class_x160 -147. 2174.
50 ms_sub_class_x180 0 2174.
51 ms_sub_class_x190 0 2174.
52 ms_zoning_c 0 2174.
53 ms_zoning_fv 0 2174.
54 ms_zoning_i 0 2174.
55 ms_zoning_rh 0 2174.
56 ms_zoning_rl 0 2174.
57 ms_zoning_rm -3734. 2174.
58 street_pave 4616. 2174.
59 alley_none 0 2174.
60 alley_pave 0 2174.
61 lot_shape_ir1 0 2174.
62 lot_shape_ir2 1800. 2174.
63 lot_shape_ir3 -31308. 2174.
64 land_contour_hls 11063. 2174.
65 land_contour_low 0 2174.
66 land_contour_lvl 0 2174.
67 utilities_no_sewr 0 2174.
68 lot_config_cul_d_sac 5489. 2174.
69 lot_config_fr2 0 2174.
70 lot_config_fr3 0 2174.
71 lot_config_inside 0 2174.
72 land_slope_mod 0 2174.
73 land_slope_sev 0 2174.
74 neighborhood_blueste 0 2174.
75 neighborhood_br_dale 0 2174.
76 neighborhood_brk_side 0 2174.
77 neighborhood_clear_cr 0 2174.
78 neighborhood_collg_cr 0 2174.
79 neighborhood_crawfor 16776. 2174.
80 neighborhood_edwards -4403. 2174.
81 neighborhood_gilbert 0 2174.
82 neighborhood_greens 0 2174.
83 neighborhood_grn_hill 35635. 2174.
84 neighborhood_idotrr -158. 2174.
85 neighborhood_landmrk 0 2174.
86 neighborhood_meadow_v 0 2174.
87 neighborhood_mitchel 0 2174.
88 neighborhood_n_ames 0 2174.
89 neighborhood_no_ridge 39510. 2174.
90 neighborhood_n_pk_vill 0 2174.
91 neighborhood_nridg_ht 23702. 2174.
92 neighborhood_nw_ames 0 2174.
93 neighborhood_old_town 0 2174.
94 neighborhood_sawyer 0 2174.
95 neighborhood_sawyer_w 0 2174.
96 neighborhood_somerst 8400. 2174.
97 neighborhood_stone_br 35257. 2174.
98 neighborhood_swisu 0 2174.
99 neighborhood_timber 0 2174.
100 neighborhood_veenker 0 2174.
101 condition_1_feedr -1843. 2174.
102 condition_1_norm 7171. 2174.
103 condition_1_pos_a 0 2174.
104 condition_1_pos_n 0 2174.
105 condition_1_rr_ae 0 2174.
106 condition_1_rr_an 0 2174.
107 condition_1_rr_ne 0 2174.
108 condition_1_rr_nn 0 2174.
109 condition_2_feedr 0 2174.
110 condition_2_norm 0 2174.
111 condition_2_pos_a 13822. 2174.
112 condition_2_pos_n -87122. 2174.
113 condition_2_rr_nn 0 2174.
114 bldg_type_one_fam 17222. 2174.
115 bldg_type_twhs_ext 0 2174.
116 bldg_type_twhs_int -287. 2174.
117 bldg_type_two_fam 0 2174.
118 house_style_s_lvl 0 2174.
119 house_style_x1_5fin 0 2174.
120 house_style_x1_5unf 0 2174.
121 house_style_x1story 131. 2174.
122 house_style_x2_5fin 0 2174.
123 house_style_x2_5unf 0 2174.
124 house_style_x2story 0 2174.
125 roof_style_gable -1429. 2174.
126 roof_style_gambrel 0 2174.
127 roof_style_hip 0 2174.
128 roof_style_mansard 0 2174.
129 roof_style_shed 0 2174.
130 roof_matl_comp_shg 1336. 2174.
131 roof_matl_membran 0 2174.
132 roof_matl_metal 0 2174.
133 roof_matl_tar_grv 5820. 2174.
134 roof_matl_wd_shake 0 2174.
135 roof_matl_wd_shngl 0 2174.
136 exterior_1st_asph_shn 0 2174.
137 exterior_1st_brk_comm 0 2174.
138 exterior_1st_brk_face 12950. 2174.
139 exterior_1st_c_block 0 2174.
140 exterior_1st_cemnt_bd 0 2174.
141 exterior_1st_hd_board -55.8 2174.
142 exterior_1st_metal_sd 0 2174.
143 exterior_1st_plywood 0 2174.
144 exterior_1st_pre_cast 22265. 2174.
145 exterior_1st_stone 0 2174.
146 exterior_1st_stucco -16495. 2174.
147 exterior_1st_vinyl_sd 0 2174.
148 exterior_1st_wd_sdng 0 2174.
149 exterior_1st_wd_shing 0 2174.
150 exterior_2nd_asph_shn 0 2174.
151 exterior_2nd_brk_cmn 0 2174.
152 exterior_2nd_brk_face 0 2174.
153 exterior_2nd_c_block 0 2174.
154 exterior_2nd_cment_bd 2678. 2174.
155 exterior_2nd_hd_board -361. 2174.
156 exterior_2nd_im_stucc 11372. 2174.
157 exterior_2nd_metal_sd 0 2174.
158 exterior_2nd_other 0 2174.
159 exterior_2nd_plywood 0 2174.
160 exterior_2nd_pre_cast 15355. 2174.
161 exterior_2nd_stone -75.8 2174.
162 exterior_2nd_stucco -827. 2174.
163 exterior_2nd_vinyl_sd 0 2174.
164 exterior_2nd_wd_sdng 0 2174.
165 exterior_2nd_wd_shng -259. 2174.
166 mas_vnr_type_brk_face 0 2174.
167 mas_vnr_type_c_block -12735. 2174.
168 mas_vnr_type_none 0 2174.
169 mas_vnr_type_stone 0 2174.
170 exter_qual_fa 0 2174.
171 exter_qual_ta -5698. 2174.
172 exter_qual_gd 0 2174.
173 exter_qual_ex 2145. 2174.
174 exter_cond_fa 0 2174.
175 exter_cond_ta 0 2174.
176 exter_cond_gd 0 2174.
177 exter_cond_ex 0 2174.
178 foundation_c_block 0 2174.
179 foundation_p_conc 0 2174.
180 foundation_slab -665. 2174.
181 foundation_stone 0 2174.
182 foundation_wood 0 2174.
183 bsmt_qual_po 0 2174.
184 bsmt_qual_fa 0 2174.
185 bsmt_qual_ta 0 2174.
186 bsmt_qual_gd 0 2174.
187 bsmt_qual_ex 22373. 2174.
188 bsmt_cond_po 0 2174.
189 bsmt_cond_fa 0 2174.
190 bsmt_cond_ta 0 2174.
191 bsmt_cond_gd 0 2174.
192 bsmt_cond_ex 0 2174.
193 bsmt_exposure_gd 12153. 2174.
194 bsmt_exposure_mn 0 2174.
195 bsmt_exposure_no -3765. 2174.
196 bsmt_exposure_none 0 2174.
197 bsmt_fin_type_1_blq 0 2174.
198 bsmt_fin_type_1_glq 4036. 2174.
199 bsmt_fin_type_1_lw_q 0 2174.
200 bsmt_fin_type_1_none 0 2174.
201 bsmt_fin_type_1_rec 0 2174.
202 bsmt_fin_type_1_unf -335. 2174.
203 bsmt_fin_type_2_blq 0 2174.
204 bsmt_fin_type_2_glq 5628. 2174.
205 bsmt_fin_type_2_lw_q 0 2174.
206 bsmt_fin_type_2_none 0 2174.
207 bsmt_fin_type_2_rec 0 2174.
208 bsmt_fin_type_2_unf 0 2174.
209 heating_gas_a 0 2174.
210 heating_gas_w 0 2174.
211 heating_grav 0 2174.
212 heating_oth_w 0 2174.
213 heating_wall 0 2174.
214 heating_qc_fa 0 2174.
215 heating_qc_ta 0 2174.
216 heating_qc_gd 0 2174.
217 heating_qc_ex 2567. 2174.
218 central_air_y 1336. 2174.
219 electrical_fuse_f 0 2174.
220 electrical_fuse_p 0 2174.
221 electrical_mix 0 2174.
222 electrical_s_brkr 0 2174.
223 kitchen_qual_fa 0 2174.
224 kitchen_qual_ta -1744. 2174.
225 kitchen_qual_gd 0 2174.
226 kitchen_qual_ex 26943. 2174.
227 functional_min1 -188. 2174.
228 functional_min2 -1081. 2174.
229 functional_mod 0 2174.
230 functional_maj1 0 2174.
231 functional_maj2 0 2174.
232 functional_sev -30817. 2174.
233 functional_sal -12338. 2174.
234 fireplace_qu_po 0 2174.
235 fireplace_qu_fa -427. 2174.
236 fireplace_qu_ta 0 2174.
237 fireplace_qu_gd 584. 2174.
238 fireplace_qu_ex 3850. 2174.
239 garage_type_basment -721. 2174.
240 garage_type_built_in 0 2174.
241 garage_type_car_port 0 2174.
242 garage_type_detchd 0 2174.
243 garage_type_none 0 2174.
244 garage_type_x2types 0 2174.
245 garage_finish_none 0 2174.
246 garage_finish_r_fn 0 2174.
247 garage_finish_unf 0 2174.
248 garage_qual_po 0 2174.
249 garage_qual_fa 0 2174.
250 garage_qual_ta 0 2174.
251 garage_qual_gd 0 2174.
252 garage_qual_ex 0 2174.
253 garage_cond_fa 0 2174.
254 garage_cond_gd 0 2174.
255 garage_cond_none 0 2174.
256 garage_cond_po 0 2174.
257 garage_cond_ta 0 2174.
258 paved_drive_p 0 2174.
259 paved_drive_y 0 2174.
260 pool_qc_po 0 2174.
261 pool_qc_fa 0 2174.
262 pool_qc_ta 0 2174.
263 pool_qc_gd -80053. 2174.
264 pool_qc_ex 66967. 2174.
265 fence_gd_wo 0 2174.
266 fence_mn_prv 0 2174.
267 fence_mn_ww 0 2174.
268 fence_none 0 2174.
269 misc_feature_none 0 2174.
270 misc_feature_othr 0 2174.
271 misc_feature_shed 0 2174.
272 misc_feature_ten_c 0 2174.
273 sale_type_con 0 2174.
274 sale_type_con_ld 0 2174.
275 sale_type_con_li 0 2174.
276 sale_type_con_lw 0 2174.
277 sale_type_cwd 0 2174.
278 sale_type_new 9744. 2174.
279 sale_type_oth 0 2174.
280 sale_type_vwd 0 2174.
281 sale_type_wd 0 2174.
282 sale_condition_adj_land 0 2174.
283 sale_condition_alloca 0 2174.
284 sale_condition_family 0 2174.
285 sale_condition_normal 0 2174.
286 sale_condition_partial 3870. 2174.
Get RMSE in train & test
Use rmse_vec() to get error in feat_trn
glmnet_rmse_trn <- rmse_vec(truth = feat_trn$sale_price,
estimate = predict(fit_glmnet, feat_trn)$.pred)Warning in select_best(fits_glmnet): No value of `metric` was given; "rmse"
will be used.
Use rmse_vec() to get error in feat_test
glmnet_rmse_test <- rmse_vec(truth = feat_test$sale_price,
estimate = predict(fit_glmnet, feat_test)$.pred)Warning in select_best(fits_glmnet): No value of `metric` was given; "rmse"
will be used.
Get number of features
glmnet_n_feat <- fit_glmnet |>
tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()Add to track_rmse
track_rmse <- add_row(track_rmse,
model = "GLMnet",
rmse_trn = glmnet_rmse_trn,
rmse_test = glmnet_rmse_test,
n_features = glmnet_n_feat)
track_rmse# A tibble: 4 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 19285. 25964. 265
2 OLS with NZV/Corr 26567. 28277. 105
3 LASSO 24967. 26860. 63
4 GLMnet 24368. 26435. 82
The elastic net performs about comparably in training and test too.
There were 82 features retained in the elasticnet model. This is more features than were retained in the LASSO, but less features than were retained using step_nzv() and step_corr().
The OLS model with all features performed descriptively the worst. However, the various methods to reduce overfitting didn’t help all that much. Maybe a little. It may be that none of these models are very overfit b/c the sample size remains quite large relative to the number of features.
Save & render
Save this .qmd file with your last name at the end (e.g., hw_unit_6_regularization_wyant). Make sure you changed “Your name here” at the top of the file to be your own name. Render the file to .html, and upload the rendered file to Canvas.
Way to go!!