options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
# We also will need to resolve a new conflict using the following code
conflictRules("Matrix", mask.ok = c("expand", "pack", "unpack"))
Homework Unit 6: Regularization and Penalized Models
Introduction
This file serves as the answer key for the Unit_06 homework. Unit 6 Regularization and Penalized Models in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how to tune two regularization hyperparameters (\(\alpha\) and \(\lambda\)) and select among model configurations using resampling methods.
Setup
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
library(xfun, include.only = "cache_rds")
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify other global settings
Since we are going to use cache_rds()
, we are also going to include rerun_setting <- FALSE
in this chunk
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
<- TRUE rerun_setting
Paths
<- "homework/unit_06" path_data
Set up parallel processing
Note you can type cl
into your console to see how many cores your computer has.
<- parallel::makePSOCKcluster(parallel::detectCores(logical = FALSE))
cl ::registerDoParallel(cl) doParallel
Read in data
Read in the ames_full_cln.csv
data file
<- read_csv(here::here(path_data, "ames_full_cln.csv"),
data_all col_types = cols()) |>
glimpse()
Rows: 1,955
Columns: 81
$ pid <chr> "x0526301100", "x0526350040", "x0526351010", "x0527105…
$ ms_sub_class <chr> "x020", "x020", "x020", "x060", "x120", "x120", "x120"…
$ ms_zoning <chr> "rl", "rh", "rl", "rl", "rl", "rl", "rl", "rl", "rl", …
$ lot_frontage <dbl> 141, 80, 81, 74, 41, 43, 39, 60, 75, 63, 85, NA, 47, 1…
$ lot_area <dbl> 31770, 11622, 14267, 13830, 4920, 5005, 5389, 7500, 10…
$ street <chr> "pave", "pave", "pave", "pave", "pave", "pave", "pave"…
$ alley <chr> "none", "none", "none", "none", "none", "none", "none"…
$ lot_shape <chr> "ir1", "reg", "ir1", "ir1", "reg", "ir1", "ir1", "reg"…
$ land_contour <chr> "lvl", "lvl", "lvl", "lvl", "lvl", "hls", "lvl", "lvl"…
$ utilities <chr> "all_pub", "all_pub", "all_pub", "all_pub", "all_pub",…
$ lot_config <chr> "corner", "inside", "corner", "inside", "inside", "ins…
$ land_slope <chr> "gtl", "gtl", "gtl", "gtl", "gtl", "gtl", "gtl", "gtl"…
$ neighborhood <chr> "n_ames", "n_ames", "n_ames", "gilbert", "stone_br", "…
$ condition_1 <chr> "norm", "feedr", "norm", "norm", "norm", "norm", "norm…
$ condition_2 <chr> "norm", "norm", "norm", "norm", "norm", "norm", "norm"…
$ bldg_type <chr> "one_fam", "one_fam", "one_fam", "one_fam", "twhs_ext"…
$ house_style <chr> "x1story", "x1story", "x1story", "x2story", "x1story",…
$ overall_qual <dbl> 6, 5, 6, 5, 8, 8, 8, 7, 6, 6, 7, 8, 8, 8, 9, 4, 6, 6, …
$ overall_cond <dbl> 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 2, 5, 6, 6, …
$ year_built <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ year_remod_add <dbl> 1960, 1961, 1958, 1998, 2001, 1992, 1996, 1999, 1994, …
$ roof_style <chr> "hip", "gable", "hip", "gable", "gable", "gable", "gab…
$ roof_matl <chr> "comp_shg", "comp_shg", "comp_shg", "comp_shg", "comp_…
$ exterior_1st <chr> "brk_face", "vinyl_sd", "wd_sdng", "vinyl_sd", "cemnt_…
$ exterior_2nd <chr> "plywood", "vinyl_sd", "wd_sdng", "vinyl_sd", "cment_b…
$ mas_vnr_type <chr> "stone", "none", "brk_face", "none", "none", "none", "…
$ mas_vnr_area <dbl> 112, 0, 108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603, 0, 350, 0…
$ exter_qual <chr> "ta", "ta", "ta", "ta", "gd", "gd", "gd", "ta", "ta", …
$ exter_cond <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ foundation <chr> "c_block", "c_block", "c_block", "p_conc", "p_conc", "…
$ bsmt_qual <chr> "ta", "ta", "ta", "gd", "gd", "gd", "gd", "ta", "gd", …
$ bsmt_cond <chr> "gd", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ bsmt_exposure <chr> "gd", "no", "no", "no", "mn", "no", "no", "no", "no", …
$ bsmt_fin_type_1 <chr> "blq", "rec", "alq", "glq", "glq", "alq", "glq", "unf"…
$ bsmt_fin_sf_1 <dbl> 639, 468, 923, 791, 616, 263, 1180, 0, 0, 0, 637, 368,…
$ bsmt_fin_type_2 <chr> "unf", "lw_q", "unf", "unf", "unf", "unf", "unf", "unf…
$ bsmt_fin_sf_2 <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0, 0, 0, 1…
$ bsmt_unf_sf <dbl> 441, 270, 406, 137, 722, 1017, 415, 994, 763, 789, 663…
$ total_bsmt_sf <dbl> 1080, 882, 1329, 928, 1338, 1280, 1595, 994, 763, 789,…
$ heating <chr> "gas_a", "gas_a", "gas_a", "gas_a", "gas_a", "gas_a", …
$ heating_qc <chr> "fa", "ta", "ta", "gd", "ex", "ex", "ex", "gd", "gd", …
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ electrical <chr> "s_brkr", "s_brkr", "s_brkr", "s_brkr", "s_brkr", "s_b…
$ x1st_flr_sf <dbl> 1656, 896, 1329, 928, 1338, 1280, 1616, 1028, 763, 789…
$ x2nd_flr_sf <dbl> 0, 0, 0, 701, 0, 0, 0, 776, 892, 676, 0, 0, 1589, 672,…
$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gr_liv_area <dbl> 1656, 896, 1329, 1629, 1338, 1280, 1616, 1804, 1655, 1…
$ bsmt_full_bath <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
$ bsmt_half_bath <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ full_bath <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, 1, 2, 2, …
$ half_bath <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
$ bedroom_abv_gr <dbl> 3, 2, 3, 3, 2, 2, 2, 3, 3, 3, 2, 1, 4, 4, 1, 2, 3, 3, …
$ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ kitchen_qual <chr> "ta", "ta", "gd", "ta", "gd", "gd", "gd", "gd", "ta", …
$ tot_rms_abv_grd <dbl> 7, 5, 6, 6, 6, 5, 5, 7, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7,…
$ functional <chr> "typ", "typ", "typ", "typ", "typ", "typ", "typ", "typ"…
$ fireplaces <dbl> 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, …
$ fireplace_qu <chr> "gd", "none", "none", "ta", "none", "none", "ta", "ta"…
$ garage_type <chr> "attchd", "attchd", "attchd", "attchd", "attchd", "att…
$ garage_yr_blt <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ garage_finish <chr> "fin", "unf", "unf", "fin", "fin", "r_fn", "r_fn", "fi…
$ garage_cars <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, …
$ garage_area <dbl> 528, 730, 312, 482, 582, 506, 608, 442, 440, 393, 506,…
$ garage_qual <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ garage_cond <chr> "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", "ta", …
$ paved_drive <chr> "p", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",…
$ wood_deck_sf <dbl> 210, 140, 393, 212, 0, 0, 237, 140, 157, 0, 192, 0, 50…
$ open_porch_sf <dbl> 62, 0, 36, 34, 0, 82, 152, 60, 84, 75, 0, 54, 36, 12, …
$ enclosed_porch <dbl> 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ x3ssn_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ screen_porch <dbl> 0, 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 140, 210, 0, 0, 0…
$ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ pool_qc <chr> "none", "none", "none", "none", "none", "none", "none"…
$ fence <chr> "none", "mn_prv", "none", "mn_prv", "none", "none", "n…
$ misc_feature <chr> "none", "none", "gar2", "none", "none", "none", "none"…
$ misc_val <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mo_sold <dbl> 5, 6, 6, 3, 4, 1, 3, 6, 4, 5, 2, 6, 6, 6, 6, 6, 2, 1, …
$ yr_sold <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ sale_type <chr> "wd", "wd", "wd", "wd", "wd", "wd", "wd", "wd", "wd", …
$ sale_condition <chr> "normal", "normal", "normal", "normal", "normal", "nor…
$ sale_price <dbl> 215000, 105000, 172000, 189900, 213500, 191500, 236500…
Set variable classes
Set all variables to factor or numeric classes. We will also explicitly set factor levels for those with low frequency count levels (e.g., neighbohood
, ms_sub_class
).
Next we will use fct_relevel
to order levels in our ordinal variables. We know which variables are ordinal by looking at the data dictionary!
You might have gotten a warning that bsmt_qual
contains no values of the po level. We still want to keep this as a possible level so we will resolve this warning by first using fct_expand()
!
<- c("none", "po", "fa", "ta", "gd", "ex")
levels_qu
<- data_all |>
data_all mutate(neighborhood = factor(neighborhood,
levels = c("blmngtn", "blueste", "br_dale", "brk_side",
"clear_cr", "collg_cr", "crawfor", "edwards",
"gilbert", "greens", "grn_hill", "idotrr",
"landmrk", "meadow_v", "mitchel", "n_ames",
"no_ridge", "n_pk_vill", "nridg_ht", "nw_ames",
"old_town", "sawyer", "sawyer_w", "somerst",
"stone_br", "swisu", "timber", "veenker")),
ms_sub_class = factor(ms_sub_class,
levels = c("020", "030", "040", "045", "050", "060",
"070", "075", "080", "085", "090", "120",
"150", "160", "180", "190")),
across(where(is.character), as.factor),
bsmt_qual = fct_expand(bsmt_qual, "po"),
bsmt_qual = fct_relevel(bsmt_qual, levels_qu),
garage_qual = fct_relevel(garage_qual, levels_qu),
fireplace_qu = fct_relevel(fireplace_qu, levels_qu),
bsmt_cond = fct_relevel(bsmt_cond, levels_qu),
exter_qual = fct_expand(exter_qual, "po"),
exter_qual = fct_relevel(exter_qual, c("po", "fa", "ta", "gd", "ex")),
exter_cond = fct_relevel(exter_cond, c("po", "fa", "ta", "gd", "ex")),
pool_qc = fct_expand(pool_qc, "po"),
pool_qc = fct_relevel(pool_qc, levels_qu),
heating_qc = fct_relevel(heating_qc, c("po", "fa", "ta", "gd", "ex")),
kitchen_qual = fct_relevel(kitchen_qual, c("po", "fa", "ta", "gd", "ex")),
functional = fct_relevel(functional, c("typ", "min1", "min2", "mod", "maj1", "maj2", "sev", "sal")),
lot_shape = fct_relevel(lot_shape, c("reg", "ir1", "ir2", "ir3")),
land_slope = fct_relevel(land_slope, c("gtl", "mod", "sev"))) |>
glimpse()
Rows: 1,955
Columns: 81
$ pid <fct> x0526301100, x0526350040, x0526351010, x0527105010, x0…
$ ms_sub_class <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ms_zoning <fct> rl, rh, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl, rl…
$ lot_frontage <dbl> 141, 80, 81, 74, 41, 43, 39, 60, 75, 63, 85, NA, 47, 1…
$ lot_area <dbl> 31770, 11622, 14267, 13830, 4920, 5005, 5389, 7500, 10…
$ street <fct> pave, pave, pave, pave, pave, pave, pave, pave, pave, …
$ alley <fct> none, none, none, none, none, none, none, none, none, …
$ lot_shape <fct> ir1, reg, ir1, ir1, reg, ir1, ir1, reg, ir1, ir1, reg,…
$ land_contour <fct> lvl, lvl, lvl, lvl, lvl, hls, lvl, lvl, lvl, lvl, lvl,…
$ utilities <fct> all_pub, all_pub, all_pub, all_pub, all_pub, all_pub, …
$ lot_config <fct> corner, inside, corner, inside, inside, inside, inside…
$ land_slope <fct> gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl, gtl,…
$ neighborhood <fct> n_ames, n_ames, n_ames, gilbert, stone_br, stone_br, s…
$ condition_1 <fct> norm, feedr, norm, norm, norm, norm, norm, norm, norm,…
$ condition_2 <fct> norm, norm, norm, norm, norm, norm, norm, norm, norm, …
$ bldg_type <fct> one_fam, one_fam, one_fam, one_fam, twhs_ext, twhs_ext…
$ house_style <fct> x1story, x1story, x1story, x2story, x1story, x1story, …
$ overall_qual <dbl> 6, 5, 6, 5, 8, 8, 8, 7, 6, 6, 7, 8, 8, 8, 9, 4, 6, 6, …
$ overall_cond <dbl> 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 2, 5, 6, 6, …
$ year_built <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ year_remod_add <dbl> 1960, 1961, 1958, 1998, 2001, 1992, 1996, 1999, 1994, …
$ roof_style <fct> hip, gable, hip, gable, gable, gable, gable, gable, ga…
$ roof_matl <fct> comp_shg, comp_shg, comp_shg, comp_shg, comp_shg, comp…
$ exterior_1st <fct> brk_face, vinyl_sd, wd_sdng, vinyl_sd, cemnt_bd, hd_bo…
$ exterior_2nd <fct> plywood, vinyl_sd, wd_sdng, vinyl_sd, cment_bd, hd_boa…
$ mas_vnr_type <fct> stone, none, brk_face, none, none, none, none, none, n…
$ mas_vnr_area <dbl> 112, 0, 108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603, 0, 350, 0…
$ exter_qual <fct> ta, ta, ta, ta, gd, gd, gd, ta, ta, ta, ta, gd, ex, gd…
$ exter_cond <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ foundation <fct> c_block, c_block, c_block, p_conc, p_conc, p_conc, p_c…
$ bsmt_qual <fct> ta, ta, ta, gd, gd, gd, gd, ta, gd, gd, gd, gd, gd, gd…
$ bsmt_cond <fct> gd, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ bsmt_exposure <fct> gd, no, no, no, mn, no, no, no, no, no, gd, av, gd, av…
$ bsmt_fin_type_1 <fct> blq, rec, alq, glq, glq, alq, glq, unf, unf, unf, glq,…
$ bsmt_fin_sf_1 <dbl> 639, 468, 923, 791, 616, 263, 1180, 0, 0, 0, 637, 368,…
$ bsmt_fin_type_2 <fct> unf, lw_q, unf, unf, unf, unf, unf, unf, unf, unf, unf…
$ bsmt_fin_sf_2 <dbl> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0, 0, 0, 1…
$ bsmt_unf_sf <dbl> 441, 270, 406, 137, 722, 1017, 415, 994, 763, 789, 663…
$ total_bsmt_sf <dbl> 1080, 882, 1329, 928, 1338, 1280, 1595, 994, 763, 789,…
$ heating <fct> gas_a, gas_a, gas_a, gas_a, gas_a, gas_a, gas_a, gas_a…
$ heating_qc <fct> fa, ta, ta, gd, ex, ex, ex, gd, gd, gd, gd, ta, ex, gd…
$ central_air <fct> y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, …
$ electrical <fct> s_brkr, s_brkr, s_brkr, s_brkr, s_brkr, s_brkr, s_brkr…
$ x1st_flr_sf <dbl> 1656, 896, 1329, 928, 1338, 1280, 1616, 1028, 763, 789…
$ x2nd_flr_sf <dbl> 0, 0, 0, 701, 0, 0, 0, 776, 892, 676, 0, 0, 1589, 672,…
$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gr_liv_area <dbl> 1656, 896, 1329, 1629, 1338, 1280, 1616, 1804, 1655, 1…
$ bsmt_full_bath <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, …
$ bsmt_half_bath <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ full_bath <dbl> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, 1, 2, 2, …
$ half_bath <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, …
$ bedroom_abv_gr <dbl> 3, 2, 3, 3, 2, 2, 2, 3, 3, 3, 2, 1, 4, 4, 1, 2, 3, 3, …
$ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ kitchen_qual <fct> ta, ta, gd, ta, gd, gd, gd, gd, ta, ta, gd, gd, ex, ta…
$ tot_rms_abv_grd <dbl> 7, 5, 6, 6, 6, 5, 5, 7, 7, 7, 5, 4, 12, 8, 8, 4, 7, 7,…
$ functional <fct> typ, typ, typ, typ, typ, typ, typ, typ, typ, typ, typ,…
$ fireplaces <dbl> 2, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, …
$ fireplace_qu <fct> gd, none, none, ta, none, none, ta, ta, ta, gd, po, no…
$ garage_type <fct> attchd, attchd, attchd, attchd, attchd, attchd, attchd…
$ garage_yr_blt <dbl> 1960, 1961, 1958, 1997, 2001, 1992, 1995, 1999, 1993, …
$ garage_finish <fct> fin, unf, unf, fin, fin, r_fn, r_fn, fin, fin, fin, un…
$ garage_cars <dbl> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, …
$ garage_area <dbl> 528, 730, 312, 482, 582, 506, 608, 442, 440, 393, 506,…
$ garage_qual <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ garage_cond <fct> ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta, ta…
$ paved_drive <fct> p, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, y, …
$ wood_deck_sf <dbl> 210, 140, 393, 212, 0, 0, 237, 140, 157, 0, 192, 0, 50…
$ open_porch_sf <dbl> 62, 0, 36, 34, 0, 82, 152, 60, 84, 75, 0, 54, 36, 12, …
$ enclosed_porch <dbl> 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ x3ssn_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ screen_porch <dbl> 0, 120, 0, 0, 0, 144, 0, 0, 0, 0, 0, 140, 210, 0, 0, 0…
$ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ pool_qc <fct> none, none, none, none, none, none, none, none, none, …
$ fence <fct> none, mn_prv, none, mn_prv, none, none, none, none, no…
$ misc_feature <fct> none, none, gar2, none, none, none, none, none, none, …
$ misc_val <dbl> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mo_sold <dbl> 5, 6, 6, 3, 4, 1, 3, 6, 4, 5, 2, 6, 6, 6, 6, 6, 2, 1, …
$ yr_sold <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
$ sale_type <fct> wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd, wd…
$ sale_condition <fct> normal, normal, normal, normal, normal, normal, normal…
$ sale_price <dbl> 215000, 105000, 172000, 189900, 213500, 191500, 236500…
Set up splits
Divide data into train and test
Hold out 25% of the data as data_test
for evaluation using the initial_split()
function. Stratify on sale_price
.
set.seed(102030)
<- data_all |>
splits_test initial_split(prop = 0.75, strata = "sale_price")
<- splits_test |>
data_trn analysis()
<- splits_test |>
data_test assessment()
Make splits
Create 100 bootstrap splits using data_trn
stratified on sale_price
.
<- data_trn |>
splits_boot bootstraps(times = 100, strata = "sale_price")
Build recipe
You will build one recipe that can be used across three model fits. Please follow these instructions to build your recipe:
Regress the outcome
sale_price
on all predictorsRemove the ID variable (
pid
) withstep_rm()
Use
step_impute_median()
to impute the median for any missing values in numeric featuresUse
step_impute_mode()
to impute the mode for any missing values in the factor featuresUse
step_YeoJohnson()
to apply Yeo-Johnson transformations to all numeric featuresUse
step_normalize()
to normalize all numeric features (necessary for regularized models)Apply dummy coding to all factor features
<- recipe(sale_price ~ ., data = data_trn) |>
rec step_rm(pid) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_YeoJohnson(all_numeric_predictors()) |>
step_normalize(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors())
Create error tracking tibble
<- tibble(model = character(),
track_rmse rmse_trn = numeric(),
rmse_test = numeric(),
n_features = numeric())
Part 1: Fitting an OLS linear regression
Fit a regression model in the full training set
Make a feature matrix for all training data and for test data.
<- rec |>
rec_prep prep(data_trn)
<- rec_prep |>
feat_trn bake(data_trn)
<- rec_prep |>
feat_test bake(data_test)
No resampling is needed because there are no hyperparameters to tune.
<- linear_reg() |>
fit_linear set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn)
Get RMSE in train & test
Use rmse_vec()
to get error in feat_trn
<- rmse_vec(truth = feat_trn$sale_price,
lin_trn_rmse estimate = predict(fit_linear, feat_trn)$.pred)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
Use rmse_vec()
to get error in feat_test
<- rmse_vec(truth = feat_test$sale_price,
lin_test_rmse estimate = predict(fit_linear, feat_test)$.pred)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response", : prediction from rank-deficient fit; consider predict(.,
rankdeficient="NA")
Get number of features
<- fit_linear |>
lin_n_feat tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()
Add to tracking tibble
<- add_row(track_rmse,
track_rmse model = "OLS",
rmse_trn = lin_trn_rmse,
rmse_test = lin_test_rmse,
n_features = lin_n_feat)
track_rmse
# A tibble: 1 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 20992. 27493. 255
We see the error is much higher in the test set. This is because with 255 features our model is overfit to noise in out training data.
Also, notice the warnings we get when making these predictions. R is telling us that our models are rank-deficient - this means that this model may not have determined a unique set of parameter estimates to minimize the cost function. This can occur for a variety of reasons, some of which are real problems and other time for reasons that are not a problem. However, you should never use such a model unless you know the source of the rank deficiency. You can read more about this warning message here and here.
step_nzv()
and step_corr()
can be used to reduce the number of features by eliminating highly correlated features (default is > .9 absolute correlation) or features with almost no variance (e.g., a dummy coded feature that is all or almost all 0’s). Here is an example of what that would like like.
Lets add these two additional steps to our recipe
<- rec |>
rec_nzv step_nzv(all_predictors()) |>
step_corr(all_predictors())
Now lets remake our feature matrices, refit the model and evaluate its performance
<- rec_nzv |>
rec_prep prep(data_trn)
<- rec_prep |>
feat_trn_nzv bake(data_trn)
<- rec_prep |>
feat_test_nzv bake(data_test)
<- linear_reg() |>
fit_linear_nzv set_engine("lm") |>
fit(sale_price ~ ., data = feat_trn_nzv)
<- rmse_vec(truth = data_trn$sale_price,
lin_nzv_trn_rmse estimate = predict(fit_linear_nzv, feat_trn_nzv)$.pred)
<- rmse_vec(truth = data_test$sale_price,
lin_nzv_test_rmse estimate = predict(fit_linear_nzv, feat_test_nzv)$.pred)
<- fit_linear_nzv |>
lin_nzv_n_feat tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()
<- add_row(track_rmse,
track_rmse model = "OLS with NZV/Corr",
rmse_trn = lin_nzv_trn_rmse,
rmse_test = lin_nzv_test_rmse,
n_features = lin_nzv_n_feat)
track_rmse
# A tibble: 2 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 20992. 27493. 255
2 OLS with NZV/Corr 27875. 28540. 103
Our model is still performing similarly in test, but now we get a realistic idea of this performance in our training data. We also have cut down our features to 103 features! As a result we no longer get the rank deficient warning. Our model still has many features, but it is no longer rank-deficient. As an aside, you don’t need to wait to use step_nzv()
or step_corr()
until you get those kinds of warnings; they are generally helpful preprocessing steps that you can use as desired/appropriate!
Now lets see how regularization handles sparse data!
Part 2: Fitting a LASSO regression
Set up a hyperparameter grid
In the LASSO, the mixture hyperparameter (\(\alpha\)) will be set to 1, but we’ll need to tune the penalty hyperparameter (\(\lambda\)).
<- expand_grid(penalty = exp(seq(-6, 8, length.out = 500))) grid_penalty
Tune a LASSO regression
Use linear_reg()
, set_engine("glmnet")
, and tune_grid()
to fit your LASSO models.
<- cache_rds(
fits_lasso expr = {
linear_reg(penalty = tune(),
mixture = 1) |>
set_engine("glmnet") |>
tune_grid(preprocessor = rec,
resamples = splits_boot,
grid = grid_penalty,
metrics = metric_set(rmse))
}, dir = "cache/",
file = "fits_lasso",
rerun = rerun_setting)
Plot performance in the validation sets by hyperparameter
Use the plot_hyperparameters()
function in fun_ml.R
or your own code.
plot_hyperparameters(fits_lasso, hp1 = "penalty", metric = "rmse")
Fit your best configuration in data_trn
Use your best configuration (i.e., your best \(\lambda\) value) to fit a model in the full training set (data_trn
) using select_best()
.
select_best(fits_lasso)
# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 1564. Preprocessor1_Model477
<- linear_reg(penalty = select_best(fits_lasso)$penalty,
fit_lasso mixture = 1) |>
set_engine("glmnet") |>
fit(sale_price ~ ., data = feat_trn)
Examine parameter estimates
|>
fit_lasso tidy() |>
print(n = Inf) |>
suppressMessages() # suppressing message that appears when required Matrix package is loaded
# A tibble: 286 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) 162262. 1564.
2 lot_frontage 0 1564.
3 lot_area 4407. 1564.
4 overall_qual 16008. 1564.
5 overall_cond 3306. 1564.
6 year_built 4821. 1564.
7 year_remod_add 2226. 1564.
8 mas_vnr_area 0 1564.
9 bsmt_fin_sf_1 4061. 1564.
10 bsmt_fin_sf_2 0 1564.
11 bsmt_unf_sf 0 1564.
12 total_bsmt_sf 2475. 1564.
13 x1st_flr_sf 2533. 1564.
14 x2nd_flr_sf 0 1564.
15 low_qual_fin_sf -119. 1564.
16 gr_liv_area 18743. 1564.
17 bsmt_full_bath 2322. 1564.
18 bsmt_half_bath 0 1564.
19 full_bath 818. 1564.
20 half_bath 20.0 1564.
21 bedroom_abv_gr 0 1564.
22 kitchen_abv_gr -784. 1564.
23 tot_rms_abv_grd 0 1564.
24 fireplaces 3358. 1564.
25 garage_yr_blt 1984. 1564.
26 garage_cars 5698. 1564.
27 garage_area 457. 1564.
28 wood_deck_sf 1020. 1564.
29 open_porch_sf 0 1564.
30 enclosed_porch 0 1564.
31 x3ssn_porch 0 1564.
32 screen_porch 0 1564.
33 pool_area 0 1564.
34 misc_val 0 1564.
35 mo_sold 0 1564.
36 yr_sold 0 1564.
37 ms_sub_class_X030 0 1564.
38 ms_sub_class_X040 0 1564.
39 ms_sub_class_X045 0 1564.
40 ms_sub_class_X050 0 1564.
41 ms_sub_class_X060 0 1564.
42 ms_sub_class_X070 0 1564.
43 ms_sub_class_X075 0 1564.
44 ms_sub_class_X080 0 1564.
45 ms_sub_class_X085 0 1564.
46 ms_sub_class_X090 0 1564.
47 ms_sub_class_X120 0 1564.
48 ms_sub_class_X150 0 1564.
49 ms_sub_class_X160 0 1564.
50 ms_sub_class_X180 0 1564.
51 ms_sub_class_X190 0 1564.
52 ms_zoning_c 0 1564.
53 ms_zoning_fv 0 1564.
54 ms_zoning_i 0 1564.
55 ms_zoning_rh 0 1564.
56 ms_zoning_rl 0 1564.
57 ms_zoning_rm 0 1564.
58 street_pave 0 1564.
59 alley_none 0 1564.
60 alley_pave 0 1564.
61 lot_shape_ir1 0 1564.
62 lot_shape_ir2 0 1564.
63 lot_shape_ir3 -20763. 1564.
64 land_contour_hls 8567. 1564.
65 land_contour_low 0 1564.
66 land_contour_lvl 0 1564.
67 utilities_no_sewr 0 1564.
68 lot_config_cul_d_sac 2361. 1564.
69 lot_config_fr2 -80.7 1564.
70 lot_config_fr3 0 1564.
71 lot_config_inside 0 1564.
72 land_slope_mod 0 1564.
73 land_slope_sev 0 1564.
74 neighborhood_blueste 0 1564.
75 neighborhood_br_dale 0 1564.
76 neighborhood_brk_side 0 1564.
77 neighborhood_clear_cr 0 1564.
78 neighborhood_collg_cr 0 1564.
79 neighborhood_crawfor 13788. 1564.
80 neighborhood_edwards -2014. 1564.
81 neighborhood_gilbert 0 1564.
82 neighborhood_greens 0 1564.
83 neighborhood_grn_hill 10176. 1564.
84 neighborhood_idotrr 0 1564.
85 neighborhood_landmrk 0 1564.
86 neighborhood_meadow_v 0 1564.
87 neighborhood_mitchel 0 1564.
88 neighborhood_n_ames 0 1564.
89 neighborhood_no_ridge 53071. 1564.
90 neighborhood_n_pk_vill 0 1564.
91 neighborhood_nridg_ht 21956. 1564.
92 neighborhood_nw_ames 0 1564.
93 neighborhood_old_town 0 1564.
94 neighborhood_sawyer 0 1564.
95 neighborhood_sawyer_w 0 1564.
96 neighborhood_somerst 6240. 1564.
97 neighborhood_stone_br 31853. 1564.
98 neighborhood_swisu 0 1564.
99 neighborhood_timber 0 1564.
100 neighborhood_veenker 0 1564.
101 condition_1_feedr 0 1564.
102 condition_1_norm 6347. 1564.
103 condition_1_pos_a 0 1564.
104 condition_1_pos_n 0 1564.
105 condition_1_rr_ae 0 1564.
106 condition_1_rr_an 0 1564.
107 condition_1_rr_ne 0 1564.
108 condition_1_rr_nn 0 1564.
109 condition_2_feedr 0 1564.
110 condition_2_norm 0 1564.
111 condition_2_pos_a 0 1564.
112 condition_2_pos_n -56105. 1564.
113 condition_2_rr_nn 0 1564.
114 bldg_type_one_fam 9920. 1564.
115 bldg_type_twhs_ext 0 1564.
116 bldg_type_twhs_int 0 1564.
117 bldg_type_two_fam 0 1564.
118 house_style_s_lvl 0 1564.
119 house_style_x1_5fin -231. 1564.
120 house_style_x1_5unf 0 1564.
121 house_style_x1story 0 1564.
122 house_style_x2_5fin 0 1564.
123 house_style_x2_5unf 0 1564.
124 house_style_x2story 0 1564.
125 roof_style_gable -1157. 1564.
126 roof_style_gambrel 0 1564.
127 roof_style_hip 0 1564.
128 roof_style_mansard 0 1564.
129 roof_style_shed 0 1564.
130 roof_matl_comp_shg 0 1564.
131 roof_matl_membran 0 1564.
132 roof_matl_metal 0 1564.
133 roof_matl_tar_grv 0 1564.
134 roof_matl_wd_shake 0 1564.
135 roof_matl_wd_shngl 0 1564.
136 exterior_1st_asph_shn 0 1564.
137 exterior_1st_brk_comm 0 1564.
138 exterior_1st_brk_face 9887. 1564.
139 exterior_1st_c_block 0 1564.
140 exterior_1st_cemnt_bd 0 1564.
141 exterior_1st_hd_board 0 1564.
142 exterior_1st_metal_sd 0 1564.
143 exterior_1st_plywood 0 1564.
144 exterior_1st_pre_cast 8590. 1564.
145 exterior_1st_stone 0 1564.
146 exterior_1st_stucco -6730. 1564.
147 exterior_1st_vinyl_sd 0 1564.
148 exterior_1st_wd_sdng 0 1564.
149 exterior_1st_wd_shing 0 1564.
150 exterior_2nd_asph_shn 0 1564.
151 exterior_2nd_brk_cmn 0 1564.
152 exterior_2nd_brk_face 0 1564.
153 exterior_2nd_c_block 0 1564.
154 exterior_2nd_cment_bd 2888. 1564.
155 exterior_2nd_hd_board 0 1564.
156 exterior_2nd_im_stucc 5929. 1564.
157 exterior_2nd_metal_sd 0 1564.
158 exterior_2nd_other 0 1564.
159 exterior_2nd_plywood 0 1564.
160 exterior_2nd_pre_cast 0 1564.
161 exterior_2nd_stone 0 1564.
162 exterior_2nd_stucco 0 1564.
163 exterior_2nd_vinyl_sd 0 1564.
164 exterior_2nd_wd_sdng 0 1564.
165 exterior_2nd_wd_shng 0 1564.
166 mas_vnr_type_brk_face 0 1564.
167 mas_vnr_type_c_block -4770. 1564.
168 mas_vnr_type_none 0 1564.
169 mas_vnr_type_stone 0 1564.
170 exter_qual_fa 0 1564.
171 exter_qual_ta -5276. 1564.
172 exter_qual_gd 0 1564.
173 exter_qual_ex 3992. 1564.
174 exter_cond_fa 0 1564.
175 exter_cond_ta 0 1564.
176 exter_cond_gd 0 1564.
177 exter_cond_ex 0 1564.
178 foundation_c_block 0 1564.
179 foundation_p_conc 0 1564.
180 foundation_slab 0 1564.
181 foundation_stone 0 1564.
182 foundation_wood 0 1564.
183 bsmt_qual_po 0 1564.
184 bsmt_qual_fa 0 1564.
185 bsmt_qual_ta 0 1564.
186 bsmt_qual_gd 0 1564.
187 bsmt_qual_ex 27334. 1564.
188 bsmt_cond_po 0 1564.
189 bsmt_cond_fa 0 1564.
190 bsmt_cond_ta 0 1564.
191 bsmt_cond_gd 0 1564.
192 bsmt_cond_ex 0 1564.
193 bsmt_exposure_gd 12150. 1564.
194 bsmt_exposure_mn 0 1564.
195 bsmt_exposure_no -4043. 1564.
196 bsmt_exposure_none 0 1564.
197 bsmt_fin_type_1_blq 0 1564.
198 bsmt_fin_type_1_glq 628. 1564.
199 bsmt_fin_type_1_lw_q 0 1564.
200 bsmt_fin_type_1_none 0 1564.
201 bsmt_fin_type_1_rec 0 1564.
202 bsmt_fin_type_1_unf 0 1564.
203 bsmt_fin_type_2_blq 0 1564.
204 bsmt_fin_type_2_glq 99.6 1564.
205 bsmt_fin_type_2_lw_q 0 1564.
206 bsmt_fin_type_2_none 0 1564.
207 bsmt_fin_type_2_rec 0 1564.
208 bsmt_fin_type_2_unf 0 1564.
209 heating_gas_a 0 1564.
210 heating_gas_w 0 1564.
211 heating_grav 0 1564.
212 heating_oth_w 0 1564.
213 heating_wall 0 1564.
214 heating_qc_fa 0 1564.
215 heating_qc_ta 0 1564.
216 heating_qc_gd 0 1564.
217 heating_qc_ex 2753. 1564.
218 central_air_y 0 1564.
219 electrical_fuse_f 0 1564.
220 electrical_fuse_p 0 1564.
221 electrical_mix 0 1564.
222 electrical_s_brkr 0 1564.
223 kitchen_qual_fa 0 1564.
224 kitchen_qual_ta -2959. 1564.
225 kitchen_qual_gd 0 1564.
226 kitchen_qual_ex 30329. 1564.
227 functional_min1 0 1564.
228 functional_min2 -70.2 1564.
229 functional_mod 0 1564.
230 functional_maj1 0 1564.
231 functional_maj2 0 1564.
232 functional_sev -14773. 1564.
233 functional_sal 0 1564.
234 fireplace_qu_po 0 1564.
235 fireplace_qu_fa 0 1564.
236 fireplace_qu_ta 0 1564.
237 fireplace_qu_gd 0 1564.
238 fireplace_qu_ex 4082. 1564.
239 garage_type_basment 0 1564.
240 garage_type_built_in 0 1564.
241 garage_type_car_port 0 1564.
242 garage_type_detchd 0 1564.
243 garage_type_none 0 1564.
244 garage_type_x2types 0 1564.
245 garage_finish_none 0 1564.
246 garage_finish_r_fn 0 1564.
247 garage_finish_unf 0 1564.
248 garage_qual_po 0 1564.
249 garage_qual_fa 0 1564.
250 garage_qual_ta 0 1564.
251 garage_qual_gd 0 1564.
252 garage_qual_ex 0 1564.
253 garage_cond_fa 0 1564.
254 garage_cond_gd 0 1564.
255 garage_cond_none 0 1564.
256 garage_cond_po 0 1564.
257 garage_cond_ta 0 1564.
258 paved_drive_p 0 1564.
259 paved_drive_y 0 1564.
260 pool_qc_po 0 1564.
261 pool_qc_fa 0 1564.
262 pool_qc_ta 0 1564.
263 pool_qc_gd -35820. 1564.
264 pool_qc_ex 77229. 1564.
265 fence_gd_wo 0 1564.
266 fence_mn_prv 0 1564.
267 fence_mn_ww 0 1564.
268 fence_none 0 1564.
269 misc_feature_none 0 1564.
270 misc_feature_othr 0 1564.
271 misc_feature_shed 0 1564.
272 misc_feature_ten_c 0 1564.
273 sale_type_con 0 1564.
274 sale_type_con_ld 0 1564.
275 sale_type_con_li 0 1564.
276 sale_type_con_lw 0 1564.
277 sale_type_cwd 0 1564.
278 sale_type_new 12654. 1564.
279 sale_type_oth 0 1564.
280 sale_type_vwd 0 1564.
281 sale_type_wd 0 1564.
282 sale_condition_adj_land 0 1564.
283 sale_condition_alloca 0 1564.
284 sale_condition_family 0 1564.
285 sale_condition_normal 0 1564.
286 sale_condition_partial 2.97 1564.
Get RMSE in train & test
Use rmse_vec()
to get error in feat_trn
<- rmse_vec(truth = feat_trn$sale_price,
lasso_rmse_trn estimate = predict(fit_lasso, feat_trn)$.pred)
Use rmse_vec()
to get error in feat_test
<- rmse_vec(truth = feat_test$sale_price,
lasso_rmse_test estimate = predict(fit_lasso, feat_test)$.pred)
Get number of features
<- fit_lasso |>
lasso_n_feat tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()
Add to track_rmse
<- add_row(track_rmse,
track_rmse model = "LASSO",
rmse_trn = lasso_rmse_trn,
rmse_test = lasso_rmse_test,
n_features = lasso_n_feat)
track_rmse
# A tibble: 3 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 20992. 27493. 255
2 OLS with NZV/Corr 27875. 28540. 103
3 LASSO 25814. 28165. 58
Performance is again about comparable in train and test. Also, LASSO didn’t really do any better than the OLS in test once we removed nzv and highly correlated features
LASSO retained 58 features with non-zero parameter estimates. This is fewer than what we got with our recipe that eliminated near zero variance and highly correlated features (103 ) and far fewer features than our OLS model with 255 features (which we know is bad, but helpful here for comparison). We know that a simpler model is likely to have slightly more bias but likely with a large reduction in variance.
Part 3: Fitting an Elastic Net regression
Set up a hyperparameter grid
Now we’ll need to tune both the mixture hyperparameter (\(\alpha\)) and the penalty hyperparameter (\(\lambda\)).
<- expand_grid(penalty = exp(seq(-10, 11, length.out = 250)),
grid_glmnet mixture = seq(0, 1, length.out = 11))
Tune an elasticnet regression
Use linear_reg()
, set_engine("glmnet")
, and tune_grid()
to fit your LASSO models.
<- cache_rds(
fits_glmnet expr = {
linear_reg(penalty = tune(),
mixture = tune()) |>
set_engine("glmnet") |>
tune_grid(preprocessor = rec,
resamples = splits_boot,
grid = grid_glmnet,
metrics = metric_set(rmse))
}, dir = "cache/",
file = "fits_glmnet",
rerun = rerun_setting)
Plot performance in the validation sets by hyperparameter
Use the plot_hyperparameters()
function or your own code.
plot_hyperparameters(fits_glmnet,
hp1 = "penalty", hp2 = "mixture",
metric = "rmse")
Fit your best configuration in training data
Use your best configuration (i.e., your best combination of \(\alpha\) & \(\lambda\) values) to fit a model in the full training set (data_trn
) using select_best()
.
select_best(fits_glmnet)
# A tibble: 1 × 3
penalty mixture .config
<dbl> <dbl> <chr>
1 13120. 0.1 Preprocessor1_Model0482
<- linear_reg(penalty = select_best(fits_glmnet)$penalty,
fit_glmnet mixture = select_best(fits_glmnet)$mixture) |>
set_engine("glmnet") |>
fit(sale_price ~ ., data = feat_trn)
Examine parameter estimates
|>
fit_glmnet tidy() |>
print(n = Inf)
# A tibble: 286 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) 158450. 13120.
2 lot_frontage 0 13120.
3 lot_area 4182. 13120.
4 overall_qual 12386. 13120.
5 overall_cond 3113. 13120.
6 year_built 2943. 13120.
7 year_remod_add 2409. 13120.
8 mas_vnr_area 0 13120.
9 bsmt_fin_sf_1 3487. 13120.
10 bsmt_fin_sf_2 0 13120.
11 bsmt_unf_sf 0 13120.
12 total_bsmt_sf 3473. 13120.
13 x1st_flr_sf 4512. 13120.
14 x2nd_flr_sf 0 13120.
15 low_qual_fin_sf -172. 13120.
16 gr_liv_area 11794. 13120.
17 bsmt_full_bath 2271. 13120.
18 bsmt_half_bath 0 13120.
19 full_bath 3299. 13120.
20 half_bath 2343. 13120.
21 bedroom_abv_gr 0 13120.
22 kitchen_abv_gr -1477. 13120.
23 tot_rms_abv_grd 2475. 13120.
24 fireplaces 3873. 13120.
25 garage_yr_blt 2010. 13120.
26 garage_cars 4408. 13120.
27 garage_area 2582. 13120.
28 wood_deck_sf 1292. 13120.
29 open_porch_sf 0 13120.
30 enclosed_porch 0 13120.
31 x3ssn_porch 0 13120.
32 screen_porch 0 13120.
33 pool_area 0 13120.
34 misc_val 0 13120.
35 mo_sold 0 13120.
36 yr_sold -32.6 13120.
37 ms_sub_class_X030 0 13120.
38 ms_sub_class_X040 0 13120.
39 ms_sub_class_X045 0 13120.
40 ms_sub_class_X050 0 13120.
41 ms_sub_class_X060 0 13120.
42 ms_sub_class_X070 0 13120.
43 ms_sub_class_X075 0 13120.
44 ms_sub_class_X080 0 13120.
45 ms_sub_class_X085 0 13120.
46 ms_sub_class_X090 0 13120.
47 ms_sub_class_X120 0 13120.
48 ms_sub_class_X150 0 13120.
49 ms_sub_class_X160 0 13120.
50 ms_sub_class_X180 0 13120.
51 ms_sub_class_X190 0 13120.
52 ms_zoning_c 0 13120.
53 ms_zoning_fv 1223. 13120.
54 ms_zoning_i 0 13120.
55 ms_zoning_rh 0 13120.
56 ms_zoning_rl 0 13120.
57 ms_zoning_rm -151. 13120.
58 street_pave 4230. 13120.
59 alley_none 0 13120.
60 alley_pave 0 13120.
61 lot_shape_ir1 0 13120.
62 lot_shape_ir2 0 13120.
63 lot_shape_ir3 -21975. 13120.
64 land_contour_hls 9491. 13120.
65 land_contour_low 0 13120.
66 land_contour_lvl 0 13120.
67 utilities_no_sewr 0 13120.
68 lot_config_cul_d_sac 4346. 13120.
69 lot_config_fr2 -869. 13120.
70 lot_config_fr3 0 13120.
71 lot_config_inside 0 13120.
72 land_slope_mod 710. 13120.
73 land_slope_sev 0 13120.
74 neighborhood_blueste 0 13120.
75 neighborhood_br_dale 0 13120.
76 neighborhood_brk_side 0 13120.
77 neighborhood_clear_cr 0 13120.
78 neighborhood_collg_cr 0 13120.
79 neighborhood_crawfor 13106. 13120.
80 neighborhood_edwards -3411. 13120.
81 neighborhood_gilbert 0 13120.
82 neighborhood_greens 0 13120.
83 neighborhood_grn_hill 21187. 13120.
84 neighborhood_idotrr -1380. 13120.
85 neighborhood_landmrk 0 13120.
86 neighborhood_meadow_v 0 13120.
87 neighborhood_mitchel 0 13120.
88 neighborhood_n_ames 0 13120.
89 neighborhood_no_ridge 47784. 13120.
90 neighborhood_n_pk_vill 0 13120.
91 neighborhood_nridg_ht 20087. 13120.
92 neighborhood_nw_ames 0 13120.
93 neighborhood_old_town 0 13120.
94 neighborhood_sawyer 0 13120.
95 neighborhood_sawyer_w 0 13120.
96 neighborhood_somerst 5379. 13120.
97 neighborhood_stone_br 30200. 13120.
98 neighborhood_swisu 0 13120.
99 neighborhood_timber 0 13120.
100 neighborhood_veenker 0 13120.
101 condition_1_feedr -480. 13120.
102 condition_1_norm 5949. 13120.
103 condition_1_pos_a 0 13120.
104 condition_1_pos_n 0 13120.
105 condition_1_rr_ae 0 13120.
106 condition_1_rr_an 0 13120.
107 condition_1_rr_ne 0 13120.
108 condition_1_rr_nn 0 13120.
109 condition_2_feedr 0 13120.
110 condition_2_norm 0 13120.
111 condition_2_pos_a 8092. 13120.
112 condition_2_pos_n -50677. 13120.
113 condition_2_rr_nn 0 13120.
114 bldg_type_one_fam 8466. 13120.
115 bldg_type_twhs_ext 0 13120.
116 bldg_type_twhs_int 0 13120.
117 bldg_type_two_fam 0 13120.
118 house_style_s_lvl 0 13120.
119 house_style_x1_5fin 0 13120.
120 house_style_x1_5unf 0 13120.
121 house_style_x1story 0 13120.
122 house_style_x2_5fin 0 13120.
123 house_style_x2_5unf 0 13120.
124 house_style_x2story 1819. 13120.
125 roof_style_gable -1672. 13120.
126 roof_style_gambrel 0 13120.
127 roof_style_hip 1409. 13120.
128 roof_style_mansard 0 13120.
129 roof_style_shed 0 13120.
130 roof_matl_comp_shg 0 13120.
131 roof_matl_membran 0 13120.
132 roof_matl_metal 0 13120.
133 roof_matl_tar_grv 0 13120.
134 roof_matl_wd_shake 0 13120.
135 roof_matl_wd_shngl 0 13120.
136 exterior_1st_asph_shn 0 13120.
137 exterior_1st_brk_comm 0 13120.
138 exterior_1st_brk_face 8869. 13120.
139 exterior_1st_c_block 0 13120.
140 exterior_1st_cemnt_bd 1479. 13120.
141 exterior_1st_hd_board 0 13120.
142 exterior_1st_metal_sd 0 13120.
143 exterior_1st_plywood 0 13120.
144 exterior_1st_pre_cast 10554. 13120.
145 exterior_1st_stone 0 13120.
146 exterior_1st_stucco -8332. 13120.
147 exterior_1st_vinyl_sd 0 13120.
148 exterior_1st_wd_sdng 0 13120.
149 exterior_1st_wd_shing 0 13120.
150 exterior_2nd_asph_shn 0 13120.
151 exterior_2nd_brk_cmn 0 13120.
152 exterior_2nd_brk_face 0 13120.
153 exterior_2nd_c_block 0 13120.
154 exterior_2nd_cment_bd 2406. 13120.
155 exterior_2nd_hd_board 0 13120.
156 exterior_2nd_im_stucc 11890. 13120.
157 exterior_2nd_metal_sd 0 13120.
158 exterior_2nd_other 0 13120.
159 exterior_2nd_plywood 0 13120.
160 exterior_2nd_pre_cast 10019. 13120.
161 exterior_2nd_stone 0 13120.
162 exterior_2nd_stucco 0 13120.
163 exterior_2nd_vinyl_sd 0 13120.
164 exterior_2nd_wd_sdng 0 13120.
165 exterior_2nd_wd_shng 0 13120.
166 mas_vnr_type_brk_face 0 13120.
167 mas_vnr_type_c_block -10282. 13120.
168 mas_vnr_type_none 0 13120.
169 mas_vnr_type_stone 0 13120.
170 exter_qual_fa 0 13120.
171 exter_qual_ta -5352. 13120.
172 exter_qual_gd 1423. 13120.
173 exter_qual_ex 10057. 13120.
174 exter_cond_fa 0 13120.
175 exter_cond_ta 0 13120.
176 exter_cond_gd 199. 13120.
177 exter_cond_ex 0 13120.
178 foundation_c_block 0 13120.
179 foundation_p_conc 56.6 13120.
180 foundation_slab 0 13120.
181 foundation_stone 0 13120.
182 foundation_wood 0 13120.
183 bsmt_qual_po 0 13120.
184 bsmt_qual_fa 0 13120.
185 bsmt_qual_ta -984. 13120.
186 bsmt_qual_gd 0 13120.
187 bsmt_qual_ex 24518. 13120.
188 bsmt_cond_po 0 13120.
189 bsmt_cond_fa 0 13120.
190 bsmt_cond_ta 0 13120.
191 bsmt_cond_gd 0 13120.
192 bsmt_cond_ex 0 13120.
193 bsmt_exposure_gd 11553. 13120.
194 bsmt_exposure_mn 0 13120.
195 bsmt_exposure_no -4256. 13120.
196 bsmt_exposure_none 0 13120.
197 bsmt_fin_type_1_blq 0 13120.
198 bsmt_fin_type_1_glq 3011. 13120.
199 bsmt_fin_type_1_lw_q 0 13120.
200 bsmt_fin_type_1_none 0 13120.
201 bsmt_fin_type_1_rec 0 13120.
202 bsmt_fin_type_1_unf 0 13120.
203 bsmt_fin_type_2_blq 0 13120.
204 bsmt_fin_type_2_glq 3385. 13120.
205 bsmt_fin_type_2_lw_q 0 13120.
206 bsmt_fin_type_2_none 0 13120.
207 bsmt_fin_type_2_rec 0 13120.
208 bsmt_fin_type_2_unf 0 13120.
209 heating_gas_a 0 13120.
210 heating_gas_w 0 13120.
211 heating_grav 0 13120.
212 heating_oth_w 0 13120.
213 heating_wall 0 13120.
214 heating_qc_fa 0 13120.
215 heating_qc_ta -349. 13120.
216 heating_qc_gd 0 13120.
217 heating_qc_ex 3388. 13120.
218 central_air_y 690. 13120.
219 electrical_fuse_f 0 13120.
220 electrical_fuse_p 0 13120.
221 electrical_mix 0 13120.
222 electrical_s_brkr 0 13120.
223 kitchen_qual_fa 0 13120.
224 kitchen_qual_ta -4219. 13120.
225 kitchen_qual_gd 0 13120.
226 kitchen_qual_ex 26783. 13120.
227 functional_min1 -284. 13120.
228 functional_min2 -1208. 13120.
229 functional_mod 0 13120.
230 functional_maj1 0 13120.
231 functional_maj2 0 13120.
232 functional_sev -17251. 13120.
233 functional_sal -5779. 13120.
234 fireplace_qu_po 0 13120.
235 fireplace_qu_fa 0 13120.
236 fireplace_qu_ta 0 13120.
237 fireplace_qu_gd 2014. 13120.
238 fireplace_qu_ex 10824. 13120.
239 garage_type_basment 0 13120.
240 garage_type_built_in 0 13120.
241 garage_type_car_port 0 13120.
242 garage_type_detchd 0 13120.
243 garage_type_none 0 13120.
244 garage_type_x2types 0 13120.
245 garage_finish_none 0 13120.
246 garage_finish_r_fn 0 13120.
247 garage_finish_unf -451. 13120.
248 garage_qual_po 0 13120.
249 garage_qual_fa 0 13120.
250 garage_qual_ta 0 13120.
251 garage_qual_gd 0 13120.
252 garage_qual_ex 0 13120.
253 garage_cond_fa -83.2 13120.
254 garage_cond_gd 0 13120.
255 garage_cond_none 0 13120.
256 garage_cond_po 0 13120.
257 garage_cond_ta 0 13120.
258 paved_drive_p 0 13120.
259 paved_drive_y 172. 13120.
260 pool_qc_po 0 13120.
261 pool_qc_fa 0 13120.
262 pool_qc_ta 0 13120.
263 pool_qc_gd -34582. 13120.
264 pool_qc_ex 77020. 13120.
265 fence_gd_wo 0 13120.
266 fence_mn_prv 0 13120.
267 fence_mn_ww 0 13120.
268 fence_none 0 13120.
269 misc_feature_none 0 13120.
270 misc_feature_othr 0 13120.
271 misc_feature_shed 0 13120.
272 misc_feature_ten_c 0 13120.
273 sale_type_con 0 13120.
274 sale_type_con_ld 0 13120.
275 sale_type_con_li 0 13120.
276 sale_type_con_lw 0 13120.
277 sale_type_cwd 0 13120.
278 sale_type_new 6772. 13120.
279 sale_type_oth 0 13120.
280 sale_type_vwd 0 13120.
281 sale_type_wd 0 13120.
282 sale_condition_adj_land 0 13120.
283 sale_condition_alloca 0 13120.
284 sale_condition_family 0 13120.
285 sale_condition_normal 0 13120.
286 sale_condition_partial 5389. 13120.
Get RMSE in train & test
Use rmse_vec()
to get error in feat_trn
<- rmse_vec(truth = feat_trn$sale_price,
glmnet_rmse_trn estimate = predict(fit_glmnet, feat_trn)$.pred)
Use rmse_vec()
to get error in feat_test
<- rmse_vec(truth = feat_test$sale_price,
glmnet_rmse_test estimate = predict(fit_glmnet, feat_test)$.pred)
Get number of features
<- fit_glmnet |>
glmnet_n_feat tidy() |>
filter(estimate != 0 & term != "(Intercept)") |>
nrow()
Add to track_rmse
<- add_row(track_rmse,
track_rmse model = "GLMnet",
rmse_trn = glmnet_rmse_trn,
rmse_test = glmnet_rmse_test,
n_features = glmnet_n_feat)
track_rmse
# A tibble: 4 × 4
model rmse_trn rmse_test n_features
<chr> <dbl> <dbl> <dbl>
1 OLS 20992. 27493. 255
2 OLS with NZV/Corr 27875. 28540. 103
3 LASSO 25814. 28165. 58
4 GLMnet 25764. 27963. 82
The elastic net performs about comparably in training and test too.
There were 82 features retained in the elasticnet model. This is more features than were retained in the LASSO, but less features than were retained using step_nzv()
and step_corr()
.
The OLS model with all features performed descriptively the worst. However, the various methods to reduce overfitting didn’t help all that much. Maybe a little. It may be that none of these models are very overfit b/c the sample size remains quite large relative to the number of features.
Save & render
Save this .qmd file with your last name at the end (e.g., hw_unit_6_regularization_wyant). Make sure you changed “Your name here” at the top of the file to be your own name. Render the file to .html, and upload the rendered file to Canvas.
Way to go!!