Unit 2: EDA Modeling

Author

Key

Published

February 1, 2024

Packages, Source, Etc

This code chunk will set up conflict policies to reduce errors associated with function conflicts

options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
tidymodels_conflictRules()

This code chunk loads all packages you will need for this assignment.

library(cowplot, include.only = c("plot_grid", "theme_half_open"))
library(corrplot, include.only = "corrplot.mixed") # or just use namespace 
library(tidyverse) # for general data wrangling
library(tidymodels) # for modeling

Source John’s function scripts

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")

Set up some other environment settings

options(tibble.width = Inf, tibble.print_max = Inf)
theme_set(theme_half_open()) # maybe you prefer this to classic theme?

In the chunk below, set path_data to the location of your cleaned data files.

path_data <- "homework/unit_02"

Read and set up data

Read data

Load the cleaned training dataset and do skim/glimpse it to get oriented.

data_trn <- read_csv(here::here(path_data, "ames_clean_class_trn.csv"), 
                     col_types = cols()) |> 
  glimpse()
Rows: 1,465
Columns: 10
$ sale_price      <dbl> 105000, 126000, 115000, 127500, 120000, 99500, 125000,…
$ garage_area     <dbl> 730, 525, 0, 440, 308, 264, 264, 429, 539, 260, 0, 0, …
$ neighborhood    <chr> "names", "names", "names", "npkvill", "npkvill", "sawy…
$ ms_sub_class    <chr> "020", "020", "020", "120", "120", "120", "120", "030"…
$ total_bsmt_sf   <dbl> 882, 882, 864, 1069, 836, 918, 744, 816, 0, 1040, 950,…
$ bsmt_qual       <chr> "ta", "ta", "ta", "gd", "gd", "gd", "gd", "ta", "no_bs…
$ central_air     <chr> "y", "y", "y", "y", "y", "y", "y", "n", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 4, 5, 4, 4, 5, 4, 5, 8, 6, 6, 10, 4, 6, 6, 6, 5, 4,…
$ fireplaces      <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fireplace_qu    <chr> "no_fireplace", "no_fireplace", "po", "fa", "no_firepl…

Set up variable classes

As before, set up all categorical variables as factors (and confirm order if needed) and set up interval/ration variables as numeric

# note use of suppressWarnings().  Once you understand a warning, its often good practice
# to suppress it so that you know you've resolved it.
data_trn <- data_trn |>
  mutate(across(where(is.character), factor)) |> 
  mutate(bsmt_qual = suppressWarnings(fct_relevel(bsmt_qual, c("no_bsmt", "po", "fa", 
                                                  "ta", "gd", "ex"))),
         fireplace_qu = fct_relevel(fireplace_qu, c("no_fireplace", "po", "fa", 
                                                  "ta", "gd", "ex"))) |>  
  glimpse()
Rows: 1,465
Columns: 10
$ sale_price      <dbl> 105000, 126000, 115000, 127500, 120000, 99500, 125000,…
$ garage_area     <dbl> 730, 525, 0, 440, 308, 264, 264, 429, 539, 260, 0, 0, …
$ neighborhood    <fct> names, names, names, npkvill, npkvill, sawyerw, sawyer…
$ ms_sub_class    <fct> 020, 020, 020, 120, 120, 120, 120, 030, 090, 020, 020,…
$ total_bsmt_sf   <dbl> 882, 882, 864, 1069, 836, 918, 744, 816, 0, 1040, 950,…
$ bsmt_qual       <fct> ta, ta, ta, gd, gd, gd, gd, ta, no_bsmt, ta, ta, ta, t…
$ central_air     <fct> y, y, y, y, y, y, y, n, y, y, y, y, y, y, y, y, y, y, …
$ tot_rms_abv_grd <dbl> 5, 4, 5, 4, 4, 5, 4, 5, 8, 6, 6, 10, 4, 6, 6, 6, 5, 4,…
$ fireplaces      <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fireplace_qu    <fct> no_fireplace, no_fireplace, po, fa, no_fireplace, ta, …
data_trn |> 
  skim_all() # big picture view of the data
Data summary
Name data_trn
Number of rows 1465
Number of columns 10
_______________________
Column type frequency:
factor 5
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate n_unique top_counts
neighborhood 0 1 28 nam: 216, col: 124, old: 121, edw: 100
ms_sub_class 0 1 15 020: 544, 060: 289, 050: 160, 120: 94
bsmt_qual 0 1 5 ta: 636, gd: 611, ex: 125, fa: 51
central_air 0 1 2 y: 1367, n: 98
fireplace_qu 0 1 6 no_: 703, gd: 359, ta: 314, fa: 37

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 skew kurtosis
sale_price 0 1 180761.93 79016.72 13100 129500.0 160000 213500 745000 1.67 4.59
garage_area 1 1 473.56 214.12 0 321.5 480 577 1488 0.18 1.01
total_bsmt_sf 0 1 1047.90 439.71 0 780.0 992 1314 6110 1.26 12.12
tot_rms_abv_grd 0 1 6.47 1.55 3 5.0 6 7 14 0.76 1.19
fireplaces 0 1 0.60 0.64 0 0.0 1 1 3 0.66 -0.17

Univariate Distributions

Numeric variables

examine the univariate distributions of your numeric variables. Use map() to iterate through all numeric variables and create a plot_grid in a single piped chain.

# only need one of these.  Which do you prefer?
data_trn |> plot_hist("sale_price") # historgram

data_trn |> plot_freqpoly("sale_price") # frequency polygon

# This one gives some additional info
data_trn |> plot_boxplot("sale_price") # boxplot

# box violin plot
# the warning is due to the missing value for garage_area
# We see that on the next skim.
# In a later code chunk we show how to resolve this by filter out NA temporarily
data_trn |> 
  select(where(is.numeric)) |> 
  names() |> 
  map(\(name) plot_box_violin(df = data_trn, x = name)) |> 
  plot_grid(plotlist = _, ncol = 2)
Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

# summary statistics
data_trn |> 
  skim_all() |> 
  filter(skim_type == "numeric")
Data summary
Name data_trn
Number of rows 1465
Number of columns 10
_______________________
Column type frequency:
numeric 5
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 skew kurtosis
sale_price 0 1 180761.93 79016.72 13100 129500.0 160000 213500 745000 1.67 4.59
garage_area 1 1 473.56 214.12 0 321.5 480 577 1488 0.18 1.01
total_bsmt_sf 0 1 1047.90 439.71 0 780.0 992 1314 6110 1.26 12.12
tot_rms_abv_grd 0 1 6.47 1.55 3 5.0 6 7 14 0.76 1.19
fireplaces 0 1 0.60 0.64 0 0.0 1 1 3 0.66 -0.17

Numeric Univariate Notes: sale_price, garage_area, and tot_bsmt_sf show positive skews, though the skew in tot_bsmt_sf may be driven by a handful of outliers (vs sale_price which has a high number of outliers and garage_area which has a moderate number of outliers). fireplaces appears mostly bimodal due to roughly equal numbers of homes having no vs any fireplace (with those who have one being significantly more likely to have only one fireplace, and the 3 fireplace observation appears to be an outlier). tot_rms_abv_grd may have a few outlier observations above ten rooms.

Categorical variables

Examine univariate distributions of your categorical variables. Use map() to iterate through all categorical variables and display them together using plot_grid().

# Bar plots
data_trn |> 
  select(where(is.factor)) |> 
  names() |> 
  map(\(name) plot_bar(df = data_trn, x = name)) |> 
  plot_grid(plotlist = _, ncol = 2) 

# Table
# Maybe you like a table?  I think the above plot is still easier to digest
# though maybe the table makes clear how few observations there are in some
# categories
data_trn |> tab(ms_sub_class)
# A tibble: 15 × 3
   ms_sub_class     n    prop
   <fct>        <int>   <dbl>
 1 020            544 0.371  
 2 030             64 0.0437 
 3 040              2 0.00137
 4 045              5 0.00341
 5 050            160 0.109  
 6 060            289 0.197  
 7 070             65 0.0444 
 8 075             12 0.00819
 9 080             53 0.0362 
10 085             21 0.0143 
11 090             57 0.0389 
12 120             94 0.0642 
13 160             66 0.0451 
14 180              7 0.00478
15 190             26 0.0177 

Univariate categorical variable notes: We notice a lot of levels in neighborhood and ms_sub_class that have far fewer observations than other categories when we take a closer look at these variables at later steps we may consider collapsing some factor levels. We will also continue to wonder if fireplace and basement quality are best considered nominal or ordinal now that we have included a “no basment/fireplace” category. …who is to say that having a poor quality fireplace may be more detrimental to sale price than not having one at all? We will explore that and more with bivariate plots

Bivariate Relationships

Numeric variables

Examine the relationship between numeric predictors and the outcome variable sale_price. Use map() to complete these in a single pipe – specify sale_price as y value.

data_trn |>
  select(where(is.numeric) &!sale_price) %>% 
  names() |>   # pass in the variable names as strings
  map(\(name) plot_scatter(df = data_trn, x = name, y = "sale_price")) %>% 
  plot_grid(plotlist = ., ncol = 2)
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 1 rows containing missing values (`geom_point()`).
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at -0.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 1.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 9.8322e-29
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
-0.015
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
1.015
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
number 9.8322e-29
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : There are other near
singularities as well. 1

# use warnings() to explore these warnings
# a few due to missing observation.  Not a problem
# The lowess line is having some trouble with fit but this is really just a visual
# aid so not concerned.

Bivariate Relationship notes: There is a pretty strong relationship between basement square footage and sale price which is encouraging. Total number of rooms and garage also hold interesting relationships – they appear to have a strong linear relationship with sale_price but drop off after around 10 rooms for total rooms. For garage areas above 1000 square feet, we wonder if this is potentially related to the type of property the residence is – while some houses with many rooms and large garages may be mansions, others may be multi-family homes with lower sale prices.

One possibly useful transformation: let’s see what it looks like if we take the log of sale_price:

data_trn |>
  filter(!is.na(garage_area)) |> # this removes the warning due to missing value
  mutate(sale_price = log(sale_price)) |> 
  plot_scatter("garage_area", "sale_price")

data_trn |> 
  mutate(sale_price = log(sale_price)) |> 
  plot_scatter("total_bsmt_sf", "sale_price")

data_trn |> 
  mutate(sale_price = log(sale_price)) |> 
  plot_scatter("tot_rms_abv_grd", "sale_price")

data_trn |> 
  mutate(sale_price = log(sale_price)) |> 
  plot_scatter("fireplaces", "sale_price")

# the warnings about lowess lines are here too
# we suppressed them for the full code chunk because we understand all of them
# use #| warning: false at the top of the code chunk to do this

Correlations

Convert any ordinal predictors to numeric, select() down to numeric variables, and plot all variable correlations with sale_price.

data_trn |> 
  mutate(bsmt_qual = as.numeric(bsmt_qual),
         fireplace_qu = as.numeric(fireplace_qu)) |> 
  select(where(is.numeric)) |> 
  cor(use = "pairwise.complete.obs") |> # again, to handle missing values
  corrplot::corrplot.mixed()

Correlation notes: This looks pretty good – all of our numeric variables seem to be moderately correlated with the outcome. Maybe there was a benefit to replacing the NAs in the quality variables.

Categorical variables

Now choose the best plot(s) to visualize the relationship between your categorical predictors and sale_price().

# Start with grouped_box_violin plots to get oriented
# then choose what to look at more in depth.
data_trn |> 
  select(where(is.factor)) |> 
  names() |> 
  map(\(name) plot_grouped_box_violin(df = data_trn, x = name, y = "sale_price")) |> 
  plot_grid(plotlist = _, ncol = 2)
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.

# we left this warning to remind us that some of the groups will be missing from 
# these plots

Categorical bivariate notes: Linear looking relationship with bsmt_qual, but missing poor quality in this set. Possible that having a poor basement is worse than not having one. Similar pattern seen with fireplace quality as well as central air.

Guided categorical exploration

ms_sub_class is a mysterious variable – take a closer look at it’s relationship with sale_price. Do some types of homes sell for more than others? (order ms_sub_class factor based on values of sale_price)

# Order levels of the categorical predictor by the outcome.
data_trn |>
  plot_categorical("ms_sub_class","sale_price", ordered = TRUE)|>
   plot_grid(plotlist = _, ncol = 1)

Guided categorical exploration notes: Write out what the codes stand to help group these together a little more:

030: 1 story, pre 1945 045: 1.5 story unfinished 190: 2 family home 180: multilevel planned unit development (pud) 160: 2 story pud, 1946+ 050: 1.5 story finished 090: duplex 040: 1 story finished attic 150: 1-1/2 story pud - all ages 085: split foyer 070: 2 story pre 1945 020: 1 story 1946+ (most common) 080: split/multilevel 120: 1 story pud 1946+ 075: 2.5 story 060: 2 story 1946+ (second most common)

The relationship with price is more complex than just more stories = higher price or older = lower price. In particular, we were surprised by the wide spread in sale_price of the three PUD (planned unit development) property types. Maybe something to do with location?

Categorical/categorical relationships

Use this space to examine how the relationship between ms_sub_class and sale_price might be related to another categorical variable in our data.

# Check on how the price of a building sale may be affected by the neighborhood its in.
# First get a good look at neighborhood:

data_trn |>
  plot_categorical("neighborhood", "sale_price", ordered = TRUE) |> 
   plot_grid(plotlist = _, ncol = 1)
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.

# left warnings again

Cat/Cat Notes: There are some neighborhoods that seem to have higher sale_prices on average, but some barely have any observations. Adding color and ordering the factor by its relationship with sale price will help us remember groups of neighborhoods in the next step as we look at the neighborhoods of various dwelling types and sales.

Let’s start just by looking at the three PUD home type sales and their neighborhoods. Since color is assigned based on the relationship with y, keep the following color groups in your head when we look at the next set of graphs: Red/orange = low sale_price, Green/Blue = moderate sale_price, *Pink/Purple = high sale_price

data_trn |> 
  filter(ms_sub_class %in% c("180","160","120")) |> 
  mutate(neighborhood = fct_reorder(neighborhood, sale_price)) |> 
  mutate(ms_sub_class = fct_reorder(ms_sub_class, sale_price)) |> 
  plot_grouped_barplot_count("neighborhood", "ms_sub_class")

It seems like neighborhood may be explaining a lot of the variation in PUD property prices (e.g. I’m only seeing red/green neighborhoods in lowest PUD class 180, and the most pink/purple neighborhoods are in our highest PUD class 120). Lets take a look across all property classes.

data_trn |> 
  mutate(neighborhood = fct_reorder(neighborhood, sale_price)) |> 
  mutate(ms_sub_class = fct_reorder(ms_sub_class, sale_price)) |> 
  plot_grouped_barplot_count("neighborhood", "ms_sub_class")

# We can also look at the opposite
data_trn |> 
  mutate(neighborhood = fct_reorder(neighborhood, sale_price)) |> 
  mutate(ms_sub_class = fct_reorder(ms_sub_class, sale_price)) |>
  plot_grouped_barplot_count("ms_sub_class", "neighborhood")

There is definitely a pattern where the highest selling sub_classes appear to be mostly in the highest selling neighborhoods (pink/blue bars). 1 and 2 stories built pre 1945 are one of the more stable appearances across neighborhoods – this makes sense since old houses are likely to vary widely in condition and desirability.