options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Unit 2: EDA Modeling
Packages, Source, Etc
This code chunk will set up conflict policies to reduce errors associated with function conflicts
This code chunk loads all packages you will need for this assignment.
library(cowplot, include.only = c("plot_grid", "theme_half_open"))
library(corrplot, include.only = "corrplot.mixed") # or just use namespace
library(tidyverse) # for general data wrangling
library(tidymodels) # for modeling
Source John’s function scripts
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true") devtools
Set up some other environment settings
options(tibble.width = Inf, tibble.print_max = Inf)
theme_set(theme_half_open()) # maybe you prefer this to classic theme?
In the chunk below, set path_data to the location of your cleaned data files.
<- "homework/unit_02" path_data
Read and set up data
Read data
Load the cleaned training dataset and do skim/glimpse it to get oriented.
<- read_csv(here::here(path_data, "ames_clean_class_trn.csv"),
data_trn col_types = cols()) |>
glimpse()
Rows: 1,465
Columns: 10
$ sale_price <dbl> 105000, 126000, 115000, 127500, 120000, 99500, 125000,…
$ garage_area <dbl> 730, 525, 0, 440, 308, 264, 264, 429, 539, 260, 0, 0, …
$ neighborhood <chr> "names", "names", "names", "npkvill", "npkvill", "sawy…
$ ms_sub_class <chr> "020", "020", "020", "120", "120", "120", "120", "030"…
$ total_bsmt_sf <dbl> 882, 882, 864, 1069, 836, 918, 744, 816, 0, 1040, 950,…
$ bsmt_qual <chr> "ta", "ta", "ta", "gd", "gd", "gd", "gd", "ta", "no_bs…
$ central_air <chr> "y", "y", "y", "y", "y", "y", "y", "n", "y", "y", "y",…
$ tot_rms_abv_grd <dbl> 5, 4, 5, 4, 4, 5, 4, 5, 8, 6, 6, 10, 4, 6, 6, 6, 5, 4,…
$ fireplaces <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fireplace_qu <chr> "no_fireplace", "no_fireplace", "po", "fa", "no_firepl…
Set up variable classes
As before, set up all categorical variables as factors (and confirm order if needed) and set up interval/ration variables as numeric
# note use of suppressWarnings(). Once you understand a warning, its often good practice
# to suppress it so that you know you've resolved it.
<- data_trn |>
data_trn mutate(across(where(is.character), factor)) |>
mutate(bsmt_qual = suppressWarnings(fct_relevel(bsmt_qual, c("no_bsmt", "po", "fa",
"ta", "gd", "ex"))),
fireplace_qu = fct_relevel(fireplace_qu, c("no_fireplace", "po", "fa",
"ta", "gd", "ex"))) |>
glimpse()
Rows: 1,465
Columns: 10
$ sale_price <dbl> 105000, 126000, 115000, 127500, 120000, 99500, 125000,…
$ garage_area <dbl> 730, 525, 0, 440, 308, 264, 264, 429, 539, 260, 0, 0, …
$ neighborhood <fct> names, names, names, npkvill, npkvill, sawyerw, sawyer…
$ ms_sub_class <fct> 020, 020, 020, 120, 120, 120, 120, 030, 090, 020, 020,…
$ total_bsmt_sf <dbl> 882, 882, 864, 1069, 836, 918, 744, 816, 0, 1040, 950,…
$ bsmt_qual <fct> ta, ta, ta, gd, gd, gd, gd, ta, no_bsmt, ta, ta, ta, t…
$ central_air <fct> y, y, y, y, y, y, y, n, y, y, y, y, y, y, y, y, y, y, …
$ tot_rms_abv_grd <dbl> 5, 4, 5, 4, 4, 5, 4, 5, 8, 6, 6, 10, 4, 6, 6, 6, 5, 4,…
$ fireplaces <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ fireplace_qu <fct> no_fireplace, no_fireplace, po, fa, no_fireplace, ta, …
|>
data_trn skim_all() # big picture view of the data
Name | data_trn |
Number of rows | 1465 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 5 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | n_unique | top_counts |
---|---|---|---|---|
neighborhood | 0 | 1 | 28 | nam: 216, col: 124, old: 121, edw: 100 |
ms_sub_class | 0 | 1 | 15 | 020: 544, 060: 289, 050: 160, 120: 94 |
bsmt_qual | 0 | 1 | 5 | ta: 636, gd: 611, ex: 125, fa: 51 |
central_air | 0 | 1 | 2 | y: 1367, n: 98 |
fireplace_qu | 0 | 1 | 6 | no_: 703, gd: 359, ta: 314, fa: 37 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | skew | kurtosis |
---|---|---|---|---|---|---|---|---|---|---|---|
sale_price | 0 | 1 | 180761.93 | 79016.72 | 13100 | 129500.0 | 160000 | 213500 | 745000 | 1.67 | 4.59 |
garage_area | 1 | 1 | 473.56 | 214.12 | 0 | 321.5 | 480 | 577 | 1488 | 0.18 | 1.01 |
total_bsmt_sf | 0 | 1 | 1047.90 | 439.71 | 0 | 780.0 | 992 | 1314 | 6110 | 1.26 | 12.12 |
tot_rms_abv_grd | 0 | 1 | 6.47 | 1.55 | 3 | 5.0 | 6 | 7 | 14 | 0.76 | 1.19 |
fireplaces | 0 | 1 | 0.60 | 0.64 | 0 | 0.0 | 1 | 1 | 3 | 0.66 | -0.17 |
Univariate Distributions
Numeric variables
examine the univariate distributions of your numeric variables. Use map()
to iterate through all numeric variables and create a plot_grid
in a single piped chain.
# only need one of these. Which do you prefer?
|> plot_hist("sale_price") # historgram data_trn
|> plot_freqpoly("sale_price") # frequency polygon data_trn
# This one gives some additional info
|> plot_boxplot("sale_price") # boxplot data_trn
# box violin plot
# the warning is due to the missing value for garage_area
# We see that on the next skim.
# In a later code chunk we show how to resolve this by filter out NA temporarily
|>
data_trn select(where(is.numeric)) |>
names() |>
map(\(name) plot_box_violin(df = data_trn, x = name)) |>
plot_grid(plotlist = _, ncol = 2)
Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
# summary statistics
|>
data_trn skim_all() |>
filter(skim_type == "numeric")
Name | data_trn |
Number of rows | 1465 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | skew | kurtosis |
---|---|---|---|---|---|---|---|---|---|---|---|
sale_price | 0 | 1 | 180761.93 | 79016.72 | 13100 | 129500.0 | 160000 | 213500 | 745000 | 1.67 | 4.59 |
garage_area | 1 | 1 | 473.56 | 214.12 | 0 | 321.5 | 480 | 577 | 1488 | 0.18 | 1.01 |
total_bsmt_sf | 0 | 1 | 1047.90 | 439.71 | 0 | 780.0 | 992 | 1314 | 6110 | 1.26 | 12.12 |
tot_rms_abv_grd | 0 | 1 | 6.47 | 1.55 | 3 | 5.0 | 6 | 7 | 14 | 0.76 | 1.19 |
fireplaces | 0 | 1 | 0.60 | 0.64 | 0 | 0.0 | 1 | 1 | 3 | 0.66 | -0.17 |
Numeric Univariate Notes: sale_price
, garage_area
, and tot_bsmt_sf
show positive skews, though the skew in tot_bsmt_sf
may be driven by a handful of outliers (vs sale_price
which has a high number of outliers and garage_area
which has a moderate number of outliers). fireplaces
appears mostly bimodal due to roughly equal numbers of homes having no vs any fireplace (with those who have one being significantly more likely to have only one fireplace, and the 3 fireplace observation appears to be an outlier). tot_rms_abv_grd
may have a few outlier observations above ten rooms.
Categorical variables
Examine univariate distributions of your categorical variables. Use map()
to iterate through all categorical variables and display them together using plot_grid()
.
# Bar plots
|>
data_trn select(where(is.factor)) |>
names() |>
map(\(name) plot_bar(df = data_trn, x = name)) |>
plot_grid(plotlist = _, ncol = 2)
# Table
# Maybe you like a table? I think the above plot is still easier to digest
# though maybe the table makes clear how few observations there are in some
# categories
|> tab(ms_sub_class) data_trn
# A tibble: 15 × 3
ms_sub_class n prop
<fct> <int> <dbl>
1 020 544 0.371
2 030 64 0.0437
3 040 2 0.00137
4 045 5 0.00341
5 050 160 0.109
6 060 289 0.197
7 070 65 0.0444
8 075 12 0.00819
9 080 53 0.0362
10 085 21 0.0143
11 090 57 0.0389
12 120 94 0.0642
13 160 66 0.0451
14 180 7 0.00478
15 190 26 0.0177
Univariate categorical variable notes: We notice a lot of levels in neighborhood
and ms_sub_class
that have far fewer observations than other categories when we take a closer look at these variables at later steps we may consider collapsing some factor levels. We will also continue to wonder if fireplace and basement quality are best considered nominal or ordinal now that we have included a “no basment/fireplace” category. …who is to say that having a poor quality fireplace may be more detrimental to sale price than not having one at all? We will explore that and more with bivariate plots
Bivariate Relationships
Numeric variables
Examine the relationship between numeric predictors and the outcome variable sale_price
. Use map()
to complete these in a single pipe – specify sale_price
as y
value.
|>
data_trn select(where(is.numeric) &!sale_price) %>%
names() |> # pass in the variable names as strings
map(\(name) plot_scatter(df = data_trn, x = name, y = "sale_price")) %>%
plot_grid(plotlist = ., ncol = 2)
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 1 rows containing missing values (`geom_point()`).
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: pseudoinverse used at -0.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: neighborhood radius 1.015
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: reciprocal condition number 9.8322e-29
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
: There are other near singularities as well. 1
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
-0.015
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
1.015
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
number 9.8322e-29
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
else if (is.data.frame(newdata))
as.matrix(model.frame(delete.response(terms(object)), : There are other near
singularities as well. 1
# use warnings() to explore these warnings
# a few due to missing observation. Not a problem
# The lowess line is having some trouble with fit but this is really just a visual
# aid so not concerned.
Bivariate Relationship notes: There is a pretty strong relationship between basement square footage and sale price which is encouraging. Total number of rooms and garage also hold interesting relationships – they appear to have a strong linear relationship with sale_price but drop off after around 10 rooms for total rooms. For garage areas above 1000 square feet, we wonder if this is potentially related to the type of property the residence is – while some houses with many rooms and large garages may be mansions, others may be multi-family homes with lower sale prices.
One possibly useful transformation: let’s see what it looks like if we take the log of sale_price
:
|>
data_trn filter(!is.na(garage_area)) |> # this removes the warning due to missing value
mutate(sale_price = log(sale_price)) |>
plot_scatter("garage_area", "sale_price")
|>
data_trn mutate(sale_price = log(sale_price)) |>
plot_scatter("total_bsmt_sf", "sale_price")
|>
data_trn mutate(sale_price = log(sale_price)) |>
plot_scatter("tot_rms_abv_grd", "sale_price")
|>
data_trn mutate(sale_price = log(sale_price)) |>
plot_scatter("fireplaces", "sale_price")
# the warnings about lowess lines are here too
# we suppressed them for the full code chunk because we understand all of them
# use #| warning: false at the top of the code chunk to do this
Correlations
Convert any ordinal predictors to numeric, select()
down to numeric variables, and plot all variable correlations with sale_price
.
|>
data_trn mutate(bsmt_qual = as.numeric(bsmt_qual),
fireplace_qu = as.numeric(fireplace_qu)) |>
select(where(is.numeric)) |>
cor(use = "pairwise.complete.obs") |> # again, to handle missing values
::corrplot.mixed() corrplot
Correlation notes: This looks pretty good – all of our numeric variables seem to be moderately correlated with the outcome. Maybe there was a benefit to replacing the NAs in the quality variables.
Categorical variables
Now choose the best plot(s) to visualize the relationship between your categorical predictors and sale_price()
.
# Start with grouped_box_violin plots to get oriented
# then choose what to look at more in depth.
|>
data_trn select(where(is.factor)) |>
names() |>
map(\(name) plot_grouped_box_violin(df = data_trn, x = name, y = "sale_price")) |>
plot_grid(plotlist = _, ncol = 2)
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
# we left this warning to remind us that some of the groups will be missing from
# these plots
Categorical bivariate notes: Linear looking relationship with bsmt_qual
, but missing poor quality in this set. Possible that having a poor basement is worse than not having one. Similar pattern seen with fireplace quality as well as central air.
Guided categorical exploration
ms_sub_class
is a mysterious variable – take a closer look at it’s relationship with sale_price
. Do some types of homes sell for more than others? (order ms_sub_class
factor based on values of sale_price
)
# Order levels of the categorical predictor by the outcome.
|>
data_trn plot_categorical("ms_sub_class","sale_price", ordered = TRUE)|>
plot_grid(plotlist = _, ncol = 1)
Guided categorical exploration notes: Write out what the codes stand to help group these together a little more:
030: 1 story, pre 1945 045: 1.5 story unfinished 190: 2 family home 180: multilevel planned unit development (pud) 160: 2 story pud, 1946+ 050: 1.5 story finished 090: duplex 040: 1 story finished attic 150: 1-1/2 story pud - all ages 085: split foyer 070: 2 story pre 1945 020: 1 story 1946+ (most common) 080: split/multilevel 120: 1 story pud 1946+ 075: 2.5 story 060: 2 story 1946+ (second most common)
The relationship with price is more complex than just more stories = higher price or older = lower price. In particular, we were surprised by the wide spread in sale_price
of the three PUD (planned unit development) property types. Maybe something to do with location?
Categorical/categorical relationships
Use this space to examine how the relationship between ms_sub_class
and sale_price
might be related to another categorical variable in our data.
# Check on how the price of a building sale may be affected by the neighborhood its in.
# First get a good look at neighborhood:
|>
data_trn plot_categorical("neighborhood", "sale_price", ordered = TRUE) |>
plot_grid(plotlist = _, ncol = 1)
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
# left warnings again
Cat/Cat Notes: There are some neighborhoods that seem to have higher sale_prices on average, but some barely have any observations. Adding color and ordering the factor by its relationship with sale price will help us remember groups of neighborhoods in the next step as we look at the neighborhoods of various dwelling types and sales.
Let’s start just by looking at the three PUD home type sales and their neighborhoods. Since color is assigned based on the relationship with y, keep the following color groups in your head when we look at the next set of graphs: Red/orange = low sale_price
, Green/Blue = moderate sale_price
, *Pink/Purple = high sale_price
|>
data_trn filter(ms_sub_class %in% c("180","160","120")) |>
mutate(neighborhood = fct_reorder(neighborhood, sale_price)) |>
mutate(ms_sub_class = fct_reorder(ms_sub_class, sale_price)) |>
plot_grouped_barplot_count("neighborhood", "ms_sub_class")
It seems like neighborhood may be explaining a lot of the variation in PUD property prices (e.g. I’m only seeing red/green neighborhoods in lowest PUD class 180, and the most pink/purple neighborhoods are in our highest PUD class 120). Lets take a look across all property classes.
|>
data_trn mutate(neighborhood = fct_reorder(neighborhood, sale_price)) |>
mutate(ms_sub_class = fct_reorder(ms_sub_class, sale_price)) |>
plot_grouped_barplot_count("neighborhood", "ms_sub_class")
# We can also look at the opposite
|>
data_trn mutate(neighborhood = fct_reorder(neighborhood, sale_price)) |>
mutate(ms_sub_class = fct_reorder(ms_sub_class, sale_price)) |>
plot_grouped_barplot_count("ms_sub_class", "neighborhood")
There is definitely a pattern where the highest selling sub_classes
appear to be mostly in the highest selling neighborhoods (pink/blue bars). 1 and 2 stories built pre 1945 are one of the more stable appearances across neighborhoods – this makes sense since old houses are likely to vary widely in condition and desirability.