options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Unit 4 (Classification) Application Assignment: Modeling EDA
Packages, Source, Etc
This code chunk will set up conflict policies to reduce errors associated with function conflicts
This code chunk loads all packages needed for this assignment.
library(cowplot, include.only = c("plot_grid", "theme_half_open"))
library(tidyverse) # for general data wrangling
library(tidymodels) # for modeling
Source John’s function scripts
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true") devtools
Set up some other environment settings
options(tibble.width = Inf, tibble.print_max = Inf)
theme_set(theme_half_open()) # maybe you prefer this to classic theme?
In the chunk below, set path_data to the location of your cleaned data files.
<- "homework/unit_04" path_data
Read and set up data
Read data
Load the cleaned training dataset and glimpse it to get oriented.
<- read_csv(here::here(path_data, "titanic_train_cln.csv"),
data_trn col_types = cols()) |>
glimpse()
Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <chr> "female", "male", "male", "male", "male", "male", "male…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <chr> "113781", "113781", "PC 17757", "PC 17318", "PC 17558",…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <chr> "C22 C26", "C22 C26", "C62 C64", NA, "B58 B60", NA, "A2…
$ embarked <chr> "s", "s", "c", "s", "c", "c", "s", "c", "c", "s", "s", …
Set up variable classes
As before, set up all categorical variables as factors (and confirm order if needed) and set up interval/ratio variables as numeric
<- data_trn |>
data_trn mutate(across(where(is.character), factor)) |>
mutate(pclass = factor(pclass, levels = c("1", "2", "3"))) |>
glimpse()
Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <fct> female, male, male, male, male, male, male, male, male,…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
|>
data_trn skim_all() # big picture view of the data
Name | data_trn |
Number of rows | 736 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
factor | 7 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | n_unique | top_counts |
---|---|---|---|---|
survived | 0 | 1.00 | 2 | no: 455, yes: 281 |
pclass | 0 | 1.00 | 3 | 3: 387, 2: 175, 1: 174 |
name | 0 | 1.00 | 736 | Abb: 1, Abb: 1, Abb: 1, Abe: 1 |
sex | 0 | 1.00 | 2 | mal: 479, fem: 257 |
ticket | 0 | 1.00 | 588 | CA.: 6, S.O: 6, 113: 4, 160: 4 |
cabin | 579 | 0.21 | 118 | F4: 4, B57: 3, C23: 3, D: 3 |
embarked | 0 | 1.00 | 3 | s: 524, c: 144, q: 68 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | skew | kurtosis |
---|---|---|---|---|---|---|---|---|---|---|---|
passenger_id | 0 | 1.0 | 647.63 | 372.37 | 3.00 | 343.75 | 635.0 | 974.50 | 1308.00 | 0.02 | -1.16 |
age | 146 | 0.8 | 29.71 | 13.96 | 0.17 | 21.00 | 28.0 | 38.00 | 76.00 | 0.35 | 0.15 |
n_sib_spouse | 0 | 1.0 | 0.49 | 1.00 | 0.00 | 0.00 | 0.0 | 1.00 | 8.00 | 3.74 | 19.59 |
n_parent_child | 0 | 1.0 | 0.37 | 0.88 | 0.00 | 0.00 | 0.0 | 0.00 | 9.00 | 4.28 | 29.26 |
fare | 0 | 1.0 | 32.47 | 50.00 | 0.00 | 7.90 | 14.4 | 31.07 | 512.33 | 4.44 | 28.25 |
Univariate Distributions
Numeric variables
Examine the univariate distributions of your numeric variables. Use map()
to iterate through all numeric variables and create a plot_grid
in a single piped chain.
# box violin plot
# the warnings are due to missing values in age
# We see that on the next skim.
|>
data_trn select(where(is.numeric)) |>
select(-passenger_id) |>
names() |>
map(\(name) plot_box_violin(df = data_trn, x = name)) |>
plot_grid(plotlist = _, ncol = 2)
Warning: Removed 146 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).
# summary statistics
|>
data_trn skim_all() |>
filter(skim_type == "numeric")
Name | data_trn |
Number of rows | 736 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | skew | kurtosis |
---|---|---|---|---|---|---|---|---|---|---|---|
passenger_id | 0 | 1.0 | 647.63 | 372.37 | 3.00 | 343.75 | 635.0 | 974.50 | 1308.00 | 0.02 | -1.16 |
age | 146 | 0.8 | 29.71 | 13.96 | 0.17 | 21.00 | 28.0 | 38.00 | 76.00 | 0.35 | 0.15 |
n_sib_spouse | 0 | 1.0 | 0.49 | 1.00 | 0.00 | 0.00 | 0.0 | 1.00 | 8.00 | 3.74 | 19.59 |
n_parent_child | 0 | 1.0 | 0.37 | 0.88 | 0.00 | 0.00 | 0.0 | 0.00 | 9.00 | 4.28 | 29.26 |
fare | 0 | 1.0 | 32.47 | 50.00 | 0.00 | 7.90 | 14.4 | 31.07 | 512.33 | 4.44 | 28.25 |
Numeric Univariate Notes:
Looks like some transformations might be helpful - lots of highly skewed distributions. There could potentially be an outlier for the highest value in fare.
Categorical variables
Examine univariate distributions of your categorical variables. Use map()
to iterate through all categorical variables and display them together using plot_grid()
. Ignoring name, cabin, ticket for now because they have so many unique values (we know this from cleaning EDA).
# Bar plots
|>
data_trn select(where(is.factor), -survived, -cabin, -ticket, -name) |>
names() |>
map(\(name) plot_bar(df = data_trn, x = name)) |>
plot_grid(plotlist = _, ncol = 2)
|> plot_bar("survived") data_trn
Univariate categorical variable notes: We can see that most people were in third class (though there’s good representation across classes), there are more males than females (though again good representation for each), and the embarkation port of Southampton is by far the modal response. There’s at least one missing value on embarked, but dealing with it/them will be trivial given how few missing values there are.
Bivariate Relationships
Numeric variables
Examine the relationship between numeric predictors and the outcome variable survived
. Use map()
to complete these in a single pipe – specify sale_price
as y
value.
|>
data_trn select(where(is.numeric) &!passenger_id) |>
names() |>
map(~ plot_grouped_box_violin(df = data_trn, x = "survived", y = .x)) |>
plot_grid(plotlist = _, ncol = 2)
Warning: Removed 146 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).
# again the warnings are dur to missing values in age. Not a problem.
Bivariate Relationship notes: Most numeric variables don’t seem like they’ll be that helpful - there are relatively similar distributions across survival outcomes. Maybe if we transform those variables to deal with skew, the relationships will improve. We’ll test that out in model fitting.
Correlations
Convert any ordinal predictors to numeric, select()
down to numeric variables, and plot all variable correlations with survived
.
|>
data_trn mutate(pclass = as.numeric(pclass)) |>
mutate(survived_yes = if_else(survived == "yes", 1, 0), # Can dummy code categorical predictors to examine correlations
embarked_s = if_else(embarked == "s", 1, 0),
embarked_q = if_else(embarked == "q", 1, 0)) |>
select(-embarked, -survived) |>
select(where(is.numeric) & !passenger_id) |>
relocate(survived_yes, .after = last_col()) |> # put outcome last to make it easy to see
cor(use = "pairwise.complete.obs") |> # deals with missing values in age
::corrplot.mixed() corrplot
Correlation notes: It looks like there are correlations between survivors and their fair and the number of children they have. There is also correlation between surviving and pclass such that first class passengers were more likely to survive.
Categorical variables
Now choose the best plot(s) to visualize the relationship between your categorical predictors and survived
.
|>
data_trn plot_grouped_barplot_percent(x = "embarked", y = "survived")
|>
data_trn plot_grouped_barplot_percent(x = "survived", y = "embarked")
|>
data_trn plot_grouped_barplot_percent(x = "sex", y = "survived")
|>
data_trn plot_grouped_barplot_percent(x = "pclass", y = "survived")
Categorical bivariate notes: One thing to note is that the relationship of pclass with survived seems somewhat monotonic, meaning that as pclass increases from 1 to 3, survival decreases - we may be able to keep that ordered factor as numeric. We can try it both ways!
Guided categorical exploration
Let’s look more closely at the cabin variable. How many unique values (levels) does it have? Are their any patterns in the way cabins are categorized?
length(unique(data_trn$cabin))
[1] 119
skim_some(data_trn$cabin)
Name | data_trn$cabin |
Number of rows | 736 |
Number of columns | 1 |
_______________________ | |
Column type frequency: | |
factor | 1 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
data | 579 | 0.21 | FALSE | 118 | F4: 4, B57: 3, C23: 3, D: 3 |
Guided categorical exploration notes: At first glance, this is probably too many unique values to be useful. Also, there are a lot of missing values. Looking more closely, however, we can see that there may be some valuable information here: each cabin number starts with a letter (corresponding to a deck), and some passengers have multiple cabins separated by a space. Let’s create two new variables with that information (deck & n_cabins).
Let’s look at the cabin variable divided into deck and number of cabins
<- data_trn |>
data_trn mutate(deck = str_extract(cabin, "[:alpha:]"),
n_cabins = str_count(cabin, "[:alpha:]"))
|>
data_trn plot_grouped_barplot_percent(x = "survived", y = "deck")
|>
data_trn plot_grouped_barplot_percent(x = "survived", y = "n_cabins")
Warning: Removed 579 rows containing non-finite values (`stat_count()`).
Guided categorical exploration notes: There appears to be some variation in deck in that those who were in deck D or E were more likely to survive but this variable still contains a lot of missing values. There will still be just as many missing values on deck and n_cabins as with cabin, but they would both be easier to impute (if we wanted to) and have greater representation across levels (e.g., only 8 unique values of deck). However, we might decide that the level of missingness isn’t worth dealing with. This is especially true given how overlapping deck is with pclass (see below)
|>
data_trn plot_grouped_barplot_count(x = "pclass", y = "deck")
Guided categorical exploration notes: We might just be better off using pclass.
Now, let’s look more closely at the name variable. How many unique values (levels) does it have?
length(unique(data_trn$name))
[1] 736
Guided categorical exploration notes: Like with cabin, this number of unique values is daunting (every passenger has a name), but there may be some valuable information here. Each name follows the same format: Last name, Title. First name [optional: middle name] [optional: (Maiden name)]. Let’s extract title, and then collapse into a sparse_title variable that groups similar categories of titles together (using domain knowledge & the internet). I did this using functions from the stringr package (loaded as part of the tidyverse) (see below).
Here we extract categories from the name variable to see if these categories may be useful in telling us more about who survived
# create a "title" variable for each name
<- data_trn %>%
data_trn mutate(title = str_to_lower(str_extract(name,
"(?<=[:punct:][:space:])[:alpha:]{2,}(?=[:punct:])"))) %>%
mutate(sparse_title = case_when(
== "master" ~ "boy",
title %in% c("miss", "ms") ~ "young_woman",
title %in% c("dona", "lady", "mrs") ~ "adult_woman",
title %in% c("mr", "sir", "don", "jonkheer") ~ "adult_man",
title %in% c("col", "dr", "major", "rev") ~ "professional",
title TRUE ~ NA_character_
%>%
)) mutate(sparse_title = case_when(
is.na(sparse_title) & age < 12 & sex == "male" ~ "boy",
is.na(sparse_title) & age < 18 & sex == "female" ~ "young_woman",
is.na(sparse_title) & sex == "male" ~ "adult_man",
is.na(sparse_title) & sex == "female" ~ "adult_woman",
!is.na(sparse_title) ~ as.character(sparse_title),
TRUE ~ NA_character_
))
|>
data_trn group_by(sparse_title) |>
summarize(mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
n = n())
# A tibble: 5 × 4
sparse_title mean_age median_age n
<chr> <dbl> <dbl> <int>
1 adult_man 31.9 29 436
2 adult_woman 37.5 35 106
3 boy 5.22 5.5 30
4 professional 46 49 14
5 young_woman 21.3 22 150
Guided categorical exploration notes: We can see that titles include information about both age and sex and we see that more clearly by grouping title by age. This indicates we might be able to use this variable to impute more strategically for the many missing values of age (e.g., impute mean or median within sparse title group rather than mean/median of the full sample).