library(tidyverse) # for general data wrangling
library(tidymodels) # for modeling
options(conflicts.policy = "depends.ok")
library(cowplot, include.only = c("plot_grid", "theme_half_open"))
source("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")
source("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
options(tibble.width = Inf, tibble.print_max = Inf)
theme_set(theme_half_open()) # maybe you prefer this to classic theme?
path_data <- "application_assignments/unit_04"Unit 4 (Classification) Application Assignment: Modeling EDA
Packages, Source, Etc
Here we demonstrate how you could put all of your set up in one code chunk!
Read and set up data
Read data
Load the cleaned training dataset and glimpse it to get oriented.
data_trn <- read_csv(here::here(path_data, "titanic_train_cln.csv"),
col_types = cols()) |>
glimpse()Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ pclass <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <chr> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <chr> "female", "male", "male", "male", "male", "male", "male…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <chr> "113781", "113781", "PC 17757", "PC 17318", "PC 17558",…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <chr> "C22 C26", "C22 C26", "C62 C64", NA, "B58 B60", NA, "A2…
$ embarked <chr> "s", "s", "c", "s", "c", "c", "s", "c", "c", "s", "s", …
Set up variable classes
As before, set up all categorical variables as factors (and confirm order if needed) and set up interval/ratio variables as numeric
data_trn <- data_trn |>
mutate(across(where(is.character), factor)) |>
mutate(pclass = factor(pclass, levels = c("1", "2", "3"))) |>
glimpse()Rows: 736
Columns: 12
$ passenger_id <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex <fct> female, male, male, male, male, male, male, male, male,…
$ age <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
data_trn |>
skim_all() # big picture view of the data| Name | data_trn |
| Number of rows | 736 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| factor | 7 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | n_unique | top_counts |
|---|---|---|---|---|
| survived | 0 | 1.00 | 2 | no: 455, yes: 281 |
| pclass | 0 | 1.00 | 3 | 3: 387, 2: 175, 1: 174 |
| name | 0 | 1.00 | 736 | Abb: 1, Abb: 1, Abb: 1, Abe: 1 |
| sex | 0 | 1.00 | 2 | mal: 479, fem: 257 |
| ticket | 0 | 1.00 | 588 | CA.: 6, S.O: 6, 113: 4, 160: 4 |
| cabin | 579 | 0.21 | 118 | F4: 4, B57: 3, C23: 3, D: 3 |
| embarked | 0 | 1.00 | 3 | s: 524, c: 144, q: 68 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | skew | kurtosis |
|---|---|---|---|---|---|---|---|---|---|---|---|
| passenger_id | 0 | 1.0 | 647.63 | 372.37 | 3.00 | 343.75 | 635.0 | 974.50 | 1308.00 | 0.02 | -1.16 |
| age | 146 | 0.8 | 29.71 | 13.96 | 0.17 | 21.00 | 28.0 | 38.00 | 76.00 | 0.35 | 0.15 |
| n_sib_spouse | 0 | 1.0 | 0.49 | 1.00 | 0.00 | 0.00 | 0.0 | 1.00 | 8.00 | 3.74 | 19.59 |
| n_parent_child | 0 | 1.0 | 0.37 | 0.88 | 0.00 | 0.00 | 0.0 | 0.00 | 9.00 | 4.28 | 29.26 |
| fare | 0 | 1.0 | 32.47 | 50.00 | 0.00 | 7.90 | 14.4 | 31.07 | 512.33 | 4.44 | 28.25 |
Univariate Distributions
Numeric variables
Examine the univariate distributions of your numeric variables. Use map() to iterate through all numeric variables and create a plot_grid in a single piped chain.
# box violin plot
# the warnings are due to missing values in age
# We see that on the next skim.
data_trn |>
select(where(is.numeric)) |>
select(-passenger_id) |>
names() |>
map(\(name) plot_box_violin(df = data_trn, x = name)) |>
plot_grid(plotlist = _, ncol = 2)Warning: Removed 146 rows containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 146 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: The `fatten` argument of `geom_boxplot()` is deprecated as of ggplot2 4.0.0.
ℹ Please use the `median.linewidth` argument instead.
# summary statistics
data_trn |>
skim_all() |>
filter(skim_type == "numeric")| Name | data_trn |
| Number of rows | 736 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | skew | kurtosis |
|---|---|---|---|---|---|---|---|---|---|---|---|
| passenger_id | 0 | 1.0 | 647.63 | 372.37 | 3.00 | 343.75 | 635.0 | 974.50 | 1308.00 | 0.02 | -1.16 |
| age | 146 | 0.8 | 29.71 | 13.96 | 0.17 | 21.00 | 28.0 | 38.00 | 76.00 | 0.35 | 0.15 |
| n_sib_spouse | 0 | 1.0 | 0.49 | 1.00 | 0.00 | 0.00 | 0.0 | 1.00 | 8.00 | 3.74 | 19.59 |
| n_parent_child | 0 | 1.0 | 0.37 | 0.88 | 0.00 | 0.00 | 0.0 | 0.00 | 9.00 | 4.28 | 29.26 |
| fare | 0 | 1.0 | 32.47 | 50.00 | 0.00 | 7.90 | 14.4 | 31.07 | 512.33 | 4.44 | 28.25 |
Numeric Univariate Notes:
Looks like some transformations might be helpful - lots of highly skewed distributions. There could potentially be an outlier for the highest value in fare.
Categorical variables
Examine univariate distributions of your categorical variables. Use map() to iterate through all categorical variables and display them together using plot_grid(). Ignoring name, cabin, ticket for now because they have so many unique values (we know this from cleaning EDA).
# Bar plots
data_trn |>
select(where(is.factor), -survived, -cabin, -ticket, -name) |>
names() |>
map(\(name) plot_bar(df = data_trn, x = name)) |>
plot_grid(plotlist = _, ncol = 2) data_trn |> plot_bar("survived")Univariate categorical variable notes: We can see that most people were in third class (though there’s good representation across classes), there are more males than females (though again good representation for each), and the embarkation port of Southampton is by far the modal response. There’s at least one missing value on embarked, but dealing with it/them will be trivial given how few missing values there are.
Bivariate Relationships
Numeric variables
Examine the relationship between numeric predictors and the outcome variable survived. Use map() to complete these in a single pipe – specify sale_price as y value.
data_trn |>
select(where(is.numeric) &!passenger_id) |>
names() |>
map(~ plot_grouped_box_violin(df = data_trn, x = "survived", y = .x)) |>
plot_grid(plotlist = _, ncol = 2)Warning: Removed 146 rows containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 146 rows containing non-finite outside the scale range
(`stat_boxplot()`).
# again the warnings are dur to missing values in age. Not a problem.Bivariate Relationship notes: Most numeric variables don’t seem like they’ll be that helpful - there are relatively similar distributions across survival outcomes. Maybe if we transform those variables to deal with skew, the relationships will improve. We’ll test that out in model fitting.
Correlations
Convert any ordinal predictors to numeric, select() down to numeric variables, and plot all variable correlations with survived.
data_trn |>
mutate(pclass = as.numeric(pclass)) |>
mutate(survived_yes = if_else(survived == "yes", 1, 0), # Can dummy code categorical predictors to examine correlations
embarked_s = if_else(embarked == "s", 1, 0),
embarked_q = if_else(embarked == "q", 1, 0)) |>
select(-embarked, -survived) |>
select(where(is.numeric) & !passenger_id) |>
relocate(survived_yes, .after = last_col()) |> # put outcome last to make it easy to see
cor(use = "pairwise.complete.obs") |> # deals with missing values in age
corrplot::corrplot.mixed()Correlation notes: It looks like there are correlations between survivors and their fair and the number of children they have. There is also correlation between surviving and pclass such that first class passengers were more likely to survive.
Categorical variables
Now choose the best plot(s) to visualize the relationship between your categorical predictors and survived.
data_trn |>
plot_grouped_barplot_percent(x = "embarked", y = "survived")data_trn |>
plot_grouped_barplot_percent(x = "survived", y = "embarked")data_trn |>
plot_grouped_barplot_percent(x = "sex", y = "survived")data_trn |>
plot_grouped_barplot_percent(x = "pclass", y = "survived")Categorical bivariate notes: One thing to note is that the relationship of pclass with survived seems somewhat monotonic, meaning that as pclass increases from 1 to 3, survival decreases - we may be able to keep that ordered factor as numeric. We can try it both ways!
Guided categorical exploration
Let’s look more closely at the cabin variable. How many unique values (levels) does it have? Are their any patterns in the way cabins are categorized?
length(unique(data_trn$cabin))[1] 119
skim_some(data_trn$cabin)| Name | data_trn$cabin |
| Number of rows | 736 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| data | 579 | 0.21 | FALSE | 118 | F4: 4, B57: 3, C23: 3, D: 3 |
Guided categorical exploration notes: At first glance, this is probably too many unique values to be useful. Also, there are a lot of missing values. Looking more closely, however, we can see that there may be some valuable information here: each cabin number starts with a letter (corresponding to a deck), and some passengers have multiple cabins separated by a space. Let’s create two new variables with that information (deck & n_cabins).
Let’s look at the cabin variable divided into deck and number of cabins
data_trn <- data_trn |>
mutate(deck = str_extract(cabin, "[:alpha:]"),
n_cabins = str_count(cabin, "[:alpha:]"))
data_trn |>
plot_grouped_barplot_percent(x = "survived", y = "deck")data_trn |>
plot_grouped_barplot_percent(x = "survived", y = "n_cabins")Warning: Removed 579 rows containing non-finite outside the scale range
(`stat_count()`).
Guided categorical exploration notes: There appears to be some variation in deck in that those who were in deck D or E were more likely to survive but this variable still contains a lot of missing values. There will still be just as many missing values on deck and n_cabins as with cabin, but they would both be easier to impute (if we wanted to) and have greater representation across levels (e.g., only 8 unique values of deck). However, we might decide that the level of missingness isn’t worth dealing with. This is especially true given how overlapping deck is with pclass (see below)
data_trn |>
plot_grouped_barplot_count(x = "pclass", y = "deck")Guided categorical exploration notes: We might just be better off using pclass.
Now, let’s look more closely at the name variable. How many unique values (levels) does it have?
length(unique(data_trn$name))[1] 736
Guided categorical exploration notes: Like with cabin, this number of unique values is daunting (every passenger has a name), but there may be some valuable information here. Each name follows the same format: Last name, Title. First name [optional: middle name] [optional: (Maiden name)]. Let’s extract title, and then collapse into a sparse_title variable that groups similar categories of titles together (using domain knowledge & the internet). I did this using functions from the stringr package (loaded as part of the tidyverse) (see below).
Here we extract categories from the name variable to see if these categories may be useful in telling us more about who survived
# create a "title" variable for each name
data_trn <- data_trn |>
mutate(title = str_to_lower(str_extract(name,
"(?<=[:punct:][:space:])[:alpha:]{2,}(?=[:punct:])"))) %>%
mutate(sparse_title = case_when(
title == "master" ~ "boy",
title %in% c("miss", "ms") ~ "young_woman",
title %in% c("dona", "lady", "mrs") ~ "adult_woman",
title %in% c("mr", "sir", "don", "jonkheer") ~ "adult_man",
title %in% c("col", "dr", "major", "rev") ~ "professional",
TRUE ~ NA_character_
)) |>
mutate(sparse_title = case_when(
is.na(sparse_title) & age < 12 & sex == "male" ~ "boy",
is.na(sparse_title) & age < 18 & sex == "female" ~ "young_woman",
is.na(sparse_title) & sex == "male" ~ "adult_man",
is.na(sparse_title) & sex == "female" ~ "adult_woman",
!is.na(sparse_title) ~ as.character(sparse_title),
TRUE ~ NA_character_
))
data_trn |>
group_by(sparse_title) |>
summarize(mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
n = n())# A tibble: 5 × 4
sparse_title mean_age median_age n
<chr> <dbl> <dbl> <int>
1 adult_man 31.9 29 436
2 adult_woman 37.5 35 106
3 boy 5.22 5.5 30
4 professional 46 49 14
5 young_woman 21.3 22 150
Guided categorical exploration notes: We can see that titles include information about both age and sex and we see that more clearly by grouping title by age. This indicates we might be able to use this variable to impute more strategically for the many missing values of age (e.g., impute mean or median within sparse title group rather than mean/median of the full sample).