Unit 4 (Classification) Application Assignment: Modeling EDA

Author

TA Key

Published

February 13, 2024

Packages, Source, Etc

This code chunk will set up conflict policies to reduce errors associated with function conflicts

options(conflicts.policy = "depends.ok")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
tidymodels_conflictRules()

This code chunk loads all packages needed for this assignment.

library(cowplot, include.only = c("plot_grid", "theme_half_open"))
library(tidyverse) # for general data wrangling
library(tidymodels) # for modeling

Source John’s function scripts

devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")

Set up some other environment settings

options(tibble.width = Inf, tibble.print_max = Inf)
theme_set(theme_half_open()) # maybe you prefer this to classic theme?

In the chunk below, set path_data to the location of your cleaned data files.

path_data <- "homework/unit_04"

Read and set up data

Read data

Load the cleaned training dataset and glimpse it to get oriented.

data_trn <- read_csv(here::here(path_data, "titanic_train_cln.csv"),
                     col_types = cols()) |> 
  glimpse()
Rows: 736
Columns: 12
$ passenger_id   <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived       <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
$ pclass         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name           <chr> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex            <chr> "female", "male", "male", "male", "male", "male", "male…
$ age            <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket         <chr> "113781", "113781", "PC 17757", "PC 17318", "PC 17558",…
$ fare           <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin          <chr> "C22 C26", "C22 C26", "C62 C64", NA, "B58 B60", NA, "A2…
$ embarked       <chr> "s", "s", "c", "s", "c", "c", "s", "c", "c", "s", "s", …

Set up variable classes

As before, set up all categorical variables as factors (and confirm order if needed) and set up interval/ratio variables as numeric

data_trn <- data_trn |>
  mutate(across(where(is.character), factor)) |> 
  mutate(pclass = factor(pclass, levels  = c("1", "2", "3"))) |>
  glimpse()
Rows: 736
Columns: 12
$ passenger_id   <dbl> 3, 4, 11, 16, 17, 26, 39, 40, 41, 46, 53, 54, 59, 61, 6…
$ survived       <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ pclass         <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ name           <fct> "Allison, Miss. Helen Loraine", "Allison, Mr. Hudson Jo…
$ sex            <fct> female, male, male, male, male, male, male, male, male,…
$ age            <dbl> 2, 30, 47, NA, 24, 25, 41, 48, NA, 45, 28, 17, 49, 36, …
$ n_sib_spouse   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0…
$ n_parent_child <dbl> 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ticket         <fct> 113781, 113781, PC 17757, PC 17318, PC 17558, 13905, 11…
$ fare           <dbl> 151.5500, 151.5500, 227.5250, 25.9250, 247.5208, 26.000…
$ cabin          <fct> C22 C26, C22 C26, C62 C64, NA, B58 B60, NA, A21, B10, N…
$ embarked       <fct> s, s, c, s, c, c, s, c, c, s, s, s, s, s, s, c, s, c, s…
data_trn |> 
  skim_all() # big picture view of the data
Data summary
Name data_trn
Number of rows 736
Number of columns 12
_______________________
Column type frequency:
factor 7
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate n_unique top_counts
survived 0 1.00 2 no: 455, yes: 281
pclass 0 1.00 3 3: 387, 2: 175, 1: 174
name 0 1.00 736 Abb: 1, Abb: 1, Abb: 1, Abe: 1
sex 0 1.00 2 mal: 479, fem: 257
ticket 0 1.00 588 CA.: 6, S.O: 6, 113: 4, 160: 4
cabin 579 0.21 118 F4: 4, B57: 3, C23: 3, D: 3
embarked 0 1.00 3 s: 524, c: 144, q: 68

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 skew kurtosis
passenger_id 0 1.0 647.63 372.37 3.00 343.75 635.0 974.50 1308.00 0.02 -1.16
age 146 0.8 29.71 13.96 0.17 21.00 28.0 38.00 76.00 0.35 0.15
n_sib_spouse 0 1.0 0.49 1.00 0.00 0.00 0.0 1.00 8.00 3.74 19.59
n_parent_child 0 1.0 0.37 0.88 0.00 0.00 0.0 0.00 9.00 4.28 29.26
fare 0 1.0 32.47 50.00 0.00 7.90 14.4 31.07 512.33 4.44 28.25

Univariate Distributions

Numeric variables

Examine the univariate distributions of your numeric variables. Use map() to iterate through all numeric variables and create a plot_grid in a single piped chain.

# box violin plot
# the warnings are due to missing values in age
# We see that on the next skim.
data_trn |> 
  select(where(is.numeric)) |> 
  select(-passenger_id) |>
  names() |> 
  map(\(name) plot_box_violin(df = data_trn, x = name)) |> 
  plot_grid(plotlist = _, ncol = 2)
Warning: Removed 146 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).

# summary statistics
data_trn |> 
  skim_all() |> 
  filter(skim_type == "numeric")
Data summary
Name data_trn
Number of rows 736
Number of columns 12
_______________________
Column type frequency:
numeric 5
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 skew kurtosis
passenger_id 0 1.0 647.63 372.37 3.00 343.75 635.0 974.50 1308.00 0.02 -1.16
age 146 0.8 29.71 13.96 0.17 21.00 28.0 38.00 76.00 0.35 0.15
n_sib_spouse 0 1.0 0.49 1.00 0.00 0.00 0.0 1.00 8.00 3.74 19.59
n_parent_child 0 1.0 0.37 0.88 0.00 0.00 0.0 0.00 9.00 4.28 29.26
fare 0 1.0 32.47 50.00 0.00 7.90 14.4 31.07 512.33 4.44 28.25

Numeric Univariate Notes:
Looks like some transformations might be helpful - lots of highly skewed distributions. There could potentially be an outlier for the highest value in fare.

Categorical variables

Examine univariate distributions of your categorical variables. Use map() to iterate through all categorical variables and display them together using plot_grid(). Ignoring name, cabin, ticket for now because they have so many unique values (we know this from cleaning EDA).

# Bar plots
data_trn |> 
  select(where(is.factor), -survived, -cabin, -ticket, -name) |> 
  names() |> 
  map(\(name) plot_bar(df = data_trn, x = name)) |> 
  plot_grid(plotlist = _, ncol = 2) 

data_trn |> plot_bar("survived")

Univariate categorical variable notes: We can see that most people were in third class (though there’s good representation across classes), there are more males than females (though again good representation for each), and the embarkation port of Southampton is by far the modal response. There’s at least one missing value on embarked, but dealing with it/them will be trivial given how few missing values there are.

Bivariate Relationships

Numeric variables

Examine the relationship between numeric predictors and the outcome variable survived. Use map() to complete these in a single pipe – specify sale_price as y value.

data_trn |> 
  select(where(is.numeric) &!passenger_id) |> 
  names() |> 
  map(~ plot_grouped_box_violin(df = data_trn, x = "survived", y = .x)) |> 
  plot_grid(plotlist = _, ncol = 2)
Warning: Removed 146 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).

# again the warnings are dur to missing values in age. Not a problem.

Bivariate Relationship notes: Most numeric variables don’t seem like they’ll be that helpful - there are relatively similar distributions across survival outcomes. Maybe if we transform those variables to deal with skew, the relationships will improve. We’ll test that out in model fitting.

Correlations

Convert any ordinal predictors to numeric, select() down to numeric variables, and plot all variable correlations with survived.

data_trn |> 
  mutate(pclass = as.numeric(pclass)) |> 
  mutate(survived_yes = if_else(survived == "yes", 1, 0), # Can dummy code categorical predictors to examine correlations
         embarked_s = if_else(embarked == "s", 1, 0),
         embarked_q = if_else(embarked == "q", 1, 0)) |> 
  select(-embarked, -survived) |>
  select(where(is.numeric) & !passenger_id) |>
  relocate(survived_yes, .after = last_col()) |> # put outcome last to make it easy to see
  cor(use = "pairwise.complete.obs") |> # deals with missing values in age
  corrplot::corrplot.mixed()

Correlation notes: It looks like there are correlations between survivors and their fair and the number of children they have. There is also correlation between surviving and pclass such that first class passengers were more likely to survive.

Categorical variables

Now choose the best plot(s) to visualize the relationship between your categorical predictors and survived.

data_trn |> 
  plot_grouped_barplot_percent(x = "embarked", y = "survived")

data_trn |> 
  plot_grouped_barplot_percent(x = "survived", y = "embarked")

data_trn |> 
  plot_grouped_barplot_percent(x = "sex", y = "survived")

data_trn |> 
  plot_grouped_barplot_percent(x = "pclass", y = "survived")

Categorical bivariate notes: One thing to note is that the relationship of pclass with survived seems somewhat monotonic, meaning that as pclass increases from 1 to 3, survival decreases - we may be able to keep that ordered factor as numeric. We can try it both ways!

Guided categorical exploration

Let’s look more closely at the cabin variable. How many unique values (levels) does it have? Are their any patterns in the way cabins are categorized?

length(unique(data_trn$cabin))
[1] 119
skim_some(data_trn$cabin)
Data summary
Name data_trn$cabin
Number of rows 736
Number of columns 1
_______________________
Column type frequency:
factor 1
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
data 579 0.21 FALSE 118 F4: 4, B57: 3, C23: 3, D: 3

Guided categorical exploration notes: At first glance, this is probably too many unique values to be useful. Also, there are a lot of missing values. Looking more closely, however, we can see that there may be some valuable information here: each cabin number starts with a letter (corresponding to a deck), and some passengers have multiple cabins separated by a space. Let’s create two new variables with that information (deck & n_cabins).

Let’s look at the cabin variable divided into deck and number of cabins

data_trn <- data_trn |>
  mutate(deck = str_extract(cabin, "[:alpha:]"),
              n_cabins = str_count(cabin, "[:alpha:]"))

data_trn |> 
  plot_grouped_barplot_percent(x = "survived", y = "deck")

data_trn |> 
  plot_grouped_barplot_percent(x = "survived", y = "n_cabins")
Warning: Removed 579 rows containing non-finite values (`stat_count()`).

Guided categorical exploration notes: There appears to be some variation in deck in that those who were in deck D or E were more likely to survive but this variable still contains a lot of missing values. There will still be just as many missing values on deck and n_cabins as with cabin, but they would both be easier to impute (if we wanted to) and have greater representation across levels (e.g., only 8 unique values of deck). However, we might decide that the level of missingness isn’t worth dealing with. This is especially true given how overlapping deck is with pclass (see below)

data_trn |> 
  plot_grouped_barplot_count(x = "pclass", y = "deck")

Guided categorical exploration notes: We might just be better off using pclass.

Now, let’s look more closely at the name variable. How many unique values (levels) does it have?

length(unique(data_trn$name))
[1] 736

Guided categorical exploration notes: Like with cabin, this number of unique values is daunting (every passenger has a name), but there may be some valuable information here. Each name follows the same format: Last name, Title. First name [optional: middle name] [optional: (Maiden name)]. Let’s extract title, and then collapse into a sparse_title variable that groups similar categories of titles together (using domain knowledge & the internet). I did this using functions from the stringr package (loaded as part of the tidyverse) (see below).

Here we extract categories from the name variable to see if these categories may be useful in telling us more about who survived

# create a "title" variable for each name
data_trn <- data_trn %>%
  mutate(title = str_to_lower(str_extract(name,
                            "(?<=[:punct:][:space:])[:alpha:]{2,}(?=[:punct:])"))) %>% 
  mutate(sparse_title = case_when(
           title == "master" ~ "boy",
           title %in% c("miss", "ms") ~ "young_woman",
           title %in% c("dona", "lady", "mrs") ~ "adult_woman",
           title %in% c("mr", "sir", "don", "jonkheer") ~ "adult_man",
           title %in% c("col", "dr", "major", "rev") ~ "professional",
           TRUE ~ NA_character_ 
         )) %>% 
  mutate(sparse_title = case_when(
    is.na(sparse_title) & age < 12 & sex == "male" ~ "boy",
    is.na(sparse_title) & age < 18 & sex == "female" ~ "young_woman",
    is.na(sparse_title) & sex == "male" ~ "adult_man",
    is.na(sparse_title) & sex == "female" ~ "adult_woman",
    !is.na(sparse_title) ~ as.character(sparse_title),
    TRUE ~ NA_character_
  ))

data_trn |> 
  group_by(sparse_title) |> 
  summarize(mean_age = mean(age, na.rm = TRUE),
            median_age = median(age, na.rm = TRUE),
            n = n())
# A tibble: 5 × 4
  sparse_title mean_age median_age     n
  <chr>           <dbl>      <dbl> <int>
1 adult_man       31.9        29     436
2 adult_woman     37.5        35     106
3 boy              5.22        5.5    30
4 professional    46          49      14
5 young_woman     21.3        22     150

Guided categorical exploration notes: We can see that titles include information about both age and sex and we see that more clearly by grouping title by age. This indicates we might be able to use this variable to impute more strategically for the many missing values of age (e.g., impute mean or median within sparse title group rather than mean/median of the full sample).