options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Unit 9 (Advanced Models: Decision Trees, Bagging Trees, and Random Forest)
Assignment overview
This is the key for the unit 9 assignment predicting overall ratings of soccer players from a variety of characteristics about the player (e.g., age
, nationality
) and their soccer playing skills (e.g., stats on balance
, dribbling
skills, etc.)
Setup
Handle conflicts
Load packages
library(tidyverse)
library(tidymodels)
library(doParallel)
library(ranger)
library(xfun, include.only = "cache_rds")
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify global settings
Since we are going to use cache_rds()
, we are also going to include rerun_setting <- FALSE
in this chunk
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
<- FALSE rerun_setting
Set up parallel processing
Note you can type cl
into your console to see how many cores your computer has.
<- parallel::makePSOCKcluster(parallel::detectCores(logical = FALSE))
cl ::registerDoParallel(cl) doParallel
Create file path object
<- "homework/unit_09" path_data
Prep data
First we prepare the data to be used to fit random forest models predicting overall player ratings from all predictor variables.
Read in fifa.csv
<- read_csv(here::here(path_data, "fifa.csv"), col_types = cols()) |>
data_all glimpse()
Rows: 2,000
Columns: 18
$ name <chr> "j_george", "j_abu_hanna", "n_stefanelli", "b…
$ nationality <chr> "germany", "germany", "argentina", "iceland",…
$ age <dbl> 25, 20, 24, 27, 29, 28, 17, 36, 30, 19, 24, 2…
$ weight <dbl> 165, 176, 157, 190, 161, 207, 181, 179, 161, …
$ dribbling <dbl> 73, 46, 70, 73, 37, 15, 29, 64, 66, 62, 79, 6…
$ interceptions <dbl> 21, 58, 39, 42, 68, 17, 43, 63, 65, 34, 68, 7…
$ heading_accuracy <dbl> 59, 60, 55, 68, 76, 13, 53, 74, 61, 40, 52, 6…
$ reactions <dbl> 67, 57, 60, 74, 65, 54, 43, 63, 57, 56, 74, 7…
$ balance <dbl> 85, 66, 92, 76, 62, 41, 60, 61, 69, 73, 91, 7…
$ jumping <dbl> 61, 77, 91, 78, 75, 61, 51, 88, 72, 51, 40, 7…
$ stamina <dbl> 83, 60, 73, 90, 64, 39, 46, 64, 77, 58, 80, 7…
$ strength <dbl> 72, 77, 55, 85, 76, 80, 54, 88, 64, 52, 69, 7…
$ composure <dbl> 59, 48, 56, 76, 58, 41, 46, 78, 61, 60, 78, 6…
$ preferred_foot <chr> "right", "left", "right", "right", "right", "…
$ international_reputation <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, …
$ work_rate <chr> "high_medium", "medium_medium", "medium_high"…
$ position <chr> "am", "df", "st", "st", "df", "gk", "df", "df…
$ overall <dbl> 69, 63, 67, 73, 70, 61, 50, 71, 64, 59, 75, 7…
# light cleaning EDA
<- data_all |>
data_all ::clean_names("snake") |>
janitormutate(across(where(is.character), factor)) |>
mutate(across(where(is.character), tidy_responses)) |>
glimpse()
Rows: 2,000
Columns: 18
$ name <fct> j_george, j_abu_hanna, n_stefanelli, b_sigur_…
$ nationality <fct> germany, germany, argentina, iceland, colombi…
$ age <dbl> 25, 20, 24, 27, 29, 28, 17, 36, 30, 19, 24, 2…
$ weight <dbl> 165, 176, 157, 190, 161, 207, 181, 179, 161, …
$ dribbling <dbl> 73, 46, 70, 73, 37, 15, 29, 64, 66, 62, 79, 6…
$ interceptions <dbl> 21, 58, 39, 42, 68, 17, 43, 63, 65, 34, 68, 7…
$ heading_accuracy <dbl> 59, 60, 55, 68, 76, 13, 53, 74, 61, 40, 52, 6…
$ reactions <dbl> 67, 57, 60, 74, 65, 54, 43, 63, 57, 56, 74, 7…
$ balance <dbl> 85, 66, 92, 76, 62, 41, 60, 61, 69, 73, 91, 7…
$ jumping <dbl> 61, 77, 91, 78, 75, 61, 51, 88, 72, 51, 40, 7…
$ stamina <dbl> 83, 60, 73, 90, 64, 39, 46, 64, 77, 58, 80, 7…
$ strength <dbl> 72, 77, 55, 85, 76, 80, 54, 88, 64, 52, 69, 7…
$ composure <dbl> 59, 48, 56, 76, 58, 41, 46, 78, 61, 60, 78, 6…
$ preferred_foot <fct> right, left, right, right, right, right, righ…
$ international_reputation <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, …
$ work_rate <fct> high_medium, medium_medium, medium_high, high…
$ position <fct> am, df, st, st, df, gk, df, df, mf, mf, mf, m…
$ overall <dbl> 69, 63, 67, 73, 70, 61, 50, 71, 64, 59, 75, 7…
# no missing data
|>
data_all skim_some()
Name | data_all |
Number of rows | 2000 |
Number of columns | 18 |
_______________________ | |
Column type frequency: | |
factor | 5 |
numeric | 13 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
name | 0 | 1 | FALSE | 1984 | a_d: 2, a_m: 2, j_c: 2, j_h: 2 |
nationality | 0 | 1 | FALSE | 109 | eng: 179, ger: 139, spa: 104, arg: 101 |
preferred_foot | 0 | 1 | FALSE | 2 | rig: 1539, lef: 461 |
work_rate | 0 | 1 | FALSE | 9 | med: 1069, hig: 357, med: 170, hig: 134 |
position | 0 | 1 | FALSE | 6 | df: 657, mf: 476, st: 314, gk: 223 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
age | 0 | 1 | 16 | 40 |
weight | 0 | 1 | 117 | 243 |
dribbling | 0 | 1 | 5 | 95 |
interceptions | 0 | 1 | 5 | 92 |
heading_accuracy | 0 | 1 | 8 | 92 |
reactions | 0 | 1 | 30 | 90 |
balance | 0 | 1 | 19 | 96 |
jumping | 0 | 1 | 29 | 94 |
stamina | 0 | 1 | 13 | 94 |
strength | 0 | 1 | 26 | 97 |
composure | 0 | 1 | 12 | 93 |
international_reputation | 0 | 1 | 1 | 4 |
overall | 0 | 1 | 46 | 91 |
Initial train/test split
set.seed(123456)
<- data_all |>
splits_test initial_split(prop = 2/3, strata = "overall")
<- splits_test |>
data_trn analysis()
<- splits_test |>
data_test assessment()
Recipe
Decision trees and random forests have low feature engineering requirements (not to say that some feature engineering can’t help your model). Since we have no missing data, the recipes for our decision tree and random forest models can be the same. If we had missing data, we would need to impute or do another strategy in order to use the recipe with random forest.
The only step you have to do is make sure that the name variable (listing the names of the players) is not used as a predictor in your models. You can also remove name
during cleaning if you wouldn’t be using it at all.
step_other is not required, but remember that while you don’t NEED to do anything with your factors, performance may still be improved by collapsing levels etc. In this case, step_other with nationality
can lower variance of the solution because it collapses a variable a large amount of levels.
# Recipe
<- recipe(overall ~ ., data = data_trn) |>
rec step_rm(name) |>
step_other(nationality, threshold = .05)
<- rec |>
rec_prep prep(data_trn)
<- rec_prep |>
feat_trn bake(NULL)
|> skim_some() feat_trn
Name | feat_trn |
Number of rows | 1331 |
Number of columns | 17 |
_______________________ | |
Column type frequency: | |
factor | 4 |
numeric | 13 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
nationality | 0 | 1 | FALSE | 5 | oth: 969, eng: 126, ger: 91, spa: 73 |
preferred_foot | 0 | 1 | FALSE | 2 | rig: 1026, lef: 305 |
work_rate | 0 | 1 | FALSE | 9 | med: 713, hig: 226, med: 121, hig: 88 |
position | 0 | 1 | FALSE | 6 | df: 446, mf: 308, st: 210, gk: 158 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
age | 0 | 1 | 16 | 40 |
weight | 0 | 1 | 117 | 243 |
dribbling | 0 | 1 | 5 | 95 |
interceptions | 0 | 1 | 5 | 88 |
heading_accuracy | 0 | 1 | 8 | 92 |
reactions | 0 | 1 | 30 | 90 |
balance | 0 | 1 | 20 | 96 |
jumping | 0 | 1 | 29 | 94 |
stamina | 0 | 1 | 13 | 93 |
strength | 0 | 1 | 26 | 97 |
composure | 0 | 1 | 12 | 91 |
international_reputation | 0 | 1 | 1 | 4 |
overall | 0 | 1 | 46 | 91 |
Decision tree
Lets start out by fitting a standard decision tree First, we assign bootstrap splits. You would want to consider more bootstraps but to lower computational costs we do 100.
set.seed(123456)
<- data_trn |>
splits_boot bootstraps(times = 100, strata = "overall")
splits_boot
# Bootstrap sampling using stratification
# A tibble: 100 × 2
splits id
<list> <chr>
1 <split [1331/488]> Bootstrap001
2 <split [1331/482]> Bootstrap002
3 <split [1331/505]> Bootstrap003
4 <split [1331/498]> Bootstrap004
5 <split [1331/503]> Bootstrap005
6 <split [1331/473]> Bootstrap006
7 <split [1331/505]> Bootstrap007
8 <split [1331/453]> Bootstrap008
9 <split [1331/485]> Bootstrap009
10 <split [1331/474]> Bootstrap010
11 <split [1331/492]> Bootstrap011
12 <split [1331/483]> Bootstrap012
13 <split [1331/498]> Bootstrap013
14 <split [1331/500]> Bootstrap014
15 <split [1331/476]> Bootstrap015
16 <split [1331/490]> Bootstrap016
17 <split [1331/490]> Bootstrap017
18 <split [1331/477]> Bootstrap018
19 <split [1331/512]> Bootstrap019
20 <split [1331/489]> Bootstrap020
21 <split [1331/497]> Bootstrap021
22 <split [1331/478]> Bootstrap022
23 <split [1331/502]> Bootstrap023
24 <split [1331/489]> Bootstrap024
25 <split [1331/480]> Bootstrap025
26 <split [1331/482]> Bootstrap026
27 <split [1331/460]> Bootstrap027
28 <split [1331/496]> Bootstrap028
29 <split [1331/491]> Bootstrap029
30 <split [1331/501]> Bootstrap030
31 <split [1331/496]> Bootstrap031
32 <split [1331/486]> Bootstrap032
33 <split [1331/493]> Bootstrap033
34 <split [1331/499]> Bootstrap034
35 <split [1331/479]> Bootstrap035
36 <split [1331/499]> Bootstrap036
37 <split [1331/501]> Bootstrap037
38 <split [1331/492]> Bootstrap038
39 <split [1331/503]> Bootstrap039
40 <split [1331/507]> Bootstrap040
41 <split [1331/488]> Bootstrap041
42 <split [1331/491]> Bootstrap042
43 <split [1331/500]> Bootstrap043
44 <split [1331/499]> Bootstrap044
45 <split [1331/504]> Bootstrap045
46 <split [1331/503]> Bootstrap046
47 <split [1331/498]> Bootstrap047
48 <split [1331/491]> Bootstrap048
49 <split [1331/486]> Bootstrap049
50 <split [1331/506]> Bootstrap050
51 <split [1331/491]> Bootstrap051
52 <split [1331/488]> Bootstrap052
53 <split [1331/494]> Bootstrap053
54 <split [1331/503]> Bootstrap054
55 <split [1331/477]> Bootstrap055
56 <split [1331/490]> Bootstrap056
57 <split [1331/493]> Bootstrap057
58 <split [1331/473]> Bootstrap058
59 <split [1331/482]> Bootstrap059
60 <split [1331/474]> Bootstrap060
61 <split [1331/496]> Bootstrap061
62 <split [1331/499]> Bootstrap062
63 <split [1331/486]> Bootstrap063
64 <split [1331/463]> Bootstrap064
65 <split [1331/471]> Bootstrap065
66 <split [1331/497]> Bootstrap066
67 <split [1331/501]> Bootstrap067
68 <split [1331/477]> Bootstrap068
69 <split [1331/501]> Bootstrap069
70 <split [1331/499]> Bootstrap070
71 <split [1331/486]> Bootstrap071
72 <split [1331/491]> Bootstrap072
73 <split [1331/493]> Bootstrap073
74 <split [1331/486]> Bootstrap074
75 <split [1331/457]> Bootstrap075
76 <split [1331/481]> Bootstrap076
77 <split [1331/471]> Bootstrap077
78 <split [1331/472]> Bootstrap078
79 <split [1331/472]> Bootstrap079
80 <split [1331/497]> Bootstrap080
81 <split [1331/486]> Bootstrap081
82 <split [1331/488]> Bootstrap082
83 <split [1331/498]> Bootstrap083
84 <split [1331/505]> Bootstrap084
85 <split [1331/490]> Bootstrap085
86 <split [1331/482]> Bootstrap086
87 <split [1331/479]> Bootstrap087
88 <split [1331/481]> Bootstrap088
89 <split [1331/473]> Bootstrap089
90 <split [1331/488]> Bootstrap090
91 <split [1331/483]> Bootstrap091
92 <split [1331/487]> Bootstrap092
93 <split [1331/483]> Bootstrap093
94 <split [1331/496]> Bootstrap094
95 <split [1331/506]> Bootstrap095
96 <split [1331/498]> Bootstrap096
97 <split [1331/486]> Bootstrap097
98 <split [1331/494]> Bootstrap098
99 <split [1331/467]> Bootstrap099
100 <split [1331/481]> Bootstrap100
Then, we set up our grid to search for hyperparameters.
Note: You would want tune on tree_depth
but we have you set it to 10 to lower computational costs for this assignment.
<- grid_regular(cost_complexity(), min_n(), levels = 4)
grid_tree
grid_tree
# A tibble: 16 × 2
cost_complexity min_n
<dbl> <int>
1 0.0000000001 2
2 0.0000001 2
3 0.0001 2
4 0.1 2
5 0.0000000001 14
6 0.0000001 14
7 0.0001 14
8 0.1 14
9 0.0000000001 27
10 0.0000001 27
11 0.0001 27
12 0.1 27
13 0.0000000001 40
14 0.0000001 40
15 0.0001 40
16 0.1 40
Now lets fit our decision tree
<-
fits_tree cache_rds(
expr = {
decision_tree(
cost_complexity = tune(),
tree_depth = 10,
min_n = tune()) |>
set_engine("rpart") |>
set_mode("regression") |>
tune_grid(preprocessor = rec,
resamples = splits_boot,
grid = grid_tree,
metrics = metric_set(rmse))
},rerun = rerun_setting,
dir = "cache/009",
file = "fits_tree")
We can use autoplot()
to view performance by hyperparameter values
autoplot(fits_tree)
There was no large difference between 27 and 40 so no further benefits of increasing node size beyond 40, meaning that we considered reasonable values. With respect to cost complexity, it is already a very near zero value so essentially no penalty so can’t really go lower than that. We are almost not pruning at all.
Take a look at our best performances
show_best(fits_tree)
# A tibble: 5 × 8
cost_complexity min_n .metric .estimator mean n std_err
<dbl> <int> <chr> <chr> <dbl> <int> <dbl>
1 0.0000000001 40 rmse standard 3.34 100 0.0140
2 0.0000001 40 rmse standard 3.34 100 0.0140
3 0.0001 40 rmse standard 3.34 100 0.0140
4 0.0001 27 rmse standard 3.35 100 0.0137
5 0.0000000001 27 rmse standard 3.35 100 0.0137
.config
<chr>
1 Preprocessor1_Model13
2 Preprocessor1_Model14
3 Preprocessor1_Model15
4 Preprocessor1_Model11
5 Preprocessor1_Model09
Train our best model on the training set
# Best model
<-
best_decision_tree decision_tree(cost_complexity = select_best(fits_tree)$cost_complexity,
tree_depth = 10,
min_n = select_best(fits_tree)$min_n) |>
set_engine("rpart", model = TRUE) |>
set_mode("regression") |>
fit(overall ~ ., data = feat_trn)
Plot the decision tree
$fit |> rpart.plot::rpart.plot() best_decision_tree
Random Forest
Now let’s see what bagging and decorrelating the base learner trees can do for our model’s performance. Since bagging performs bootstrapping internally and we are using bootstrapping for selection of tuning parameters, we have a good old fashioned double bootstrap!
Make bootstrap splits for selection
set.seed(123456)
<- data_trn |>
splits_boot bootstraps(times = 100, strata = "overall")
Make tuning grid and fit random forest
Note: You likely want to consider more bootstraps than 100 and greater trees
than 95 – these were just limited for computational purposes during this assignment.
# mtry should go up to number of predictors
# Not including trees because holding constant at 10
<- expand_grid(mtry = c(2, 5, 10, 16),
grid_rf min_n = c(1, 2, 5, 10))
grid_rf
# A tibble: 16 × 2
mtry min_n
<dbl> <dbl>
1 2 1
2 2 2
3 2 5
4 2 10
5 5 1
6 5 2
7 5 5
8 5 10
9 10 1
10 10 2
11 10 5
12 10 10
13 16 1
14 16 2
15 16 5
16 16 10
# Fit random forest
<-
fits_rf cache_rds(
expr = {
rand_forest(
trees = 95,
mtry = tune(),
min_n = tune()) |>
set_engine("ranger",
respect.unordered.factors = "order",
oob.error = FALSE,
seed = 102030) |>
set_mode("regression") |>
tune_grid(preprocessor = rec,
resamples = splits_boot,
grid = grid_rf,
metrics = metric_set(rmse))
},rerun = rerun_setting,
dir = "cache/009/",
file = "fits_rf")
collect_metrics(fits_rf, summarize = TRUE)
# A tibble: 16 × 8
mtry min_n .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2 1 rmse standard 2.70 100 0.0110 Preprocessor1_Model01
2 2 2 rmse standard 2.71 100 0.0111 Preprocessor1_Model02
3 2 5 rmse standard 2.72 100 0.0110 Preprocessor1_Model03
4 2 10 rmse standard 2.73 100 0.0114 Preprocessor1_Model04
5 5 1 rmse standard 2.54 100 0.0101 Preprocessor1_Model05
6 5 2 rmse standard 2.54 100 0.0100 Preprocessor1_Model06
7 5 5 rmse standard 2.54 100 0.0101 Preprocessor1_Model07
8 5 10 rmse standard 2.55 100 0.0101 Preprocessor1_Model08
9 10 1 rmse standard 2.58 100 0.00958 Preprocessor1_Model09
10 10 2 rmse standard 2.58 100 0.00943 Preprocessor1_Model10
11 10 5 rmse standard 2.58 100 0.00940 Preprocessor1_Model11
12 10 10 rmse standard 2.59 100 0.00943 Preprocessor1_Model12
13 16 1 rmse standard 2.63 100 0.00951 Preprocessor1_Model13
14 16 2 rmse standard 2.63 100 0.00953 Preprocessor1_Model14
15 16 5 rmse standard 2.63 100 0.00955 Preprocessor1_Model15
16 16 10 rmse standard 2.63 100 0.00942 Preprocessor1_Model16
Tuning parameter plots
We used these plots to confirm that we had selected good combinations of hyperparameters to tune and that the best hyperparameters are inside the range of values considered (or at their objective edge)
autoplot(fits_rf)
Best tuning values and RMSE
show_best(fits_rf)
# A tibble: 5 × 8
mtry min_n .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 1 rmse standard 2.54 100 0.0101 Preprocessor1_Model05
2 5 2 rmse standard 2.54 100 0.0100 Preprocessor1_Model06
3 5 5 rmse standard 2.54 100 0.0101 Preprocessor1_Model07
4 5 10 rmse standard 2.55 100 0.0101 Preprocessor1_Model08
5 10 2 rmse standard 2.58 100 0.00943 Preprocessor1_Model10
Fit best model on full training data
<-
fit_rf_best rand_forest(trees = 10,
mtry = select_best(fits_rf)$mtry,
min_n = select_best(fits_rf)$min_n) |>
set_engine("ranger",
respect.unordered.factors = "order",
oob.error = FALSE,
seed = 102030) |>
set_mode("regression") |>
fit(overall ~ ., data = feat_trn)
Evaluate best model in test
Based on bootstrapped training rmse, we can see that the bagging and decorrelating provided by random forest was helpful for reducing model rmse.
Evaluate best random forest in test
<- rec_prep |>
feat_test bake(data_test)
rmse_vec(truth = feat_test$overall,
estimate = predict(fit_rf_best, feat_test)$.pred)
[1] 2.509799
Nice - test rmse is not far from training rmse suggesting our model is not overfit
Here is a visualization of how well the model is doing
plot_truth(truth = feat_test$overall,
estimate = predict(fit_rf_best, feat_test)$.pred)