options(conflicts.policy = "depends.ok")
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_ml.R?raw=true")
devtoolstidymodels_conflictRules()
Homework Unit 8: Advanced Performance Metrics
Introduction
This file serves as the answer key for the Unit_08 homework. Unit 8 Advanced Performance Metrics in the course web book contains all materials required for this assignment.
In this assignment, we demonstrate how the selected performance metric and the outcome balance affect our selection and evaluation of the best performing model.
Setup
Handle conflicts
Load required packages
library(tidyverse)
library(tidymodels)
Source function scripts (John’s or your own)
::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_plots.R?raw=true")
devtools::source_url("https://github.com/jjcurtin/lab_support/blob/main/fun_eda.R?raw=true") devtools
Specify other global settings
theme_set(theme_classic())
options(tibble.width = Inf, dplyr.print_max=Inf)
Paths
<- "homework/unit_08" path_data
Set up parallel processing
Note you can type cl
into your console to see how many cores your computer has.
<- parallel::makePSOCKcluster(parallel::detectCores(logical = FALSE))
cl ::registerDoParallel(cl) doParallel
Read in data
Read in the breast_cancer.csv
data file and save as an object called data_all
, perform any checks (i.e., light cleaning EDA) needed on the data and set the outcome diagnosis
as a factor with malignant as the first level.
<- read_csv(here::here(path_data, "breast_cancer.csv"), col_types = cols()) |>
data_all glimpse()
Rows: 423
Columns: 31
$ diagnosis <chr> "malignant", "benign", "benign", "malignant", …
$ perimeter_se <dbl> 1.9740, 1.6670, 1.4890, 2.9890, 2.6840, 1.4450…
$ fractal_dimension_mean <dbl> 0.05986, 0.06320, 0.05828, 0.06768, 0.05934, 0…
$ concave_points_worst <dbl> 0.12520, 0.11050, 0.03002, 0.20270, 0.06544, 0…
$ symmetry_mean <dbl> 0.1594, 0.1886, 0.1845, 0.2157, 0.1834, 0.1514…
$ texture_se <dbl> 0.3621, 0.7339, 1.6470, 0.9489, 0.8429, 1.0660…
$ concave_points_se <dbl> 0.008260, 0.013040, 0.003419, 0.012710, 0.0091…
$ concavity_mean <dbl> 0.075500, 0.070970, 0.004967, 0.169000, 0.0263…
$ fractal_dimension_se <dbl> 0.002881, 0.001982, 0.002534, 0.003884, 0.0014…
$ radius_worst <dbl> 17.770, 12.640, 12.360, 18.810, 12.970, 14.200…
$ concave_points_mean <dbl> 0.040790, 0.044970, 0.006434, 0.089230, 0.0206…
$ radius_mean <dbl> 15.120, 11.610, 11.220, 14.870, 11.500, 12.620…
$ smoothness_se <dbl> 0.005472, 0.005884, 0.004359, 0.006985, 0.0063…
$ smoothness_worst <dbl> 0.14910, 0.14150, 0.09994, 0.18780, 0.11830, 0…
$ symmetry_se <dbl> 0.01523, 0.01848, 0.01916, 0.01602, 0.02292, 0…
$ radius_se <dbl> 0.2711, 0.2456, 0.2239, 0.4266, 0.3927, 0.2449…
$ concavity_worst <dbl> 0.33270, 0.23020, 0.02318, 0.47040, 0.08105, 0…
$ concavity_se <dbl> 0.020390, 0.026310, 0.003223, 0.030110, 0.0124…
$ compactness_se <dbl> 0.019190, 0.020050, 0.006813, 0.025630, 0.0106…
$ smoothness_mean <dbl> 0.08876, 0.10880, 0.07780, 0.11620, 0.09345, 0…
$ area_se <dbl> 26.440, 15.890, 15.460, 41.180, 26.990, 18.510…
$ area_worst <dbl> 989.5, 475.7, 470.9, 1095.0, 508.9, 624.0, 544…
$ perimeter_mean <dbl> 98.78, 75.46, 70.79, 98.64, 73.28, 81.35, 79.0…
$ compactness_mean <dbl> 0.09588, 0.11680, 0.03574, 0.16490, 0.05991, 0…
$ area_mean <dbl> 716.6, 408.2, 386.8, 682.5, 407.4, 496.4, 466.…
$ fractal_dimension_worst <dbl> 0.09740, 0.07427, 0.07307, 0.10650, 0.06487, 0…
$ texture_mean <dbl> 16.68, 16.02, 33.81, 16.67, 18.45, 23.97, 18.5…
$ perimeter_worst <dbl> 117.70, 81.93, 78.44, 127.10, 83.12, 90.67, 85…
$ symmetry_worst <dbl> 0.3415, 0.2787, 0.2911, 0.3585, 0.2740, 0.2826…
$ texture_worst <dbl> 20.24, 19.67, 41.78, 27.37, 22.46, 31.31, 27.4…
$ compactness_worst <dbl> 0.33310, 0.21700, 0.06885, 0.44800, 0.10490, 0…
|>
data_all skim_some()
Name | data_all |
Number of rows | 423 |
Number of columns | 31 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 30 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
diagnosis | 0 | 1 | 6 | 9 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | p0 | p100 |
---|---|---|---|---|
perimeter_se | 0 | 1 | 0.76 | 21.98 |
fractal_dimension_mean | 0 | 1 | 0.05 | 0.10 |
concave_points_worst | 0 | 1 | 0.00 | 0.29 |
symmetry_mean | 0 | 1 | 0.11 | 0.29 |
texture_se | 0 | 1 | 0.36 | 4.88 |
concave_points_se | 0 | 1 | 0.00 | 0.05 |
concavity_mean | 0 | 1 | 0.00 | 0.43 |
fractal_dimension_se | 0 | 1 | 0.00 | 0.03 |
radius_worst | 0 | 1 | 7.93 | 33.13 |
concave_points_mean | 0 | 1 | 0.00 | 0.19 |
radius_mean | 0 | 1 | 6.98 | 28.11 |
smoothness_se | 0 | 1 | 0.00 | 0.02 |
smoothness_worst | 0 | 1 | 0.07 | 0.21 |
symmetry_se | 0 | 1 | 0.01 | 0.08 |
radius_se | 0 | 1 | 0.11 | 2.87 |
concavity_worst | 0 | 1 | 0.00 | 1.25 |
concavity_se | 0 | 1 | 0.00 | 0.40 |
compactness_se | 0 | 1 | 0.00 | 0.11 |
smoothness_mean | 0 | 1 | 0.05 | 0.16 |
area_se | 0 | 1 | 6.80 | 525.60 |
area_worst | 0 | 1 | 185.20 | 3432.00 |
perimeter_mean | 0 | 1 | 43.79 | 188.50 |
compactness_mean | 0 | 1 | 0.02 | 0.35 |
area_mean | 0 | 1 | 143.50 | 2499.00 |
fractal_dimension_worst | 0 | 1 | 0.06 | 0.17 |
texture_mean | 0 | 1 | 9.71 | 39.28 |
perimeter_worst | 0 | 1 | 50.41 | 229.30 |
symmetry_worst | 0 | 1 | 0.16 | 0.66 |
texture_worst | 0 | 1 | 12.02 | 49.54 |
compactness_worst | 0 | 1 | 0.03 | 0.87 |
<- data_all |>
data_all mutate(diagnosis = factor(diagnosis, levels = c("malignant", "benign"))) |>
glimpse()
Rows: 423
Columns: 31
$ diagnosis <fct> malignant, benign, benign, malignant, benign, …
$ perimeter_se <dbl> 1.9740, 1.6670, 1.4890, 2.9890, 2.6840, 1.4450…
$ fractal_dimension_mean <dbl> 0.05986, 0.06320, 0.05828, 0.06768, 0.05934, 0…
$ concave_points_worst <dbl> 0.12520, 0.11050, 0.03002, 0.20270, 0.06544, 0…
$ symmetry_mean <dbl> 0.1594, 0.1886, 0.1845, 0.2157, 0.1834, 0.1514…
$ texture_se <dbl> 0.3621, 0.7339, 1.6470, 0.9489, 0.8429, 1.0660…
$ concave_points_se <dbl> 0.008260, 0.013040, 0.003419, 0.012710, 0.0091…
$ concavity_mean <dbl> 0.075500, 0.070970, 0.004967, 0.169000, 0.0263…
$ fractal_dimension_se <dbl> 0.002881, 0.001982, 0.002534, 0.003884, 0.0014…
$ radius_worst <dbl> 17.770, 12.640, 12.360, 18.810, 12.970, 14.200…
$ concave_points_mean <dbl> 0.040790, 0.044970, 0.006434, 0.089230, 0.0206…
$ radius_mean <dbl> 15.120, 11.610, 11.220, 14.870, 11.500, 12.620…
$ smoothness_se <dbl> 0.005472, 0.005884, 0.004359, 0.006985, 0.0063…
$ smoothness_worst <dbl> 0.14910, 0.14150, 0.09994, 0.18780, 0.11830, 0…
$ symmetry_se <dbl> 0.01523, 0.01848, 0.01916, 0.01602, 0.02292, 0…
$ radius_se <dbl> 0.2711, 0.2456, 0.2239, 0.4266, 0.3927, 0.2449…
$ concavity_worst <dbl> 0.33270, 0.23020, 0.02318, 0.47040, 0.08105, 0…
$ concavity_se <dbl> 0.020390, 0.026310, 0.003223, 0.030110, 0.0124…
$ compactness_se <dbl> 0.019190, 0.020050, 0.006813, 0.025630, 0.0106…
$ smoothness_mean <dbl> 0.08876, 0.10880, 0.07780, 0.11620, 0.09345, 0…
$ area_se <dbl> 26.440, 15.890, 15.460, 41.180, 26.990, 18.510…
$ area_worst <dbl> 989.5, 475.7, 470.9, 1095.0, 508.9, 624.0, 544…
$ perimeter_mean <dbl> 98.78, 75.46, 70.79, 98.64, 73.28, 81.35, 79.0…
$ compactness_mean <dbl> 0.09588, 0.11680, 0.03574, 0.16490, 0.05991, 0…
$ area_mean <dbl> 716.6, 408.2, 386.8, 682.5, 407.4, 496.4, 466.…
$ fractal_dimension_worst <dbl> 0.09740, 0.07427, 0.07307, 0.10650, 0.06487, 0…
$ texture_mean <dbl> 16.68, 16.02, 33.81, 16.67, 18.45, 23.97, 18.5…
$ perimeter_worst <dbl> 117.70, 81.93, 78.44, 127.10, 83.12, 90.67, 85…
$ symmetry_worst <dbl> 0.3415, 0.2787, 0.2911, 0.3585, 0.2740, 0.2826…
$ texture_worst <dbl> 20.24, 19.67, 41.78, 27.37, 22.46, 31.31, 27.4…
$ compactness_worst <dbl> 0.33310, 0.21700, 0.06885, 0.44800, 0.10490, 0…
|>
data_all skim_some() |>
::yank("factor") skimr
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
diagnosis | 0 | 1 | FALSE | 2 | ben: 357, mal: 66 |
Print a table to see the balance of positive and negative cases of your outcome variable.
|>
data_all ::tabyl(diagnosis) janitor
diagnosis n percent
malignant 66 0.1560284
benign 357 0.8439716
Our outcome variable is unbalanced (357:66). This could create problems with the Positive Predictive Value of our model and yield lower sensitivity (i.e., true positives).
Split data into train and test
Hold out 1/3 of the data as a test set for evaluation using the initial_split()
function. Use the provided seed. Stratify on diagnosis.
set.seed(12345)
<- data_all |>
splits_test initial_split(prop = 2/3, strata = "diagnosis")
<- splits_test |>
data_train analysis()
<- splits_test |>
data_test assessment()
Light modeling EDA
Look at correlations between predictors in data_train
.
|>
data_train select(where(is.numeric)) |>
cor() |>
::corrplot(type = "upper") corrplot
Visualize the variance of your predictors in data_train
.
|>
data_train select(where(is.numeric)) |>
names() |>
map(\(name) plot_box_violin(df = data_train, x = name)) |>
::plot_grid(plotlist = _, ncol = 5) cowplot
Looking at the variability of our predictors allows us to examine the variable distribution and flag any potential outlier values. Examining the variable distribution is especially important here since we are considering fitting a linear model, and linear models assume that our predictors are normally distributed. Outlier values can be problematic for all types of models if they have high influence and impact the shape of the relationship between our predictors and the outcome.
Transformations (e.g. log, Yeo Johnson) can be applied to normalize the distributional shape of predictors. Options for handling outliers include excluding outlier values that are clearly errors/invalid, retaining outliers that we have no evidence are incorrect, or bringing outlier values to the fence.
Almost all variables are very positively skewed, with a handful of variables (e.g. concave_points_se
, perimeter_se
, area_se
… basically all _se
variables) having a few points with unusually high values that appear separated from the distribution and may be outliers.
GLM vs KNN
Bootstrap splits
Split data_train
into 100 bootstrap samples stratified on diagnosis.
set.seed(12345)
<- data_train |>
splits_boot bootstraps(times = 100, strata = "diagnosis")
splits_boot
# Bootstrap sampling using stratification
# A tibble: 100 × 2
splits id
<list> <chr>
1 <split [282/106]> Bootstrap001
2 <split [282/101]> Bootstrap002
3 <split [282/104]> Bootstrap003
4 <split [282/98]> Bootstrap004
5 <split [282/109]> Bootstrap005
6 <split [282/91]> Bootstrap006
7 <split [282/103]> Bootstrap007
8 <split [282/95]> Bootstrap008
9 <split [282/108]> Bootstrap009
10 <split [282/101]> Bootstrap010
11 <split [282/110]> Bootstrap011
12 <split [282/100]> Bootstrap012
13 <split [282/107]> Bootstrap013
14 <split [282/106]> Bootstrap014
15 <split [282/100]> Bootstrap015
16 <split [282/98]> Bootstrap016
17 <split [282/108]> Bootstrap017
18 <split [282/106]> Bootstrap018
19 <split [282/114]> Bootstrap019
20 <split [282/110]> Bootstrap020
21 <split [282/108]> Bootstrap021
22 <split [282/107]> Bootstrap022
23 <split [282/103]> Bootstrap023
24 <split [282/111]> Bootstrap024
25 <split [282/101]> Bootstrap025
26 <split [282/98]> Bootstrap026
27 <split [282/107]> Bootstrap027
28 <split [282/102]> Bootstrap028
29 <split [282/113]> Bootstrap029
30 <split [282/104]> Bootstrap030
31 <split [282/99]> Bootstrap031
32 <split [282/92]> Bootstrap032
33 <split [282/103]> Bootstrap033
34 <split [282/100]> Bootstrap034
35 <split [282/102]> Bootstrap035
36 <split [282/93]> Bootstrap036
37 <split [282/99]> Bootstrap037
38 <split [282/97]> Bootstrap038
39 <split [282/101]> Bootstrap039
40 <split [282/97]> Bootstrap040
41 <split [282/107]> Bootstrap041
42 <split [282/106]> Bootstrap042
43 <split [282/97]> Bootstrap043
44 <split [282/103]> Bootstrap044
45 <split [282/119]> Bootstrap045
46 <split [282/99]> Bootstrap046
47 <split [282/103]> Bootstrap047
48 <split [282/99]> Bootstrap048
49 <split [282/100]> Bootstrap049
50 <split [282/112]> Bootstrap050
51 <split [282/101]> Bootstrap051
52 <split [282/103]> Bootstrap052
53 <split [282/115]> Bootstrap053
54 <split [282/94]> Bootstrap054
55 <split [282/113]> Bootstrap055
56 <split [282/100]> Bootstrap056
57 <split [282/101]> Bootstrap057
58 <split [282/100]> Bootstrap058
59 <split [282/94]> Bootstrap059
60 <split [282/102]> Bootstrap060
61 <split [282/105]> Bootstrap061
62 <split [282/105]> Bootstrap062
63 <split [282/111]> Bootstrap063
64 <split [282/106]> Bootstrap064
65 <split [282/100]> Bootstrap065
66 <split [282/104]> Bootstrap066
67 <split [282/98]> Bootstrap067
68 <split [282/107]> Bootstrap068
69 <split [282/95]> Bootstrap069
70 <split [282/102]> Bootstrap070
71 <split [282/108]> Bootstrap071
72 <split [282/106]> Bootstrap072
73 <split [282/101]> Bootstrap073
74 <split [282/97]> Bootstrap074
75 <split [282/102]> Bootstrap075
76 <split [282/98]> Bootstrap076
77 <split [282/100]> Bootstrap077
78 <split [282/102]> Bootstrap078
79 <split [282/109]> Bootstrap079
80 <split [282/112]> Bootstrap080
81 <split [282/108]> Bootstrap081
82 <split [282/106]> Bootstrap082
83 <split [282/112]> Bootstrap083
84 <split [282/104]> Bootstrap084
85 <split [282/103]> Bootstrap085
86 <split [282/102]> Bootstrap086
87 <split [282/102]> Bootstrap087
88 <split [282/114]> Bootstrap088
89 <split [282/107]> Bootstrap089
90 <split [282/112]> Bootstrap090
91 <split [282/100]> Bootstrap091
92 <split [282/99]> Bootstrap092
93 <split [282/106]> Bootstrap093
94 <split [282/103]> Bootstrap094
95 <split [282/98]> Bootstrap095
96 <split [282/101]> Bootstrap096
97 <split [282/110]> Bootstrap097
98 <split [282/100]> Bootstrap098
99 <split [282/108]> Bootstrap099
100 <split [282/97]> Bootstrap100
Build recipes
<- recipe(diagnosis ~ ., data = data_train)
rec_glm
<- recipe(diagnosis ~ ., data = data_train) |>
rec_knn step_range(all_predictors())
Fit GLM
<-
fit_lr_boot logistic_reg() |>
set_engine("glm") |>
fit_resamples(preprocessor = rec_glm,
resamples = splits_boot,
metrics = metric_set(roc_auc))
Fitting our bootstrap splits in this way appears to work just fine. However, upon closer look we see that there were some warnings. First lets look for the warnings by calling the fit object directly:
fit_lr_boot
# Resampling results
# Bootstrap sampling using stratification
# A tibble: 100 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [282/106]> Bootstrap001 <tibble [1 × 4]> <tibble [1 × 3]>
2 <split [282/101]> Bootstrap002 <tibble [1 × 4]> <tibble [1 × 3]>
3 <split [282/104]> Bootstrap003 <tibble [1 × 4]> <tibble [1 × 3]>
4 <split [282/98]> Bootstrap004 <tibble [1 × 4]> <tibble [1 × 3]>
5 <split [282/109]> Bootstrap005 <tibble [1 × 4]> <tibble [1 × 3]>
6 <split [282/91]> Bootstrap006 <tibble [1 × 4]> <tibble [1 × 3]>
7 <split [282/103]> Bootstrap007 <tibble [1 × 4]> <tibble [1 × 3]>
8 <split [282/95]> Bootstrap008 <tibble [1 × 4]> <tibble [1 × 3]>
9 <split [282/108]> Bootstrap009 <tibble [1 × 4]> <tibble [1 × 3]>
10 <split [282/101]> Bootstrap010 <tibble [1 × 4]> <tibble [1 × 3]>
11 <split [282/110]> Bootstrap011 <tibble [1 × 4]> <tibble [1 × 3]>
12 <split [282/100]> Bootstrap012 <tibble [1 × 4]> <tibble [1 × 3]>
13 <split [282/107]> Bootstrap013 <tibble [1 × 4]> <tibble [1 × 3]>
14 <split [282/106]> Bootstrap014 <tibble [1 × 4]> <tibble [1 × 3]>
15 <split [282/100]> Bootstrap015 <tibble [1 × 4]> <tibble [1 × 3]>
16 <split [282/98]> Bootstrap016 <tibble [1 × 4]> <tibble [1 × 3]>
17 <split [282/108]> Bootstrap017 <tibble [1 × 4]> <tibble [1 × 3]>
18 <split [282/106]> Bootstrap018 <tibble [1 × 4]> <tibble [1 × 3]>
19 <split [282/114]> Bootstrap019 <tibble [1 × 4]> <tibble [1 × 3]>
20 <split [282/110]> Bootstrap020 <tibble [1 × 4]> <tibble [1 × 3]>
21 <split [282/108]> Bootstrap021 <tibble [1 × 4]> <tibble [1 × 3]>
22 <split [282/107]> Bootstrap022 <tibble [1 × 4]> <tibble [1 × 3]>
23 <split [282/103]> Bootstrap023 <tibble [1 × 4]> <tibble [1 × 3]>
24 <split [282/111]> Bootstrap024 <tibble [1 × 4]> <tibble [1 × 3]>
25 <split [282/101]> Bootstrap025 <tibble [1 × 4]> <tibble [1 × 3]>
26 <split [282/98]> Bootstrap026 <tibble [1 × 4]> <tibble [1 × 3]>
27 <split [282/107]> Bootstrap027 <tibble [1 × 4]> <tibble [1 × 3]>
28 <split [282/102]> Bootstrap028 <tibble [1 × 4]> <tibble [1 × 3]>
29 <split [282/113]> Bootstrap029 <tibble [1 × 4]> <tibble [1 × 3]>
30 <split [282/104]> Bootstrap030 <tibble [1 × 4]> <tibble [1 × 3]>
31 <split [282/99]> Bootstrap031 <tibble [1 × 4]> <tibble [1 × 3]>
32 <split [282/92]> Bootstrap032 <tibble [1 × 4]> <tibble [1 × 3]>
33 <split [282/103]> Bootstrap033 <tibble [1 × 4]> <tibble [1 × 3]>
34 <split [282/100]> Bootstrap034 <tibble [1 × 4]> <tibble [1 × 3]>
35 <split [282/102]> Bootstrap035 <tibble [1 × 4]> <tibble [1 × 3]>
36 <split [282/93]> Bootstrap036 <tibble [1 × 4]> <tibble [1 × 3]>
37 <split [282/99]> Bootstrap037 <tibble [1 × 4]> <tibble [1 × 3]>
38 <split [282/97]> Bootstrap038 <tibble [1 × 4]> <tibble [1 × 3]>
39 <split [282/101]> Bootstrap039 <tibble [1 × 4]> <tibble [1 × 3]>
40 <split [282/97]> Bootstrap040 <tibble [1 × 4]> <tibble [1 × 3]>
41 <split [282/107]> Bootstrap041 <tibble [1 × 4]> <tibble [1 × 3]>
42 <split [282/106]> Bootstrap042 <tibble [1 × 4]> <tibble [1 × 3]>
43 <split [282/97]> Bootstrap043 <tibble [1 × 4]> <tibble [1 × 3]>
44 <split [282/103]> Bootstrap044 <tibble [1 × 4]> <tibble [1 × 3]>
45 <split [282/119]> Bootstrap045 <tibble [1 × 4]> <tibble [1 × 3]>
46 <split [282/99]> Bootstrap046 <tibble [1 × 4]> <tibble [1 × 3]>
47 <split [282/103]> Bootstrap047 <tibble [1 × 4]> <tibble [1 × 3]>
48 <split [282/99]> Bootstrap048 <tibble [1 × 4]> <tibble [1 × 3]>
49 <split [282/100]> Bootstrap049 <tibble [1 × 4]> <tibble [1 × 3]>
50 <split [282/112]> Bootstrap050 <tibble [1 × 4]> <tibble [1 × 3]>
51 <split [282/101]> Bootstrap051 <tibble [1 × 4]> <tibble [1 × 3]>
52 <split [282/103]> Bootstrap052 <tibble [1 × 4]> <tibble [1 × 3]>
53 <split [282/115]> Bootstrap053 <tibble [1 × 4]> <tibble [1 × 3]>
54 <split [282/94]> Bootstrap054 <tibble [1 × 4]> <tibble [1 × 3]>
55 <split [282/113]> Bootstrap055 <tibble [1 × 4]> <tibble [1 × 3]>
56 <split [282/100]> Bootstrap056 <tibble [1 × 4]> <tibble [1 × 3]>
57 <split [282/101]> Bootstrap057 <tibble [1 × 4]> <tibble [1 × 3]>
58 <split [282/100]> Bootstrap058 <tibble [1 × 4]> <tibble [1 × 3]>
59 <split [282/94]> Bootstrap059 <tibble [1 × 4]> <tibble [1 × 3]>
60 <split [282/102]> Bootstrap060 <tibble [1 × 4]> <tibble [1 × 3]>
61 <split [282/105]> Bootstrap061 <tibble [1 × 4]> <tibble [1 × 3]>
62 <split [282/105]> Bootstrap062 <tibble [1 × 4]> <tibble [1 × 3]>
63 <split [282/111]> Bootstrap063 <tibble [1 × 4]> <tibble [1 × 3]>
64 <split [282/106]> Bootstrap064 <tibble [1 × 4]> <tibble [1 × 3]>
65 <split [282/100]> Bootstrap065 <tibble [1 × 4]> <tibble [1 × 3]>
66 <split [282/104]> Bootstrap066 <tibble [1 × 4]> <tibble [1 × 3]>
67 <split [282/98]> Bootstrap067 <tibble [1 × 4]> <tibble [1 × 3]>
68 <split [282/107]> Bootstrap068 <tibble [1 × 4]> <tibble [1 × 3]>
69 <split [282/95]> Bootstrap069 <tibble [1 × 4]> <tibble [1 × 3]>
70 <split [282/102]> Bootstrap070 <tibble [1 × 4]> <tibble [1 × 3]>
71 <split [282/108]> Bootstrap071 <tibble [1 × 4]> <tibble [1 × 3]>
72 <split [282/106]> Bootstrap072 <tibble [1 × 4]> <tibble [1 × 3]>
73 <split [282/101]> Bootstrap073 <tibble [1 × 4]> <tibble [1 × 3]>
74 <split [282/97]> Bootstrap074 <tibble [1 × 4]> <tibble [1 × 3]>
75 <split [282/102]> Bootstrap075 <tibble [1 × 4]> <tibble [1 × 3]>
76 <split [282/98]> Bootstrap076 <tibble [1 × 4]> <tibble [1 × 3]>
77 <split [282/100]> Bootstrap077 <tibble [1 × 4]> <tibble [1 × 3]>
78 <split [282/102]> Bootstrap078 <tibble [1 × 4]> <tibble [1 × 3]>
79 <split [282/109]> Bootstrap079 <tibble [1 × 4]> <tibble [1 × 3]>
80 <split [282/112]> Bootstrap080 <tibble [1 × 4]> <tibble [1 × 3]>
81 <split [282/108]> Bootstrap081 <tibble [1 × 4]> <tibble [1 × 3]>
82 <split [282/106]> Bootstrap082 <tibble [1 × 4]> <tibble [1 × 3]>
83 <split [282/112]> Bootstrap083 <tibble [1 × 4]> <tibble [1 × 3]>
84 <split [282/104]> Bootstrap084 <tibble [1 × 4]> <tibble [1 × 3]>
85 <split [282/103]> Bootstrap085 <tibble [1 × 4]> <tibble [1 × 3]>
86 <split [282/102]> Bootstrap086 <tibble [1 × 4]> <tibble [1 × 3]>
87 <split [282/102]> Bootstrap087 <tibble [1 × 4]> <tibble [1 × 3]>
88 <split [282/114]> Bootstrap088 <tibble [1 × 4]> <tibble [1 × 3]>
89 <split [282/107]> Bootstrap089 <tibble [1 × 4]> <tibble [1 × 3]>
90 <split [282/112]> Bootstrap090 <tibble [1 × 4]> <tibble [1 × 3]>
91 <split [282/100]> Bootstrap091 <tibble [1 × 4]> <tibble [1 × 3]>
92 <split [282/99]> Bootstrap092 <tibble [1 × 4]> <tibble [1 × 3]>
93 <split [282/106]> Bootstrap093 <tibble [1 × 4]> <tibble [1 × 3]>
94 <split [282/103]> Bootstrap094 <tibble [1 × 4]> <tibble [1 × 3]>
95 <split [282/98]> Bootstrap095 <tibble [1 × 4]> <tibble [1 × 3]>
96 <split [282/101]> Bootstrap096 <tibble [1 × 4]> <tibble [1 × 3]>
97 <split [282/110]> Bootstrap097 <tibble [1 × 4]> <tibble [1 × 3]>
98 <split [282/100]> Bootstrap098 <tibble [1 × 4]> <tibble [1 × 3]>
99 <split [282/108]> Bootstrap099 <tibble [1 × 4]> <tibble [1 × 3]>
100 <split [282/97]> Bootstrap100 <tibble [1 × 4]> <tibble [1 × 3]>
There were issues with some computations:
- Warning(s) x100: glm.fit: algorithm did not converge, glm.fit: fitted probabilitie...
Run `show_notes(.Last.tune.result)` for more information.
This tells us that we should use: show_notes(.Last.tune.result)
. So let’s run that:
show_notes(.Last.tune.result)
unique notes:
────────────────────────────────────────────────────────────────────────────────
glm.fit: algorithm did not converge, glm.fit: fitted probabilities numerically 0 or 1 occurred
In this case, it just recapitulates the same message that presented by calling the object. The bigger point here is: always inspect error and warning messages.
We are getting a warning message about our fitted probabilities. This occurs because the outcome is totally separable (remember that issue from our discussions of logistic regression earlier). This model can still be used for prediction but we wouldn’t be able to use the statistical tests of the parameter estimates. Those are not our focus here.
We are going to fit this model on the whole sample so we can show you why we get this warning message. When we aren’t bootstrapping, the warning is provided when we fit.
#make feature matrix
<- rec_glm |>
rec_prep prep(data_train)
<- rec_prep |>
feat_train_glm bake(data_train)
#fit on full training sample
<- logistic_reg() |>
fit_best_glm set_engine("glm", control = list(maxit = 100)) |>
fit(diagnosis ~ ., data = feat_train_glm)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
$fit |> summary() fit_best_glm
Call:
stats::glm(formula = diagnosis ~ ., family = stats::binomial,
data = data, control = ~list(maxit = 100))
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.531e+02 1.708e+07 0 1
perimeter_se 1.707e+01 1.334e+06 0 1
fractal_dimension_mean 6.355e+02 1.607e+08 0 1
concave_points_worst 8.240e+01 1.631e+07 0 1
symmetry_mean 1.176e+02 1.443e+07 0 1
texture_se -1.961e+01 3.189e+05 0 1
concave_points_se -7.613e+03 1.292e+08 0 1
concavity_mean -1.484e+03 2.391e+07 0 1
fractal_dimension_se 1.078e+04 3.475e+08 0 1
radius_worst -8.568e+00 2.694e+06 0 1
concave_points_mean 2.814e+02 4.573e+07 0 1
radius_mean 2.524e+02 2.835e+06 0 1
smoothness_se 1.435e+04 1.270e+08 0 1
smoothness_worst -7.010e+02 4.114e+07 0 1
symmetry_se 1.954e+03 3.878e+07 0 1
radius_se -2.772e+02 1.306e+07 0 1
concavity_worst 5.805e+01 6.757e+06 0 1
concavity_se 1.542e+03 3.167e+07 0 1
compactness_se -1.799e+03 4.536e+07 0 1
smoothness_mean -1.274e+03 8.379e+07 0 1
area_se 1.624e+00 2.984e+04 0 1
area_worst -2.532e-01 1.314e+04 0 1
perimeter_mean -3.272e+01 3.664e+05 0 1
compactness_mean 2.339e+03 3.687e+07 0 1
area_mean -2.674e-01 1.237e+04 0 1
fractal_dimension_worst -1.091e+03 7.320e+07 0 1
texture_mean -6.494e+00 1.553e+05 0 1
perimeter_worst 2.138e+00 2.153e+05 0 1
symmetry_worst -1.873e+02 7.068e+06 0 1
texture_worst 4.069e+00 1.055e+05 0 1
compactness_worst 4.504e+01 1.423e+07 0 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2.4423e+02 on 281 degrees of freedom
Residual deviance: 2.6409e-10 on 251 degrees of freedom
AIC: 62
Number of Fisher Scoring iterations: 29
Let’s take a closer look at the predictions our model is making.
# probabilities in train
predict(fit_best_glm, feat_train_glm, type = "prob") |>
print(n = 50)
# A tibble: 282 × 2
.pred_malignant .pred_benign
<dbl> <dbl>
1 2.22e-16 1
2 2.22e-16 1
3 2.94e-12 1.00
4 2.22e-16 1
5 2.22e-16 1
6 2.22e-16 1
7 2.22e-16 1
8 2.22e-16 1
9 2.22e-16 1
10 2.22e-16 1
11 3.18e-12 1.00
12 2.22e-16 1
13 2.22e-16 1
14 2.22e-16 1
15 5.41e-12 1.00
16 2.22e-16 1
17 2.22e-16 1
18 2.22e-16 1
19 1.22e-12 1.00
20 2.22e-16 1
21 2.22e-16 1
22 2.22e-16 1
23 2.22e-16 1
24 2.22e-16 1
25 2.22e-16 1
26 2.22e-16 1
27 2.22e-16 1
28 2.22e-16 1
29 2.22e-16 1
30 3.15e-12 1.00
31 2.22e-16 1
32 2.22e-16 1
33 2.22e-16 1
34 2.22e-16 1
35 2.22e-16 1
36 2.22e-16 1
37 2.22e-16 1
38 2.22e-16 1
39 2.22e-16 1
40 2.22e-16 1
41 2.22e-16 1
42 2.22e-16 1
43 2.22e-16 1
44 2.22e-16 1
45 2.22e-16 1
46 2.22e-16 1
47 2.22e-16 1
48 2.22e-16 1
49 2.22e-16 1
50 2.22e-16 1
# ℹ 232 more rows
# confusion matrix for train
<- tibble(truth = feat_train_glm$diagnosis,
cm_trn estimate = predict(fit_best_glm, feat_train_glm)$.pred_class) |>
conf_mat(truth, estimate)
cm_trn
Truth
Prediction malignant benign
malignant 44 0
benign 0 238
|>
cm_trn summary()
# A tibble: 13 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 1
2 kap binary 1
3 sens binary 1
4 spec binary 1
5 ppv binary 1
6 npv binary 1
7 mcc binary 1
8 j_index binary 1
9 bal_accuracy binary 1
10 detection_prevalence binary 0.156
11 precision binary 1
12 recall binary 1
13 f_meas binary 1
Here we can see the cause of our warning message: Our data is perfectly predicting the classes (“perfect separation”) of the outcome in train (as seen from the confusion matrix above). This likely occurs because we can predict the outcome really well with these features in held out data (see below) and there is some overfittng in training which moves us from very good prediction to perfect prediction in training. Since we have confirmed that the warning message makes sense to us here, it is OK to progress.
Print the average ROC AUC of your logistic regression model
collect_metrics(fit_lr_boot, summarize = TRUE)
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 roc_auc binary 0.945 100 0.00398 Preprocessor1_Model1
You might be wondering why our performance is not perfect across bootstrapped samples (we got the error for each of the 100 bootstraps above, which means our model always predicted perfectly in train). For demonstration purposes, we will assess this model in test here as well.
#test feature matrix
<- rec_prep |>
feat_test_glm bake(data_test)
# confusion matrix for test
<- tibble(truth = feat_test_glm$diagnosis,
cm_test estimate = predict(fit_best_glm, feat_test_glm)$.pred_class) |>
conf_mat(truth, estimate)
cm_test
Truth
Prediction malignant benign
malignant 21 1
benign 1 118
Performance is still pretty good here, but due to the ratio of number of predictors relative to our low training cases across bootstrapped folds, our models are likely able to perfectly fit our training data every time, but do not always echo this same performance in their relative test sets.
You should always check for these sorts of warnings by printing the model fit object.
Fit KNN
Set up a hyperparameter grid to consider a range of values for neighbors
in your KNN models.
NOTE be sure that you are not considering values higher than the N of your training sample! In this dataset N = 282
<- expand.grid(neighbors = seq(1, 280, by = 10)) hyper_grid
Fit a KNN model using the recipe you created and your bootstrap splits. Use ROC AUC (roc_auc
) as your performance metric.
<-
fit_knn_boot nearest_neighbor(neighbors = tune()) |>
set_engine("kknn") |>
set_mode("classification") |>
tune_grid(preprocessor = rec_knn,
resamples = splits_boot,
grid = hyper_grid,
metrics = metric_set(roc_auc))
Generate a plot to help you determine if you considered a wide enough range of values for neighbors
.
plot_hyperparameters(fit_knn_boot, hp1 = "neighbors", metric = "roc_auc")
Print the best value for the neighbors
hyperparameter across resamples based on model ROC AUC.
select_best(fit_knn_boot)
# A tibble: 1 × 2
neighbors .config
<dbl> <chr>
1 121 Preprocessor1_Model13
Print the average ROC AUC of your best KNN regression model
collect_metrics(fit_knn_boot, summarize = TRUE) |>
filter(neighbors == select_best(fit_knn_boot)$neighbors)
# A tibble: 1 × 7
neighbors .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 121 roc_auc binary 0.982 100 0.00173 Preprocessor1_Model13
Select and fit best model
Select based on average roc_auc
. Our best model configuration was a KNN algorithm with 121 neighbors.
Create training and test feature matrices for our best KNN model
<- rec_knn |>
rec_prep prep(data_train)
<- rec_prep |>
feat_train bake(data_train)
<- rec_prep |>
feat_test bake(data_test)
Fit on the full training sample
<- nearest_neighbor(neighbors = select_best(fit_knn_boot)$neighbors) |>
fit_best_knn set_engine("kknn") |>
set_mode("classification") |>
fit(diagnosis ~ ., data = feat_train)
Evaluate best model
Plot the ROC of your best model in the test set.
<-
roc_plot tibble(truth = feat_test$diagnosis,
prob = predict(fit_best_knn, feat_test, type = "prob")$.pred_malignant) |>
roc_curve(prob, truth = truth)
|>
roc_plot ggplot(aes(x = 1 - specificity, y = sensitivity, color = .threshold)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal() +
labs(x = "1 - Specificity (FPR)",
y = "Sensitivity (TPR)")
Confusion matrix depicting performance in test
<- tibble(truth = feat_test$diagnosis,
cm estimate = predict(fit_best_knn, feat_test)$.pred_class) |>
conf_mat(truth, estimate)
cm
Truth
Prediction malignant benign
malignant 11 0
benign 11 119
|>
cm summary()
# A tibble: 13 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.922
2 kap binary 0.628
3 sens binary 0.5
4 spec binary 1
5 ppv binary 1
6 npv binary 0.915
7 mcc binary 0.677
8 j_index binary 0.5
9 bal_accuracy binary 0.75
10 detection_prevalence binary 0.0780
11 precision binary 1
12 recall binary 0.5
13 f_meas binary 0.667
Make a plot of your confusion matrix
autoplot(cm)
Report the AUC, accuracy, sensitivity, specificity, PPV, and NPV of your best model in test
Let’s set up a tibble to track results across portions of the assignment.
#ROC AUC
<- tibble(truth = feat_test$diagnosis,
auc prob = predict(fit_best_knn, feat_test, type = "prob")$.pred_malignant) |>
roc_auc(prob, truth = truth) |>
pull(.estimate)
#Accuracy, sensitivity, specificity, PPV, NPV
#Using pivot_wider() to make a tracking tibble
<- cm |>
results summary() |>
filter(.metric %in% c("accuracy", "sens", "spec", "ppv", "npv")) |>
select(-.estimator) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = str_c("knn_", fit_best_knn$fit$best.parameters$k),
roc_auc = auc) |>
select(model, roc_auc, everything())
print_kbl(results, height = "100px")
model | roc_auc | accuracy | sens | spec | ppv | npv |
---|---|---|---|---|---|---|
knn_121 | 1 | 0.92 | 0.5 | 1 | 1 | 0.92 |
Part 2: Addressing class imbalance
Since only 15% of our cases our malignant, let’s see if we can achieve higher sensitivity by up-sampling our data with SMOTE.
Build recipes
Update your previous recipes to up-sample the minority class (malignant) in diagnosis using step_smote().
Remember to make 2 recipes (one for glm, one for KNN)
<- recipe(diagnosis ~ ., data = data_train) |>
rec_glm_up ::step_smote(diagnosis, over_ratio = 1, neighbors = 5)
themis
<- recipe(diagnosis ~ ., data = data_train) |>
rec_knn_up ::step_smote(diagnosis, over_ratio = 1, neighbors = 5) |>
themisstep_range(all_predictors())
Fit GLM
Fit an up-sampled logistic regression classifier
<-
fit_lr_boot_up logistic_reg() |>
set_engine("glm", control = list(maxit = 100)) |>
fit_resamples(preprocessor = rec_glm_up,
resamples = splits_boot,
metrics = metric_set(roc_auc))
We are seeing the same problem as before.
show_notes(.Last.tune.result)
unique notes:
─────────────────────────────────────────────────────────
glm.fit: fitted probabilities numerically 0 or 1 occurred
Print the average ROC AUC of your logistic regression model
collect_metrics(fit_lr_boot_up, summarize = TRUE)
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 roc_auc binary 0.916 100 0.00469 Preprocessor1_Model1
3) Fit KNN
Set up a hyperparameter grid to consider a range of values for neighbors
in your KNN models.
<- expand.grid(neighbors = seq(1, 280, by = 10)) hyper_grid
Fit an up-sampled KNN using the new KNN recipe you created and your bootstrap splits. Use ROC AUC as your performance metric.
<-
fit_knn_boot_up nearest_neighbor(neighbors = tune()) |>
set_engine("kknn") |>
set_mode("classification") |>
tune_grid(preprocessor = rec_knn_up,
resamples = splits_boot,
grid = hyper_grid,
metrics = metric_set(roc_auc))
Generate a plot to help you determine if you considered a wide enough range of values for neighbors
.
plot_hyperparameters(fit_knn_boot_up, hp1 = "neighbors", metric = "roc_auc")
Print the best value for the neighbors
hyperparameter across resamples based on model ROC AUC.
select_best(fit_knn_boot_up)
# A tibble: 1 × 2
neighbors .config
<dbl> <chr>
1 71 Preprocessor1_Model08
Print the average ROC AUC of your best KNN regression model
collect_metrics(fit_knn_boot_up, summarize = TRUE) |>
filter(neighbors == select_best(fit_knn_boot_up)$neighbors)
# A tibble: 1 × 7
neighbors .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 71 roc_auc binary 0.978 100 0.00186 Preprocessor1_Model08
Select and fit best model
Create the up-sampled training feature matrix using your best recipe (for us this is KNN). Remember, do not upsample your test data!
<- rec_knn_up |>
rec_prep prep(data_train)
<- rec_prep |>
feat_train_up bake(new_data = NULL)
Fit your best performing up-sampled model on the full training sample
<- nearest_neighbor(neighbors = select_best(fit_knn_boot_up)$neighbors) |>
fit_best_knn_up set_engine("kknn") |>
set_mode("classification") |>
fit(diagnosis ~ ., data = feat_train_up)
5) Evaluate best model
Make a figure to plot the ROC of your best up-sampled model in the test set
<-
roc_plot_up tibble(truth = feat_test$diagnosis,
prob = predict(fit_best_knn_up, feat_test, type = "prob")$.pred_malignant) |>
roc_curve(prob, truth = truth)
|>
roc_plot_up ggplot(aes(x = 1 - specificity, y = sensitivity, color = .threshold)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal() +
labs(x = "1 - Specificity (FPR)",
y = "Sensitivity (TPR)")
Generate a confusion matrix depicting your up-sampled model’s performance in test
<- tibble(truth = feat_test$diagnosis,
cm_up estimate = predict(fit_best_knn_up, feat_test)$.pred_class) |>
conf_mat(truth, estimate)
cm_up
Truth
Prediction malignant benign
malignant 21 2
benign 1 117
Make a plot of your confusion matrix
autoplot(cm_up)
Report the AUC, accuracy, sensitivity, specificity, PPV, and NPV of your best up-sampled model in the held out test set
#ROC AUC
<- tibble(truth = feat_test$diagnosis,
auc prob = predict(fit_best_knn_up, feat_test, type = "prob")$.pred_malignant) |>
roc_auc(prob, truth = truth) |>
pull(.estimate)
#Accuracy, sensitivity, specificity, PPV, NPV
<- cm_up |>
results_up summary() |>
filter(.metric %in% c("accuracy", "sens", "spec", "ppv", "npv")) |>
select(-.estimator) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = str_c("knn_smote_", fit_best_knn_up$fit$best.parameters$k),
roc_auc = auc) |>
select(model, roc_auc, everything())
Joining results with our tibble from Part 1 for comparison
<- results |>
results bind_rows(results_up)
print_kbl(results, height = "150px")
model | roc_auc | accuracy | sens | spec | ppv | npv |
---|---|---|---|---|---|---|
knn_121 | 1 | 0.92 | 0.50 | 1.00 | 1.00 | 0.92 |
knn_smote_71 | 1 | 0.98 | 0.95 | 0.98 | 0.91 | 0.99 |
We get improvement with SMOTE! Increased sensitivity without much drop in specificity.
Part 3: New Classification Threshold
Now you want to check if there may be an additional benefit for your model’s performance if you adjust the classification threshold from its default 50% to a threshold of 40%
Adjust classification threshold to 40%
<- tibble(truth = feat_test$diagnosis,
preds prob = predict(fit_best_knn_up, feat_test, type = "prob")$.pred_malignant) |>
mutate(estimate_40 = if_else(prob > .40, "malignant", "benign"),
estimate_40 = factor(estimate_40, levels = c("malignant", "benign")))
preds
# A tibble: 141 × 3
truth prob estimate_40
<fct> <dbl> <fct>
1 benign 0.0142 benign
2 benign 0.0144 benign
3 malignant 1 malignant
4 benign 0.120 benign
5 malignant 0.956 malignant
6 benign 0 benign
7 benign 0.285 benign
8 benign 0.0139 benign
9 benign 0.00253 benign
10 benign 0 benign
11 benign 0.281 benign
12 benign 0.0258 benign
13 benign 0.249 benign
14 benign 0 benign
15 benign 0.0345 benign
16 benign 0.351 benign
17 benign 0.0169 benign
18 benign 0.304 benign
19 benign 0.264 benign
20 benign 0 benign
21 benign 0.0489 benign
22 malignant 0.552 malignant
23 benign 0.585 malignant
24 benign 0.212 benign
25 benign 0.0287 benign
26 benign 0.00512 benign
27 malignant 1 malignant
28 benign 0.119 benign
29 benign 0.450 malignant
30 benign 0.0573 benign
31 benign 0.00615 benign
32 benign 0.00718 benign
33 benign 0 benign
34 benign 0 benign
35 benign 0.0661 benign
36 benign 0 benign
37 benign 0.0644 benign
38 benign 0.00732 benign
39 malignant 0.996 malignant
40 benign 0.352 benign
41 benign 0.0113 benign
42 benign 0.0901 benign
43 benign 0.113 benign
44 benign 0 benign
45 benign 0.0785 benign
46 malignant 0.896 malignant
47 benign 0.0880 benign
48 malignant 0.922 malignant
49 benign 0.181 benign
50 benign 0.000859 benign
51 malignant 1 malignant
52 benign 0.184 benign
53 benign 0 benign
54 benign 0.0937 benign
55 benign 0.0935 benign
56 benign 0.000321 benign
57 benign 0 benign
58 benign 0.0455 benign
59 malignant 0.933 malignant
60 benign 0 benign
61 benign 0.0698 benign
62 benign 0.0425 benign
63 benign 0.603 malignant
64 benign 0 benign
65 malignant 1 malignant
66 benign 0 benign
67 benign 0.0459 benign
68 benign 0.0101 benign
69 benign 0 benign
70 benign 0.355 benign
71 benign 0.0163 benign
72 benign 0.0421 benign
73 benign 0.0552 benign
74 malignant 0.995 malignant
75 malignant 0.368 benign
76 benign 0 benign
77 benign 0.253 benign
78 benign 0.187 benign
79 benign 0 benign
80 benign 0.0446 benign
81 benign 0.0651 benign
82 malignant 0.947 malignant
83 benign 0.260 benign
84 malignant 1 malignant
85 malignant 0.949 malignant
86 benign 0.224 benign
87 benign 0.00242 benign
88 benign 0.00543 benign
89 benign 0.00974 benign
90 benign 0.00793 benign
91 benign 0.0618 benign
92 malignant 0.944 malignant
93 benign 0 benign
94 benign 0 benign
95 benign 0.230 benign
96 benign 0.0260 benign
97 benign 0.103 benign
98 benign 0.0146 benign
99 benign 0.0437 benign
100 benign 0.181 benign
101 benign 0 benign
102 malignant 0.748 malignant
103 benign 0.380 benign
104 benign 0 benign
105 benign 0.274 benign
106 benign 0.0431 benign
107 benign 0 benign
108 malignant 1 malignant
109 benign 0.0742 benign
110 benign 0.449 malignant
111 benign 0.00891 benign
112 malignant 0.901 malignant
113 benign 0 benign
114 benign 0 benign
115 benign 0.148 benign
116 benign 0 benign
117 benign 0 benign
118 benign 0 benign
119 benign 0 benign
120 benign 0.00751 benign
121 benign 0.000538 benign
122 benign 0.261 benign
123 benign 0.125 benign
124 benign 0.0479 benign
125 benign 0.0583 benign
126 malignant 1 malignant
127 benign 0.0161 benign
128 benign 0.0992 benign
129 benign 0.369 benign
130 malignant 1 malignant
131 benign 0 benign
132 benign 0.0818 benign
133 benign 0.123 benign
134 benign 0.108 benign
135 benign 0.0629 benign
136 benign 0 benign
137 benign 0.00798 benign
138 malignant 0.831 malignant
139 benign 0.0230 benign
140 benign 0.0261 benign
141 benign 0 benign
Evaluate model at new threshold
Generate a confusion matrix depicting your up-sampled model’s performance in test at your new threshold
<- preds |>
cm_40 conf_mat(truth = truth, estimate = estimate_40)
cm_40
Truth
Prediction malignant benign
malignant 21 4
benign 1 115
Make a plot of your confusion matrix
autoplot(cm_40)
Report the AUC, accuracy, sensitivity, specificity, PPV, and NPV of your best up-sampled model in the held out test set
#ROC AUC
<- tibble(truth = feat_test$diagnosis,
auc prob = predict(fit_best_knn_up, feat_test, type = "prob")$.pred_malignant) |>
roc_auc(prob, truth = truth) |>
pull(.estimate)
#Accuracy, sensitivity, specificity, PPV, NPV
<- cm_40 |>
results_40 summary() |>
filter(.metric %in% c("accuracy", "sens", "spec", "ppv", "npv")) |>
select(-.estimator) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = str_c("knn_smote_", fit_best_knn_up$fit$best.parameters$k, "_thresh_40"),
roc_auc = auc) |>
select(model, roc_auc, everything())
Join results with tibble from part 1 and 2 results for comparison
<- results |>
results bind_rows(results_40)
print_kbl(results, height = "200px")
model | roc_auc | accuracy | sens | spec | ppv | npv |
---|---|---|---|---|---|---|
knn_121 | 1 | 0.92 | 0.50 | 1.00 | 1.00 | 0.92 |
knn_smote_71 | 1 | 0.98 | 0.95 | 0.98 | 0.91 | 0.99 |
knn_smote_71_thresh_40 | 1 | 0.96 | 0.95 | 0.97 | 0.84 | 0.99 |
Adjusting the threshold did not help performance. Sensitivity stayed the same and specificity decreased! If we look at histograms below we see why this is. The black line shows a threshold of .5 (pretty good separation of classes). On the other hand a threshold of .4 creates more false positives (benign cases labeled as malignant) without capturing any additional true positives.
ggplot(data = preds, aes(x = prob)) +
geom_histogram(bins = 15) +
facet_wrap(vars(truth), nrow = 2) +
xlab("Pr(Disease)") +
geom_vline(xintercept = .5) +
geom_vline(xintercept = .4, color = "red")
Lets see what would happen if we changed threshold to .3.
ggplot(data = preds, aes(x = prob)) +
geom_histogram(bins = 15) +
facet_wrap(vars(truth), nrow = 2) +
xlab("Pr(Disease)") +
geom_vline(xintercept = .3, color = "red")
<- tibble(truth = feat_test$diagnosis,
preds prob = predict(fit_best_knn_up, feat_test, type = "prob")$.pred_malignant) |>
mutate(estimate_30 = if_else(prob > .30, "malignant", "benign"),
estimate_30 = factor(estimate_30, levels = c("malignant", "benign")))
<- preds |>
cm_30 conf_mat(truth = truth, estimate = estimate_30)
<- cm_30 |>
results_30 summary() |>
filter(.metric %in% c("accuracy", "sens", "spec", "ppv", "npv")) |>
select(-.estimator) |>
pivot_wider(names_from = .metric, values_from = .estimate) |>
mutate(model = str_c("knn_smote_", fit_best_knn_up$fit$best.parameters$k, "_thresh_30"),
roc_auc = auc) |>
select(model, roc_auc, everything())
<- results |>
results bind_rows(results_30)
print_kbl(results, height = "200px")
model | roc_auc | accuracy | sens | spec | ppv | npv |
---|---|---|---|---|---|---|
knn_121 | 1 | 0.92 | 0.50 | 1.00 | 1.00 | 0.92 |
knn_smote_71 | 1 | 0.98 | 0.95 | 0.98 | 0.91 | 0.99 |
knn_smote_71_thresh_40 | 1 | 0.96 | 0.95 | 0.97 | 0.84 | 0.99 |
knn_smote_71_thresh_30 | 1 | 0.93 | 1.00 | 0.92 | 0.69 | 1.00 |
We now have perfect sensitivity because we correctly predict all malignant cases as positive. However we now have lower specificity and ppv. However, this may be appropriate for this case since it would be better to falsely label benign cases as malignant as opposed to missing malignant cases!
We can also go back and look at our KNN model that was not upsampled.
With the default threshold of .5, what do you notice? Does this explain why we had perfect PPV and not very good sensitivity?
<- tibble(truth = feat_test$diagnosis,
preds prob = predict(fit_best_knn, feat_test, type = "prob")$.pred_malignant)
ggplot(data = preds, aes(x = prob)) +
geom_histogram(bins = 15) +
facet_wrap(vars(truth), nrow = 2) +
xlab("Pr(Disease)") +
geom_vline(xintercept = .5, color = "red")