-
Notifications
You must be signed in to change notification settings - Fork 2
/
shap_example.R
162 lines (122 loc) · 5.7 KB
/
shap_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# load the required libraries for building the base-learners and the ensemble models
library(h2o)
library(h2otools)
library(autoEnsemble)
# initiate the h2o server
h2o.init(ignore_config = TRUE, nthreads = 2, bind_to_localhost = FALSE, insecure = TRUE)
# upload data to h2o cloud
prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
df <- read.csv(prostate_path)
prostate <- h2o.importFile(path = prostate_path, header = TRUE)
### H2O provides 2 types of grid search for tuning the models, which are
### AutoML and Grid. Below, I tune 2 set of model grids and use them both
### for building the ensemble, just to set an example ...
#######################################################
### PREPARE AutoML Grid (takes a couple of minutes)
#######################################################
# run AutoML to tune various models (GLM, GBM, XGBoost, DRF, DeepLearning) for 120 seconds
y <- "CAPSULE"
prostate[,y] <- as.factor(prostate[,y]) #convert to factor for classification
aml <- h2o.automl(y = y, training_frame = prostate, max_runtime_secs = 30,
include_algos=c("DRF","XGBoost", "GBM"),
# this setting ensures the models are comparable for building a meta learner
seed = 2023, nfolds = 10,
keep_cross_validation_predictions = TRUE)
#######################################################
### PREPARE H2O Grid
#######################################################
# make sure equal number of "nfolds" is specified for different grids
grid <- h2o.grid(algorithm = "gbm", y = y, training_frame = prostate,
hyper_params = list(ntrees = seq(1,25,1)), grid_id = "ensemble_grid",
# this setting ensures the models are comparable for building a meta learner
seed = 2023, fold_assignment = "Modulo", nfolds = 10,
keep_cross_validation_predictions = TRUE)
#######################################################
### PREPARE ENSEMBLE MODELS
#######################################################
### get the models' IDs from the AutoML and grid searches.
### this is all that is needed before building the ensemble,
### i.e., to specify the model IDs that should be evaluated.
ids <- c(h2o.get_ids(aml), h2o.get_ids(grid))
top <- ensemble(models = ids, training_frame = prostate,
strategy = "top", top_rank = seq(0.05, 0.99, 0.01))
top$top_rank_id
search <- ensemble(models = ids, training_frame = prostate, strategy = "search")
search$top_rank_id
#######################################################
### EVALUATE THE MODELS
#######################################################
h2o.auc(aml@leader) # best model identified by h2o.automl
h2o.auc(h2o.getModel(grid@model_ids[[1]])) # best model identified by grid search
h2o.auc(top$model) # ensemble model with 'top' strategy
h2o.auc(search$model) # ensemble model with 'search' strategy
p <- h2o.predict_contributions(object = aml@leader,
newdata = prostate,
top_n=5)
View(as.data.frame(p))
# Compute SHAP and pick the top two highest regardless of the sign
pabs <- h2o.predict_contributions(aml@leader, prostate, top_n=5, compare_abs=TRUE)
# Compute SHAP and pick the top two lowest regardless of the sign
View(as.data.frame(pabs))
h2o.shap_summary_plot(
model = aml@leader,
newdata = prostate,
columns = NULL,
top_n_features = 5)
# h2o.shap_summary_plot(
# model = h2o.getModel("GBM_grid_1_AutoML_2_20230920_225003_model_7"),
# newdata = prostate,
# columns = NULL,
# top_n_features = 5)
# plt <- h2o.shap_summary_plot(
# model = aml@leader,
# newdata = prostate,
# columns = NULL #get SHAP for all columns
# #top_n_features = 5
# #sample_size = 100
# )
mdl1 <- h2o.shap_summary_plot(
model = h2o.getModel(aml@leaderboard[2,"model_id"]),
newdata = prostate,
columns = NULL #get SHAP for all columns
#top_n_features = 5
#sample_size = 100
)
mdl2 <- h2o.shap_summary_plot(
model = h2o.getModel(aml@leaderboard[2,"model_id"]),
newdata = prostate,
columns = NULL #get SHAP for all columns
#top_n_features = 5
#sample_size = 100
)
#get the data
dd <- mdl1$data
View(dd) #dd has 3040 rows, which is number of subjects * predictors of the model
#create the empty datasets for array
holder <- dd
holder <- holder[, c("Row.names", "contribution")]
d2 <- mdl2$data
View(d2) #dd has 3040 rows, which is number of subjects * predictors of the model
#id.x is the subject row number
#id.y I am not sure what this is
#feature is the predictor name
# 'normalized_value' is the original value of the observation, used for coloring
# 'original_value' is the original value of the observation, which serves the
# 'normalized_value' later on
# THEREFORE 'original_valie' and 'normalized_value' are identical across models
# for the same set of participants. The only thing that changes is the
# 'contribution' values
#
# create an array to store the shap values for each subject,
# and for each feature, presenting both 'contribution', 'original' values, and
#
# 1) sort shap data based on id.x
# 2) extract data of each subject and store it an array
# 3) the array should include:
# - number of models > number of features > feature contributions for each subject
# 4) For the overall importance of each feature, we can report distrbution
#
FEATURES <- 7
COLUMNS <- 2 # 'Row.names' and 'contribution'
shaparray <- array(NA, dim=c(data.frame(nrow(df), c("Row.names", "contribution")), FEATURES))
# > both 'top' and 'search' strategies had identical results, but outperform the grid search and AutoML search. Yet, this was a small dataset, and a quick test...