-
We are trying to translate mlr code to mlr3 but this seems to be at the order of magnitude more complex than what we expected. Basically all functions names are different, object types are different and functions are split over n packages. This is the operation we would like to translate (see complete code):
it is explained in detail in: How to do ensemble ML with mlr3 and derive predictions? Thank you! |
Beta Was this translation helpful? Give feedback.
Replies: 5 comments 10 replies
-
Hey, did you read the stacking section in the book? |
Beta Was this translation helpful? Give feedback.
-
@thengl well would be great if you could still use the resources to build your answer incrementally ;-) |
Beta Was this translation helpful? Give feedback.
-
does this help?
|
Beta Was this translation helpful? Give feedback.
-
For reference, I'll post the code here again which I wrote during my stay at OpenGeoHub in September which should address most of the questions discussed here: library("mlr3verse")
library("mlr3spatiotempcv")
library("mlr3viz")
library(data.table)
set.seed(42)
logger = lgr::get_logger("bbotk")
logger$set_threshold("warn")
# parallelization --------------------------------------------------------------
future::plan("multisession", workers = 4)
# data -------------------------------------------------------------------------
# create dataset with blocking from example dataset
task = tsk("ecuador")
data_raw = task$backend$data(1:task$nrow, task$feature_names)
group = as.factor(sample(c("class1", "class2", "class3", "class4", "class5",
"class6", "class7", "class8"),
task$nrow, replace = TRUE))
task$cbind(data.table("group" = group))
# tell the task about the grouping
task$set_col_roles("group", roles = "group")
task$col_roles
# preprocessing ----------------------------------------------------------------
# list of PipeOps for preprocessing
mlr_pipeops
# resampling -------------------------------------------------------------------
set.seed(42)
# spcv = rsmp("spcv_coords", folds = 4)$instantiate(task)
cv = rsmp("cv", folds = 4)$instantiate(task)
# bug in mlr3spatiotempcv
autoplot(cv, task, fold_id = 1)
# tuning -----------------------------------------------------------------------
# note: I would highly suggest not to use grid search
terminator = trm("evals", n_evals = 2)
tuner = tnr("random_search")
# the same logic can be created for RandomForest or any other learner
learner_xgb = lrn("classif.xgboost", predict_type = "prob")
# parallel predictions
learner_xgb$parallel_predict = TRUE
search_space = ps(
max_depth = p_int(5, 10),
eta = p_dbl(0.5, 0.8),
subsample = p_dbl(0.9, 1),
min_child_weight = p_int(8, 10),
colsample_bytree = p_dbl(0.5, 1)
)
at_xgb = AutoTuner$new(
learner = learner_xgb,
resampling = rsmp("cv", folds = 4),
measure = msr("classif.ce"),
terminator = terminator,
search_space = search_space,
tuner = tuner,
store_models = FALSE)
# stacking ---------------------------------------------------------------------
stacked_graph = gunion(list(
po("learner_cv", at_xgb),
po("learner_cv", lrn("classif.ranger", predict_type = "prob"))
)) %>>%
po("featureunion") %>>% lrn("classif.log_reg", predict_type = "prob")
stacked_graph$plot()
stacked_graph$keep_results = TRUE
stacked_learner = as_learner(stacked_graph)
# train stacked learner on the full task
# stacked_learner$train(task)
# resample ---------------------------------------------------------------------
# Cross-validation for stacked learner
rr_res = resample(task,
stacked_learner,
cv)
# benchmark --------------------------------------------------------------------
# benchmark ensemble model against SVM and KKNN
bmr = benchmark(data.table(
task = list(task),
learner = list(
stacked_graph,
lrn("classif.svm"),
lrn("classif.kknn")
),
resampling = list(cv)))
autoplot(bmr)
# train & predict --------------------------------------------------------------
stacked_learner$train(task, row_ids = 1:100)
# get base learner Predictions
# requires $keep_results = TRUE to be set
# Note Patrick: we will most likely simplify this in the future
base_learner_preds = stacked_learner$graph$pipeops$featureunion$.result[[1]]$data()
# filter only prob
base_learner_preds[, grepl("prob.TRUE", colnames(base_learner_preds)), with = FALSE]
pred = stacked_learner$predict(task, row_ids = 101:200)
# get base learners ------------------------------------------------------------
# extract learner ids before the featureunion PO - no easy way currently
pos = names(stacked_learner$graph$pipeops)
i1 = pos == "featureunion"
grp = cumsum(i1)
base_learner_ids = split(pos[!i1], grp[!i1])$`0`
# for newdata, use $predict_newdata()
# predict with the fitted base learners
# usually you should only have .tuned models and then the else block can be discarded
pred_base_learners = lapply(base_learner_ids, function(x) {
if (grepl(".tuned", x)) {
stacked_learner$model[[x]]$model$learner$predict(task)
} else {
# for untuned models we need to call the S3 predict method because the object structure differs
predict(stacked_learner$model[[x]]$model, task$data())
}
}) |
Beta Was this translation helpful? Give feedback.
-
@pat-s as mlr3ensembles is now on the roadmap, how about closing this here, with a link to the new repo |
Beta Was this translation helpful? Give feedback.
For reference, I'll post the code here again which I wrote during my stay at OpenGeoHub in September which should address most of the questions discussed here: