diff --git a/DESCRIPTION b/DESCRIPTION index 6587b1f..64ecc80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,7 @@ Package: mlr3learners.lightgbm Title: mlr3: LightGBM learner -Version: 0.0.4.9006 +Version: 0.0.4.9007 +Date: 2020-04-05 Authors@R: c(person(given = "Lorenz A.", family = "Kapsner", @@ -12,33 +13,34 @@ Authors@R: role = "ctb", email = "patrick.schratz@gmail.com", comment = c(ORCID = "0000-0003-0748-6624"))) -Description: Adds `lgb.train()` from the lightgbm package to mlr3. +Description: Adds `lgb.train()` from the lightgbm package to + mlr3. License: LGPL-3 -LazyData: true URL: https://github.com/kapsner/mlr3learners.lightgbm -BugReports: https://github.com/kapsner/mlr3learners.lightgbm/issues -Encoding: UTF-8 -Date: 2020-04-04 -VignetteBuilder: knitr +BugReports: + https://github.com/kapsner/mlr3learners.lightgbm/issues Depends: R (>= 2.10) Imports: data.table, - R6, - paradox, - mlr3misc, - ggplot2, + MLmetrics, mlr3, + mlr3misc, + paradox, plyr, - MLmetrics + R6 Suggests: - testthat, - lintr, checkmate, - rmarkdown, - qpdf, - knitr, future, + knitr, + lintr, + mlbench, mlr3tuning, - mlbench + qpdf, + rmarkdown, + testthat +VignetteBuilder: + knitr +Encoding: UTF-8 +LazyData: true RoxygenNote: 7.1.0 diff --git a/NAMESPACE b/NAMESPACE index 1b5b292..9378cdb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,3 +10,4 @@ importFrom(R6,R6Class) importFrom(mlr3,LearnerClassif) importFrom(mlr3,LearnerRegr) importFrom(mlr3,mlr_learners) +importFrom(plyr,revalue) diff --git a/R/LearnerClassifLightGBM.R b/R/LearnerClassifLightGBM.R index 6a6b3a9..92071f1 100644 --- a/R/LearnerClassifLightGBM.R +++ b/R/LearnerClassifLightGBM.R @@ -6,6 +6,7 @@ #' @import data.table #' @import paradox #' @importFrom mlr3 mlr_learners LearnerClassif +#' @importFrom plyr revalue #' #' @export LearnerClassifLightGBM = R6::R6Class( @@ -21,177 +22,215 @@ LearnerClassifLightGBM = R6::R6Class( params = list( ####################################### # Config Parameters - ParamUty$new(id = "custom_eval", - default = NULL, - tags = c("config", "train")), - ParamLgl$new(id = "nrounds_by_cv", - default = TRUE, - tags = c("config", "train")), - ParamInt$new(id = "nfolds", - default = 5L, - lower = 3L, - tags = c("config", "train")), - ParamUty$new(id = "init_model", - default = NULL, - tags = c("config", "train")), + ParamUty$new( + id = "custom_eval", + default = NULL, + tags = c("config", "train")), + ParamLgl$new( + id = "nrounds_by_cv", + default = TRUE, + tags = c("config", "train")), + ParamInt$new( + id = "nfolds", + default = 5L, + lower = 3L, + tags = c("config", "train")), + ParamUty$new( + id = "init_model", + default = NULL, + tags = c("config", "train")), ####################################### ####################################### # Classification only - ParamFct$new(id = "objective", - default = "binary", - levels = c("binary", - "multiclass", - "multiclassova", - "cross_entropy", - "cross_entropy_lambda", - "rank_xendcg", - "lambdarank"), - tags = "train"), + ParamFct$new( + id = "objective", + default = "binary", + levels = c( + "binary", + "multiclass", + "multiclassova", + "cross_entropy", + "cross_entropy_lambda", + "rank_xendcg", + "lambdarank"), + tags = "train"), # Objective Parameters - #% constraints: num_class > 0 - ParamInt$new(id = "num_class", - default = 1L, - lower = 1L, - tags = c("train", - "multi-class")), - ParamLgl$new(id = "is_unbalance", - default = FALSE, - tags = c("train", - "binary", - "multiclassova")), - #% constraints: scale_pos_weight > 0.0 - ParamDbl$new(id = "scale_pos_weight", - default = 1.0, - lower = 0.0, - tags = c("train", - "binary", - "multiclassova")), - #% constraints: sigmoid > 0.0 - ParamDbl$new(id = "sigmoid", - default = 1.0, - lower = 0.0, - tags = c("train", - "binary", - "multiclassova", - "lambdarank")), - ParamInt$new(id = "lambdarank_truncation_level", - default = 20L, - lower = 1L, - tags = c("train", - "lambdarank")), - ParamLgl$new(id = "lambdarank_norm", - default = TRUE, - tags = c("train", - "lambdarank")), + # % constraints: num_class > 0 + ParamInt$new( + id = "num_class", + default = 1L, + lower = 1L, + tags = c( + "train", + "multi-class")), + ParamLgl$new( + id = "is_unbalance", + default = FALSE, + tags = c( + "train", + "binary", + "multiclassova")), + # % constraints: scale_pos_weight > 0.0 + ParamDbl$new( + id = "scale_pos_weight", + default = 1.0, + lower = 0.0, + tags = c( + "train", + "binary", + "multiclassova")), + # % constraints: sigmoid > 0.0 + ParamDbl$new( + id = "sigmoid", + default = 1.0, + lower = 0.0, + tags = c( + "train", + "binary", + "multiclassova", + "lambdarank")), + ParamInt$new( + id = "lambdarank_truncation_level", + default = 20L, + lower = 1L, + tags = c( + "train", + "lambdarank")), + ParamLgl$new( + id = "lambdarank_norm", + default = TRUE, + tags = c( + "train", + "lambdarank")), # Metric Parameters - ParamFct$new(id = "metric", - default = "", - levels = c("", "None", - "ndcg", "lambdarank", - "rank_xendcg", "xendcg", - "xe_ndcg", "xe_ndcg_mart", - "xendcg_mart", "map", - "mean_average_precision", - "cross_entropy", - "cross_entropy_lambda", - "kullback_leibler", - "xentropy", "xentlambda", - "kldiv", "multiclass", - "softmax", "multiclassova", - "multiclass_ova", "ova", - "ovr", "binary", - "binary_logloss", - "binary_error", "auc_mu", - "multi_logloss", "auc", - "multi_error"), - tags = "train"), - #% constraints: multi_error_top_k > 0 - ParamInt$new(id = "multi_error_top_k", - default = 1L, - lower = 1L, - tags = "train"), + ParamFct$new( + id = "metric", + default = "", + levels = c( + "", "None", + "ndcg", "lambdarank", + "rank_xendcg", "xendcg", + "xe_ndcg", "xe_ndcg_mart", + "xendcg_mart", "map", + "mean_average_precision", + "cross_entropy", + "cross_entropy_lambda", + "kullback_leibler", + "xentropy", "xentlambda", + "kldiv", "multiclass", + "softmax", "multiclassova", + "multiclass_ova", "ova", + "ovr", "binary", + "binary_logloss", + "binary_error", "auc_mu", + "multi_logloss", "auc", + "multi_error"), + tags = "train"), + # % constraints: multi_error_top_k > 0 + ParamInt$new( + id = "multi_error_top_k", + default = 1L, + lower = 1L, + tags = "train"), ####################################### ####################################### # Core Parameters - ParamFct$new(id = "boosting", - default = "gbdt", - levels = c("gbdt", - "rf", - "dart", - "goss"), - tags = "train"), - #% constraints: num_iterations >= 0 + ParamFct$new( + id = "boosting", + default = "gbdt", + levels = c( + "gbdt", + "rf", + "dart", + "goss"), + tags = "train"), + # % constraints: num_iterations >= 0 # Note: internally, LightGBM constructs # num_class * num_iterations # trees for multi-class classification problems - ParamInt$new(id = "num_iterations", - default = 100L, - lower = 0L, - tags = "train"), - #% constraints: learning_rate > 0.0 - ParamDbl$new(id = "learning_rate", - default = 0.1, - lower = 0.0, - tags = "train"), - #% constraints: 1 < num_leaves <= 131072 - ParamInt$new(id = "num_leaves", - default = 31L, - lower = 1L, - upper = 131072L, - tags = "train"), - ParamFct$new(id = "tree_learner", - default = "serial", - levels = c("serial", - "feature", - "data", - "voting"), - tags = "train"), - ParamInt$new(id = "num_threads", - default = 0L, - lower = 0L, - tags = "train"), - ParamFct$new(id = "device_type", - default = "cpu", - levels = c("cpu", "gpu"), - tags = "train"), - ParamUty$new(id = "seed", - default = "None", - tags = "train"), + ParamInt$new( + id = "num_iterations", + default = 100L, + lower = 0L, + tags = "train"), + # % constraints: learning_rate > 0.0 + ParamDbl$new( + id = "learning_rate", + default = 0.1, + lower = 0.0, + tags = "train"), + # % constraints: 1 < num_leaves <= 131072 + ParamInt$new( + id = "num_leaves", + default = 31L, + lower = 1L, + upper = 131072L, + tags = "train"), + ParamFct$new( + id = "tree_learner", + default = "serial", + levels = c( + "serial", + "feature", + "data", + "voting"), + tags = "train"), + ParamInt$new( + id = "num_threads", + default = 0L, + lower = 0L, + tags = "train"), + ParamFct$new( + id = "device_type", + default = "cpu", + levels = c("cpu", "gpu"), + tags = "train"), + ParamUty$new( + id = "seed", + default = "None", + tags = "train"), ####################################### # Learning Control Parameters - ParamLgl$new(id = "force_col_wise", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "force_row_wise", - default = FALSE, - tags = "train"), - ParamDbl$new(id = "histogram_pool_size", - default = -1.0, - tags = "train"), - #% <= 0 means no limit - ParamInt$new(id = "max_depth", - default = -1L, - tags = "train"), - #% constraints: min_data_in_leaf >= 0 - ParamInt$new(id = "min_data_in_leaf", - default = 20L, - lower = 0L, - tags = "train"), - #% constraints: min_sum_hessian_in_leaf >= 0.0 + ParamLgl$new( + id = "force_col_wise", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "force_row_wise", + default = FALSE, + tags = "train"), + ParamDbl$new( + id = "histogram_pool_size", + default = -1.0, + tags = "train"), + # % <= 0 means no limit + ParamInt$new( + id = "max_depth", + default = -1L, + tags = "train"), + # % constraints: min_data_in_leaf >= 0 + ParamInt$new( + id = "min_data_in_leaf", + default = 20L, + lower = 0L, + tags = "train"), + # % constraints: min_sum_hessian_in_leaf >= 0.0 # Note: to enable bagging, bagging_freq # should be set to a non # zero value as well - ParamDbl$new(id = "min_sum_hessian_in_leaf", - default = 1e-3, - lower = 0, - tags = "train"), - #% constraints: 0.0 < bagging_fraction <= 1.0 - ParamDbl$new(id = "bagging_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < pos_bagging_fraction <= 1.0 + ParamDbl$new( + id = "min_sum_hessian_in_leaf", + default = 1e-3, + lower = 0, + tags = "train"), + # % constraints: 0.0 < bagging_fraction <= 1.0 + ParamDbl$new( + id = "bagging_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < pos_bagging_fraction <= 1.0 # Note: to enable this, you need to set bagging_freq and # neg_bagging_fraction as well # Note: if both pos_bagging_fraction and @@ -199,223 +238,273 @@ LearnerClassifLightGBM = R6::R6Class( # are set to 1.0, balanced bagging is disabled # Note: if balanced bagging is enabled, # bagging_fraction will be ignored - ParamDbl$new(id = "pos_bagging_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < neg_bagging_fraction <= 1.0 - ParamDbl$new(id = "neg_bagging_fraction", - default = 1.0, - lower = 0, - upper = 1.0, - tags = "train"), + ParamDbl$new( + id = "pos_bagging_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < neg_bagging_fraction <= 1.0 + ParamDbl$new( + id = "neg_bagging_fraction", + default = 1.0, + lower = 0, + upper = 1.0, + tags = "train"), # Note: to enable bagging, bagging_fraction # should be set to value # smaller than 1.0 as well - ParamInt$new(id = "bagging_freq", - default = 0L, - lower = 0L, - tags = "train"), - ParamInt$new(id = "bagging_seed", - default = 3L, - tags = "train"), - #% constraints: 0.0 < feature_fraction <= 1.0 - ParamDbl$new(id = "feature_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < feature_fraction_bynode <= 1.0 + ParamInt$new( + id = "bagging_freq", + default = 0L, + lower = 0L, + tags = "train"), + ParamInt$new( + id = "bagging_seed", + default = 3L, + tags = "train"), + # % constraints: 0.0 < feature_fraction <= 1.0 + ParamDbl$new( + id = "feature_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < feature_fraction_bynode <= 1.0 # Note: unlike feature_fraction, this cannot # speed up training # Note: if both feature_fraction and # feature_fraction_bynode are # smaller than 1.0, the final fraction of # each node is - #% feature_fraction * feature_fraction_bynode - ParamDbl$new(id = "feature_fraction_bynode", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - ParamInt$new(id = "feature_fraction_seed", - default = 2L, - tags = "train"), - ParamLgl$new(id = "extra_trees", - default = FALSE, - tags = "train"), - ParamInt$new(id = "extra_seed", - default = 6L, - tags = "train"), + # % feature_fraction * feature_fraction_bynode + ParamDbl$new( + id = "feature_fraction_bynode", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + ParamInt$new( + id = "feature_fraction_seed", + default = 2L, + tags = "train"), + ParamLgl$new( + id = "extra_trees", + default = FALSE, + tags = "train"), + ParamInt$new( + id = "extra_seed", + default = 6L, + tags = "train"), # <= 0 means disable - ParamInt$new(id = "early_stopping_round", - default = 0L, - tags = "train"), - ParamLgl$new(id = "first_metric_only", - default = FALSE, - tags = "train"), + ParamInt$new( + id = "early_stopping_round", + default = 0L, + tags = "train"), + ParamLgl$new( + id = "first_metric_only", + default = FALSE, + tags = "train"), # <= 0 means no constraint - ParamDbl$new(id = "max_delta_step", - default = 0.0, - tags = "train"), - #% constraints: lambda_l1 >= 0.0 - ParamDbl$new(id = "lambda_l1", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: lambda_l2 >= 0.0 - ParamDbl$new(id = "lambda_l2", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: min_gain_to_split >= 0.0 - ParamDbl$new(id = "min_gain_to_split", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: 0.0 <= drop_rate <= 1.0 - ParamDbl$new(id = "drop_rate", - default = 0.1, - lower = 0.0, - upper = 1.0, - tags = c("train", "dart")), + ParamDbl$new( + id = "max_delta_step", + default = 0.0, + tags = "train"), + # % constraints: lambda_l1 >= 0.0 + ParamDbl$new( + id = "lambda_l1", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: lambda_l2 >= 0.0 + ParamDbl$new( + id = "lambda_l2", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: min_gain_to_split >= 0.0 + ParamDbl$new( + id = "min_gain_to_split", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: 0.0 <= drop_rate <= 1.0 + ParamDbl$new( + id = "drop_rate", + default = 0.1, + lower = 0.0, + upper = 1.0, + tags = c("train", "dart")), # <=0 means no limit - ParamInt$new(id = "max_drop", - default = 50L, - tags = c("train", "dart")), - #% constraints: 0.0 <= skip_drop <= 1.0 - ParamDbl$new(id = "skip_drop", - default = 0.5, - lower = 0.0, - upper = 1.0, - tags = c("train", "dart")), - ParamLgl$new(id = "xgboost_dart_mode", - default = FALSE, - tags = c("train", "dart")), - ParamLgl$new(id = "uniform_drop", - default = FALSE, - tags = c("train", "dart")), - ParamInt$new(id = "drop_seed", - default = 4L, - tags = c("train", "dart")), - #% constraints: 0.0 <= top_rate <= 1.0 - ParamDbl$new(id = "top_rate", - default = 0.2, - lower = 0.0, - upper = 1.0, - tags = c("train", "goss")), - #% constraints: 0.0 <= other_rate <= 1.0 - ParamDbl$new(id = "other_rate", - default = 0.1, - lower = 0.0, - upper = 1.0, - tags = c("train", "goss")), - #% constraints: min_data_per_group > 0 - ParamInt$new(id = "min_data_per_group", - default = 100L, - lower = 1L, - tags = "train"), - #% constraints: max_cat_threshold > 0 - ParamInt$new(id = "max_cat_threshold", - default = 32L, - lower = 1L, - tags = "train"), - #% constraints: cat_l2 >= 0.0 - ParamDbl$new(id = "cat_l2", - default = 10.0, - lower = 0.0, - tags = "train"), - #% constraints: cat_smooth >= 0.0 - ParamDbl$new(id = "cat_smooth", - default = 10.0, - lower = 0.0, - tags = "train"), - #% constraints: max_cat_to_onehot > 0 - ParamInt$new(id = "max_cat_to_onehot", - default = 4L, - lower = 1L, - tags = "train"), - #% constraints: top_k > 0 - ParamInt$new(id = "top_k", - default = 20L, - lower = 1L, - tags = "train"), - #% constraints: cegb_tradeoff >= 0.0 - ParamDbl$new(id = "cegb_tradeoff", - default = 1.0, - lower = 0.0, - tags = "train"), - #% constraints: cegb_penalty_split >= 0.0 - ParamDbl$new(id = "cegb_penalty_split", - default = 0.0, - lower = 0.0, - tags = "train"), + ParamInt$new( + id = "max_drop", + default = 50L, + tags = c("train", "dart")), + # % constraints: 0.0 <= skip_drop <= 1.0 + ParamDbl$new( + id = "skip_drop", + default = 0.5, + lower = 0.0, + upper = 1.0, + tags = c("train", "dart")), + ParamLgl$new( + id = "xgboost_dart_mode", + default = FALSE, + tags = c("train", "dart")), + ParamLgl$new( + id = "uniform_drop", + default = FALSE, + tags = c("train", "dart")), + ParamInt$new( + id = "drop_seed", + default = 4L, + tags = c("train", "dart")), + # % constraints: 0.0 <= top_rate <= 1.0 + ParamDbl$new( + id = "top_rate", + default = 0.2, + lower = 0.0, + upper = 1.0, + tags = c("train", "goss")), + # % constraints: 0.0 <= other_rate <= 1.0 + ParamDbl$new( + id = "other_rate", + default = 0.1, + lower = 0.0, + upper = 1.0, + tags = c("train", "goss")), + # % constraints: min_data_per_group > 0 + ParamInt$new( + id = "min_data_per_group", + default = 100L, + lower = 1L, + tags = "train"), + # % constraints: max_cat_threshold > 0 + ParamInt$new( + id = "max_cat_threshold", + default = 32L, + lower = 1L, + tags = "train"), + # % constraints: cat_l2 >= 0.0 + ParamDbl$new( + id = "cat_l2", + default = 10.0, + lower = 0.0, + tags = "train"), + # % constraints: cat_smooth >= 0.0 + ParamDbl$new( + id = "cat_smooth", + default = 10.0, + lower = 0.0, + tags = "train"), + # % constraints: max_cat_to_onehot > 0 + ParamInt$new( + id = "max_cat_to_onehot", + default = 4L, + lower = 1L, + tags = "train"), + # % constraints: top_k > 0 + ParamInt$new( + id = "top_k", + default = 20L, + lower = 1L, + tags = "train"), + # % constraints: cegb_tradeoff >= 0.0 + ParamDbl$new( + id = "cegb_tradeoff", + default = 1.0, + lower = 0.0, + tags = "train"), + # % constraints: cegb_penalty_split >= 0.0 + ParamDbl$new( + id = "cegb_penalty_split", + default = 0.0, + lower = 0.0, + tags = "train"), ####################################### # IO Parameters - ParamInt$new(id = "verbose", - default = 1L, - tags = "train"), - ParamUty$new(id = "input_model", - default = "", - tags = "train"), - ParamUty$new(id = "output_model", - default = "LightGBM_model.txt", - tags = "train"), - ParamInt$new(id = "snapshot_freq", - default = -1L, - tags = "train"), - #% constraints: max_bin > 1 - ParamInt$new(id = "max_bin", - default = 255L, - lower = 2L, - tags = "train"), - #% constraints: min_data_in_bin > 0 - ParamInt$new(id = "min_data_in_bin", - default = 3L, - lower = 1L, - tags = "train"), - #% constraints: bin_construct_sample_cnt > 0 - ParamInt$new(id = "bin_construct_sample_cnt", - default = 200000L, - lower = 1L, - tags = "train"), - ParamInt$new(id = "data_random_seed", - default = 1L, - tags = "train"), - ParamLgl$new(id = "is_enable_sparse", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "enable_bundle", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "use_missing", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "zero_as_missing", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "feature_pre_filter", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "pre_partition", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "two_round", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "header", - default = FALSE, - tags = "train"), - ParamUty$new(id = "group_column", - default = "", - tags = "train"), - ParamUty$new(id = "ignore_column", - default = "", - tags = "train"), - ParamUty$new(id = "categorical_feature", - default = "", - tags = "train"), + ParamInt$new( + id = "verbose", + default = 1L, + tags = "train"), + ParamUty$new( + id = "input_model", + default = "", + tags = "train"), + ParamUty$new( + id = "output_model", + default = "LightGBM_model.txt", + tags = "train"), + ParamInt$new( + id = "snapshot_freq", + default = -1L, + tags = "train"), + # % constraints: max_bin > 1 + ParamInt$new( + id = "max_bin", + default = 255L, + lower = 2L, + tags = "train"), + # % constraints: min_data_in_bin > 0 + ParamInt$new( + id = "min_data_in_bin", + default = 3L, + lower = 1L, + tags = "train"), + # % constraints: bin_construct_sample_cnt > 0 + ParamInt$new( + id = "bin_construct_sample_cnt", + default = 200000L, + lower = 1L, + tags = "train"), + ParamInt$new( + id = "data_random_seed", + default = 1L, + tags = "train"), + ParamLgl$new( + id = "is_enable_sparse", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "enable_bundle", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "use_missing", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "zero_as_missing", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "feature_pre_filter", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "pre_partition", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "two_round", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "header", + default = FALSE, + tags = "train"), + ParamUty$new( + id = "group_column", + default = "", + tags = "train"), + ParamUty$new( + id = "ignore_column", + default = "", + tags = "train"), + ParamUty$new( + id = "categorical_feature", + default = "", + tags = "train"), ####################################### ####################################### # Predict Parameters TODO are they needed? @@ -423,17 +512,20 @@ LearnerClassifLightGBM = R6::R6Class( ####################################### ####################################### # Objective Parameters - ParamInt$new(id = "objective_seed", - default = 5L, - tags = c("train", "rank_xendcg")), + ParamInt$new( + id = "objective_seed", + default = 5L, + tags = c("train", "rank_xendcg")), # moved num_class up to classification part # moved is_unbalance up to classification part # moved scale_pos_weight up to classification part # moved sigmoid up to classification part - ParamLgl$new(id = "boost_from_average", - default = TRUE, - tags = c("train", "regression", "binary", - "multiclassova", "cross-entropy")), + ParamLgl$new( + id = "boost_from_average", + default = TRUE, + tags = c( + "train", "regression", "binary", + "multiclassova", "cross-entropy")), # moved req_sqrt up to regression part # moved alpha up to regression part # moved fair_c up to regression part @@ -444,14 +536,16 @@ LearnerClassifLightGBM = R6::R6Class( # moved label_gain up to classification part ####################################### # Metric Parameters - #% constraints: metric_freq > 0 - ParamInt$new(id = "metric_freq", - default = 1L, - lower = 1L, - tags = "train"), - ParamLgl$new(id = "is_provide_training_metric", - default = FALSE, - tags = "train") + # % constraints: metric_freq > 0 + ParamInt$new( + id = "metric_freq", + default = 1L, + lower = 1L, + tags = "train"), + ParamLgl$new( + id = "is_provide_training_metric", + default = FALSE, + tags = "train") ) ) # custom defaults @@ -472,11 +566,12 @@ LearnerClassifLightGBM = R6::R6Class( ), predict_types = "prob", param_set = ps, - properties = c("weights", - "twoclass", - "multiclass", - "missings", - "importance"), + properties = c( + "weights", + "twoclass", + "multiclass", + "missings", + "importance"), man = "mlr3learners.lightgbm::mlr_learners_classif_lightgbm" ) }, @@ -513,6 +608,7 @@ LearnerClassifLightGBM = R6::R6Class( dtrain = NULL, .train = function(task) { # extract training data + data = task$data() # create training label label = data[, get(task$target_names)] @@ -555,15 +651,18 @@ LearnerClassifLightGBM = R6::R6Class( } # extract config-parameters feval = self$param_set$values[["custom_eval"]] - self$param_set$values[["custom_eval"]] = NULL nrounds_by_cv = self$param_set$values[["nrounds_by_cv"]] - self$param_set$values[["nrounds_by_cv"]] = NULL nfolds = self$param_set$values[["nfolds"]] - self$param_set$values[["nfolds"]] = NULL init_model = self$param_set$values[["init_model"]] - self$param_set$values[["init_model"]] = NULL + # get names of parameters to keep + keep_params = setdiff( + names(self$param_set$values), + names(self$param_set$get_values(tags = "config")) + ) # get training parameters pars = self$param_set$get_values(tags = "train") + # remove config parameters + pars = pars[keep_params] # train CV model, in case that nrounds_by_cv is true if (isTRUE(nrounds_by_cv)) { message( @@ -579,7 +678,7 @@ LearnerClassifLightGBM = R6::R6Class( , nfold = nfolds , stratified = TRUE , eval = feval - , init_model = init_model + , init_model = init_model ) message( sprintf( @@ -599,7 +698,7 @@ LearnerClassifLightGBM = R6::R6Class( , data = private$dtrain , params = pars , eval = feval - , init_model = init_model + , init_model = init_model ) # use the mlr3misc::invoke function (it's similar to do.call()) }, .predict = function(task) { @@ -619,10 +718,10 @@ LearnerClassifLightGBM = R6::R6Class( , reshape = TRUE ) if (self$param_set$values[["objective"]] %in% - c("multiclass", "multiclassova", "lambdarank")) { + c("multiclass", "multiclassova", "lambdarank")) { # process target variable c_names = as.character(unique(private$label_names)) - c_names = plyr::revalue( + c_names = revalue( x = c_names, replace = private$value_mapping_dtrain ) @@ -634,7 +733,7 @@ LearnerClassifLightGBM = R6::R6Class( "1" = p ) c_names = colnames(p) - c_names = plyr::revalue( + c_names = revalue( x = c_names, replace = private$value_mapping_dtrain ) @@ -669,7 +768,7 @@ LearnerClassifLightGBM = R6::R6Class( message(paste0("negative class: ", negative)) repl = c(0, 1) names(repl) = c(negative, positive) - vector = as.integer(plyr::revalue( + vector = as.integer(revalue( x = as.character(vector), replace = repl )) @@ -714,8 +813,9 @@ LearnerClassifLightGBM = R6::R6Class( # if error occured if (error) { stop( - paste0("Please provide a valid target variable ", - "for classification tasks") + paste0( + "Please provide a valid target variable ", + "for classification tasks") ) } } diff --git a/R/LearnerRegrLightGBM.R b/R/LearnerRegrLightGBM.R index 603f24b..c64734b 100644 --- a/R/LearnerRegrLightGBM.R +++ b/R/LearnerRegrLightGBM.R @@ -15,429 +15,519 @@ LearnerRegrLightGBM = R6::R6Class( initialize = function() { # initialize ParamSet ps = ParamSet$new( - # https://lightgbm.readthedocs.io/en/latest/Parameters.html# - params = list( - ####################################### - # Config Parameters - ParamUty$new(id = "custom_eval", - default = NULL, - tags = c("config", "train")), - ParamLgl$new(id = "nrounds_by_cv", - default = TRUE, - tags = c("config", "train")), - ParamInt$new(id = "nfolds", - default = 5L, - lower = 3L, - tags = c("config", "train")), - ParamUty$new(id = "init_model", - default = NULL, - tags = c("config", "train")), - ####################################### - ####################################### - # Regression only - ParamFct$new(id = "objective", - default = "regression", - levels = c("regression", - "regression_l1", - "huber", - "fair", - "poisson", - "quantile", - "mape", - "gamma", - "tweedie"), - tags = "train"), - ParamLgl$new(id = "reg_sqrt", - default = FALSE, - tags = c("train", - "regression")), - #% constraints: alpha > 0.0 - ParamDbl$new(id = "alpha", - default = 0.9, - lower = 0.0, - tags = c("train", - "huber", - "quantile")), - #% constraints: fair_c > 0.0 - ParamDbl$new(id = "fair_c", - default = 1.0, - lower = 0.0, - tags = c("train", - "fair")), - #% constraints: poisson_max_delta_step > 0.0 - ParamDbl$new(id = "poisson_max_delta_step", - default = 0.7, - lower = 0.0, - tags = c("train", - "poisson")), - #% constraints: 1.0 <= tweedie_variance_power < 2.0 - ParamDbl$new(id = "tweedie_variance_power", - default = 1.5, - lower = 1.0, - upper = 2.0, - tags = c("train", - "tweedie")), - # Metric Parameters - ParamFct$new(id = "metric", - default = "", - levels = c("", "None", - "l1", "mean_absolute_error", - "mae", "regression_l1", - "l2", "mean_squared_error", - "mse", "regression_l2", - "regression", "rmse", - "root_mean_squared_error", "l2_root", - "quantile", "lambdarank", - "mean_absolute_percentage_error", - "mean_average_precision", "mape", - "huber", "fair", - "poisson", "gamma", - "gamma_deviance", "tweedie"), - tags = "train"), - ####################################### - ####################################### - # Core Parameters - ParamFct$new(id = "boosting", - default = "gbdt", - levels = c("gbdt", - "rf", - "dart", - "goss"), - tags = "train"), - #% constraints: num_iterations >= 0 - # Note: internally, LightGBM constructs - # num_class * num_iterations - # trees for multi-class classification problems - ParamInt$new(id = "num_iterations", - default = 100L, - lower = 0L, - tags = "train"), - #% constraints: learning_rate > 0.0 - ParamDbl$new(id = "learning_rate", - default = 0.1, - lower = 0.0, - tags = "train"), - #% constraints: 1 < num_leaves <= 131072 - ParamInt$new(id = "num_leaves", - default = 31L, - lower = 1L, - upper = 131072L, - tags = "train"), - ParamFct$new(id = "tree_learner", - default = "serial", - levels = c("serial", - "feature", - "data", - "voting"), - tags = "train"), - ParamInt$new(id = "num_threads", - default = 0L, - lower = 0L, - tags = "train"), - ParamFct$new(id = "device_type", - default = "cpu", - levels = c("cpu", "gpu"), - tags = "train"), - ParamUty$new(id = "seed", - default = "None", - tags = "train"), - ####################################### - # Learning Control Parameters - ParamLgl$new(id = "force_col_wise", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "force_row_wise", - default = FALSE, - tags = "train"), - ParamDbl$new(id = "histogram_pool_size", - default = -1.0, - tags = "train"), - #% <= 0 means no limit - ParamInt$new(id = "max_depth", - default = -1L, - tags = "train"), - #% constraints: min_data_in_leaf >= 0 - ParamInt$new(id = "min_data_in_leaf", - default = 20L, - lower = 0L, - tags = "train"), - #% constraints: min_sum_hessian_in_leaf >= 0.0 - # Note: to enable bagging, bagging_freq - # should be set to a non - # zero value as well - ParamDbl$new(id = "min_sum_hessian_in_leaf", - default = 1e-3, - lower = 0, - tags = "train"), - #% constraints: 0.0 < bagging_fraction <= 1.0 - ParamDbl$new(id = "bagging_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < pos_bagging_fraction <= 1.0 - # Note: to enable this, you need to set bagging_freq and - # neg_bagging_fraction as well - # Note: if both pos_bagging_fraction and - # neg_bagging_fraction - # are set to 1.0, balanced bagging is disabled - # Note: if balanced bagging is enabled, - # bagging_fraction will be ignored - ParamDbl$new(id = "pos_bagging_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < neg_bagging_fraction <= 1.0 - ParamDbl$new(id = "neg_bagging_fraction", - default = 1.0, - lower = 0, - upper = 1.0, - tags = "train"), - # Note: to enable bagging, bagging_fraction - # should be set to value - # smaller than 1.0 as well - ParamInt$new(id = "bagging_freq", - default = 0L, - lower = 0L, - tags = "train"), - ParamInt$new(id = "bagging_seed", - default = 3L, - tags = "train"), - #% constraints: 0.0 < feature_fraction <= 1.0 - ParamDbl$new(id = "feature_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < feature_fraction_bynode <= 1.0 - # Note: unlike feature_fraction, this cannot - # speed up training - # Note: if both feature_fraction and - # feature_fraction_bynode are - # smaller than 1.0, the final fraction of - # each node is - #% feature_fraction * feature_fraction_bynode - ParamDbl$new(id = "feature_fraction_bynode", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - ParamInt$new(id = "feature_fraction_seed", - default = 2L, - tags = "train"), - ParamLgl$new(id = "extra_trees", - default = FALSE, - tags = "train"), - ParamInt$new(id = "extra_seed", - default = 6L, - tags = "train"), - # <= 0 means disable - ParamInt$new(id = "early_stopping_round", - default = 0L, - tags = "train"), - ParamLgl$new(id = "first_metric_only", - default = FALSE, - tags = "train"), - # <= 0 means no constraint - ParamDbl$new(id = "max_delta_step", - default = 0.0, - tags = "train"), - #% constraints: lambda_l1 >= 0.0 - ParamDbl$new(id = "lambda_l1", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: lambda_l2 >= 0.0 - ParamDbl$new(id = "lambda_l2", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: min_gain_to_split >= 0.0 - ParamDbl$new(id = "min_gain_to_split", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: 0.0 <= drop_rate <= 1.0 - ParamDbl$new(id = "drop_rate", - default = 0.1, - lower = 0.0, - upper = 1.0, - tags = c("train", "dart")), - # <=0 means no limit - ParamInt$new(id = "max_drop", - default = 50L, - tags = c("train", "dart")), - #% constraints: 0.0 <= skip_drop <= 1.0 - ParamDbl$new(id = "skip_drop", - default = 0.5, - lower = 0.0, - upper = 1.0, - tags = c("train", "dart")), - ParamLgl$new(id = "xgboost_dart_mode", - default = FALSE, - tags = c("train", "dart")), - ParamLgl$new(id = "uniform_drop", - default = FALSE, - tags = c("train", "dart")), - ParamInt$new(id = "drop_seed", - default = 4L, - tags = c("train", "dart")), - #% constraints: 0.0 <= top_rate <= 1.0 - ParamDbl$new(id = "top_rate", - default = 0.2, - lower = 0.0, - upper = 1.0, - tags = c("train", "goss")), - #% constraints: 0.0 <= other_rate <= 1.0 - ParamDbl$new(id = "other_rate", - default = 0.1, - lower = 0.0, - upper = 1.0, - tags = c("train", "goss")), - #% constraints: min_data_per_group > 0 - ParamInt$new(id = "min_data_per_group", - default = 100L, - lower = 1L, - tags = "train"), - #% constraints: max_cat_threshold > 0 - ParamInt$new(id = "max_cat_threshold", - default = 32L, - lower = 1L, - tags = "train"), - #% constraints: cat_l2 >= 0.0 - ParamDbl$new(id = "cat_l2", - default = 10.0, - lower = 0.0, - tags = "train"), - #% constraints: cat_smooth >= 0.0 - ParamDbl$new(id = "cat_smooth", - default = 10.0, - lower = 0.0, - tags = "train"), - #% constraints: max_cat_to_onehot > 0 - ParamInt$new(id = "max_cat_to_onehot", - default = 4L, - lower = 1L, - tags = "train"), - #% constraints: top_k > 0 - ParamInt$new(id = "top_k", - default = 20L, - lower = 1L, - tags = "train"), - #% constraints: cegb_tradeoff >= 0.0 - ParamDbl$new(id = "cegb_tradeoff", - default = 1.0, - lower = 0.0, - tags = "train"), - #% constraints: cegb_penalty_split >= 0.0 - ParamDbl$new(id = "cegb_penalty_split", - default = 0.0, - lower = 0.0, - tags = "train"), - ####################################### - # IO Parameters - ParamInt$new(id = "verbose", - default = 1L, - tags = "train"), - ParamUty$new(id = "input_model", - default = "", - tags = "train"), - ParamUty$new(id = "output_model", - default = "LightGBM_model.txt", - tags = "train"), - ParamInt$new(id = "snapshot_freq", - default = -1L, - tags = "train"), - #% constraints: max_bin > 1 - ParamInt$new(id = "max_bin", - default = 255L, - lower = 2L, - tags = "train"), - #% constraints: min_data_in_bin > 0 - ParamInt$new(id = "min_data_in_bin", - default = 3L, - lower = 1L, - tags = "train"), - #% constraints: bin_construct_sample_cnt > 0 - ParamInt$new(id = "bin_construct_sample_cnt", - default = 200000L, - lower = 1L, - tags = "train"), - ParamInt$new(id = "data_random_seed", - default = 1L, - tags = "train"), - ParamLgl$new(id = "is_enable_sparse", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "enable_bundle", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "use_missing", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "zero_as_missing", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "feature_pre_filter", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "pre_partition", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "two_round", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "header", - default = FALSE, - tags = "train"), - ParamUty$new(id = "group_column", - default = "", - tags = "train"), - ParamUty$new(id = "ignore_column", - default = "", - tags = "train"), - ParamUty$new(id = "categorical_feature", - default = "", - tags = "train"), - ####################################### - ####################################### - # Predict Parameters TODO are they needed? - # Convert Parameters TODO are they needed? - ####################################### - ####################################### - # Objective Parameters - ParamInt$new(id = "objective_seed", - default = 5L, - tags = c("train", "rank_xendcg")), - # moved num_class up to classification part - # moved is_unbalance up to classification part - # moved scale_pos_weight up to classification part - # moved sigmoid up to classification part - ParamLgl$new(id = "boost_from_average", - default = TRUE, - tags = c("train", "regression", "binary", - "multiclassova", "cross-entropy")), - # moved req_sqrt up to regression part - # moved alpha up to regression part - # moved fair_c up to regression part - # moved poisson_max_delta_step up to regression part - # moved tweedie_variance_power up to regression part - # moved lambdarank_truncation_level up to classification part - # moved lambdarank_norm up to classification part - # moved label_gain up to classification part - ####################################### - # Metric Parameters - #% constraints: metric_freq > 0 - ParamInt$new(id = "metric_freq", - default = 1L, - lower = 1L, - tags = "train"), - ParamLgl$new(id = "is_provide_training_metric", - default = FALSE, - tags = "train") - ) + # https://lightgbm.readthedocs.io/en/latest/Parameters.html# + params = list( + ####################################### + # Config Parameters + ParamUty$new( + id = "custom_eval", + default = NULL, + tags = c("config", "train")), + ParamLgl$new( + id = "nrounds_by_cv", + default = TRUE, + tags = c("config", "train")), + ParamInt$new( + id = "nfolds", + default = 5L, + lower = 3L, + tags = c("config", "train")), + ParamUty$new( + id = "init_model", + default = NULL, + tags = c("config", "train")), + ####################################### + ####################################### + # Regression only + ParamFct$new( + id = "objective", + default = "regression", + levels = c( + "regression", + "regression_l1", + "huber", + "fair", + "poisson", + "quantile", + "mape", + "gamma", + "tweedie"), + tags = "train"), + ParamLgl$new( + id = "reg_sqrt", + default = FALSE, + tags = c( + "train", + "regression")), + # % constraints: alpha > 0.0 + ParamDbl$new( + id = "alpha", + default = 0.9, + lower = 0.0, + tags = c( + "train", + "huber", + "quantile")), + # % constraints: fair_c > 0.0 + ParamDbl$new( + id = "fair_c", + default = 1.0, + lower = 0.0, + tags = c( + "train", + "fair")), + # % constraints: poisson_max_delta_step > 0.0 + ParamDbl$new( + id = "poisson_max_delta_step", + default = 0.7, + lower = 0.0, + tags = c( + "train", + "poisson")), + # % constraints: 1.0 <= tweedie_variance_power < 2.0 + ParamDbl$new( + id = "tweedie_variance_power", + default = 1.5, + lower = 1.0, + upper = 2.0, + tags = c( + "train", + "tweedie")), + # Metric Parameters + ParamFct$new( + id = "metric", + default = "", + levels = c( + "", "None", + "l1", "mean_absolute_error", + "mae", "regression_l1", + "l2", "mean_squared_error", + "mse", "regression_l2", + "regression", "rmse", + "root_mean_squared_error", "l2_root", + "quantile", "lambdarank", + "mean_absolute_percentage_error", + "mean_average_precision", "mape", + "huber", "fair", + "poisson", "gamma", + "gamma_deviance", "tweedie"), + tags = "train"), + ####################################### + ####################################### + # Core Parameters + ParamFct$new( + id = "boosting", + default = "gbdt", + levels = c( + "gbdt", + "rf", + "dart", + "goss"), + tags = "train"), + # % constraints: num_iterations >= 0 + # Note: internally, LightGBM constructs + # num_class * num_iterations + # trees for multi-class classification problems + ParamInt$new( + id = "num_iterations", + default = 100L, + lower = 0L, + tags = "train"), + # % constraints: learning_rate > 0.0 + ParamDbl$new( + id = "learning_rate", + default = 0.1, + lower = 0.0, + tags = "train"), + # % constraints: 1 < num_leaves <= 131072 + ParamInt$new( + id = "num_leaves", + default = 31L, + lower = 1L, + upper = 131072L, + tags = "train"), + ParamFct$new( + id = "tree_learner", + default = "serial", + levels = c( + "serial", + "feature", + "data", + "voting"), + tags = "train"), + ParamInt$new( + id = "num_threads", + default = 0L, + lower = 0L, + tags = "train"), + ParamFct$new( + id = "device_type", + default = "cpu", + levels = c("cpu", "gpu"), + tags = "train"), + ParamUty$new( + id = "seed", + default = "None", + tags = "train"), + ####################################### + # Learning Control Parameters + ParamLgl$new( + id = "force_col_wise", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "force_row_wise", + default = FALSE, + tags = "train"), + ParamDbl$new( + id = "histogram_pool_size", + default = -1.0, + tags = "train"), + # % <= 0 means no limit + ParamInt$new( + id = "max_depth", + default = -1L, + tags = "train"), + # % constraints: min_data_in_leaf >= 0 + ParamInt$new( + id = "min_data_in_leaf", + default = 20L, + lower = 0L, + tags = "train"), + # % constraints: min_sum_hessian_in_leaf >= 0.0 + # Note: to enable bagging, bagging_freq + # should be set to a non + # zero value as well + ParamDbl$new( + id = "min_sum_hessian_in_leaf", + default = 1e-3, + lower = 0, + tags = "train"), + # % constraints: 0.0 < bagging_fraction <= 1.0 + ParamDbl$new( + id = "bagging_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < pos_bagging_fraction <= 1.0 + # Note: to enable this, you need to set bagging_freq and + # neg_bagging_fraction as well + # Note: if both pos_bagging_fraction and + # neg_bagging_fraction + # are set to 1.0, balanced bagging is disabled + # Note: if balanced bagging is enabled, + # bagging_fraction will be ignored + ParamDbl$new( + id = "pos_bagging_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < neg_bagging_fraction <= 1.0 + ParamDbl$new( + id = "neg_bagging_fraction", + default = 1.0, + lower = 0, + upper = 1.0, + tags = "train"), + # Note: to enable bagging, bagging_fraction + # should be set to value + # smaller than 1.0 as well + ParamInt$new( + id = "bagging_freq", + default = 0L, + lower = 0L, + tags = "train"), + ParamInt$new( + id = "bagging_seed", + default = 3L, + tags = "train"), + # % constraints: 0.0 < feature_fraction <= 1.0 + ParamDbl$new( + id = "feature_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < feature_fraction_bynode <= 1.0 + # Note: unlike feature_fraction, this cannot + # speed up training + # Note: if both feature_fraction and + # feature_fraction_bynode are + # smaller than 1.0, the final fraction of + # each node is + # % feature_fraction * feature_fraction_bynode + ParamDbl$new( + id = "feature_fraction_bynode", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + ParamInt$new( + id = "feature_fraction_seed", + default = 2L, + tags = "train"), + ParamLgl$new( + id = "extra_trees", + default = FALSE, + tags = "train"), + ParamInt$new( + id = "extra_seed", + default = 6L, + tags = "train"), + # <= 0 means disable + ParamInt$new( + id = "early_stopping_round", + default = 0L, + tags = "train"), + ParamLgl$new( + id = "first_metric_only", + default = FALSE, + tags = "train"), + # <= 0 means no constraint + ParamDbl$new( + id = "max_delta_step", + default = 0.0, + tags = "train"), + # % constraints: lambda_l1 >= 0.0 + ParamDbl$new( + id = "lambda_l1", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: lambda_l2 >= 0.0 + ParamDbl$new( + id = "lambda_l2", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: min_gain_to_split >= 0.0 + ParamDbl$new( + id = "min_gain_to_split", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: 0.0 <= drop_rate <= 1.0 + ParamDbl$new( + id = "drop_rate", + default = 0.1, + lower = 0.0, + upper = 1.0, + tags = c("train", "dart")), + # <=0 means no limit + ParamInt$new( + id = "max_drop", + default = 50L, + tags = c("train", "dart")), + # % constraints: 0.0 <= skip_drop <= 1.0 + ParamDbl$new( + id = "skip_drop", + default = 0.5, + lower = 0.0, + upper = 1.0, + tags = c("train", "dart")), + ParamLgl$new( + id = "xgboost_dart_mode", + default = FALSE, + tags = c("train", "dart")), + ParamLgl$new( + id = "uniform_drop", + default = FALSE, + tags = c("train", "dart")), + ParamInt$new( + id = "drop_seed", + default = 4L, + tags = c("train", "dart")), + # % constraints: 0.0 <= top_rate <= 1.0 + ParamDbl$new( + id = "top_rate", + default = 0.2, + lower = 0.0, + upper = 1.0, + tags = c("train", "goss")), + # % constraints: 0.0 <= other_rate <= 1.0 + ParamDbl$new( + id = "other_rate", + default = 0.1, + lower = 0.0, + upper = 1.0, + tags = c("train", "goss")), + # % constraints: min_data_per_group > 0 + ParamInt$new( + id = "min_data_per_group", + default = 100L, + lower = 1L, + tags = "train"), + # % constraints: max_cat_threshold > 0 + ParamInt$new( + id = "max_cat_threshold", + default = 32L, + lower = 1L, + tags = "train"), + # % constraints: cat_l2 >= 0.0 + ParamDbl$new( + id = "cat_l2", + default = 10.0, + lower = 0.0, + tags = "train"), + # % constraints: cat_smooth >= 0.0 + ParamDbl$new( + id = "cat_smooth", + default = 10.0, + lower = 0.0, + tags = "train"), + # % constraints: max_cat_to_onehot > 0 + ParamInt$new( + id = "max_cat_to_onehot", + default = 4L, + lower = 1L, + tags = "train"), + # % constraints: top_k > 0 + ParamInt$new( + id = "top_k", + default = 20L, + lower = 1L, + tags = "train"), + # % constraints: cegb_tradeoff >= 0.0 + ParamDbl$new( + id = "cegb_tradeoff", + default = 1.0, + lower = 0.0, + tags = "train"), + # % constraints: cegb_penalty_split >= 0.0 + ParamDbl$new( + id = "cegb_penalty_split", + default = 0.0, + lower = 0.0, + tags = "train"), + ####################################### + # IO Parameters + ParamInt$new( + id = "verbose", + default = 1L, + tags = "train"), + ParamUty$new( + id = "input_model", + default = "", + tags = "train"), + ParamUty$new( + id = "output_model", + default = "LightGBM_model.txt", + tags = "train"), + ParamInt$new( + id = "snapshot_freq", + default = -1L, + tags = "train"), + # % constraints: max_bin > 1 + ParamInt$new( + id = "max_bin", + default = 255L, + lower = 2L, + tags = "train"), + # % constraints: min_data_in_bin > 0 + ParamInt$new( + id = "min_data_in_bin", + default = 3L, + lower = 1L, + tags = "train"), + # % constraints: bin_construct_sample_cnt > 0 + ParamInt$new( + id = "bin_construct_sample_cnt", + default = 200000L, + lower = 1L, + tags = "train"), + ParamInt$new( + id = "data_random_seed", + default = 1L, + tags = "train"), + ParamLgl$new( + id = "is_enable_sparse", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "enable_bundle", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "use_missing", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "zero_as_missing", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "feature_pre_filter", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "pre_partition", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "two_round", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "header", + default = FALSE, + tags = "train"), + ParamUty$new( + id = "group_column", + default = "", + tags = "train"), + ParamUty$new( + id = "ignore_column", + default = "", + tags = "train"), + ParamUty$new( + id = "categorical_feature", + default = "", + tags = "train"), + ####################################### + ####################################### + # Predict Parameters TODO are they needed? + # Convert Parameters TODO are they needed? + ####################################### + ####################################### + # Objective Parameters + ParamInt$new( + id = "objective_seed", + default = 5L, + tags = c("train", "rank_xendcg")), + # moved num_class up to classification part + # moved is_unbalance up to classification part + # moved scale_pos_weight up to classification part + # moved sigmoid up to classification part + ParamLgl$new( + id = "boost_from_average", + default = TRUE, + tags = c( + "train", "regression", "binary", + "multiclassova", "cross-entropy")), + # moved req_sqrt up to regression part + # moved alpha up to regression part + # moved fair_c up to regression part + # moved poisson_max_delta_step up to regression part + # moved tweedie_variance_power up to regression part + # moved lambdarank_truncation_level up to classification part + # moved lambdarank_norm up to classification part + # moved label_gain up to classification part + ####################################### + # Metric Parameters + # % constraints: metric_freq > 0 + ParamInt$new( + id = "metric_freq", + default = 1L, + lower = 1L, + tags = "train"), + ParamLgl$new( + id = "is_provide_training_metric", + default = FALSE, + tags = "train") ) + ) # custom defaults ps$values = list( # FIXME: Add this change to the description of the help page @@ -460,9 +550,10 @@ LearnerRegrLightGBM = R6::R6Class( ), predict_types = "response", param_set = ps, - properties = c("weights", - "missings", - "importance"), + properties = c( + "weights", + "missings", + "importance"), man = "mlr3learners.lightgbm::mlr_learners_regr_lightgbm" ) }, @@ -485,7 +576,8 @@ LearnerRegrLightGBM = R6::R6Class( private$dtrain$get_colnames(), function(x) { return(0) - }, USE.NAMES = TRUE, simplify = FALSE) + }, + USE.NAMES = TRUE, simplify = FALSE) } return(unlist(ret)) } @@ -498,6 +590,7 @@ LearnerRegrLightGBM = R6::R6Class( dtrain = NULL, .train = function(task) { # extract training data + data = task$data() # prepare data for lightgbm data = lightgbm::lgb.prepare(data) @@ -518,13 +611,18 @@ LearnerRegrLightGBM = R6::R6Class( } # extract config-parameters feval = self$param_set$values[["custom_eval"]] - self$param_set$values[["custom_eval"]] = NULL nrounds_by_cv = self$param_set$values[["nrounds_by_cv"]] - self$param_set$values[["nrounds_by_cv"]] = NULL nfolds = self$param_set$values[["nfolds"]] - self$param_set$values[["nfolds"]] = NULL + init_model = self$param_set$values[["init_model"]] + # get names of parameters to keep + keep_params = setdiff( + names(self$param_set$values), + names(self$param_set$get_values(tags = "config")) + ) # get training parameters pars = self$param_set$get_values(tags = "train") + # remove config parameters + pars = pars[keep_params] # train CV model, in case that nrounds_by_cv is true if (isTRUE(nrounds_by_cv)) { message( @@ -540,6 +638,7 @@ LearnerRegrLightGBM = R6::R6Class( , nfold = nfolds , stratified = TRUE , eval = feval + , init_model = init_model ) message( sprintf( @@ -559,6 +658,7 @@ LearnerRegrLightGBM = R6::R6Class( , data = private$dtrain , params = pars , eval = feval + , init_model = init_model ) # use the mlr3misc::invoke function (it's similar to do.call()) }, .predict = function(task) { diff --git a/R/lgbparams.R b/R/lgbparams.R index cd273ff..a610da8 100644 --- a/R/lgbparams.R +++ b/R/lgbparams.R @@ -5,239 +5,291 @@ lgbparams = function() { params = list( ####################################### # Config Parameters - ParamUty$new(id = "custom_eval", - default = NULL, - tags = c("config", "train")), - ParamLgl$new(id = "nrounds_by_cv", - default = TRUE, - tags = c("config", "train")), - ParamInt$new(id = "nfolds", - default = 5L, - lower = 3L, - tags = c("config", "train")), - ParamUty$new(id = "init_model", - default = NULL, - tags = c("config", "train")), + ParamUty$new( + id = "custom_eval", + default = NULL, + tags = c("config", "train")), + ParamLgl$new( + id = "nrounds_by_cv", + default = TRUE, + tags = c("config", "train")), + ParamInt$new( + id = "nfolds", + default = 5L, + lower = 3L, + tags = c("config", "train")), + ParamUty$new( + id = "init_model", + default = NULL, + tags = c("config", "train")), ####################################### ####################################### # Regression only - ParamFct$new(id = "objective", - default = "regression", - levels = c("regression", - "regression_l1", - "huber", - "fair", - "poisson", - "quantile", - "mape", - "gamma", - "tweedie"), - tags = "train"), - ParamLgl$new(id = "reg_sqrt", - default = FALSE, - tags = c("train", - "regression")), - #% constraints: alpha > 0.0 - ParamDbl$new(id = "alpha", - default = 0.9, - lower = 0.0, - tags = c("train", - "huber", - "quantile")), - #% constraints: fair_c > 0.0 - ParamDbl$new(id = "fair_c", - default = 1.0, - lower = 0.0, - tags = c("train", - "fair")), - #% constraints: poisson_max_delta_step > 0.0 - ParamDbl$new(id = "poisson_max_delta_step", - default = 0.7, - lower = 0.0, - tags = c("train", - "poisson")), - #% constraints: 1.0 <= tweedie_variance_power < 2.0 - ParamDbl$new(id = "tweedie_variance_power", - default = 1.5, - lower = 1.0, - upper = 2.0, - tags = c("train", - "tweedie")), + ParamFct$new( + id = "objective", + default = "regression", + levels = c( + "regression", + "regression_l1", + "huber", + "fair", + "poisson", + "quantile", + "mape", + "gamma", + "tweedie"), + tags = "train"), + ParamLgl$new( + id = "reg_sqrt", + default = FALSE, + tags = c( + "train", + "regression")), + # % constraints: alpha > 0.0 + ParamDbl$new( + id = "alpha", + default = 0.9, + lower = 0.0, + tags = c( + "train", + "huber", + "quantile")), + # % constraints: fair_c > 0.0 + ParamDbl$new( + id = "fair_c", + default = 1.0, + lower = 0.0, + tags = c( + "train", + "fair")), + # % constraints: poisson_max_delta_step > 0.0 + ParamDbl$new( + id = "poisson_max_delta_step", + default = 0.7, + lower = 0.0, + tags = c( + "train", + "poisson")), + # % constraints: 1.0 <= tweedie_variance_power < 2.0 + ParamDbl$new( + id = "tweedie_variance_power", + default = 1.5, + lower = 1.0, + upper = 2.0, + tags = c( + "train", + "tweedie")), # Metric Parameters - ParamFct$new(id = "metric", - default = "", - levels = c("", "None", - "l1", "mean_absolute_error", - "mae", "regression_l1", - "l2", "mean_squared_error", - "mse", "regression_l2", - "regression", "rmse", - "root_mean_squared_error", "l2_root", - "quantile", "lambdarank", - "mean_absolute_percentage_error", - "mean_average_precision", "mape", - "huber", "fair", - "poisson", "gamma", - "gamma_deviance", "tweedie"), - tags = "train"), + ParamFct$new( + id = "metric", + default = "", + levels = c( + "", "None", + "l1", "mean_absolute_error", + "mae", "regression_l1", + "l2", "mean_squared_error", + "mse", "regression_l2", + "regression", "rmse", + "root_mean_squared_error", "l2_root", + "quantile", "lambdarank", + "mean_absolute_percentage_error", + "mean_average_precision", "mape", + "huber", "fair", + "poisson", "gamma", + "gamma_deviance", "tweedie"), + tags = "train"), ####################################### ####################################### # Classification only - ParamFct$new(id = "objective", - default = "binary", - levels = c("binary", - "multiclass", - "multiclassova", - "cross_entropy", - "cross_entropy_lambda", - "rank_xendcg", - "lambdarank"), - tags = "train"), + ParamFct$new( + id = "objective", + default = "binary", + levels = c( + "binary", + "multiclass", + "multiclassova", + "cross_entropy", + "cross_entropy_lambda", + "rank_xendcg", + "lambdarank"), + tags = "train"), # Objective Parameters - #% constraints: num_class > 0 - ParamInt$new(id = "num_class", - default = 1L, - lower = 1L, - tags = c("train", - "multi-class")), - ParamLgl$new(id = "is_unbalance", - default = FALSE, - tags = c("train", - "binary", - "multiclassova")), - #% constraints: scale_pos_weight > 0.0 - ParamDbl$new(id = "scale_pos_weight", - default = 1.0, - lower = 0.0, - tags = c("train", - "binary", - "multiclassova")), - #% constraints: sigmoid > 0.0 - ParamDbl$new(id = "sigmoid", - default = 1.0, - lower = 0.0, - tags = c("train", - "binary", - "multiclassova", - "lambdarank")), - ParamInt$new(id = "lambdarank_truncation_level", - default = 20L, - lower = 1L, - tags = c("train", - "lambdarank")), - ParamLgl$new(id = "lambdarank_norm", - default = TRUE, - tags = c("train", - "lambdarank")), + # % constraints: num_class > 0 + ParamInt$new( + id = "num_class", + default = 1L, + lower = 1L, + tags = c( + "train", + "multi-class")), + ParamLgl$new( + id = "is_unbalance", + default = FALSE, + tags = c( + "train", + "binary", + "multiclassova")), + # % constraints: scale_pos_weight > 0.0 + ParamDbl$new( + id = "scale_pos_weight", + default = 1.0, + lower = 0.0, + tags = c( + "train", + "binary", + "multiclassova")), + # % constraints: sigmoid > 0.0 + ParamDbl$new( + id = "sigmoid", + default = 1.0, + lower = 0.0, + tags = c( + "train", + "binary", + "multiclassova", + "lambdarank")), + ParamInt$new( + id = "lambdarank_truncation_level", + default = 20L, + lower = 1L, + tags = c( + "train", + "lambdarank")), + ParamLgl$new( + id = "lambdarank_norm", + default = TRUE, + tags = c( + "train", + "lambdarank")), # Metric Parameters - ParamFct$new(id = "metric", - default = "", - levels = c("", "None", - "ndcg", "lambdarank", - "rank_xendcg", "xendcg", - "xe_ndcg", "xe_ndcg_mart", - "xendcg_mart", "map", - "mean_average_precision", - "cross_entropy", - "cross_entropy_lambda", - "kullback_leibler", - "xentropy", "xentlambda", - "kldiv", "multiclass", - "softmax", "multiclassova", - "multiclass_ova", "ova", - "ovr", "binary", - "binary_logloss", - "binary_error", "auc_mu", - "multi_logloss", "auc", - "multi_error"), - tags = "train"), - #% constraints: multi_error_top_k > 0 - ParamInt$new(id = "multi_error_top_k", - default = 1L, - lower = 1L, - tags = "train"), + ParamFct$new( + id = "metric", + default = "", + levels = c( + "", "None", + "ndcg", "lambdarank", + "rank_xendcg", "xendcg", + "xe_ndcg", "xe_ndcg_mart", + "xendcg_mart", "map", + "mean_average_precision", + "cross_entropy", + "cross_entropy_lambda", + "kullback_leibler", + "xentropy", "xentlambda", + "kldiv", "multiclass", + "softmax", "multiclassova", + "multiclass_ova", "ova", + "ovr", "binary", + "binary_logloss", + "binary_error", "auc_mu", + "multi_logloss", "auc", + "multi_error"), + tags = "train"), + # % constraints: multi_error_top_k > 0 + ParamInt$new( + id = "multi_error_top_k", + default = 1L, + lower = 1L, + tags = "train"), ####################################### ####################################### # Core Parameters - ParamFct$new(id = "boosting", - default = "gbdt", - levels = c("gbdt", - "rf", - "dart", - "goss"), - tags = "train"), - #% constraints: num_iterations >= 0 + ParamFct$new( + id = "boosting", + default = "gbdt", + levels = c( + "gbdt", + "rf", + "dart", + "goss"), + tags = "train"), + # % constraints: num_iterations >= 0 # Note: internally, LightGBM constructs # num_class * num_iterations # trees for multi-class classification problems - ParamInt$new(id = "num_iterations", - default = 100L, - lower = 0L, - tags = "train"), - #% constraints: learning_rate > 0.0 - ParamDbl$new(id = "learning_rate", - default = 0.1, - lower = 0.0, - tags = "train"), - #% constraints: 1 < num_leaves <= 131072 - ParamInt$new(id = "num_leaves", - default = 31L, - lower = 1L, - upper = 131072L, - tags = "train"), - ParamFct$new(id = "tree_learner", - default = "serial", - levels = c("serial", - "feature", - "data", - "voting"), - tags = "train"), - ParamInt$new(id = "num_threads", - default = 0L, - lower = 0L, - tags = "train"), - ParamFct$new(id = "device_type", - default = "cpu", - levels = c("cpu", "gpu"), - tags = "train"), - ParamUty$new(id = "seed", - default = "None", - tags = "train"), + ParamInt$new( + id = "num_iterations", + default = 100L, + lower = 0L, + tags = "train"), + # % constraints: learning_rate > 0.0 + ParamDbl$new( + id = "learning_rate", + default = 0.1, + lower = 0.0, + tags = "train"), + # % constraints: 1 < num_leaves <= 131072 + ParamInt$new( + id = "num_leaves", + default = 31L, + lower = 1L, + upper = 131072L, + tags = "train"), + ParamFct$new( + id = "tree_learner", + default = "serial", + levels = c( + "serial", + "feature", + "data", + "voting"), + tags = "train"), + ParamInt$new( + id = "num_threads", + default = 0L, + lower = 0L, + tags = "train"), + ParamFct$new( + id = "device_type", + default = "cpu", + levels = c("cpu", "gpu"), + tags = "train"), + ParamUty$new( + id = "seed", + default = "None", + tags = "train"), ####################################### # Learning Control Parameters - ParamLgl$new(id = "force_col_wise", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "force_row_wise", - default = FALSE, - tags = "train"), - ParamDbl$new(id = "histogram_pool_size", - default = -1.0, - tags = "train"), - #% <= 0 means no limit - ParamInt$new(id = "max_depth", - default = -1L, - tags = "train"), - #% constraints: min_data_in_leaf >= 0 - ParamInt$new(id = "min_data_in_leaf", - default = 20L, - lower = 0L, - tags = "train"), - #% constraints: min_sum_hessian_in_leaf >= 0.0 + ParamLgl$new( + id = "force_col_wise", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "force_row_wise", + default = FALSE, + tags = "train"), + ParamDbl$new( + id = "histogram_pool_size", + default = -1.0, + tags = "train"), + # % <= 0 means no limit + ParamInt$new( + id = "max_depth", + default = -1L, + tags = "train"), + # % constraints: min_data_in_leaf >= 0 + ParamInt$new( + id = "min_data_in_leaf", + default = 20L, + lower = 0L, + tags = "train"), + # % constraints: min_sum_hessian_in_leaf >= 0.0 # Note: to enable bagging, bagging_freq # should be set to a non # zero value as well - ParamDbl$new(id = "min_sum_hessian_in_leaf", - default = 1e-3, - lower = 0, - tags = "train"), - #% constraints: 0.0 < bagging_fraction <= 1.0 - ParamDbl$new(id = "bagging_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < pos_bagging_fraction <= 1.0 + ParamDbl$new( + id = "min_sum_hessian_in_leaf", + default = 1e-3, + lower = 0, + tags = "train"), + # % constraints: 0.0 < bagging_fraction <= 1.0 + ParamDbl$new( + id = "bagging_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < pos_bagging_fraction <= 1.0 # Note: to enable this, you need to set bagging_freq and # neg_bagging_fraction as well # Note: if both pos_bagging_fraction and @@ -245,223 +297,273 @@ lgbparams = function() { # are set to 1.0, balanced bagging is disabled # Note: if balanced bagging is enabled, # bagging_fraction will be ignored - ParamDbl$new(id = "pos_bagging_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < neg_bagging_fraction <= 1.0 - ParamDbl$new(id = "neg_bagging_fraction", - default = 1.0, - lower = 0, - upper = 1.0, - tags = "train"), + ParamDbl$new( + id = "pos_bagging_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < neg_bagging_fraction <= 1.0 + ParamDbl$new( + id = "neg_bagging_fraction", + default = 1.0, + lower = 0, + upper = 1.0, + tags = "train"), # Note: to enable bagging, bagging_fraction # should be set to value # smaller than 1.0 as well - ParamInt$new(id = "bagging_freq", - default = 0L, - lower = 0L, - tags = "train"), - ParamInt$new(id = "bagging_seed", - default = 3L, - tags = "train"), - #% constraints: 0.0 < feature_fraction <= 1.0 - ParamDbl$new(id = "feature_fraction", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - #% constraints: 0.0 < feature_fraction_bynode <= 1.0 + ParamInt$new( + id = "bagging_freq", + default = 0L, + lower = 0L, + tags = "train"), + ParamInt$new( + id = "bagging_seed", + default = 3L, + tags = "train"), + # % constraints: 0.0 < feature_fraction <= 1.0 + ParamDbl$new( + id = "feature_fraction", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + # % constraints: 0.0 < feature_fraction_bynode <= 1.0 # Note: unlike feature_fraction, this cannot # speed up training # Note: if both feature_fraction and # feature_fraction_bynode are # smaller than 1.0, the final fraction of # each node is - #% feature_fraction * feature_fraction_bynode - ParamDbl$new(id = "feature_fraction_bynode", - default = 1.0, - lower = 0.0, - upper = 1.0, - tags = "train"), - ParamInt$new(id = "feature_fraction_seed", - default = 2L, - tags = "train"), - ParamLgl$new(id = "extra_trees", - default = FALSE, - tags = "train"), - ParamInt$new(id = "extra_seed", - default = 6L, - tags = "train"), + # % feature_fraction * feature_fraction_bynode + ParamDbl$new( + id = "feature_fraction_bynode", + default = 1.0, + lower = 0.0, + upper = 1.0, + tags = "train"), + ParamInt$new( + id = "feature_fraction_seed", + default = 2L, + tags = "train"), + ParamLgl$new( + id = "extra_trees", + default = FALSE, + tags = "train"), + ParamInt$new( + id = "extra_seed", + default = 6L, + tags = "train"), # <= 0 means disable - ParamInt$new(id = "early_stopping_round", - default = 0L, - tags = "train"), - ParamLgl$new(id = "first_metric_only", - default = FALSE, - tags = "train"), + ParamInt$new( + id = "early_stopping_round", + default = 0L, + tags = "train"), + ParamLgl$new( + id = "first_metric_only", + default = FALSE, + tags = "train"), # <= 0 means no constraint - ParamDbl$new(id = "max_delta_step", - default = 0.0, - tags = "train"), - #% constraints: lambda_l1 >= 0.0 - ParamDbl$new(id = "lambda_l1", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: lambda_l2 >= 0.0 - ParamDbl$new(id = "lambda_l2", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: min_gain_to_split >= 0.0 - ParamDbl$new(id = "min_gain_to_split", - default = 0.0, - lower = 0.0, - tags = "train"), - #% constraints: 0.0 <= drop_rate <= 1.0 - ParamDbl$new(id = "drop_rate", - default = 0.1, - lower = 0.0, - upper = 1.0, - tags = c("train", "dart")), + ParamDbl$new( + id = "max_delta_step", + default = 0.0, + tags = "train"), + # % constraints: lambda_l1 >= 0.0 + ParamDbl$new( + id = "lambda_l1", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: lambda_l2 >= 0.0 + ParamDbl$new( + id = "lambda_l2", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: min_gain_to_split >= 0.0 + ParamDbl$new( + id = "min_gain_to_split", + default = 0.0, + lower = 0.0, + tags = "train"), + # % constraints: 0.0 <= drop_rate <= 1.0 + ParamDbl$new( + id = "drop_rate", + default = 0.1, + lower = 0.0, + upper = 1.0, + tags = c("train", "dart")), # <=0 means no limit - ParamInt$new(id = "max_drop", - default = 50L, - tags = c("train", "dart")), - #% constraints: 0.0 <= skip_drop <= 1.0 - ParamDbl$new(id = "skip_drop", - default = 0.5, - lower = 0.0, - upper = 1.0, - tags = c("train", "dart")), - ParamLgl$new(id = "xgboost_dart_mode", - default = FALSE, - tags = c("train", "dart")), - ParamLgl$new(id = "uniform_drop", - default = FALSE, - tags = c("train", "dart")), - ParamInt$new(id = "drop_seed", - default = 4L, - tags = c("train", "dart")), - #% constraints: 0.0 <= top_rate <= 1.0 - ParamDbl$new(id = "top_rate", - default = 0.2, - lower = 0.0, - upper = 1.0, - tags = c("train", "goss")), - #% constraints: 0.0 <= other_rate <= 1.0 - ParamDbl$new(id = "other_rate", - default = 0.1, - lower = 0.0, - upper = 1.0, - tags = c("train", "goss")), - #% constraints: min_data_per_group > 0 - ParamInt$new(id = "min_data_per_group", - default = 100L, - lower = 1L, - tags = "train"), - #% constraints: max_cat_threshold > 0 - ParamInt$new(id = "max_cat_threshold", - default = 32L, - lower = 1L, - tags = "train"), - #% constraints: cat_l2 >= 0.0 - ParamDbl$new(id = "cat_l2", - default = 10.0, - lower = 0.0, - tags = "train"), - #% constraints: cat_smooth >= 0.0 - ParamDbl$new(id = "cat_smooth", - default = 10.0, - lower = 0.0, - tags = "train"), - #% constraints: max_cat_to_onehot > 0 - ParamInt$new(id = "max_cat_to_onehot", - default = 4L, - lower = 1L, - tags = "train"), - #% constraints: top_k > 0 - ParamInt$new(id = "top_k", - default = 20L, - lower = 1L, - tags = "train"), - #% constraints: cegb_tradeoff >= 0.0 - ParamDbl$new(id = "cegb_tradeoff", - default = 1.0, - lower = 0.0, - tags = "train"), - #% constraints: cegb_penalty_split >= 0.0 - ParamDbl$new(id = "cegb_penalty_split", - default = 0.0, - lower = 0.0, - tags = "train"), + ParamInt$new( + id = "max_drop", + default = 50L, + tags = c("train", "dart")), + # % constraints: 0.0 <= skip_drop <= 1.0 + ParamDbl$new( + id = "skip_drop", + default = 0.5, + lower = 0.0, + upper = 1.0, + tags = c("train", "dart")), + ParamLgl$new( + id = "xgboost_dart_mode", + default = FALSE, + tags = c("train", "dart")), + ParamLgl$new( + id = "uniform_drop", + default = FALSE, + tags = c("train", "dart")), + ParamInt$new( + id = "drop_seed", + default = 4L, + tags = c("train", "dart")), + # % constraints: 0.0 <= top_rate <= 1.0 + ParamDbl$new( + id = "top_rate", + default = 0.2, + lower = 0.0, + upper = 1.0, + tags = c("train", "goss")), + # % constraints: 0.0 <= other_rate <= 1.0 + ParamDbl$new( + id = "other_rate", + default = 0.1, + lower = 0.0, + upper = 1.0, + tags = c("train", "goss")), + # % constraints: min_data_per_group > 0 + ParamInt$new( + id = "min_data_per_group", + default = 100L, + lower = 1L, + tags = "train"), + # % constraints: max_cat_threshold > 0 + ParamInt$new( + id = "max_cat_threshold", + default = 32L, + lower = 1L, + tags = "train"), + # % constraints: cat_l2 >= 0.0 + ParamDbl$new( + id = "cat_l2", + default = 10.0, + lower = 0.0, + tags = "train"), + # % constraints: cat_smooth >= 0.0 + ParamDbl$new( + id = "cat_smooth", + default = 10.0, + lower = 0.0, + tags = "train"), + # % constraints: max_cat_to_onehot > 0 + ParamInt$new( + id = "max_cat_to_onehot", + default = 4L, + lower = 1L, + tags = "train"), + # % constraints: top_k > 0 + ParamInt$new( + id = "top_k", + default = 20L, + lower = 1L, + tags = "train"), + # % constraints: cegb_tradeoff >= 0.0 + ParamDbl$new( + id = "cegb_tradeoff", + default = 1.0, + lower = 0.0, + tags = "train"), + # % constraints: cegb_penalty_split >= 0.0 + ParamDbl$new( + id = "cegb_penalty_split", + default = 0.0, + lower = 0.0, + tags = "train"), ####################################### # IO Parameters - ParamInt$new(id = "verbose", - default = 1L, - tags = "train"), - ParamUty$new(id = "input_model", - default = "", - tags = "train"), - ParamUty$new(id = "output_model", - default = "LightGBM_model.txt", - tags = "train"), - ParamInt$new(id = "snapshot_freq", - default = -1L, - tags = "train"), - #% constraints: max_bin > 1 - ParamInt$new(id = "max_bin", - default = 255L, - lower = 2L, - tags = "train"), - #% constraints: min_data_in_bin > 0 - ParamInt$new(id = "min_data_in_bin", - default = 3L, - lower = 1L, - tags = "train"), - #% constraints: bin_construct_sample_cnt > 0 - ParamInt$new(id = "bin_construct_sample_cnt", - default = 200000L, - lower = 1L, - tags = "train"), - ParamInt$new(id = "data_random_seed", - default = 1L, - tags = "train"), - ParamLgl$new(id = "is_enable_sparse", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "enable_bundle", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "use_missing", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "zero_as_missing", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "feature_pre_filter", - default = TRUE, - tags = "train"), - ParamLgl$new(id = "pre_partition", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "two_round", - default = FALSE, - tags = "train"), - ParamLgl$new(id = "header", - default = FALSE, - tags = "train"), - ParamUty$new(id = "group_column", - default = "", - tags = "train"), - ParamUty$new(id = "ignore_column", - default = "", - tags = "train"), - ParamUty$new(id = "categorical_feature", - default = "", - tags = "train"), + ParamInt$new( + id = "verbose", + default = 1L, + tags = "train"), + ParamUty$new( + id = "input_model", + default = "", + tags = "train"), + ParamUty$new( + id = "output_model", + default = "LightGBM_model.txt", + tags = "train"), + ParamInt$new( + id = "snapshot_freq", + default = -1L, + tags = "train"), + # % constraints: max_bin > 1 + ParamInt$new( + id = "max_bin", + default = 255L, + lower = 2L, + tags = "train"), + # % constraints: min_data_in_bin > 0 + ParamInt$new( + id = "min_data_in_bin", + default = 3L, + lower = 1L, + tags = "train"), + # % constraints: bin_construct_sample_cnt > 0 + ParamInt$new( + id = "bin_construct_sample_cnt", + default = 200000L, + lower = 1L, + tags = "train"), + ParamInt$new( + id = "data_random_seed", + default = 1L, + tags = "train"), + ParamLgl$new( + id = "is_enable_sparse", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "enable_bundle", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "use_missing", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "zero_as_missing", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "feature_pre_filter", + default = TRUE, + tags = "train"), + ParamLgl$new( + id = "pre_partition", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "two_round", + default = FALSE, + tags = "train"), + ParamLgl$new( + id = "header", + default = FALSE, + tags = "train"), + ParamUty$new( + id = "group_column", + default = "", + tags = "train"), + ParamUty$new( + id = "ignore_column", + default = "", + tags = "train"), + ParamUty$new( + id = "categorical_feature", + default = "", + tags = "train"), ####################################### ####################################### # Predict Parameters TODO are they needed? @@ -469,17 +571,20 @@ lgbparams = function() { ####################################### ####################################### # Objective Parameters - ParamInt$new(id = "objective_seed", - default = 5L, - tags = c("train", "rank_xendcg")), + ParamInt$new( + id = "objective_seed", + default = 5L, + tags = c("train", "rank_xendcg")), # moved num_class up to classification part # moved is_unbalance up to classification part # moved scale_pos_weight up to classification part # moved sigmoid up to classification part - ParamLgl$new(id = "boost_from_average", - default = TRUE, - tags = c("train", "regression", "binary", - "multiclassova", "cross-entropy")), + ParamLgl$new( + id = "boost_from_average", + default = TRUE, + tags = c( + "train", "regression", "binary", + "multiclassova", "cross-entropy")), # moved req_sqrt up to regression part # moved alpha up to regression part # moved fair_c up to regression part @@ -490,14 +595,16 @@ lgbparams = function() { # moved label_gain up to classification part ####################################### # Metric Parameters - #% constraints: metric_freq > 0 - ParamInt$new(id = "metric_freq", - default = 1L, - lower = 1L, - tags = "train"), - ParamLgl$new(id = "is_provide_training_metric", - default = FALSE, - tags = "train") + # % constraints: metric_freq > 0 + ParamInt$new( + id = "metric_freq", + default = 1L, + lower = 1L, + tags = "train"), + ParamLgl$new( + id = "is_provide_training_metric", + default = FALSE, + tags = "train") ) ) return(ps) diff --git a/R/zzz.R b/R/zzz.R index cba01a7..4a6ef50 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -18,7 +18,7 @@ register_mlr3 = function(libname, pkgname) { .onLoad = function(libname, pkgname) { # nolint register_mlr3() setHook(packageEvent("mlr3", "onLoad"), function(...) register_mlr3(), - action = "append") + action = "append") } .onUnload = function(libpath) { # nolint @@ -26,6 +26,6 @@ register_mlr3 = function(libname, pkgname) { hooks = getHook(event) pkgname = vapply(hooks, function(x) environment(x)$pkgname, NA_character_) setHook(event, hooks[pkgname != "mlr3learners.lightgbm"], - action = "replace") + action = "replace") } # nocov end diff --git a/data-raw/devstuffs.R b/data-raw/devstuffs.R index 5d2846e..1462afb 100644 --- a/data-raw/devstuffs.R +++ b/data-raw/devstuffs.R @@ -12,21 +12,22 @@ my_desc$set("Package", packagename) # Set author names 2 my_desc$set_authors(c( person("Lorenz A.", "Kapsner", - email = "lorenz.kapsner@gmail.com", - role = c("cre", "aut"), - comment = c(ORCID = "0000-0003-1866-860X")), - person(given = "Patrick", - family = "Schratz", - role = "ctb", - email = "patrick.schratz@gmail.com", - comment = c(ORCID = "0000-0003-0748-6624")) + email = "lorenz.kapsner@gmail.com", + role = c("cre", "aut"), + comment = c(ORCID = "0000-0003-1866-860X")), + person( + given = "Patrick", + family = "Schratz", + role = "ctb", + email = "patrick.schratz@gmail.com", + comment = c(ORCID = "0000-0003-0748-6624")) )) # Remove some author fields my_desc$del("Maintainer") # Set the version -my_desc$set_version("0.0.4.9006") +my_desc$set_version("0.0.4.9007") # The title of your package my_desc$set(Title = "mlr3: LightGBM learner") @@ -51,7 +52,7 @@ my_desc$set("License", "LGPL-3") my_desc$write(file = "DESCRIPTION") # License -#usethis::use_lgpl_license(name = "Lorenz A. Kapsner") +# usethis::use_lgpl_license(name = "Lorenz A. Kapsner") # add Imports and Depends @@ -70,11 +71,10 @@ my_desc$write(file = "DESCRIPTION") usethis::use_package("R", min_version = "2.10", type = "Depends") # Imports -usethis::use_package("data.table", type="Imports") +usethis::use_package("data.table", type = "Imports") usethis::use_package("R6", type = "Imports") usethis::use_package("paradox", type = "Imports") usethis::use_package("mlr3misc", type = "Imports") -usethis::use_package("ggplot2", type = "Imports") usethis::use_package("mlr3", type = "Imports") usethis::use_package("plyr", type = "Imports") usethis::use_package("MLmetrics", type = "Imports") @@ -121,8 +121,9 @@ usethis::use_package("mlbench", type = "Suggests") # usethis::use_git_ignore("!/man-roxygen/") # code coverage -#covr::package_coverage() +# covr::package_coverage() # lint package -#lintr::lint_package() +# lintr::lint_package() +usethis::use_tidy_description() diff --git a/data-raw/test.R b/data-raw/test.R index 3327a1e..9ef1f3f 100644 --- a/data-raw/test.R +++ b/data-raw/test.R @@ -27,7 +27,7 @@ lightgbm = reticulate::import("lightgbm") pars = ps$get_values(tags = "train") # Get formula, data, classwt, cutoff for the LightGBM -data = task$data() #the data is avail +data = task$data() # the data is avail levs = levels(data[[task$target_names]]) n = length(levs) @@ -53,7 +53,7 @@ if (n > 2) { } } else { pars[["objective"]] = "binary" - if (is.null(pars[["metric"]])) { + if (is.null(pars[["metric"]])) { pars[["metric"]] = c("auc", "binary_error") } } @@ -80,9 +80,10 @@ mymodel = lightgbm$train( newdata = task$data(cols = task$feature_names) -p = mlr3misc::invoke(.f = mymodel$predict, - data = newdata, - is_reshape = T) +p = mlr3misc::invoke( + .f = mymodel$predict, + data = newdata, + is_reshape = T) colnames(p) = as.character(unique(x_label)) PredictionClassif$new(task = task, prob = p) @@ -95,10 +96,12 @@ imp = data.table::data.table( imp -ggplot2::ggplot(data = NULL, - ggplot2::aes(x = reorder(imp$Feature, imp$Value), - y = imp$Value, - fill = imp$Value)) + +ggplot2::ggplot( + data = NULL, + ggplot2::aes( + x = reorder(imp$Feature, imp$Value), + y = imp$Value, + fill = imp$Value)) + ggplot2::geom_col() + ggplot2::coord_flip() + ggplot2::scale_fill_gradientn(colours = grDevices::rainbow(n = nrow(imp))) + diff --git a/inst/paramtest/test_paramtest_classif.lightgbm.R b/inst/paramtest/test_paramtest_classif.lightgbm.R index 9438a9e..41b76f8 100644 --- a/inst/paramtest/test_paramtest_classif.lightgbm.R +++ b/inst/paramtest/test_paramtest_classif.lightgbm.R @@ -11,7 +11,6 @@ test_that("classif.lightgbm", { "eval", "record", "eval_freq", - "init_model", "colnames", "early_stopping_rounds", "callbacks", diff --git a/man/mlr3learners.lightgbm-package.Rd b/man/mlr3learners.lightgbm-package.Rd index 0c0d916..377f37b 100644 --- a/man/mlr3learners.lightgbm-package.Rd +++ b/man/mlr3learners.lightgbm-package.Rd @@ -6,7 +6,8 @@ \alias{mlr3learners.lightgbm-package} \title{mlr3learners.lightgbm: mlr3: LightGBM learner} \description{ -Adds `lgb.train()` from the lightgbm package to mlr3. +Adds `lgb.train()` from the lightgbm package to + mlr3. } \seealso{ Useful links: diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 0149d90..a4792c1 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -1,4 +1,4 @@ library(checkmate) library(mlr3) lapply(list.files(system.file("testthat", package = "mlr3"), - pattern = "^helper.*\\.[rR]", full.names = TRUE), source) + pattern = "^helper.*\\.[rR]", full.names = TRUE), source) diff --git a/tests/testthat/test-learner_regression.R b/tests/testthat/test-learner_regression.R index ded1c7d..82679bc 100644 --- a/tests/testthat/test-learner_regression.R +++ b/tests/testthat/test-learner_regression.R @@ -3,7 +3,6 @@ context("Test Regression") test_that( desc = "Learner Regression", code = { - library(mlbench) data("BostonHousing2") dataset = data.table::as.data.table(BostonHousing2) diff --git a/tests/testthat/test_learner_classif_lightgbm.R b/tests/testthat/test_learner_classif_lightgbm.R index 668b6d5..44e7845 100644 --- a/tests/testthat/test_learner_classif_lightgbm.R +++ b/tests/testthat/test_learner_classif_lightgbm.R @@ -3,7 +3,6 @@ context("LearnerClassifLightGBM") test_that( desc = "LearnerClassifLightGBM", code = { - learner = LearnerClassifLightGBM$new() expect_learner(learner) learner$param_set$values = mlr3misc::insert_named( diff --git a/tests/testthat/test_learner_regr_lightgbm.R b/tests/testthat/test_learner_regr_lightgbm.R index 65a1e2c..ffa5da2 100644 --- a/tests/testthat/test_learner_regr_lightgbm.R +++ b/tests/testthat/test_learner_regr_lightgbm.R @@ -5,7 +5,6 @@ context("LearnerRegrLightGBM") test_that( desc = "LearnerRegrLightGBM", code = { - learner = LearnerRegrLightGBM$new() expect_learner(learner) learner$param_set$values = mlr3misc::insert_named(