diff --git a/.editorconfig b/.editorconfig index f5ef1a534..1b8da5d5c 100644 --- a/.editorconfig +++ b/.editorconfig @@ -15,7 +15,7 @@ indent_size = 2 indent_size = 4 [*.{cpp,hpp}] -indent_size = 4 +indent_size = 2 [{NEWS.md,DESCRIPTION,LICENSE}] max_line_length = 80 diff --git a/DESCRIPTION b/DESCRIPTION index 9258055c7..26b5b06cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mlr3proba Title: Probabilistic Supervised Learning for 'mlr3' -Version: 0.6.6 +Version: 0.6.8 Authors@R: c(person(given = "Raphael", family = "Sonabend", @@ -64,11 +64,9 @@ Imports: paradox (>= 1.0.0), R6, Rcpp (>= 1.0.4), - survival, - survivalmodels (>= 0.1.12) + survival Suggests: bujar, - cubature, GGally, knitr, lgr, @@ -145,6 +143,7 @@ Collate: 'PipeOpPredRegrSurv.R' 'PipeOpPredSurvRegr.R' 'PipeOpProbregrCompositor.R' + 'PipeOpResponseCompositor.R' 'PipeOpSurvAvg.R' 'PipeOpTaskRegrSurv.R' 'PipeOpTaskSurvClassifDiscTime.R' @@ -176,7 +175,6 @@ Collate: 'histogram.R' 'integrated_scores.R' 'mlr3proba-package.R' - 'partition.R' 'pecs.R' 'pipelines.R' 'plot.R' diff --git a/NAMESPACE b/NAMESPACE index 673c263ba..aeb2060ed 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,7 +25,6 @@ S3method(check_prediction_data,PredictionDataSurv) S3method(filter_prediction_data,PredictionDataSurv) S3method(is_missing_prediction_data,PredictionDataDens) S3method(is_missing_prediction_data,PredictionDataSurv) -S3method(partition,TaskSurv) S3method(pecs,PredictionSurv) S3method(pecs,list) S3method(plot,LearnerSurv) @@ -77,6 +76,7 @@ export(PipeOpPredRegrSurv) export(PipeOpPredSurvRegr) export(PipeOpPredTransformer) export(PipeOpProbregr) +export(PipeOpResponseCompositor) export(PipeOpSurvAvg) export(PipeOpTaskRegrSurv) export(PipeOpTaskSurvClassifDiscTime) @@ -95,7 +95,9 @@ export(as_prediction_surv) export(as_task_dens) export(as_task_surv) export(assert_surv) +export(assert_surv_matrix) export(breslow) +export(get_mortality) export(pecs) export(pipeline_survtoclassif_disctime) export(pipeline_survtoregr) diff --git a/NEWS.md b/NEWS.md index e5e720a95..293fcf698 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# mlr3proba 0.6.8 + +- `Rcpp` code optimizations +- Fixed ERV scoring to comply with `mlr3` dev version (no bugs before) +- Skipping `survtoregr` pipelines due to bugs (to be refactored in the future) + +# mlr3proba 0.6.7 + +- Deprecate `crank` to `distr` composition in `distrcompose` pipeop (only from `lp` => `distr` works now) +- Add `get_mortality()` function (from `survivalmodels::surv_to_risk()` +- Add Rcpp function `assert_surv_matrix()` +- Update and simplify `crankcompose` pipeop and respective pipeline (no `response` is created anymore) +- Add `responsecompositor` pipeline with `rmst` and `median` + # mlr3proba 0.6.6 - Small fixes and refactoring to the discrete-time pipeops diff --git a/R/LearnerDens.R b/R/LearnerDens.R index b3ee721b3..6018e6896 100644 --- a/R/LearnerDens.R +++ b/R/LearnerDens.R @@ -36,14 +36,14 @@ LearnerDens = R6::R6Class("LearnerDens", #' @description Creates a new instance of this [R6][R6::R6Class] class. initialize = function(id, param_set = ps(), predict_types = "cdf", feature_types = character(), - properties = character(), data_formats = "data.table", + properties = character(), packages = character(), label = NA_character_, man = NA_character_) { super$initialize( id = id, task_type = "dens", param_set = param_set, predict_types = predict_types, feature_types = feature_types, properties = properties, - data_formats = data_formats, packages = c("mlr3proba", packages), label = label, man = man) + packages = c("mlr3proba", packages), label = label, man = man) } ) ) diff --git a/R/MeasureSurvCindex.R b/R/MeasureSurvCindex.R index f7d63e166..65cd8134c 100644 --- a/R/MeasureSurvCindex.R +++ b/R/MeasureSurvCindex.R @@ -56,7 +56,7 @@ #' library(mlr3) #' task = tsk("rats") #' learner = lrn("surv.coxph") -#' part = partition(task) # train/test split, stratified on `status` by default +#' part = partition(task) # train/test split #' learner$train(task, part$train) #' p = learner$predict(task, part$test) #' diff --git a/R/MeasureSurvRCLL.R b/R/MeasureSurvRCLL.R index 471496f45..cb961acdb 100644 --- a/R/MeasureSurvRCLL.R +++ b/R/MeasureSurvRCLL.R @@ -16,6 +16,10 @@ #' density function and \eqn{S} the survival function. #' RCLL is proper given that censoring and survival distribution are independent, see Rindt et al. (2022). #' +#' **Note**: Even though RCLL is a proper scoring rule, the calculation of \eqn{f(t)} (which in our case is discrete, i.e. it is a *probability mass function*) for time points in the test set that don't exist in the predicted survival matrix (`distr`), results in 0 values, which are substituted by `"eps"` in our implementation, therefore skewing the result towards \eqn{-log(eps)}. +#' This problem is also discussed in Rindt et al. (2022), where the authors perform interpolation to get non-zero values for the \eqn{f(t)}. +#' Until this is handled in `mlr3proba` some way, we advise against using this measure for model evaluation. +#' #' @section Parameter details: #' - `na.rm` (`logical(1)`)\cr #' If `TRUE` (default) then removes any NAs in individual score calculations. diff --git a/R/PipeOpCrankCompositor.R b/R/PipeOpCrankCompositor.R index 4a8f97716..19bcba3cd 100644 --- a/R/PipeOpCrankCompositor.R +++ b/R/PipeOpCrankCompositor.R @@ -16,14 +16,11 @@ #' ``` #' #' @section Input and Output Channels: -#' [PipeOpCrankCompositor] has one input channel named "input", which takes -#' `NULL` during training and [PredictionSurv] during prediction. +#' [PipeOpCrankCompositor] has one input channel named `"input"`, which takes `NULL` during training and [PredictionSurv] during prediction. #' -#' [PipeOpCrankCompositor] has one output channel named "output", producing `NULL` during training -#' and a [PredictionSurv] during prediction. +#' [PipeOpCrankCompositor] has one output channel named `"output"`, producing `NULL` during training and a [PredictionSurv] during prediction. #' -#' The output during prediction is the [PredictionSurv] from the "pred" input but with the `crank` -#' predict type overwritten by the given estimation method. +#' The output during prediction is the [PredictionSurv] from the input but with the `crank` predict type overwritten by the given estimation method. #' #' @section State: #' The `$state` is left empty (`list()`). @@ -31,42 +28,24 @@ #' @section Parameters: #' * `method` :: `character(1)` \cr #' Determines what method should be used to produce a continuous ranking from the distribution. -#' One of `sum_haz`, `median`, `mode`, or `mean` corresponding to the -#' respective functions in the predicted survival distribution. Note that -#' for models with a proportional hazards form, the ranking implied by -#' `mean` and `median` will be identical (but not the value of `crank` -#' itself). `sum_haz` (default) uses [survivalmodels::surv_to_risk()]. -#' * `which` :: `numeric(1)`\cr -#' If `method = "mode"` then specifies which mode to use if multi-modal, default is the first. -#' * `response` :: `logical(1)`\cr -#' If `TRUE` then the `response` predict type is estimated with the same values as `crank`. +#' Currently only `mort` is supported, which is the sum of the cumulative hazard, also called *expected/ensemble mortality*, see Ishwaran et al. (2008). +#' For more details, see [get_mortality()]. #' * `overwrite` :: `logical(1)` \cr -#' If `FALSE` (default) then if the "pred" input already has a `crank`, the compositor only -#' composes a `response` type if `response = TRUE` and does not already exist. If `TRUE` then -#' both the `crank` and `response` are overwritten. -#' -#' @section Internals: -#' The `median`, `mode`, or `mean` will use analytical expressions if possible but if not they are -#' calculated using methods from [distr6]. `mean` requires \CRANpkg{cubature}. +#' If `FALSE` (default) and the prediction already has a `crank` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `crank` will be overwritten. #' #' @seealso [pipeline_crankcompositor] #' @family survival compositors #' @examples #' \dontrun{ #' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { -#' library(mlr3) #' library(mlr3pipelines) #' task = tsk("rats") #' -#' learn = lrn("surv.coxph")$train(task)$predict(task) -#' poc = po("crankcompose", param_vals = list(method = "sum_haz")) -#' poc$predict(list(learn))[[1]] -#' -#' if (requireNamespace("cubature", quietly = TRUE)) { -#' learn = lrn("surv.coxph")$train(task)$predict(task) -#' poc = po("crankcompose", param_vals = list(method = "sum_haz")) -#' poc$predict(list(learn))[[1]] -#' } +#' # change the crank prediction type of a Cox's model predictions +#' pred = lrn("surv.coxph")$train(task)$predict(task) +#' poc = po("crankcompose", param_vals = list(overwrite = TRUE)) +#' poc$predict(list(pred))[[1L]] #' } #' } #' @export @@ -77,13 +56,10 @@ PipeOpCrankCompositor = R6Class("PipeOpCrankCompositor", #' Creates a new instance of this [R6][R6::R6Class] class. initialize = function(id = "crankcompose", param_vals = list()) { param_set = ps( - method = p_fct(default = "sum_haz", levels = c("sum_haz", "mean", "median", "mode"), - tags = "predict"), - which = p_int(1L, default = 1L, tags = "predict", depends = quote(method == "mode")), - response = p_lgl(default = FALSE, tags = "predict"), + method = p_fct(default = "mort", levels = c("mort"), tags = "predict"), overwrite = p_lgl(default = FALSE, tags = "predict") ) - param_set$set_values(method = "sum_haz", response = FALSE, overwrite = FALSE) + param_set$set_values(method = "mort", overwrite = FALSE) super$initialize( id = id, @@ -91,7 +67,7 @@ PipeOpCrankCompositor = R6Class("PipeOpCrankCompositor", param_vals = param_vals, input = data.table(name = "input", train = "NULL", predict = "PredictionSurv"), output = data.table(name = "output", train = "NULL", predict = "PredictionSurv"), - packages = c("mlr3proba", "distr6") + packages = c("mlr3proba") ) } ), @@ -103,83 +79,47 @@ PipeOpCrankCompositor = R6Class("PipeOpCrankCompositor", }, .predict = function(inputs) { - - inpred = inputs[[1L]] - - response = self$param_set$values$response - b_response = !anyMissing(inpred$response) - if (!length(response)) response = FALSE - + pred = inputs[[1L]] overwrite = self$param_set$values$overwrite - if (!length(overwrite)) overwrite = FALSE + # it's impossible for a learner not to predict crank in mlr3proba, + # but let's check either way: + has_crank = !all(is.na(pred$crank)) - # if crank and response already exist and not overwriting then return prediction - if (!overwrite && (!response || (response && b_response))) { - return(list(inpred)) + if (!overwrite & has_crank) { + # return prediction as is + return(list(pred)) } else { - assert("distr" %in% inpred$predict_types) - method = self$param_set$values$method - if (length(method) == 0L) method = "sum_haz" - if (method == "sum_haz") { - if (inherits(inpred$data$distr, "matrix") || - !requireNamespace("survivalmodels", quietly = TRUE)) { - comp = survivalmodels::surv_to_risk(inpred$data$distr) - } else { - comp = as.numeric( - colSums(inpred$distr$cumHazard(sort(unique(inpred$truth[, 1])))) - ) - } - } else if (method == "mean") { - comp = try(inpred$distr$mean(), silent = TRUE) - if (inherits(comp, "try-error")) { - requireNamespace("cubature") - comp = try(inpred$distr$mean(cubature = TRUE), silent = TRUE) - } - if (inherits(comp, "try-error")) { - comp = numeric(length(inpred$crank)) - } - } else { - comp = switch(method, - median = inpred$distr$median(), - mode = inpred$distr$mode(self$param_set$values$which)) - } + # compose crank from distr prediction + assert("distr" %in% pred$predict_types) - comp = as.numeric(comp) - - # if crank exists and not overwriting then return predicted crank, otherwise compose - if (!overwrite) { - crank = inpred$crank + # get survival matrix + if (inherits(pred$data$distr, "array")) { + surv = pred$data$distr + if (length(dim(surv)) == 3L) { + # survival 3d array, extract median + surv = .ext_surv_mat(arr = surv, which.curve = 0.5) + } } else { - crank = -comp - # missing imputed with median - crank[is.na(crank)] = stats::median(crank[!is.na(crank)]) - crank[crank == Inf] = 1e3 - crank[crank == -Inf] = -1e3 + stop("Distribution prediction does not have a survival matrix or array + in the $data$distr slot") } - # i) not overwriting or requesting response, and already predicted - if (b_response && (!overwrite || !response)) { - response = inpred$response - # ii) not requesting response and doesn't exist - } else if (!response) { - response = NULL - # iii) requesting response and happy to overwrite - # iv) requesting response and doesn't exist - } else { - response = comp - response[is.na(response)] = 0 - response[response == Inf | response == -Inf] = 0 + method = self$param_set$values$method + if (method == "mort") { + crank = get_mortality(surv) } - if (!anyMissing(inpred$lp)) { - lp = inpred$lp - } else { - lp = NULL - } + # update only `crank` + p = PredictionSurv$new( + row_ids = pred$row_ids, + truth = pred$truth, + crank = crank, + distr = pred$distr, + lp = pred$lp, + response = pred$response + ) - return(list(PredictionSurv$new( - row_ids = inpred$row_ids, truth = inpred$truth, crank = crank, - distr = inpred$distr, lp = lp, response = response))) + return(list(p)) } } ) diff --git a/R/PipeOpDistrCompositor.R b/R/PipeOpDistrCompositor.R index 113a49732..aca38240f 100644 --- a/R/PipeOpDistrCompositor.R +++ b/R/PipeOpDistrCompositor.R @@ -3,16 +3,12 @@ #' @template param_pipelines #' #' @description -#' Estimates (or 'composes') a survival distribution from a predicted baseline `distr` and a -#' `crank` or `lp` from two [PredictionSurv]s. +#' Estimates (or 'composes') a survival distribution from a predicted baseline +#' survival distribution (`distr`) and a linear predictor (`lp`) from two [PredictionSurv]s. #' #' Compositor Assumptions: #' * The baseline `distr` is a discrete estimator, e.g. [surv.kaplan][LearnerSurvKaplan]. #' * The composed `distr` is of a linear form -#' * If `lp` is missing then `crank` is equivalent -#' -#' These assumptions are strong and may not be reasonable. Future updates will upgrade this -#' compositor to be more flexible. #' #' @section Dictionary: #' This [PipeOp][mlr3pipelines::PipeOp] can be instantiated via the @@ -25,15 +21,16 @@ #' ``` #' #' @section Input and Output Channels: -#' [PipeOpDistrCompositor] has two input channels, "base" and "pred". Both input channels take -#' `NULL` during training and [PredictionSurv] during prediction. +#' [PipeOpDistrCompositor] has two input channels, `"base"` and `"pred"`. +#' Both input channels take `NULL` during training and [PredictionSurv] during prediction. #' -#' [PipeOpDistrCompositor] has one output channel named "output", producing `NULL` during training -#' and a [PredictionSurv] during prediction. +#' [PipeOpDistrCompositor] has one output channel named `"output"`, producing +#' `NULL` during training and a [PredictionSurv] during prediction. #' -#' The output during prediction is the [PredictionSurv] from the "pred" input but with an extra -#' (or overwritten) column for `distr` predict type; which is composed from the `distr` of "base" -#' and `lp` or `crank` of "pred". +#' The output during prediction is the [PredictionSurv] from the `"pred"` input +#' but with an extra (or overwritten) column for the `distr` predict type; which +#' is composed from the `distr` of `"base"` and the `lp` of `"pred"`. +#' If no `lp` predictions have been made or exist, then the `"pred"` is returned unchanged. #' #' @section State: #' The `$state` is left empty (`list()`). @@ -46,8 +43,8 @@ #' Default `aft`. #' * `overwrite` :: `logical(1)` \cr #' If `FALSE` (default) then if the "pred" input already has a `distr`, the compositor does -#' nothing and returns the given [PredictionSurv]. If `TRUE` then the `distr` is overwritten -#' with the `distr` composed from `lp`/`crank` - this is useful for changing the prediction +#' nothing and returns the given [PredictionSurv]. If `TRUE`, then the `distr` is overwritten +#' with the `distr` composed from `lp` - this is useful for changing the prediction #' `distr` from one model form to another. #' #' @section Internals: @@ -56,8 +53,7 @@ #' \deqn{ph: S(t) = S_0(t)^{exp(lp)}}{ph: S(t) = S0(t)^exp(lp)} #' \deqn{po: S(t) = \frac{S_0(t)}{exp(-lp) + (1-exp(-lp)) S_0(t)}}{po: S(t) = S0(t) / [exp(-lp) + S0(t) (1-exp(-lp))]} # nolint #' where \eqn{S_0}{S0} is the estimated baseline survival distribution, and \eqn{lp} is the -#' predicted linear predictor. If the input model does not predict a linear predictor then `crank` -#' is assumed to be the `lp` - **this may be a strong and unreasonable assumption.** +#' predicted linear predictor. #' #' @seealso [pipeline_distrcompositor] #' @export @@ -71,6 +67,7 @@ #' #' base = lrn("surv.kaplan")$train(task)$predict(task) #' pred = lrn("surv.coxph")$train(task)$predict(task) +#' # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: #' pod = po("distrcompose", param_vals = list(form = "aft", overwrite = TRUE)) #' pod$predict(list(base = base, pred = pred))[[1]] #' } @@ -106,31 +103,27 @@ PipeOpDistrCompositor = R6Class("PipeOpDistrCompositor", .predict = function(inputs) { base = inputs$base - inpred = inputs$pred + pred = inputs$pred - overwrite = self$param_set$values$overwrite - if (!length(overwrite)) overwrite = FALSE + # if no `lp` predictions, we return the survival prediction object unchanged + if (is.null(pred$lp)) { + return(list(pred)) + } - if ("distr" %in% inpred$predict_types & !overwrite) { - return(list(inpred)) + overwrite = assert_logical(self$param_set$values$overwrite) + if ("distr" %in% pred$predict_types & !overwrite) { + return(list(pred)) } else { assert("distr" %in% base$predict_types) - row_ids = inpred$row_ids - truth = inpred$truth - - walk(inputs, function(x) assert_true(identical(truth, x$truth))) + # check: targets are the same + assert_true(identical(base$truth, pred$truth)) form = self$param_set$values$form - if (length(form) == 0L) form = "aft" - nr = length(inpred$data$row_ids) + nr = length(pred$data$row_ids) - # assumes PH-style lp where high value = high risk - if (anyMissing(inpred$lp)) { - lp = inpred$crank - } else { - lp = inpred$lp - } + # we need 'lp' predictions + lp = pred$lp if (inherits(base$data$distr, "Distribution")) { base = distr6::as.MixtureDistribution(base$distr) @@ -138,15 +131,17 @@ PipeOpDistrCompositor = R6Class("PipeOpDistrCompositor", nc = length(times) survmat = matrix(1 - base$cdf(times), nrow = nr, ncol = nc, byrow = TRUE) } else { - base = colMeans(base$data$distr) - times = as.numeric(names(base)) + # average survival probability across observations (on the test set) + avg_surv = colMeans(base$data$distr) + times = as.numeric(names(avg_surv)) nc = length(times) - survmat = matrix(base, nrow = nr, ncol = nc, byrow = TRUE) + survmat = matrix(avg_surv, nrow = nr, ncol = nc, byrow = TRUE) } timesmat = matrix(times, nrow = nr, ncol = nc, byrow = TRUE) lpmat = matrix(lp, nrow = nr, ncol = nc) + # compose survival distribution if (form == "ph") { cdf = 1 - (survmat^exp(lpmat)) } else if (form == "aft") { @@ -159,17 +154,17 @@ PipeOpDistrCompositor = R6Class("PipeOpDistrCompositor", cdf[survmat == 1] = 0 } - distr = .surv_return(times, 1 - cdf)$distr + distr = .surv_return(times = times, surv = 1 - cdf)$distr - if (anyMissing(inpred$lp)) { - lp = NULL - } else { - lp = inpred$lp - } + p = PredictionSurv$new( + row_ids = pred$row_ids, + truth = pred$truth, + crank = pred$crank, + lp = pred$lp, + distr = distr # overwrite only the distribution + ) - return(list(PredictionSurv$new( - row_ids = row_ids, truth = truth, - crank = inpred$crank, distr = distr, lp = lp))) + return(list(p)) } } ) diff --git a/R/PipeOpResponseCompositor.R b/R/PipeOpResponseCompositor.R new file mode 100644 index 000000000..d00fbce97 --- /dev/null +++ b/R/PipeOpResponseCompositor.R @@ -0,0 +1,191 @@ +#' @title PipeOpResponseCompositor +#' @name mlr_pipeops_responsecompose +#' @template param_pipelines +#' +#' @description +#' Uses a predicted survival distribution (`distr`) in a [PredictionSurv] to estimate (or 'compose') an expected survival time (`response`) prediction. +#' Practically, this `PipeOp` summarizes an observation's survival curve/distribution to a single number which can be either the restricted mean survival time or the median survival time. +#' +#' @section Dictionary: +#' This [PipeOp][mlr3pipelines::PipeOp] can be instantiated via the +#' [dictionary][mlr3misc::Dictionary] [mlr3pipelines::mlr_pipeops] or with the associated sugar +#' function [mlr3pipelines::po()]: +#' ``` +#' PipeOpResponseCompositor$new() +#' mlr_pipeops$get("responsecompose") +#' po("responsecompose") +#' ``` +#' +#' @section Input and Output Channels: +#' [PipeOpResponseCompositor] has one input channel named `"input"`, which takes +#' `NULL` during training and [PredictionSurv] during prediction. +#' +#' [PipeOpResponseCompositor] has one output channel named `"output"`, producing `NULL` during training +#' and a [PredictionSurv] during prediction. +#' +#' The output during prediction is the [PredictionSurv] from the input but with the `response` +#' predict type overwritten by the given method. +#' +#' @section State: +#' The `$state` is left empty (`list()`). +#' +#' @section Parameters: +#' - `method` :: `character(1)` \cr +#' Determines what method should be used to produce a survival time (response) from the survival distribution. +#' Available methods are `"rmst"` and `"median"`, corresponding to the *restricted mean survival time* and the *median survival time* respectively. +#' - `cutoff_time` :: `numeric(1)` \cr +#' Determines the time point up to which we calculate the restricted mean survival time (works only for the `"rmst"` method). +#' If `NULL` (default), all the available time points in the predicted survival distribution will be used. +#' * `add_crank` :: `logical(1)` \cr +#' If `TRUE` then `crank` predict type will be set as `-response` (as higher survival times correspond to lower risk). +#' Works only if `overwrite` is `TRUE`. +#' * `overwrite` :: `logical(1)` \cr +#' If `FALSE` (default) and the prediction already has a `response` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `response` (and the `crank`, if `add_crank` is `TRUE`) will be overwritten. +#' +#' @section Internals: +#' The restricted mean survival time is the default/preferred method and is calculated as follows: +#' \deqn{T_{i,rmst} \approx \sum_{t_j \in [0,\tau]} (t_j - t_{j-1}) S_i(t_j)} +#' +#' where \eqn{T} is the expected survival time, \eqn{\tau} is the time cutoff and \eqn{S_i(t_j)} are the predicted survival probabilities of observation \eqn{i} for all the \eqn{t_j} time points. +#' +#' The \eqn{T_{i,median}} survival time is just the first time point for which the survival probability is less than \eqn{0.5}. +#' If no such time point exists (e.g. when the survival distribution is not proper due to high censoring) we return the last time point. +#' This is **not a good estimate to use in general**, only a reasonable substitute in such cases. +#' +#' @references +#' `r format_bib("zhao_2016")` +#' +#' @seealso [pipeline_crankcompositor] +#' @family survival compositors +#' @examples +#' \dontrun{ +#' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { +#' library(mlr3pipelines) +#' task = tsk("rats") +#' +#' # add survival time prediction type to the predictions of a Cox model +#' # Median survival time as response +#' pred = lrn("surv.coxph")$train(task)$predict(task) +#' por = po("responsecompose", param_vals = list(method = "median", overwrite = TRUE)) +#' por$predict(list(pred))[[1L]] +#' # mostly improper survival distributions, "median" sets the survival time +#' # to the last time point +#' +#' # RMST (default) as response, while also changing the crank = -response +#' por = po("responsecompose", param_vals = list(overwrite = TRUE, add_crank = TRUE)) +#' por$predict(list(pred))[[1L]] +#' } +#' } +#' @export +PipeOpResponseCompositor = R6Class("PipeOpResponseCompositor", + inherit = mlr3pipelines::PipeOp, + public = list( + #' @description + #' Creates a new instance of this [R6][R6::R6Class] class. + initialize = function(id = "responsecompose", param_vals = list()) { + param_set = ps( + method = p_fct(default = "rmst", levels = c("rmst", "median"), tags = "predict"), + cutoff_time = p_dbl(0, default = NULL, special_vals = list(NULL), tags = "predict"), + add_crank = p_lgl(default = FALSE, tags = "predict"), + overwrite = p_lgl(default = FALSE, tags = "predict") + ) + + param_set$set_values(method = "rmst", add_crank = FALSE, overwrite = FALSE) + + super$initialize( + id = id, + param_set = param_set, + param_vals = param_vals, + input = data.table(name = "input", train = "NULL", predict = "PredictionSurv"), + output = data.table(name = "output", train = "NULL", predict = "PredictionSurv"), + packages = c("mlr3proba") + ) + } + ), + + private = list( + .train = function(inputs) { + self$state = list() + list(NULL) + }, + + .predict = function(inputs) { + pred = inputs[[1L]] + overwrite = self$param_set$values$overwrite + has_response = !is.null(pred$response) + + if (!overwrite & has_response) { + # return prediction as is + return(list(pred)) + } else { + # compose response (survival time) from distr prediction + assert("distr" %in% pred$predict_types) + + # get survival matrix + if (inherits(pred$data$distr, "array")) { + surv = pred$data$distr + if (length(dim(surv)) == 3L) { + # survival 3d array, extract median + surv = .ext_surv_mat(arr = surv, which.curve = 0.5) + } + } else { + stop("Distribution prediction does not have a survival matrix or array + in the $data$distr slot") + } + + # time points + times = as.numeric(colnames(surv)) + + method = self$param_set$values$method + if (method == "rmst") { + cutoff_time = self$param_set$values$cutoff_time + within_range = !is.null(cutoff_time) && + cutoff_time <= max(times) && + cutoff_time >= min(times) + if (within_range) { + # subset survival matrix and times + surv = surv[, times <= cutoff_time, drop = FALSE] + times = times[times <= cutoff_time] + } + + # calculate the restricted mean survival time + weights = c(times[1], diff(times)) # time intervals as weights + response = apply(surv, 1, function(surv_probs) { + sum(weights * surv_probs) + }) + } else { # median + t_max = tail(times, 1) # last time point + response = apply(surv, 1, function(surv_probs) { + # first survival probability that is < 0.5 + median_time_index = match(TRUE, surv_probs < 0.5) + if (!is.na(median_time_index)) times[median_time_index] else t_max + }) + } + + add_crank = self$param_set$values$add_crank + if (add_crank) { + # higher risk, lower survival time + crank = -response + } else { + # otherwise, leave crank unchanged + crank = pred$crank + } + + # update only `response` (and maybe `crank`) + p = PredictionSurv$new( + row_ids = pred$row_ids, + truth = pred$truth, + crank = crank, + distr = pred$distr, + lp = pred$lp, + response = response + ) + + return(list(p)) + } + } + ) +) + +register_pipeop("responsecompose", PipeOpResponseCompositor) diff --git a/R/PipeOpSurvAvg.R b/R/PipeOpSurvAvg.R index fddaf95c8..12289e3b8 100644 --- a/R/PipeOpSurvAvg.R +++ b/R/PipeOpSurvAvg.R @@ -72,21 +72,22 @@ PipeOpSurvAvg = R6Class("PipeOpSurvAvg", private = list( weighted_avg_predictions = function(inputs, weights, row_ids, truth) { response_matrix = map(inputs, "response") - if (some(response_matrix, anyMissing)) { + + if (some(response_matrix, is.null)) { response = NULL } else { response = c(simplify2array(response_matrix) %*% weights) } crank_matrix = map(inputs, "crank") - if (some(crank_matrix, anyMissing)) { + if (some(crank_matrix, is.null)) { crank = NULL } else { crank = c(simplify2array(crank_matrix) %*% weights) } lp_matrix = map(inputs, "lp") - if (some(lp_matrix, anyMissing)) { + if (some(lp_matrix, is.null)) { lp = NULL } else { lp = c(simplify2array(lp_matrix) %*% weights) diff --git a/R/PredictionSurv.R b/R/PredictionSurv.R index 9732ccc5f..83e72662f 100644 --- a/R/PredictionSurv.R +++ b/R/PredictionSurv.R @@ -90,8 +90,8 @@ PredictionSurv = R6Class("PredictionSurv", distr = private$.simplify_distr(distr) } - pdata = list(row_ids = row_ids, truth = truth, crank = crank, distr = distr, lp = lp, - response = response) + pdata = list(row_ids = row_ids, truth = truth, crank = crank, distr = distr, + lp = lp, response = response) pdata = discard(pdata, is.null) class(pdata) = c("PredictionDataSurv", "PredictionData") @@ -108,7 +108,7 @@ PredictionSurv = R6Class("PredictionSurv", active = list( #' @field truth (`Surv`)\cr - #' True (observed) outcome. + #' True (observed) outcome. truth = function() { self$data$truth }, @@ -126,19 +126,19 @@ PredictionSurv = R6Class("PredictionSurv", return(self$data$distr) } - private$.distrify_survarray(self$data$distr %??% NA_real_) + private$.distrify_survarray(self$data$distr) }, #' @field lp (`numeric()`)\cr #' Access the stored predicted linear predictor. lp = function() { - self$data$lp %??% rep(NA_real_, length(self$data$row_ids)) + self$data$lp }, #' @field response (`numeric()`)\cr #' Access the stored predicted survival time. response = function() { - self$data$response %??% rep(NA_real_, length(self$data$row_ids)) + self$data$response } ), diff --git a/R/RcppExports.R b/R/RcppExports.R index a9a0a21ad..34d44a152 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,10 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +c_assert_surv <- function(mat) { + .Call(`_mlr3proba_c_assert_surv`, mat) +} + .c_get_unique_times <- function(true_times, req_times) { .Call(`_mlr3proba_c_get_unique_times`, true_times, req_times) } diff --git a/R/assertions.R b/R/assertions.R index 6030e997c..3a9dc7a69 100644 --- a/R/assertions.R +++ b/R/assertions.R @@ -10,3 +10,44 @@ assert_surv = function(x, len = NULL, any.missing = TRUE, null.ok = FALSE, .var. assert_class(x, "Surv", null.ok = null.ok, .var.name = .var.name) assert_matrix(x, any.missing = any.missing, nrows = len, null.ok = null.ok, .var.name = .var.name) } + +#' @title Assert survival matrix +#' +#' @description Asserts if the given input matrix is a (discrete) survival +#' probabilities matrix using [Rcpp] code. +#' The following checks are performed: +#' +#' 1. All values are probabilities, i.e. \eqn{S(t) \in [0,1]} +#' 2. Column names correspond to time-points and should therefore be coercable to +#' `numeric` and increasing +#' 3. Per row/observation, the survival probabilities decrease non-strictly, i.e. +#' \eqn{S(t) \ge S(t+1)} +#' +#' @param x (`matrix()`)\cr +#' A matrix of (predicted) survival probabilities. +#' Rows are observations, columns are (increasing) time points. +#' +#' @return if the assertion fails an error occurs, otherwise `NULL` is returned +#' invisibly. +#' +#' @examples +#' x = matrix(data = c(1,0.6,0.4,0.8,0.8,0.7), nrow = 2, ncol = 3, byrow = TRUE) +#' colnames(x) = c(12, 34, 42) +#' x +#' +#' assert_surv_matrix(x) +#' +#' @export +assert_surv_matrix = function(x) { + times = assert_numeric(as.numeric(colnames(x)), any.missing = FALSE) + + if (is.null(times) || !identical(order(times), seq(ncol(x)))) { + stop("Survival matrix column names must be increasing numeric (time points)") + } + + if (!c_assert_surv(x)) { + stop("Survival probabilities must be (non-strictly) decreasing and between [0, 1]") + } + + invisible(NULL) +} diff --git a/R/bibentries.R b/R/bibentries.R index f33899ba0..11c5ccb87 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -1,679 +1,679 @@ bibentries = c( # nolint start aalen_1978 = bibentry("article", - title = "Nonparametric Inference for a Family of Counting Processes", - author = "Odd Aalen", - year = 1978, - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 6, - number = 4, - pages = "701--726", - url = "https://www.jstor.org/stable/2958850" + title = "Nonparametric Inference for a Family of Counting Processes", + author = "Odd Aalen", + year = 1978, + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 6, + number = 4, + pages = "701--726", + url = "https://www.jstor.org/stable/2958850" ), begg_2000 = bibentry("article", - title = "Comparing tumour staging and grading systems: a case study and a review of the issues, using thymoma as a model", - author = "Colin B. Begg and Laura D. Cramer and E. S. Venkatraman and Juan Rosai", - year = 2000, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 19, - number = 15, - pages = "1997--2014", - doi = "10.1002/1097-0258(20000815)19:15<1997::aid-sim511>3.0.co;2-c" + title = "Comparing tumour staging and grading systems: a case study and a review of the issues, using thymoma as a model", + author = "Colin B. Begg and Laura D. Cramer and E. S. Venkatraman and Juan Rosai", + year = 2000, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 19, + number = 15, + pages = "1997--2014", + doi = "10.1002/1097-0258(20000815)19:15<1997::aid-sim511>3.0.co;2-c" ), breiman_1984 = bibentry("book", - title = "Classification And Regression Trees", - author = "Leo Breiman and Jerome H. Friedman and Richard A. Olshen and Charles J. Stone", - year = 1984, - publisher = "Routledge", - doi = "10.1201/9781315139470" + title = "Classification And Regression Trees", + author = "Leo Breiman and Jerome H. Friedman and Richard A. Olshen and Charles J. Stone", + year = 1984, + publisher = "Routledge", + doi = "10.1201/9781315139470" ), breiman_2001 = bibentry("article", - title = "Random Forests", - author = "Breiman, Leo", - year = 2001, - journal = "Machine Learning", - volume = 45, - number = 1, - pages = "5--32", - doi = "10.1023/A:1010933404324", + title = "Random Forests", + author = "Breiman, Leo", + year = 2001, + journal = "Machine Learning", + volume = 45, + number = 1, + pages = "5--32", + doi = "10.1023/A:1010933404324", ), buehlmann_2003 = bibentry("article", - title = "Boosting With the L2 Loss", - author = "Peter B\u00fchlmann and Bin Yu", - year = 2003, - month = "jun", - journal = "Journal of the American Statistical Association", - publisher = "Informa {UK} Limited", - volume = 98, - number = 462, - pages = "324--339", - doi = "10.1198/016214503000125" + title = "Boosting With the L2 Loss", + author = "Peter B\u00fchlmann and Bin Yu", + year = 2003, + month = "jun", + journal = "Journal of the American Statistical Association", + publisher = "Informa {UK} Limited", + volume = 98, + number = 462, + pages = "324--339", + doi = "10.1198/016214503000125" ), buehlmann_2006 = bibentry("article", - title = "Boosting for High-Dimensional Linear Models", - author = "Peter B\u00fchlmann", - year = 2006, - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 34, - number = 2, - pages = "559--583", - url = "https://www.jstor.org/stable/25463430" + title = "Boosting for High-Dimensional Linear Models", + author = "Peter B\u00fchlmann", + year = 2006, + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 34, + number = 2, + pages = "559--583", + url = "https://www.jstor.org/stable/25463430" ), buehlmann_2007 = bibentry("article", - title = "Boosting Algorithms: Regularization, Prediction and Model Fitting", - author = "Peter B\u00fchlmann and Torsten Hothorn", - year = 2007, - month = "nov", - journal = "Statistical Science", - publisher = "Institute of Mathematical Statistics", - volume = 22, - number = 4, - pages = "477--505", - doi = "10.1214/07-sts242" + title = "Boosting Algorithms: Regularization, Prediction and Model Fitting", + author = "Peter B\u00fchlmann and Torsten Hothorn", + year = 2007, + month = "nov", + journal = "Statistical Science", + publisher = "Institute of Mathematical Statistics", + volume = 22, + number = 4, + pages = "477--505", + doi = "10.1214/07-sts242" ), burges_2010 = bibentry("techreport", - title = "From RankNet to LambdaRank to LambdaMART: An Overview", - author = "Burges, Chris J.C.", - year = 2010, - month = "jun", - number = "MSR-TR-2010-82", - url = "https://www.microsoft.com/en-us/research/publication/from-ranknet-to-lambdarank-to-lambdamart-an-overview/", + title = "From RankNet to LambdaRank to LambdaMART: An Overview", + author = "Burges, Chris J.C.", + year = 2010, + month = "jun", + number = "MSR-TR-2010-82", + url = "https://www.microsoft.com/en-us/research/publication/from-ranknet-to-lambdarank-to-lambdamart-an-overview/", institution = "Microsoft Research" ), chambless_2006 = bibentry("article", - title = "Estimation of time-dependent area under the {ROC} curve for long-term risk prediction", - author = "Lloyd E. Chambless and Guoqing Diao", - year = 2006, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 25, - number = 20, - pages = "3474--3486", - doi = "10.1002/sim.2299" + title = "Estimation of time-dependent area under the {ROC} curve for long-term risk prediction", + author = "Lloyd E. Chambless and Guoqing Diao", + year = 2006, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 25, + number = 20, + pages = "3474--3486", + doi = "10.1002/sim.2299" ), cox_1972 = bibentry("article", - title = "Regression Models and Life-Tables", - author = "D. R. Cox", - year = 1972, - month = "jan", - journal = "Journal of the Royal Statistical Society: Series B (Methodological)", - publisher = "Wiley", - volume = 34, - number = 2, - pages = "187--202", - doi = "10.1111/j.2517-6161.1972.tb00899.x" + title = "Regression Models and Life-Tables", + author = "D. R. Cox", + year = 1972, + month = "jan", + journal = "Journal of the Royal Statistical Society: Series B (Methodological)", + publisher = "Wiley", + volume = 34, + number = 2, + pages = "187--202", + doi = "10.1111/j.2517-6161.1972.tb00899.x" ), freund_1996 = bibentry("inproceedings", - title = "Experiments with a new boosting algorithm", - author = "Freund, Yoav and Schapire, Robert E and others", - year = 1996, - booktitle = "In: Thirteenth International Conference on ML", - volume = 96, - pages = "148--156", + title = "Experiments with a new boosting algorithm", + author = "Freund, Yoav and Schapire, Robert E and others", + year = 1996, + booktitle = "In: Thirteenth International Conference on ML", + volume = 96, + pages = "148--156", organization = "Citeseer" ), freund_1997 = bibentry("article", - title = "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting", - author = "Yoav Freund and Robert E Schapire", - year = 1997, - month = "aug", - journal = "Journal of Computer and System Sciences", - publisher = "Elsevier {BV}", - volume = 55, - number = 1, - pages = "119--139", - doi = "10.1006/jcss.1997.1504" + title = "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting", + author = "Yoav Freund and Robert E Schapire", + year = 1997, + month = "aug", + journal = "Journal of Computer and System Sciences", + publisher = "Elsevier {BV}", + volume = 55, + number = 1, + pages = "119--139", + doi = "10.1006/jcss.1997.1504" ), friedman_2000 = bibentry("article", - title = "Additive logistic regression: a statistical view of boosting (With discussion and a rejoinder by the authors)", - author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", - year = 2000, - month = "apr", - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 28, - number = 2, - pages = "337--407", - doi = "10.1214/aos/1016218223" + title = "Additive logistic regression: a statistical view of boosting (With discussion and a rejoinder by the authors)", + author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", + year = 2000, + month = "apr", + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 28, + number = 2, + pages = "337--407", + doi = "10.1214/aos/1016218223" ), friedman_2001 = bibentry("article", - title = "Greedy Function Approximation: A Gradient Boosting Machine", - author = "Jerome H. Friedman", - year = 2001, - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 29, - number = 5, - pages = "1189--1232", - url = "https://www.jstor.org/stable/2699986" + title = "Greedy Function Approximation: A Gradient Boosting Machine", + author = "Jerome H. Friedman", + year = 2001, + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 29, + number = 5, + pages = "1189--1232", + url = "https://www.jstor.org/stable/2699986" ), friedman_2002 = bibentry("article", - title = "Stochastic gradient boosting", - author = "Jerome H. Friedman", - year = 2002, - month = "feb", - journal = "Computational Statistics & Data Analysis", - publisher = "Elsevier {BV}", - volume = 38, - number = 4, - pages = "367--378", - doi = "10.1016/s0167-9473(01)00065-2" + title = "Stochastic gradient boosting", + author = "Jerome H. Friedman", + year = 2002, + month = "feb", + journal = "Computational Statistics & Data Analysis", + publisher = "Elsevier {BV}", + volume = 38, + number = 4, + pages = "367--378", + doi = "10.1016/s0167-9473(01)00065-2" ), friedman_2010 = bibentry("article", - title = "Regularization Paths for Generalized Linear Models via Coordinate Descent", - author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", - year = 2010, - journal = "Journal of Statistical Software", - publisher = "Foundation for Open Access Statistic", - volume = 33, - number = 1, - doi = "10.18637/jss.v033.i01" + title = "Regularization Paths for Generalized Linear Models via Coordinate Descent", + author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", + year = 2010, + journal = "Journal of Statistical Software", + publisher = "Foundation for Open Access Statistic", + volume = 33, + number = 1, + doi = "10.18637/jss.v033.i01" ), goeman_2009 = bibentry("article", - title = "L1Penalized Estimation in the Cox Proportional Hazards Model", - author = "Jelle J. Goeman", - year = 2009, - month = "nov", - journal = "Biometrical Journal", - publisher = "Wiley", - pages = "NA--NA", - doi = "10.1002/bimj.200900028" + title = "L1Penalized Estimation in the Cox Proportional Hazards Model", + author = "Jelle J. Goeman", + year = 2009, + month = "nov", + journal = "Biometrical Journal", + publisher = "Wiley", + pages = "NA--NA", + doi = "10.1002/bimj.200900028" ), goenen_2005 = bibentry("article", - title = "Concordance probability and discriminatory power in proportional hazards regression", - author = "Mithat Goenen and Glenn Heller", - year = 2005, - month = "dec", - journal = "Biometrika", - publisher = "Oxford University Press ({OUP})", - volume = 92, - number = 4, - pages = "965--970", - doi = "10.1093/biomet/92.4.965" + title = "Concordance probability and discriminatory power in proportional hazards regression", + author = "Mithat Goenen and Glenn Heller", + year = 2005, + month = "dec", + journal = "Biometrika", + publisher = "Oxford University Press ({OUP})", + volume = 92, + number = 4, + pages = "965--970", + doi = "10.1093/biomet/92.4.965" ), graf_1999 = bibentry("article", - title = "Assessment and comparison of prognostic classification schemes for survival data", - author = "Erika Graf and Claudia Schmoor and Willi Sauerbrei and Martin Schumacher", - year = 1999, - month = "sep", - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 18, - number = "17-18", - pages = "2529--2545", - doi = "10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5" + title = "Assessment and comparison of prognostic classification schemes for survival data", + author = "Erika Graf and Claudia Schmoor and Willi Sauerbrei and Martin Schumacher", + year = 1999, + month = "sep", + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 18, + number = "17-18", + pages = "2529--2545", + doi = "10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5" ), harrell_1982 = bibentry("article", - title = "Evaluating the yield of medical tests", - author = "Harrell, Frank E and Califf, Robert M and Pryor, David B and Lee, Kerry L and Rosati, Robert A", - year = 1982, - journal = "Jama", - publisher = "American Medical Association", - volume = 247, - number = 18, - pages = "2543--2546" + title = "Evaluating the yield of medical tests", + author = "Harrell, Frank E and Califf, Robert M and Pryor, David B and Lee, Kerry L and Rosati, Robert A", + year = 1982, + journal = "Jama", + publisher = "American Medical Association", + volume = 247, + number = 18, + pages = "2543--2546" ), hofner_2012 = bibentry("article", - title = "Model-based boosting in R: a hands-on tutorial using the R package mboost", - author = "Benjamin Hofner and Andreas Mayr and Nikolay Robinzonov and Matthias Schmid", - year = 2012, - month = "dec", - journal = "Computational Statistics", - publisher = "Springer Science and Business Media {LLC}", - volume = 29, - number = "1-2", - pages = "3--35", - doi = "10.1007/s00180-012-0382-5" + title = "Model-based boosting in R: a hands-on tutorial using the R package mboost", + author = "Benjamin Hofner and Andreas Mayr and Nikolay Robinzonov and Matthias Schmid", + year = 2012, + month = "dec", + journal = "Computational Statistics", + publisher = "Springer Science and Business Media {LLC}", + volume = 29, + number = "1-2", + pages = "3--35", + doi = "10.1007/s00180-012-0382-5" ), hothorn_2006 = bibentry("article", - title = "Unbiased Recursive Partitioning: A Conditional Inference Framework", - author = "Torsten Hothorn and Kurt Hornik and Achim Zeileis", - year = 2006, - month = "sep", - journal = "Journal of Computational and Graphical Statistics", - publisher = "Informa {UK} Limited", - volume = 15, - number = 3, - pages = "651--674", - doi = "10.1198/106186006x133933" + title = "Unbiased Recursive Partitioning: A Conditional Inference Framework", + author = "Torsten Hothorn and Kurt Hornik and Achim Zeileis", + year = 2006, + month = "sep", + journal = "Journal of Computational and Graphical Statistics", + publisher = "Informa {UK} Limited", + volume = 15, + number = 3, + pages = "651--674", + doi = "10.1198/106186006x133933" ), hothorn_2010 = bibentry("article", - title = "Model-based boosting 2.0", - author = "Hothorn, Torsten and B\u00fchlmann, Peter and Kneib, Thomas and Schmid, Matthias and Hofner, Benjamin", - year = 2010, - journal = "Journal of Machine Learning Research", - volume = 11, - number = "Aug", - pages = "2109--2113", - url = "https://www.jmlr.org/papers/v11/hothorn10a.html" + title = "Model-based boosting 2.0", + author = "Hothorn, Torsten and B\u00fchlmann, Peter and Kneib, Thomas and Schmid, Matthias and Hofner, Benjamin", + year = 2010, + journal = "Journal of Machine Learning Research", + volume = 11, + number = "Aug", + pages = "2109--2113", + url = "https://www.jmlr.org/papers/v11/hothorn10a.html" ), hung_2010 = bibentry("article", - title = "Estimation methods for time-dependent AUC models with survival data", - author = "Hung Hung and Chin-Tsang Chiang", - year = 2010, - journal = "The Canadian Journal of Statistics / La Revue Canadienne de Statistique", - publisher = "Statistical Society of Canada, Wiley", - volume = 38, - number = 1, - pages = "8--26", - url = "https://www.jstor.org/stable/27805213" + title = "Estimation methods for time-dependent AUC models with survival data", + author = "Hung Hung and Chin-Tsang Chiang", + year = 2010, + journal = "The Canadian Journal of Statistics / La Revue Canadienne de Statistique", + publisher = "Statistical Society of Canada, Wiley", + volume = 38, + number = 1, + pages = "8--26", + url = "https://www.jstor.org/stable/27805213" ), ishwaran_2008 = bibentry("article", - title = "Random survival forests", - author = "Ishwaran, Hemant and Kogalur, Udaya B and Blackstone, Eugene H and Lauer, Michael S and others", - year = 2008, - journal = "The annals of applied statistics", - publisher = "Institute of Mathematical Statistics", - volume = 2, - number = 3, - pages = "841--860" + title = "Random survival forests", + author = "Ishwaran, Hemant and Kogalur, Udaya B and Blackstone, Eugene H and Lauer, Michael S and others", + year = 2008, + journal = "The Annals of applied statistics", + publisher = "Institute of Mathematical Statistics", + volume = 2, + number = 3, + pages = "841--860" ), kalbfleisch_2002 = bibentry("book", - title = "The Statistical Analysis of Failure Time Data", - author = "John D. Kalbfleisch and Ross L. Prentice", - year = 2002, - month = "aug", - publisher = "John Wiley & Sons, Inc.", - doi = "10.1002/9781118032985" + title = "The Statistical Analysis of Failure Time Data", + author = "John D. Kalbfleisch and Ross L. Prentice", + year = 2002, + month = "aug", + publisher = "John Wiley & Sons, Inc.", + doi = "10.1002/9781118032985" ), kaplan_1958 = bibentry("article", - title = "Nonparametric Estimation from Incomplete Observations", - author = "E. L. Kaplan and Paul Meier", - year = 1958, - month = "jun", - journal = "Journal of the American Statistical Association", - publisher = "Informa {UK} Limited", - volume = 53, - number = 282, - pages = "457--481", - doi = "10.1080/01621459.1958.10501452" + title = "Nonparametric Estimation from Incomplete Observations", + author = "E. L. Kaplan and Paul Meier", + year = 1958, + month = "jun", + journal = "Journal of the American Statistical Association", + publisher = "Informa {UK} Limited", + volume = 53, + number = 282, + pages = "457--481", + doi = "10.1080/01621459.1958.10501452" ), kneib_2008 = bibentry("article", - title = "Variable Selection and Model Choice in Geoadditive Regression Models", - author = "Thomas Kneib and Torsten Hothorn and Gerhard Tutz", - year = 2008, - month = "aug", - journal = "Biometrics", - publisher = "Wiley", - volume = 65, - number = 2, - pages = "626--634", - doi = "10.1111/j.1541-0420.2008.01112.x" + title = "Variable Selection and Model Choice in Geoadditive Regression Models", + author = "Thomas Kneib and Torsten Hothorn and Gerhard Tutz", + year = 2008, + month = "aug", + journal = "Biometrics", + publisher = "Wiley", + volume = 65, + number = 2, + pages = "626--634", + doi = "10.1111/j.1541-0420.2008.01112.x" ), kriegler_2007 = bibentry("phdthesis", - title = "Cost-Sensitive Stochastic Gradient Boosting Within a Quantitative Regression Framework", - author = "Kriegler, Brian", - year = 2007, - url = "https://dl.acm.org/citation.cfm?id=1354603", - school = "University of California at Los Angeles" + title = "Cost-Sensitive Stochastic Gradient Boosting Within a Quantitative Regression Framework", + author = "Kriegler, Brian", + year = 2007, + url = "https://dl.acm.org/citation.cfm?id=1354603", + school = "University of California at Los Angeles" ), nagelkerke_1991 = bibentry("article", - title = "A note on a general definition of the coefficient of determination", - author = "Nagelkerke, Nico JD and others", - year = 1991, - journal = "Biometrika", - publisher = "Oxford University Press", - volume = 78, - number = 3, - pages = "691--692" + title = "A note on a general definition of the coefficient of determination", + author = "Nagelkerke, Nico JD and others", + year = 1991, + journal = "Biometrika", + publisher = "Oxford University Press", + volume = 78, + number = 3, + pages = "691--692" ), nelson_1969 = bibentry("article", - title = "Hazard Plotting for Incomplete Failure Data", - author = "Wayne Nelson", - year = 1969, - month = "jan", - journal = "Journal of Quality Technology", - publisher = "Informa {UK} Limited", - volume = 1, - number = 1, - pages = "27--52", - doi = "10.1080/00224065.1969.11980344" + title = "Hazard Plotting for Incomplete Failure Data", + author = "Wayne Nelson", + year = 1969, + month = "jan", + journal = "Journal of Quality Technology", + publisher = "Informa {UK} Limited", + volume = 1, + number = 1, + pages = "27--52", + doi = "10.1080/00224065.1969.11980344" ), nelson_1972 = bibentry("article", - title = "Theory and Applications of Hazard Plotting for Censored Failure Data", - author = "Wayne Nelson", - year = 1972, - month = "nov", - journal = "Technometrics", - publisher = "Informa {UK} Limited", - volume = 14, - number = 4, - pages = "945--966", - doi = "10.1080/00401706.1972.10488991" + title = "Theory and Applications of Hazard Plotting for Censored Failure Data", + author = "Wayne Nelson", + year = 1972, + month = "nov", + journal = "Technometrics", + publisher = "Informa {UK} Limited", + volume = 14, + number = 4, + pages = "945--966", + doi = "10.1080/00401706.1972.10488991" ), oquigley_2005 = bibentry("article", - title = "Explained randomness in proportional hazards models", - author = "John O'Quigley and Ronghui Xu and Janez Stare", - year = 2005, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 24, - number = 3, - pages = "479--489", - doi = "10.1002/sim.1946" + title = "Explained randomness in proportional hazards models", + author = "John O'Quigley and Ronghui Xu and Janez Stare", + year = 2005, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 24, + number = 3, + pages = "479--489", + doi = "10.1002/sim.1946" ), ridgeway_1999 = bibentry("article", - title = "The state of boosting", - author = "Ridgeway, Greg", - year = 1999, - journal = "Computing Science and Statistics", - number = 31, - pages = "172--181" + title = "The state of boosting", + author = "Ridgeway, Greg", + year = 1999, + journal = "Computing Science and Statistics", + number = 31, + pages = "172--181" ), royston_2002 = bibentry("article", - title = "Flexible parametric proportional-hazards and proportional-odds models for censored survival data, with application to prognostic modelling and estimation of treatment effects", - author = "Patrick Royston and Mahesh K. B. Parmar", - year = 2002, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 21, - number = 15, - pages = "2175--2197", - doi = "10.1002/sim.1203" + title = "Flexible parametric proportional-hazards and proportional-odds models for censored survival data, with application to prognostic modelling and estimation of treatment effects", + author = "Patrick Royston and Mahesh K. B. Parmar", + year = 2002, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 21, + number = 15, + pages = "2175--2197", + doi = "10.1002/sim.1203" ), schmid_2008 = bibentry("article", - title = "Boosting additive models using component-wise P-Splines", - author = "Matthias Schmid and Torsten Hothorn", - year = 2008, - month = "dec", - journal = "Computational Statistics & Data Analysis", - publisher = "Elsevier {BV}", - volume = 53, - number = 2, - pages = "298--311", - doi = "10.1016/j.csda.2008.09.009" + title = "Boosting additive models using component-wise P-Splines", + author = "Matthias Schmid and Torsten Hothorn", + year = 2008, + month = "dec", + journal = "Computational Statistics & Data Analysis", + publisher = "Elsevier {BV}", + volume = 53, + number = 2, + pages = "298--311", + doi = "10.1016/j.csda.2008.09.009" ), song_2008 = bibentry("article", - title = "A semiparametric approach for the covariate specific ROC curve with survival outcome", - author = "Song, Xiao and Zhou, Xiao-Hua", - year = 2008, - journal = "Statistica Sinica", - volume = 18, - number = 3, - pages = "947--65", - url = "https://www.jstor.org/stable/24308524" + title = "A semiparametric approach for the covariate specific ROC curve with survival outcome", + author = "Song, Xiao and Zhou, Xiao-Hua", + year = 2008, + journal = "Statistica Sinica", + volume = 18, + number = 3, + pages = "947--65", + url = "https://www.jstor.org/stable/24308524" ), uno_2007 = bibentry("article", - title = "Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models", - author = "Hajime Uno and Tianxi Cai and Lu Tian and L. J Wei", - year = 2007, - month = "jun", - journal = "Journal of the American Statistical Association", - publisher = "Informa {UK} Limited", - volume = 102, - number = 478, - pages = "527--537", - doi = "10.1198/016214507000000149" + title = "Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models", + author = "Hajime Uno and Tianxi Cai and Lu Tian and L. J Wei", + year = 2007, + month = "jun", + journal = "Journal of the American Statistical Association", + publisher = "Informa {UK} Limited", + volume = 102, + number = 478, + pages = "527--537", + doi = "10.1198/016214507000000149" ), uno_2011 = bibentry("article", - title = "On the C-statistics for evaluating overall adequacy of risk prediction procedures with censored survival data", - author = "Hajime Uno and Tianxi Cai and Michael J. Pencina and Ralph B. D'Agostino and L. J. Wei", - year = 2011, - journal = "Statistics in Medicine", - publisher = "Wiley", - pages = "n/a--n/a", - doi = "10.1002/sim.4154" + title = "On the C-statistics for evaluating overall adequacy of risk prediction procedures with censored survival data", + author = "Hajime Uno and Tianxi Cai and Michael J. Pencina and Ralph B. D'Agostino and L. J. Wei", + year = 2011, + journal = "Statistics in Medicine", + publisher = "Wiley", + pages = "n/a--n/a", + doi = "10.1002/sim.4154" ), vanbelle_2010 = bibentry("article", - title = "Improved performance on high-dimensional survival data by application of Survival-{SVM}", - author = "V. Van Belle and K. Pelckmans and S. Van Huffel and J. A. K. Suykens", - year = 2010, - month = "nov", - journal = "Bioinformatics", - publisher = "Oxford University Press ({OUP})", - volume = 27, - number = 1, - pages = "87--94", - doi = "10.1093/bioinformatics/btq617" + title = "Improved performance on high-dimensional survival data by application of Survival-{SVM}", + author = "V. Van Belle and K. Pelckmans and S. Van Huffel and J. A. K. Suykens", + year = 2010, + month = "nov", + journal = "Bioinformatics", + publisher = "Oxford University Press ({OUP})", + volume = 27, + number = 1, + pages = "87--94", + doi = "10.1093/bioinformatics/btq617" ), vanbelle_2011 = bibentry("article", - title = "Support vector methods for survival analysis: a comparison between ranking and regression approaches", - author = "Vanya Van Belle and Kristiaan Pelckmans and Sabine Van Huffel and Johan A.K. Suykens", - year = 2011, - month = "oct", - journal = "Artificial Intelligence in Medicine", - publisher = "Elsevier {BV}", - volume = 53, - number = 2, - pages = "107--118", - doi = "10.1016/j.artmed.2011.06.006" + title = "Support vector methods for survival analysis: a comparison between ranking and regression approaches", + author = "Vanya Van Belle and Kristiaan Pelckmans and Sabine Van Huffel and Johan A.K. Suykens", + year = 2011, + month = "oct", + journal = "Artificial Intelligence in Medicine", + publisher = "Elsevier {BV}", + volume = 53, + number = 2, + pages = "107--118", + doi = "10.1016/j.artmed.2011.06.006" ), wright_2017 = bibentry("article", - title = "ranger: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}", - author = "Wright, Marvin N. and Ziegler, Andreas", - year = 2017, - journal = "Journal of Statistical Software", - volume = 77, - number = 1, - pages = "1--17", - doi = "10.18637/jss.v077.i01" + title = "ranger: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}", + author = "Wright, Marvin N. and Ziegler, Andreas", + year = 2017, + journal = "Journal of Statistical Software", + volume = 77, + number = 1, + pages = "1--17", + doi = "10.18637/jss.v077.i01" ), xu_1999 = bibentry("article", - title = "A R2 type measure of dependence for proportional hazards models", - author = "Ronghui Xu and John O'Quigley", - year = 1999, - month = "jan", - journal = "Journal of Nonparametric Statistics", - publisher = "Informa {UK} Limited", - volume = 12, - number = 1, - pages = "83--107", - doi = "10.1080/10485259908832799" + title = "A R2 type measure of dependence for proportional hazards models", + author = "Ronghui Xu and John O'Quigley", + year = 1999, + month = "jan", + journal = "Journal of Nonparametric Statistics", + publisher = "Informa {UK} Limited", + volume = 12, + number = 1, + pages = "83--107", + doi = "10.1080/10485259908832799" ), jaeger_2019 = bibentry("article", - title = "Oblique random survival forests", - volume = "13", - url = "https://projecteuclid.org/euclid.aoas/1571277776", - doi = "10.1214/19-AOAS1261", - language = "EN", - number = "3", - urldate = "2019-10-19", - journal = "The Annals of Applied Statistics", - author = "Jaeger, Byron C. and Long, D. Leann and Long, Dustin M. and Sims, Mario and Szychowski, Jeff M. and Min, Yuan-I. and Mcclure, Leslie A. and Howard, George and Simon, Noah", - month = "sep", - year = "2019", - pages = "1847--1883" + title = "Oblique random survival forests", + volume = "13", + url = "https://projecteuclid.org/euclid.aoas/1571277776", + doi = "10.1214/19-AOAS1261", + language = "EN", + number = "3", + urldate = "2019-10-19", + journal = "The Annals of Applied Statistics", + author = "Jaeger, Byron C. and Long, D. Leann and Long, Dustin M. and Sims, Mario and Szychowski, Jeff M. and Min, Yuan-I. and Mcclure, Leslie A. and Howard, George and Simon, Noah", + month = "sep", + year = "2019", + pages = "1847--1883" ), silverman_1986 = bibentry("book", - address = "London", - author = "Silverman, B. W.", - biburl = "https://www.bibsonomy.org/bibtex/25c3b630fb0a76da55942a77551fde8a2/pierpaolo.pk81", + address = "London", + author = "Silverman, B. W.", + biburl = "https://www.bibsonomy.org/bibtex/25c3b630fb0a76da55942a77551fde8a2/pierpaolo.pk81", description = "WSD", - publisher = "Chapman & Hall", - timestamp = "2007-02-27T16:22:14.000+0100", - title = "Density Estimation for Statistics and Data Analysis", - year = 1986 + publisher = "Chapman & Hall", + timestamp = "2007-02-27T16:22:14.000+0100", + title = "Density Estimation for Statistics and Data Analysis", + year = 1986 ), chen_2016 = bibentry("article", - title = "{XGBoost}: {A} {Scalable} {Tree} {Boosting} {System}", - url = "https://arxiv.org/abs/1603.02754", - journal = "Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16", - author = "Chen, Tianqi and Guestrin, Carlos", - year = "2016", - note = "arXiv: 1603.02754", - pages = "785--794" + title = "{XGBoost}: {A} {Scalable} {Tree} {Boosting} {System}", + url = "https://arxiv.org/abs/1603.02754", + journal = "Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16", + author = "Chen, Tianqi and Guestrin, Carlos", + year = "2016", + note = "arXiv: 1603.02754", + pages = "785--794" ), binder_2009 = bibentry("article", - author = "Binder, H. and Allignol, A. and Schumacher, M. and Beyersmann, J.", - year = "2009", - title = "Boosting for high-dimensional time-to-event data with competing risks", - journal = "Bioinformatics", - pages = "890--896", - volume = "25" + author = "Binder, H. and Allignol, A. and Schumacher, M. and Beyersmann, J.", + year = "2009", + title = "Boosting for high-dimensional time-to-event data with competing risks", + journal = "Bioinformatics", + pages = "890--896", + volume = "25" ), schemper_2000 = bibentry("article", - author = "Schemper, Michael and Henderson, Robin", - doi = "10.1002/sim.1486", - journal = "Biometrics", - pages = "249--255", - pmid = "12854094", - title = "Predictive Accuracy and Explained Variation in Cox Regression", - volume = "56", - year = "2000" + author = "Schemper, Michael and Henderson, Robin", + doi = "10.1002/sim.1486", + journal = "Biometrics", + pages = "249--255", + pmid = "12854094", + title = "Predictive Accuracy and Explained Variation in Cox Regression", + volume = "56", + year = "2000" ), schmid_2011 = bibentry("article", - author = "Schmid, Matthias and Hielscher, Thomas and Augustin, Thomas and Gefeller, Olaf", - doi = "10.1111/j.1541-0420.2010.01459.x", - journal = "Biometrics", - number = "2", - pages = "524--535", - pmid = "20618308", - title = "A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error", - volume = "67", - year = "2011" + author = "Schmid, Matthias and Hielscher, Thomas and Augustin, Thomas and Gefeller, Olaf", + doi = "10.1111/j.1541-0420.2010.01459.x", + journal = "Biometrics", + number = "2", + pages = "524--535", + pmid = "20618308", + title = "A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error", + volume = "67", + year = "2011" ), vanhouwelingen_2000 = bibentry("article", - author = "Van Houwelingen, Hans C.", - doi = "10.1002/1097-0258(20001230)19:24<3401::AID-SIM554>3.0.CO;2-2", - journal = "Statistics in Medicine", - number = "24", - pages = "3401--3415", - pmid = "11122504", - title = "Validation, calibration, revision and combination of prognostic survival models", - volume = "19", - year = "2000" + author = "Van Houwelingen, Hans C.", + doi = "10.1002/1097-0258(20001230)19:24<3401::AID-SIM554>3.0.CO;2-2", + journal = "Statistics in Medicine", + number = "24", + pages = "3401--3415", + pmid = "11122504", + title = "Validation, calibration, revision and combination of prognostic survival models", + volume = "19", + year = "2000" ), peto_1972 = bibentry("article", - author = "Peto, Richard and Peto, Julian", - journal = "Journal of the Royal Statistical Society: Series A (General)", - number = "2", - pages = "185--198", - publisher = "Wiley Online Library", - title = "Asymptotically efficient rank invariant test procedures", - volume = "135", - year = "1972" + author = "Peto, Richard and Peto, Julian", + journal = "Journal of the Royal Statistical Society: Series A (General)", + number = "2", + pages = "185--198", + publisher = "Wiley Online Library", + title = "Asymptotically efficient rank invariant test procedures", + volume = "135", + year = "1972" ), schemper_2009 = bibentry("article", - author = "Schemper, Michael and Wakounig, Samo and Heinze, Georg", - doi = "10.1002/sim.3623", - journal = "Statistics in Medicine", - month = "aug", - number = "19", - pages = "2473--2489", - publisher = "John Wiley & Sons, Ltd", - title = "The estimation of average hazard ratios by weighted Cox regression", - volume = "28", - year = "2009" + author = "Schemper, Michael and Wakounig, Samo and Heinze, Georg", + doi = "10.1002/sim.3623", + journal = "Statistics in Medicine", + month = "aug", + number = "19", + pages = "2473--2489", + publisher = "John Wiley & Sons, Ltd", + title = "The estimation of average hazard ratios by weighted Cox regression", + volume = "28", + year = "2009" ), vock_2016 = bibentry("article", - author = "Vock, David M and Wolfson, Julian and Bandyopadhyay, Sunayan and Adomavicius, Gediminas and Johnson, Paul E and Vazquez-Benitez, Gabriela and O'Connor, Patrick J", - doi = "https://doi.org/10.1016/j.jbi.2016.03.009", - journal = "Journal of Biomedical Informatics", - pages = "119--131", - title = "Adapting machine learning techniques to censored time-to-event health record data: A general-purpose approach using inverse probability of censoring weighting", - url = "https://www.sciencedirect.com/science/article/pii/S1532046416000496", - volume = "61", - year = "2016" + author = "Vock, David M and Wolfson, Julian and Bandyopadhyay, Sunayan and Adomavicius, Gediminas and Johnson, Paul E and Vazquez-Benitez, Gabriela and O'Connor, Patrick J", + doi = "https://doi.org/10.1016/j.jbi.2016.03.009", + journal = "Journal of Biomedical Informatics", + pages = "119--131", + title = "Adapting machine learning techniques to censored time-to-event health record data: A general-purpose approach using inverse probability of censoring weighting", + url = "https://www.sciencedirect.com/science/article/pii/S1532046416000496", + volume = "61", + year = "2016" ), jackson_2014 = bibentry("article", - author = "Jackson, Dan and White, Ian R. and Seaman, Shaun and Evans, Hannah and Baisley, Kathy and Carpenter, James", - doi = "10.1002/sim.6274", - journal = "Statistics in Medicine", - month = "nov", - number = "27", - pages = "4681--4694", - publisher = "John Wiley and Sons Ltd", - title = "Relaxing the independent censoring assumption in the Cox proportional hazards model using multiple imputation", - volume = "33", - year = "2014" + author = "Jackson, Dan and White, Ian R. and Seaman, Shaun and Evans, Hannah and Baisley, Kathy and Carpenter, James", + doi = "10.1002/sim.6274", + journal = "Statistics in Medicine", + month = "nov", + number = "27", + pages = "4681--4694", + publisher = "John Wiley and Sons Ltd", + title = "Relaxing the independent censoring assumption in the Cox proportional hazards model using multiple imputation", + volume = "33", + year = "2014" ), klein_2003 = bibentry("book", - author = "Klein, John P and Moeschberger, Melvin L", - edition = "2", - isbn = "0387216456", - publisher = "Springer Science & Business Media", - title = "Survival analysis: techniques for censored and truncated data", - year = "2003" + author = "Klein, John P and Moeschberger, Melvin L", + edition = "2", + isbn = "0387216456", + publisher = "Springer Science & Business Media", + title = "Survival analysis: techniques for censored and truncated data", + year = "2003" ), buckley_1979 = bibentry("article", - author = "Buckley, Jonathan and James, Ian", - doi = "10.2307/2335161", - journal = "Biometrika", - month = "apr", - number = "3", - pages = "429--436", - publisher = "Oxford University Press, Biometrika Trust", - title = "Linear Regression with Censored Data", - url = "https://www.jstor.org/stable/2335161", - volume = "66", - year = "1979" + author = "Buckley, Jonathan and James, Ian", + doi = "10.2307/2335161", + journal = "Biometrika", + month = "apr", + number = "3", + pages = "429--436", + publisher = "Oxford University Press, Biometrika Trust", + title = "Linear Regression with Censored Data", + url = "https://www.jstor.org/stable/2335161", + volume = "66", + year = "1979" ), haider_2020 = bibentry("article", - author = "Haider, Humza and Hoehn, Bret and Davis, Sarah and Greiner, Russell", - journal = "Journal of Machine Learning Research", - volume = "21", - number = "85", - pages = "1--63", - title = "Effective Ways to Build and Evaluate Individual Survival Distributions", - url = "https://jmlr.org/papers/v21/18-772.html", - year = "2020" + author = "Haider, Humza and Hoehn, Bret and Davis, Sarah and Greiner, Russell", + journal = "Journal of Machine Learning Research", + volume = "21", + number = "85", + pages = "1--63", + title = "Effective Ways to Build and Evaluate Individual Survival Distributions", + url = "https://jmlr.org/papers/v21/18-772.html", + year = "2020" ), dcalib = bibentry("article", - author = "Humza Haider and Bret Hoehn and Sarah Davis and Russell Greiner", - title = "Effective Ways to Build and Evaluate Individual Survival Distributions", - journal = "Journal of Machine Learning Research", - year = "2020", - volume = "21", - number = "85", - pages = "1-63", - url = "https://jmlr.org/papers/v21/18-772.html" + author = "Humza Haider and Bret Hoehn and Sarah Davis and Russell Greiner", + title = "Effective Ways to Build and Evaluate Individual Survival Distributions", + journal = "Journal of Machine Learning Research", + year = "2020", + volume = "21", + number = "85", + pages = "1-63", + url = "https://jmlr.org/papers/v21/18-772.html" ), breslow_1972 = bibentry("article", - author = "Norman Breslow", - title = "Discussion of 'Regression Models and Life-Tables' by D.R. Cox", - journal = "Journal of the Royal Statistical Society: Series B", - year = "1972", - volume = "34", - number = "2", - pages = "216-217" + author = "Norman Breslow", + title = "Discussion of 'Regression Models and Life-Tables' by D.R. Cox", + journal = "Journal of the Royal Statistical Society: Series B", + year = "1972", + volume = "34", + number = "2", + pages = "216-217" ), lin_2007 = bibentry("article", - author = "Lin, D. Y.", - title = "On the Breslow estimator", - journal = "Lifetime Data Analysis", - year = "2007", - volume = "13", - number = "4", - pages = "471-480", - publisher = "Springer", - doi = "10.1007/s10985-007-9048-y" + author = "Lin, D. Y.", + title = "On the Breslow estimator", + journal = "Lifetime Data Analysis", + year = "2007", + volume = "13", + number = "4", + pages = "471-480", + publisher = "Springer", + doi = "10.1007/s10985-007-9048-y" ), avati_2020 = bibentry("article", - author = "Avati, Anand and Duan, Tony and Zhou, Sharon and Jung, Kenneth and Shah, Nigam H and Ng, Andrew Y", - title = "Countdown Regression: Sharp and Calibrated Survival Predictions", - journal = "Proceedings of The 35th Uncertainty in Artificial Intelligence Conference", - year = "2020", - volume = "115", - number = "4", - series = "Proceedings of Machine Learning Research", - pages = "145--155", - publisher = "PMLR", - url = "https://proceedings.mlr.press/v115/avati20a.html" + author = "Avati, Anand and Duan, Tony and Zhou, Sharon and Jung, Kenneth and Shah, Nigam H and Ng, Andrew Y", + title = "Countdown Regression: Sharp and Calibrated Survival Predictions", + journal = "Proceedings of The 35th Uncertainty in Artificial Intelligence Conference", + year = "2020", + volume = "115", + number = "4", + series = "Proceedings of Machine Learning Research", + pages = "145--155", + publisher = "PMLR", + url = "https://proceedings.mlr.press/v115/avati20a.html" ), rindt_2022 = bibentry("article", - author = "Rindt, David and Hu, Robert and Steinsaltz, David and Sejdinovic, Dino", - title = "Survival regression with proper scoring rules and monotonic neural networks", - journal = "Proceedings of The 25th International Conference on Artificial Intelligence and Statistics", - year = "2022", - volume = "151", - number = "4", - series = "Proceedings of Machine Learning Research", - pages = "1190--1205", - publisher = "PMLR", - url = "https://proceedings.mlr.press/v151/rindt22a.html" + author = "Rindt, David and Hu, Robert and Steinsaltz, David and Sejdinovic, Dino", + title = "Survival regression with proper scoring rules and monotonic neural networks", + journal = "Proceedings of The 25th International Conference on Artificial Intelligence and Statistics", + year = "2022", + volume = "151", + number = "4", + series = "Proceedings of Machine Learning Research", + pages = "1190--1205", + publisher = "PMLR", + url = "https://proceedings.mlr.press/v151/rindt22a.html" ), grambsch_1994 = bibentry("article", - author = "Grambsch, Patricia and Therneau, Terry", - title = "Proportional hazards tests and diagnostics based on weighted residuals", - journal = "Biometrika", - year = "1994", - volume = "81", - number = "3", - pages = "515--526", - doi = "10.1093/biomet/81.3.515", - url = "https://doi.org/10.1093/biomet/81.3.515" + author = "Grambsch, Patricia and Therneau, Terry", + title = "Proportional hazards tests and diagnostics based on weighted residuals", + journal = "Biometrika", + year = "1994", + volume = "81", + number = "3", + pages = "515--526", + doi = "10.1093/biomet/81.3.515", + url = "https://doi.org/10.1093/biomet/81.3.515" ), tutz_2016 = bibentry("book", author = "Tutz, Gerhard and Schmid, Matthias", @@ -683,5 +683,31 @@ bibentries = c( # nolint start series = "Springer Series in Statistics", isbn = "978-3-319-28156-8 978-3-319-28158-2", url = "http://link.springer.com/10.1007/978-3-319-28158-2" + ), + sonabend_2022 = bibentry("article", + author = "Sonabend, Raphael and Bender, Andreas and Vollmer, Sebastian", + doi = "10.1093/BIOINFORMATICS/BTAC451", + editor = "Lu, Zhiyong", + isbn = "451/6640155", + issn = "1367-4803", + journal = "Bioinformatics", + month = "jul", + title = "Avoiding C-hacking when evaluating survival distribution predictions with discrimination measures", + url = "https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btac451/6640155", + year = "2022" + ), + zhao_2016 = bibentry("article", + author = "Zhao, Lihui and Claggett, Brian and Tian, Lu and Uno, Hajime and Pfeffer, Marc A. and Solomon, Scott D. and Trippa, Lorenzo and Wei, L. J.", + doi = "10.1111/BIOM.12384", + issn = "1541-0420", + journal = "Biometrics", + month = "mar", + number = "1", + pages = "215--221", + publisher = "John Wiley & Sons, Ltd", + title = "On the restricted mean survival time curve in survival analysis", + url = "https://onlinelibrary.wiley.com/doi/full/10.1111/biom.12384", + volume = "72", + year = "2016" ) ) # nolint end diff --git a/R/partition.R b/R/partition.R deleted file mode 100644 index 2ad3dff31..000000000 --- a/R/partition.R +++ /dev/null @@ -1,9 +0,0 @@ -#' @export -partition.TaskSurv = function(task, ratio = 0.67, stratify = TRUE, ...) { # nolint - if (stratify) { - task = task$clone() - task$set_col_roles(task$target_names[2L], add_to = "stratum") - } - - NextMethod("partition") -} diff --git a/R/pipelines.R b/R/pipelines.R index 638c89d97..8587f6d44 100644 --- a/R/pipelines.R +++ b/R/pipelines.R @@ -108,49 +108,46 @@ pipeline_survbagging = function(learner, iterations = 10, frac = 0.7, avg = TRUE #' @templateVar pipeop [PipeOpCrankCompositor] #' @templateVar id crankcompositor #' @template param_pipeline_learner +#' #' @param method `character(1)`\cr -#' One of `sum_haz` (default), `mean`, `mode`, or `median`; -#' abbreviations allowed. Used to determine how `crank` is estimated from -#' the predicted `distr`. -#' @param which `integer(1)`\cr -#' If `method = "mode"` then specifies which mode to use if multi-modal, default -#' is the first. -#' @param response `logical(1)`\cr -#' If `TRUE` then the `response` predict type is also estimated with the same values as `crank`. +#' Determines what method should be used to produce a continuous ranking from the distribution. +#' Currently only `mort` is supported, which is the sum of the cumulative hazard, also called *expected/ensemble mortality*, see Ishwaran et al. (2008). +#' For more details, see [get_mortality()]. #' @param overwrite `logical(1)`\cr -#' If `TRUE` then existing `response` and `crank` predict types are overwritten. +#' If `FALSE` (default) and the prediction already has a `crank` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `crank` will be overwritten. +#' #' @examples #' \dontrun{ #' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { #' library("mlr3") #' library("mlr3pipelines") #' -#' task = tsk("rats") -#' pipe = ppl( +#' task = tsk("lung") +#' part = partition(task) +#' +#' # change the crank prediction type of a Cox's model predictions +#' grlrn = ppl( #' "crankcompositor", #' learner = lrn("surv.coxph"), -#' method = "sum_haz" +#' method = "mort", +#' overwrite = TRUE, +#' graph_learner = TRUE #' ) -#' pipe$train(task) -#' pipe$predict(task) +#' grlrn$train(task, part$train) +#' grlrn$predict(task, part$test) #' } #' } -pipeline_crankcompositor = function(learner, - method = c("sum_haz", "mean", "median", "mode"), which = NULL, - response = FALSE, overwrite = FALSE, graph_learner = FALSE) { - - if (testCharacter(learner)) { - warning("Passing a learner id is now deprecated. In the future please pass a constructed - learner or graph instead.") - learner = lrn(learner) - } +pipeline_crankcompositor = function(learner, method = c("mort"), + overwrite = FALSE, graph_learner = FALSE) { + assert_learner(learner, task_type = "surv") + assert_choice(method, choices = c("mort")) + assert_logical(overwrite) + assert_logical(graph_learner) pred = mlr3pipelines::as_graph(learner) - pv = list(method = match.arg(method), response = response, overwrite = overwrite) - if (!is.null(which)) { - pv$which = which - } + pv = list(method = method, overwrite = overwrite) compositor = mlr3pipelines::po("crankcompose", param_vals = pv) gr = mlr3pipelines::`%>>%`(pred, compositor) @@ -162,6 +159,71 @@ pipeline_crankcompositor = function(learner, gr } +#' @template pipeline +#' @templateVar title Estimate Survival Time/Response Predict Type +#' @templateVar pipeop [PipeOpResponseCompositor] +#' @templateVar id responsecompositor +#' @template param_pipeline_learner +#' +#' @param method (`character(1)`)\cr +#' Determines what method should be used to produce a survival time (response) from the survival distribution. +#' Available methods are `"rmst"` and `"median"`, corresponding to the *restricted mean survival time* and the *median survival time* respectively. +#' @param cutoff_time (`numeric(1)`)\cr +#' Determines the time point up to which we calculate the restricted mean survival time (works only for the `"rmst"` method). +#' If `NULL` (default), all the available time points in the predicted survival distribution will be used. +#' @param add_crank (`logical(1)`)\cr +#' If `TRUE` then `crank` predict type will be set as `-response` (as higher survival times correspond to lower risk). +#' Works only if `overwrite` is `TRUE`. +#' @param overwrite (`logical(1)`)\cr +#' If `FALSE` (default) and the prediction already has a `response` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `response` (and the `crank`, if `add_crank` is `TRUE`) will be overwritten. +#' +#' @examples +#' \dontrun{ +#' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { +#' library("mlr3") +#' library("mlr3pipelines") +#' +#' task = tsk("lung") +#' part = partition(task) +#' +#' # add survival time prediction type to the predictions of a Cox model +#' grlrn = ppl( +#' "responsecompositor", +#' learner = lrn("surv.coxph"), +#' method = "rmst", +#' overwrite = TRUE, +#' graph_learner = TRUE +#' ) +#' grlrn$train(task, part$train) +#' grlrn$predict(task, part$test) +#' } +#' } +pipeline_responsecompositor = function(learner, method = "rmst", cutoff_time = NULL, + add_crank = FALSE, overwrite = FALSE, + graph_learner = FALSE) { + assert_learner(learner, task_type = "surv") + assert_choice(method, choices = c("rmst", "median")) + assert_number(cutoff_time, null.ok = TRUE, lower = 0) + assert_logical(add_crank) + assert_logical(overwrite) + assert_logical(graph_learner) + + pred = mlr3pipelines::as_graph(learner) + + pv = list(method = method, cutoff_time = cutoff_time, add_crank = add_crank, + overwrite = overwrite) + compositor = mlr3pipelines::po("responsecompose", param_vals = pv) + + gr = mlr3pipelines::`%>>%`(pred, compositor) + + if (graph_learner) { + gr = mlr3pipelines::GraphLearner$new(gr) + } + + gr +} + #' @template pipeline #' @templateVar title Estimate Survival distr Predict Type #' @templateVar pipeop [PipeOpDistrCompositor] or [PipeOpBreslow] @@ -185,20 +247,21 @@ pipeline_crankcompositor = function(learner, #' to another. #' @examples #' \dontrun{ -#' if (requireNamespace("mlr3pipelines", quietly = TRUE) && -#' requireNamespace("rpart", quietly = TRUE)) { -#' library("mlr3") +#' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { #' library("mlr3pipelines") #' +#' # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: #' task = tsk("rats") -#' pipe = ppl( +#' grlrn = ppl( #' "distrcompositor", -#' learner = lrn("surv.rpart"), +#' learner = lrn("surv.coxph"), #' estimator = "kaplan", -#' form = "ph" +#' form = "aft", +#' overwrite = TRUE, +#' graph_learner = TRUE #' ) -#' pipe$train(task) -#' pipe$predict(task) +#' grlrn$train(task) +#' grlrn$predict(task) #' } #' } pipeline_distrcompositor = function(learner, estimator = "kaplan", form = "aft", @@ -310,8 +373,6 @@ pipeline_probregr = function(learner, learner_se = NULL, dist = "Uniform", #' \item A [LearnerRegr] is fit and predicted on the new `TaskRegr`. #' \item [PipeOpPredRegrSurv] transforms the resulting [PredictionRegr][mlr3::PredictionRegr] #' to [PredictionSurv]. -#' \item Optionally: [PipeOpDistrCompositor] is used to compose a `distr` predict_type from the -#' predicted `response` predict_type. #' } #' \item Survival to Probabilistic Regression #' \enumerate{ @@ -357,7 +418,7 @@ pipeline_probregr = function(learner, learner_se = NULL, dist = "Uniform", #' Regression learner to fit to the transformed [TaskRegr][mlr3::TaskRegr]. If `regr_se_learner` is #' `NULL` in method `2`, then `regr_learner` must have `se` predict_type. #' @param distrcompose `logical(1)`\cr -#' For methods `1` and `3` if `TRUE` (default) then [PipeOpDistrCompositor] is utilised to +#' For method `3` if `TRUE` (default) then [PipeOpDistrCompositor] is utilised to #' transform the deterministic predictions to a survival distribution. #' @param distr_estimator [LearnerSurv]\cr #' For methods `1` and `3` if `distrcompose = TRUE` then specifies the learner to estimate the @@ -398,7 +459,6 @@ pipeline_probregr = function(learner, learner_se = NULL, dist = "Uniform", #' "survtoregr", #' method = 1, #' regr_learner = lrn("regr.featureless"), -#' distrcompose = TRUE, #' survregr_params = list(method = "delete") #' ) #' pipe$train(task) @@ -448,16 +508,6 @@ pipeline_survtoregr = function(method = 1, regr_learner = lrn("regr.featureless" add_edge("trafotask_survregr", "regr_learner")$ add_edge("regr_learner", "trafopred_regrsurv", dst_channel = "pred")$ add_edge("task_surv", "trafopred_regrsurv", dst_channel = "task") - - if (distrcompose) { - assert("distr" %in% distr_estimator$predict_types) - - gr$add_pipeop(mlr3pipelines::po("learner", distr_estimator, id = "distr_estimator"))$ - add_pipeop(mlr3pipelines::po("distrcompose", param_vals = distrcompose_params))$ - add_edge("trafopred_regrsurv", dst_id = "distrcompose", dst_channel = "pred")$ - add_edge("distr_estimator", dst_id = "distrcompose", dst_channel = "base") - } - } else if (method == 2) { gr = mlr3pipelines::Graph$new()$ @@ -485,7 +535,6 @@ pipeline_survtoregr = function(method = 1, regr_learner = lrn("regr.featureless" add_edge("regr_learner", "compose_probregr", dst_channel = "input_response") } else if (method == 3) { - assert("lp" %in% surv_learner$predict_types) gr = mlr3pipelines::Graph$new()$ @@ -614,6 +663,7 @@ register_graph("survaverager", pipeline_survaverager) register_graph("survbagging", pipeline_survbagging) register_graph("crankcompositor", pipeline_crankcompositor) register_graph("distrcompositor", pipeline_distrcompositor) +register_graph("responsecompositor", pipeline_responsecompositor) register_graph("probregr", pipeline_probregr) register_graph("survtoregr", pipeline_survtoregr) register_graph("survtoclassif_disctime", pipeline_survtoclassif_disctime) diff --git a/R/scoring_rule_erv.R b/R/scoring_rule_erv.R index b680b8b06..e956b120d 100644 --- a/R/scoring_rule_erv.R +++ b/R/scoring_rule_erv.R @@ -7,18 +7,20 @@ if (!is.null(ps$se) && ps$se) { stop("Only one of `ERV` and `se` can be TRUE") } - measure$param_set$values$ERV = FALSE + + measure$param_set$set_values(ERV = FALSE) # compute score for the learner - learner_score = measure$score(prediction, task = task, - train_set = train_set) + learner_score = measure$score(prediction, task = task, train_set = train_set) # compute score for the baseline (Kaplan-Meier) + # train KM km = lrn("surv.kaplan")$train(task = task, row_ids = train_set) - km = km$predict(as_task_surv(data.frame( - as.matrix(prediction$truth)), event = "status")) - base_score = measure$score(km, task = task, train_set = train_set) + # predict KM on the test set (= not train ids) + test_set = setdiff(task$row_ids, train_set) + km_pred = km$predict(task, row_ids = test_set) + base_score = measure$score(km_pred, task = task, train_set = train_set) - measure$param_set$values$ERV = TRUE + measure$param_set$set_values(ERV = TRUE) # return percentage decrease 1 - (learner_score / base_score) diff --git a/R/surv_return.R b/R/surv_return.R index 775734300..6f7a857ed 100644 --- a/R/surv_return.R +++ b/R/surv_return.R @@ -3,38 +3,25 @@ #' @description Internal helper function to easily return the correct survival predict types. #' #' @param times (`numeric()`) \cr Vector of survival times. -#' @param surv (`matrix()|array()`)\cr Matrix or array of predicted survival -#' probabilities, rows (1st dimension) are observations, columns (2nd dimension) -#' are times and in the case of an array there should be one more dimension. -#' Number of columns should be equal to length of `times`. In case a `numeric()` -#' vector is provided, it is converted to a single row (one observation) matrix. -#' @param crank (`numeric()`)\cr Relative risk/continuous ranking. Higher value is associated -#' with higher risk. If `NULL` then either set as `-response` if available or -#' `lp` if available (this assumes that the `lp` prediction comes from a PH type -#' model - in case of an AFT model the user should provide `-lp`). -#' In case neither `response` or `lp` are provided, then `crank` is calculated -#' as the sum of the cumulative hazard function (expected mortality) derived from -#' the predicted survival function (`surv`). In case `surv` is a 3d array, we use -#' the `which.curve` parameter to decide which survival matrix (index in the 3rd -#' dimension) will be chosen for the calculation of `crank`. +#' @param surv (`matrix()|array()`)\cr Matrix or array of predicted survival probabilities, rows (1st dimension) are observations, columns (2nd dimension) are times and in the case of an array there should be one more dimension. +#' Number of columns should be equal to length of `times`. +#' In case a `numeric()` vector is provided, it is converted to a single row (one observation) matrix. +#' @param crank (`numeric()`)\cr Relative risk/continuous ranking. +#' Higher value is associated with higher risk. +#' If `NULL` then either set as `-response` if available or `lp` if available (this assumes that the `lp` prediction comes from a PH type model - in case of an AFT model the user should provide `-lp`). +#' In case neither `response` or `lp` are provided, then `crank` is calculated as the sum of the cumulative hazard function (**expected mortality**) derived from the predicted survival function (`surv`), see [get_mortality]. +#' In case `surv` is a 3d array, we use the `which.curve` parameter to decide which survival matrix (index in the 3rd dimension) will be chosen for the calculation of `crank`. #' @param lp (`numeric()`)\cr Predicted linear predictor, used to impute `crank` if `NULL`. #' @param response (`numeric()`)\cr Predicted survival time, passed through function without #' modification. -#' @param which.curve Which curve (3rd dimension) should the `crank` be -#' calculated for, in case `surv` is an `array`? If between (0,1) it is taken as -#' the quantile of the curves otherwise if greater than 1 it is taken as the -#' curve index. It can also be 'mean' and the survival probabilities are averaged -#' across the 3rd dimension. Default value (`NULL`) is the **0.5 quantile** which -#' is the median across the 3rd dimension of the survival array. -#' -#' @details -#' Uses [survivalmodels::surv_to_risk] to reduce survival matrices to relative -#' risks / rankings if `crank` is NULL. +#' @param which.curve Which curve (3rd dimension) should the `crank` be calculated for, in case `surv` is an `array`? +#' If between (0,1) it is taken as the quantile of the curves otherwise if greater than 1 it is taken as the curve index. +#' It can also be 'mean' and the survival probabilities are averaged across the 3rd dimension. +#' Default value (`NULL`) is the **0.5 quantile** which is the median across the 3rd dimension of the survival array. #' #' @references -#' Sonabend, R., Bender, A., & Vollmer, S. (2022). Avoiding C-hacking when -#' evaluating survival distribution predictions with discrimination measures. -#' Bioinformatics. https://doi.org/10.1093/BIOINFORMATICS/BTAC451 +#' `r format_bib("sonabend_2022")` +#' #' @export .surv_return = function(times = NULL, surv = NULL, crank = NULL, lp = NULL, response = NULL, which.curve = NULL) { @@ -64,10 +51,10 @@ crank = lp } else if (!is.null(surv)) { if (inherits(surv, "matrix")) { - crank = survivalmodels::surv_to_risk(surv) + crank = get_mortality(surv) } else { # array surv_mat = .ext_surv_mat(surv, which.curve) - crank = survivalmodels::surv_to_risk(surv_mat) + crank = get_mortality(surv_mat) } } } @@ -118,3 +105,53 @@ array(arr[, , which.curve], c(nrow(arr), ncol(arr)), dimnames(arr)[1:2]) } } + +#' @title Calculate the expected mortality risks from a survival matrix +#' +#' @description Many methods can be used to reduce a discrete survival +#' distribution prediction (i.e. matrix) to a relative risk / ranking +#' prediction, see Sonabend et al. (2022). +#' +#' This function calculates a relative risk score as the sum of the +#' predicted cumulative hazard function, also called **ensemble/expected mortality**. +#' This risk score can be loosely interpreted as the expected number of deaths for +#' patients with similar characteristics, see Ishwaran et al. (2008) and has no +#' model or survival distribution assumptions. +#' +#' @param x (`matrix()`) \cr A survival matrix where rows are the +#' (predicted) observations and columns the time-points. +#' For more details, see [assert_surv_matrix]. +#' +#' @return a `numeric` vector of the mortality risk scores, one per row of the +#' input survival matrix. +#' +#' @references +#' `r format_bib("sonabend_2022", "ishwaran_2008")` +#' +#' @examples +#' n = 10 # number of observations +#' k = 50 # time points +#' +#' # Create the matrix with random values between 0 and 1 +#' mat = matrix(runif(n * k, min = 0, max = 1), nrow = n, ncol = k) +#' +#' # transform it to a survival matrix +#' surv_mat = t(apply(mat, 1L, function(row) sort(row, decreasing = TRUE))) +#' colnames(surv_mat) = 1:k # time points +#' +#' # get mortality scores (the larger, the more risk) +#' mort = get_mortality(surv_mat) +#' mort +#' +#' @export +get_mortality = function(x) { + assert_surv_matrix(x) + + # H(t) = -log(S(t)) + cumhaz = -log(x) + + # Ignore S(t) = 0 => -log(S(t)) = Inf + cumhaz[is.infinite(cumhaz)] = 0 + + rowSums(cumhaz) +} diff --git a/R/zzz.R b/R/zzz.R index 20a0b3e83..bad8118b7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -81,7 +81,7 @@ unregister_reflections = function() { # task package = NULL # silence data.table notes - x$task_types[package != "mlr3proba"] + x$task_types = x$task_types[package != "mlr3proba"] x$task_col_roles$surv = NULL x$task_col_roles$dens = NULL x$task_col_roles$classif = setdiff(x$task_col_roles$classif, "original_ids") diff --git a/man/assert_surv_matrix.Rd b/man/assert_surv_matrix.Rd new file mode 100644 index 000000000..349250639 --- /dev/null +++ b/man/assert_surv_matrix.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assertions.R +\name{assert_surv_matrix} +\alias{assert_surv_matrix} +\title{Assert survival matrix} +\usage{ +assert_surv_matrix(x) +} +\arguments{ +\item{x}{(\code{matrix()})\cr +A matrix of (predicted) survival probabilities. +Rows are observations, columns are (increasing) time points.} +} +\value{ +if the assertion fails an error occurs, otherwise \code{NULL} is returned +invisibly. +} +\description{ +Asserts if the given input matrix is a (discrete) survival +probabilities matrix using \link{Rcpp} code. +The following checks are performed: +\enumerate{ +\item All values are probabilities, i.e. \eqn{S(t) \in [0,1]} +\item Column names correspond to time-points and should therefore be coercable to +\code{numeric} and increasing +\item Per row/observation, the survival probabilities decrease non-strictly, i.e. +\eqn{S(t) \ge S(t+1)} +} +} +\examples{ +x = matrix(data = c(1,0.6,0.4,0.8,0.8,0.7), nrow = 2, ncol = 3, byrow = TRUE) +colnames(x) = c(12, 34, 42) +x + +assert_surv_matrix(x) + +} diff --git a/man/dot-surv_return.Rd b/man/dot-surv_return.Rd index f2020ee44..fafca5b72 100644 --- a/man/dot-surv_return.Rd +++ b/man/dot-surv_return.Rd @@ -16,43 +16,32 @@ \arguments{ \item{times}{(\code{numeric()}) \cr Vector of survival times.} -\item{surv}{(\code{matrix()|array()})\cr Matrix or array of predicted survival -probabilities, rows (1st dimension) are observations, columns (2nd dimension) -are times and in the case of an array there should be one more dimension. -Number of columns should be equal to length of \code{times}. In case a \code{numeric()} -vector is provided, it is converted to a single row (one observation) matrix.} +\item{surv}{(\code{matrix()|array()})\cr Matrix or array of predicted survival probabilities, rows (1st dimension) are observations, columns (2nd dimension) are times and in the case of an array there should be one more dimension. +Number of columns should be equal to length of \code{times}. +In case a \code{numeric()} vector is provided, it is converted to a single row (one observation) matrix.} -\item{crank}{(\code{numeric()})\cr Relative risk/continuous ranking. Higher value is associated -with higher risk. If \code{NULL} then either set as \code{-response} if available or -\code{lp} if available (this assumes that the \code{lp} prediction comes from a PH type -model - in case of an AFT model the user should provide \code{-lp}). -In case neither \code{response} or \code{lp} are provided, then \code{crank} is calculated -as the sum of the cumulative hazard function (expected mortality) derived from -the predicted survival function (\code{surv}). In case \code{surv} is a 3d array, we use -the \code{which.curve} parameter to decide which survival matrix (index in the 3rd -dimension) will be chosen for the calculation of \code{crank}.} +\item{crank}{(\code{numeric()})\cr Relative risk/continuous ranking. +Higher value is associated with higher risk. +If \code{NULL} then either set as \code{-response} if available or \code{lp} if available (this assumes that the \code{lp} prediction comes from a PH type model - in case of an AFT model the user should provide \code{-lp}). +In case neither \code{response} or \code{lp} are provided, then \code{crank} is calculated as the sum of the cumulative hazard function (\strong{expected mortality}) derived from the predicted survival function (\code{surv}), see \link{get_mortality}. +In case \code{surv} is a 3d array, we use the \code{which.curve} parameter to decide which survival matrix (index in the 3rd dimension) will be chosen for the calculation of \code{crank}.} \item{lp}{(\code{numeric()})\cr Predicted linear predictor, used to impute \code{crank} if \code{NULL}.} \item{response}{(\code{numeric()})\cr Predicted survival time, passed through function without modification.} -\item{which.curve}{Which curve (3rd dimension) should the \code{crank} be -calculated for, in case \code{surv} is an \code{array}? If between (0,1) it is taken as -the quantile of the curves otherwise if greater than 1 it is taken as the -curve index. It can also be 'mean' and the survival probabilities are averaged -across the 3rd dimension. Default value (\code{NULL}) is the \strong{0.5 quantile} which -is the median across the 3rd dimension of the survival array.} +\item{which.curve}{Which curve (3rd dimension) should the \code{crank} be calculated for, in case \code{surv} is an \code{array}? +If between (0,1) it is taken as the quantile of the curves otherwise if greater than 1 it is taken as the curve index. +It can also be 'mean' and the survival probabilities are averaged across the 3rd dimension. +Default value (\code{NULL}) is the \strong{0.5 quantile} which is the median across the 3rd dimension of the survival array.} } \description{ Internal helper function to easily return the correct survival predict types. } -\details{ -Uses \link[survivalmodels:surv_to_risk]{survivalmodels::surv_to_risk} to reduce survival matrices to relative -risks / rankings if \code{crank} is NULL. -} \references{ -Sonabend, R., Bender, A., & Vollmer, S. (2022). Avoiding C-hacking when -evaluating survival distribution predictions with discrimination measures. -Bioinformatics. https://doi.org/10.1093/BIOINFORMATICS/BTAC451 +Sonabend, Raphael, Bender, Andreas, Vollmer, Sebastian (2022). +\dQuote{Avoiding C-hacking when evaluating survival distribution predictions with discrimination measures.} +\emph{Bioinformatics}. +ISSN 1367-4803, \doi{10.1093/BIOINFORMATICS/BTAC451}, \url{https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btac451/6640155}. } diff --git a/man/get_mortality.Rd b/man/get_mortality.Rd new file mode 100644 index 000000000..49cf04ac9 --- /dev/null +++ b/man/get_mortality.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/surv_return.R +\name{get_mortality} +\alias{get_mortality} +\title{Calculate the expected mortality risks from a survival matrix} +\usage{ +get_mortality(x) +} +\arguments{ +\item{x}{(\code{matrix()}) \cr A survival matrix where rows are the +(predicted) observations and columns the time-points. +For more details, see \link{assert_surv_matrix}.} +} +\value{ +a \code{numeric} vector of the mortality risk scores, one per row of the +input survival matrix. +} +\description{ +Many methods can be used to reduce a discrete survival +distribution prediction (i.e. matrix) to a relative risk / ranking +prediction, see Sonabend et al. (2022). + +This function calculates a relative risk score as the sum of the +predicted cumulative hazard function, also called \strong{ensemble/expected mortality}. +This risk score can be loosely interpreted as the expected number of deaths for +patients with similar characteristics, see Ishwaran et al. (2008) and has no +model or survival distribution assumptions. +} +\examples{ +n = 10 # number of observations +k = 50 # time points + +# Create the matrix with random values between 0 and 1 +mat = matrix(runif(n * k, min = 0, max = 1), nrow = n, ncol = k) + +# transform it to a survival matrix +surv_mat = t(apply(mat, 1L, function(row) sort(row, decreasing = TRUE))) +colnames(surv_mat) = 1:k # time points + +# get mortality scores (the larger, the more risk) +mort = get_mortality(surv_mat) +mort + +} +\references{ +Sonabend, Raphael, Bender, Andreas, Vollmer, Sebastian (2022). +\dQuote{Avoiding C-hacking when evaluating survival distribution predictions with discrimination measures.} +\emph{Bioinformatics}. +ISSN 1367-4803, \doi{10.1093/BIOINFORMATICS/BTAC451}, \url{https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btac451/6640155}. + +Ishwaran, Hemant, Kogalur, B U, Blackstone, H E, Lauer, S M, others (2008). +\dQuote{Random survival forests.} +\emph{The Annals of applied statistics}, \bold{2}(3), 841--860. +} diff --git a/man/mlr_graphs_crankcompositor.Rd b/man/mlr_graphs_crankcompositor.Rd index 2aed5581c..14922ccc0 100644 --- a/man/mlr_graphs_crankcompositor.Rd +++ b/man/mlr_graphs_crankcompositor.Rd @@ -7,9 +7,7 @@ \usage{ pipeline_crankcompositor( learner, - method = c("sum_haz", "mean", "median", "mode"), - which = NULL, - response = FALSE, + method = c("mort"), overwrite = FALSE, graph_learner = FALSE ) @@ -21,19 +19,13 @@ be wrapped in \link[mlr3pipelines:Graph]{mlr3pipelines::Graph} or a \code{Graph} \link{LearnerSurv}.} \item{method}{\code{character(1)}\cr -One of \code{sum_haz} (default), \code{mean}, \code{mode}, or \code{median}; -abbreviations allowed. Used to determine how \code{crank} is estimated from -the predicted \code{distr}.} - -\item{which}{\code{integer(1)}\cr -If \code{method = "mode"} then specifies which mode to use if multi-modal, default -is the first.} - -\item{response}{\code{logical(1)}\cr -If \code{TRUE} then the \code{response} predict type is also estimated with the same values as \code{crank}.} +Determines what method should be used to produce a continuous ranking from the distribution. +Currently only \code{mort} is supported, which is the sum of the cumulative hazard, also called \emph{expected/ensemble mortality}, see Ishwaran et al. (2008). +For more details, see \code{\link[=get_mortality]{get_mortality()}}.} \item{overwrite}{\code{logical(1)}\cr -If \code{TRUE} then existing \code{response} and \code{crank} predict types are overwritten.} +If \code{FALSE} (default) and the prediction already has a \code{crank} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{crank} will be overwritten.} \item{graph_learner}{\code{logical(1)}\cr If \code{TRUE} returns wraps the \link[mlr3pipelines:Graph]{Graph} as a @@ -51,14 +43,19 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { library("mlr3") library("mlr3pipelines") - task = tsk("rats") - pipe = ppl( + task = tsk("lung") + part = partition(task) + + # change the crank prediction type of a Cox's model predictions + grlrn = ppl( "crankcompositor", learner = lrn("surv.coxph"), - method = "sum_haz" + method = "mort", + overwrite = TRUE, + graph_learner = TRUE ) - pipe$train(task) - pipe$predict(task) + grlrn$train(task, part$train) + grlrn$predict(task, part$test) } } } @@ -66,6 +63,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other pipelines: \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, diff --git a/man/mlr_graphs_distrcompositor.Rd b/man/mlr_graphs_distrcompositor.Rd index e75fee5cf..d25347f8d 100644 --- a/man/mlr_graphs_distrcompositor.Rd +++ b/man/mlr_graphs_distrcompositor.Rd @@ -46,20 +46,21 @@ Wrapper around \link{PipeOpDistrCompositor} or \link{PipeOpBreslow} to simplify } \examples{ \dontrun{ -if (requireNamespace("mlr3pipelines", quietly = TRUE) && - requireNamespace("rpart", quietly = TRUE)) { - library("mlr3") +if (requireNamespace("mlr3pipelines", quietly = TRUE)) { library("mlr3pipelines") + # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: task = tsk("rats") - pipe = ppl( + grlrn = ppl( "distrcompositor", - learner = lrn("surv.rpart"), + learner = lrn("surv.coxph"), estimator = "kaplan", - form = "ph" + form = "aft", + overwrite = TRUE, + graph_learner = TRUE ) - pipe$train(task) - pipe$predict(task) + grlrn$train(task) + grlrn$predict(task) } } } @@ -67,6 +68,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE) && Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, diff --git a/man/mlr_graphs_probregr.Rd b/man/mlr_graphs_probregr.Rd index 4a0573d2d..25db8a1e6 100644 --- a/man/mlr_graphs_probregr.Rd +++ b/man/mlr_graphs_probregr.Rd @@ -70,6 +70,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE) && Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, diff --git a/man/mlr_graphs_responsecompositor.Rd b/man/mlr_graphs_responsecompositor.Rd new file mode 100644 index 000000000..0bc4b237b --- /dev/null +++ b/man/mlr_graphs_responsecompositor.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipelines.R +\name{mlr_graphs_responsecompositor} +\alias{mlr_graphs_responsecompositor} +\alias{pipeline_responsecompositor} +\title{Estimate Survival Time/Response Predict Type Pipeline} +\usage{ +pipeline_responsecompositor( + learner, + method = "rmst", + cutoff_time = NULL, + add_crank = FALSE, + overwrite = FALSE, + graph_learner = FALSE +) +} +\arguments{ +\item{learner}{\verb{[mlr3::Learner]|[mlr3pipelines::PipeOp]|[mlr3pipelines::Graph]} \cr +Either a \code{Learner} which will be wrapped in \link[mlr3pipelines:mlr_pipeops_learner]{mlr3pipelines::PipeOpLearner}, a \code{PipeOp} which will +be wrapped in \link[mlr3pipelines:Graph]{mlr3pipelines::Graph} or a \code{Graph} itself. Underlying \code{Learner} should be +\link{LearnerSurv}.} + +\item{method}{(\code{character(1)})\cr +Determines what method should be used to produce a survival time (response) from the survival distribution. +Available methods are \code{"rmst"} and \code{"median"}, corresponding to the \emph{restricted mean survival time} and the \emph{median survival time} respectively.} + +\item{cutoff_time}{(\code{numeric(1)})\cr +Determines the time point up to which we calculate the restricted mean survival time (works only for the \code{"rmst"} method). +If \code{NULL} (default), all the available time points in the predicted survival distribution will be used.} + +\item{add_crank}{(\code{logical(1)})\cr +If \code{TRUE} then \code{crank} predict type will be set as \code{-response} (as higher survival times correspond to lower risk). +Works only if \code{overwrite} is \code{TRUE}.} + +\item{overwrite}{(\code{logical(1)})\cr +If \code{FALSE} (default) and the prediction already has a \code{response} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{response} (and the \code{crank}, if \code{add_crank} is \code{TRUE}) will be overwritten.} + +\item{graph_learner}{\code{logical(1)}\cr +If \code{TRUE} returns wraps the \link[mlr3pipelines:Graph]{Graph} as a +\link[mlr3pipelines:mlr_learners_graph]{GraphLearner} otherwise (default) returns as a \code{Graph}.} +} +\value{ +\link[mlr3pipelines:Graph]{mlr3pipelines::Graph} or \link[mlr3pipelines:mlr_learners_graph]{mlr3pipelines::GraphLearner} +} +\description{ +Wrapper around \link{PipeOpResponseCompositor} to simplify \link[mlr3pipelines:Graph]{Graph} creation. +} +\examples{ +\dontrun{ +if (requireNamespace("mlr3pipelines", quietly = TRUE)) { + library("mlr3") + library("mlr3pipelines") + + task = tsk("lung") + part = partition(task) + + # add survival time prediction type to the predictions of a Cox model + grlrn = ppl( + "responsecompositor", + learner = lrn("surv.coxph"), + method = "rmst", + overwrite = TRUE, + graph_learner = TRUE + ) + grlrn$train(task, part$train) + grlrn$predict(task, part$test) +} +} +} +\seealso{ +Other pipelines: +\code{\link{mlr_graphs_crankcompositor}}, +\code{\link{mlr_graphs_distrcompositor}}, +\code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_survaverager}}, +\code{\link{mlr_graphs_survbagging}}, +\code{\link{mlr_graphs_survtoclassif_disctime}}, +\code{\link{mlr_graphs_survtoregr}} +} +\concept{pipelines} diff --git a/man/mlr_graphs_survaverager.Rd b/man/mlr_graphs_survaverager.Rd index e85c9dae0..7384fbf13 100644 --- a/man/mlr_graphs_survaverager.Rd +++ b/man/mlr_graphs_survaverager.Rd @@ -47,6 +47,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, \code{\link{mlr_graphs_survtoregr}} diff --git a/man/mlr_graphs_survbagging.Rd b/man/mlr_graphs_survbagging.Rd index 3b9eaba82..c6ceb8f6e 100644 --- a/man/mlr_graphs_survbagging.Rd +++ b/man/mlr_graphs_survbagging.Rd @@ -78,6 +78,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, \code{\link{mlr_graphs_survtoregr}} diff --git a/man/mlr_graphs_survtoclassif_disctime.Rd b/man/mlr_graphs_survtoclassif_disctime.Rd index c3b3e64c3..d1c09d1b3 100644 --- a/man/mlr_graphs_survtoclassif_disctime.Rd +++ b/man/mlr_graphs_survtoclassif_disctime.Rd @@ -84,6 +84,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoregr}} diff --git a/man/mlr_graphs_survtoregr.Rd b/man/mlr_graphs_survtoregr.Rd index ef9d99047..746725f5d 100644 --- a/man/mlr_graphs_survtoregr.Rd +++ b/man/mlr_graphs_survtoregr.Rd @@ -28,7 +28,7 @@ Regression learner to fit to the transformed \link[mlr3:TaskRegr]{TaskRegr}. If \code{NULL} in method \code{2}, then \code{regr_learner} must have \code{se} predict_type.} \item{distrcompose}{\code{logical(1)}\cr -For methods \code{1} and \code{3} if \code{TRUE} (default) then \link{PipeOpDistrCompositor} is utilised to +For method \code{3} if \code{TRUE} (default) then \link{PipeOpDistrCompositor} is utilised to transform the deterministic predictions to a survival distribution.} \item{distr_estimator}{\link{LearnerSurv}\cr @@ -79,8 +79,6 @@ Three reduction strategies are implemented, these are: \item A \link{LearnerRegr} is fit and predicted on the new \code{TaskRegr}. \item \link{PipeOpPredRegrSurv} transforms the resulting \link[mlr3:PredictionRegr]{PredictionRegr} to \link{PredictionSurv}. -\item Optionally: \link{PipeOpDistrCompositor} is used to compose a \code{distr} predict_type from the -predicted \code{response} predict_type. } \item Survival to Probabilistic Regression \enumerate{ @@ -134,7 +132,6 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { "survtoregr", method = 1, regr_learner = lrn("regr.featureless"), - distrcompose = TRUE, survregr_params = list(method = "delete") ) pipe$train(task) @@ -169,6 +166,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}} diff --git a/man/mlr_measures_surv.cindex.Rd b/man/mlr_measures_surv.cindex.Rd index f4fc2880d..5697b9764 100644 --- a/man/mlr_measures_surv.cindex.Rd +++ b/man/mlr_measures_surv.cindex.Rd @@ -92,7 +92,7 @@ Weighting applied to tied rankings, default is to give them half (0.5) weighting library(mlr3) task = tsk("rats") learner = lrn("surv.coxph") -part = partition(task) # train/test split, stratified on `status` by default +part = partition(task) # train/test split learner$train(task, part$train) p = learner$predict(task, part$test) diff --git a/man/mlr_measures_surv.rcll.Rd b/man/mlr_measures_surv.rcll.Rd index 405555ff9..1a1a4b47a 100644 --- a/man/mlr_measures_surv.rcll.Rd +++ b/man/mlr_measures_surv.rcll.Rd @@ -13,6 +13,10 @@ The RCLL, in the context of probabilistic predictions, is defined by where \eqn{\Delta} is the censoring indicator, \eqn{f} the probability density function and \eqn{S} the survival function. RCLL is proper given that censoring and survival distribution are independent, see Rindt et al. (2022). + +\strong{Note}: Even though RCLL is a proper scoring rule, the calculation of \eqn{f(t)} (which in our case is discrete, i.e. it is a \emph{probability mass function}) for time points in the test set that don't exist in the predicted survival matrix (\code{distr}), results in 0 values, which are substituted by \code{"eps"} in our implementation, therefore skewing the result towards \eqn{-log(eps)}. +This problem is also discussed in Rindt et al. (2022), where the authors perform interpolation to get non-zero values for the \eqn{f(t)}. +Until this is handled in \code{mlr3proba} some way, we advise against using this measure for model evaluation. } \section{Dictionary}{ diff --git a/man/mlr_pipeops_compose_breslow_distr.Rd b/man/mlr_pipeops_compose_breslow_distr.Rd index 211ba6adf..c8f3138fa 100644 --- a/man/mlr_pipeops_compose_breslow_distr.Rd +++ b/man/mlr_pipeops_compose_breslow_distr.Rd @@ -75,7 +75,8 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other survival compositors: \code{\link{mlr_pipeops_crankcompose}}, -\code{\link{mlr_pipeops_distrcompose}} +\code{\link{mlr_pipeops_distrcompose}}, +\code{\link{mlr_pipeops_responsecompose}} } \concept{survival compositors} \section{Super class}{ diff --git a/man/mlr_pipeops_crankcompose.Rd b/man/mlr_pipeops_crankcompose.Rd index f6c931a42..adedaf7f7 100644 --- a/man/mlr_pipeops_crankcompose.Rd +++ b/man/mlr_pipeops_crankcompose.Rd @@ -21,14 +21,11 @@ po("crankcompose") \section{Input and Output Channels}{ -\link{PipeOpCrankCompositor} has one input channel named "input", which takes -\code{NULL} during training and \link{PredictionSurv} during prediction. +\link{PipeOpCrankCompositor} has one input channel named \code{"input"}, which takes \code{NULL} during training and \link{PredictionSurv} during prediction. -\link{PipeOpCrankCompositor} has one output channel named "output", producing \code{NULL} during training -and a \link{PredictionSurv} during prediction. +\link{PipeOpCrankCompositor} has one output channel named \code{"output"}, producing \code{NULL} during training and a \link{PredictionSurv} during prediction. -The output during prediction is the \link{PredictionSurv} from the "pred" input but with the \code{crank} -predict type overwritten by the given estimation method. +The output during prediction is the \link{PredictionSurv} from the input but with the \code{crank} predict type overwritten by the given estimation method. } \section{State}{ @@ -41,44 +38,24 @@ The \verb{$state} is left empty (\code{list()}). \itemize{ \item \code{method} :: \code{character(1)} \cr Determines what method should be used to produce a continuous ranking from the distribution. -One of \code{sum_haz}, \code{median}, \code{mode}, or \code{mean} corresponding to the -respective functions in the predicted survival distribution. Note that -for models with a proportional hazards form, the ranking implied by -\code{mean} and \code{median} will be identical (but not the value of \code{crank} -itself). \code{sum_haz} (default) uses \code{\link[survivalmodels:surv_to_risk]{survivalmodels::surv_to_risk()}}. -\item \code{which} :: \code{numeric(1)}\cr -If \code{method = "mode"} then specifies which mode to use if multi-modal, default is the first. -\item \code{response} :: \code{logical(1)}\cr -If \code{TRUE} then the \code{response} predict type is estimated with the same values as \code{crank}. +Currently only \code{mort} is supported, which is the sum of the cumulative hazard, also called \emph{expected/ensemble mortality}, see Ishwaran et al. (2008). +For more details, see \code{\link[=get_mortality]{get_mortality()}}. \item \code{overwrite} :: \code{logical(1)} \cr -If \code{FALSE} (default) then if the "pred" input already has a \code{crank}, the compositor only -composes a \code{response} type if \code{response = TRUE} and does not already exist. If \code{TRUE} then -both the \code{crank} and \code{response} are overwritten. +If \code{FALSE} (default) and the prediction already has a \code{crank} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{crank} will be overwritten. } } -\section{Internals}{ - -The \code{median}, \code{mode}, or \code{mean} will use analytical expressions if possible but if not they are -calculated using methods from \link{distr6}. \code{mean} requires \CRANpkg{cubature}. -} - \examples{ \dontrun{ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { - library(mlr3) library(mlr3pipelines) task = tsk("rats") - learn = lrn("surv.coxph")$train(task)$predict(task) - poc = po("crankcompose", param_vals = list(method = "sum_haz")) - poc$predict(list(learn))[[1]] - - if (requireNamespace("cubature", quietly = TRUE)) { - learn = lrn("surv.coxph")$train(task)$predict(task) - poc = po("crankcompose", param_vals = list(method = "sum_haz")) - poc$predict(list(learn))[[1]] - } + # change the crank prediction type of a Cox's model predictions + pred = lrn("surv.coxph")$train(task)$predict(task) + poc = po("crankcompose", param_vals = list(overwrite = TRUE)) + poc$predict(list(pred))[[1L]] } } } @@ -87,7 +64,8 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other survival compositors: \code{\link{mlr_pipeops_compose_breslow_distr}}, -\code{\link{mlr_pipeops_distrcompose}} +\code{\link{mlr_pipeops_distrcompose}}, +\code{\link{mlr_pipeops_responsecompose}} } \concept{survival compositors} \section{Super class}{ diff --git a/man/mlr_pipeops_distrcompose.Rd b/man/mlr_pipeops_distrcompose.Rd index b5ac7b05e..5d43c3504 100644 --- a/man/mlr_pipeops_distrcompose.Rd +++ b/man/mlr_pipeops_distrcompose.Rd @@ -5,18 +5,14 @@ \alias{PipeOpDistrCompositor} \title{PipeOpDistrCompositor} \description{ -Estimates (or 'composes') a survival distribution from a predicted baseline \code{distr} and a -\code{crank} or \code{lp} from two \link{PredictionSurv}s. +Estimates (or 'composes') a survival distribution from a predicted baseline +survival distribution (\code{distr}) and a linear predictor (\code{lp}) from two \link{PredictionSurv}s. Compositor Assumptions: \itemize{ \item The baseline \code{distr} is a discrete estimator, e.g. \link[=LearnerSurvKaplan]{surv.kaplan}. \item The composed \code{distr} is of a linear form -\item If \code{lp} is missing then \code{crank} is equivalent } - -These assumptions are strong and may not be reasonable. Future updates will upgrade this -compositor to be more flexible. } \section{Dictionary}{ @@ -32,15 +28,16 @@ po("distrcompose") \section{Input and Output Channels}{ -\link{PipeOpDistrCompositor} has two input channels, "base" and "pred". Both input channels take -\code{NULL} during training and \link{PredictionSurv} during prediction. +\link{PipeOpDistrCompositor} has two input channels, \code{"base"} and \code{"pred"}. +Both input channels take \code{NULL} during training and \link{PredictionSurv} during prediction. -\link{PipeOpDistrCompositor} has one output channel named "output", producing \code{NULL} during training -and a \link{PredictionSurv} during prediction. +\link{PipeOpDistrCompositor} has one output channel named \code{"output"}, producing +\code{NULL} during training and a \link{PredictionSurv} during prediction. -The output during prediction is the \link{PredictionSurv} from the "pred" input but with an extra -(or overwritten) column for \code{distr} predict type; which is composed from the \code{distr} of "base" -and \code{lp} or \code{crank} of "pred". +The output during prediction is the \link{PredictionSurv} from the \code{"pred"} input +but with an extra (or overwritten) column for the \code{distr} predict type; which +is composed from the \code{distr} of \code{"base"} and the \code{lp} of \code{"pred"}. +If no \code{lp} predictions have been made or exist, then the \code{"pred"} is returned unchanged. } \section{State}{ @@ -58,8 +55,8 @@ accelerated-failure time, \code{aft}, proportional hazards, \code{ph}, or propor Default \code{aft}. \item \code{overwrite} :: \code{logical(1)} \cr If \code{FALSE} (default) then if the "pred" input already has a \code{distr}, the compositor does -nothing and returns the given \link{PredictionSurv}. If \code{TRUE} then the \code{distr} is overwritten -with the \code{distr} composed from \code{lp}/\code{crank} - this is useful for changing the prediction +nothing and returns the given \link{PredictionSurv}. If \code{TRUE}, then the \code{distr} is overwritten +with the \code{distr} composed from \code{lp} - this is useful for changing the prediction \code{distr} from one model form to another. } } @@ -71,8 +68,7 @@ The respective \code{form}s above have respective survival distributions: \deqn{ph: S(t) = S_0(t)^{exp(lp)}}{ph: S(t) = S0(t)^exp(lp)} \deqn{po: S(t) = \frac{S_0(t)}{exp(-lp) + (1-exp(-lp)) S_0(t)}}{po: S(t) = S0(t) / [exp(-lp) + S0(t) (1-exp(-lp))]} # nolint where \eqn{S_0}{S0} is the estimated baseline survival distribution, and \eqn{lp} is the -predicted linear predictor. If the input model does not predict a linear predictor then \code{crank} -is assumed to be the \code{lp} - \strong{this may be a strong and unreasonable assumption.} +predicted linear predictor. } \examples{ @@ -84,6 +80,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { base = lrn("surv.kaplan")$train(task)$predict(task) pred = lrn("surv.coxph")$train(task)$predict(task) + # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: pod = po("distrcompose", param_vals = list(form = "aft", overwrite = TRUE)) pod$predict(list(base = base, pred = pred))[[1]] } @@ -94,7 +91,8 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other survival compositors: \code{\link{mlr_pipeops_compose_breslow_distr}}, -\code{\link{mlr_pipeops_crankcompose}} +\code{\link{mlr_pipeops_crankcompose}}, +\code{\link{mlr_pipeops_responsecompose}} } \concept{survival compositors} \section{Super class}{ diff --git a/man/mlr_pipeops_responsecompose.Rd b/man/mlr_pipeops_responsecompose.Rd new file mode 100644 index 000000000..dea59706c --- /dev/null +++ b/man/mlr_pipeops_responsecompose.Rd @@ -0,0 +1,166 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpResponseCompositor.R +\name{mlr_pipeops_responsecompose} +\alias{mlr_pipeops_responsecompose} +\alias{PipeOpResponseCompositor} +\title{PipeOpResponseCompositor} +\description{ +Uses a predicted survival distribution (\code{distr}) in a \link{PredictionSurv} to estimate (or 'compose') an expected survival time (\code{response}) prediction. +Practically, this \code{PipeOp} summarizes an observation's survival curve/distribution to a single number which can be either the restricted mean survival time or the median survival time. +} +\section{Dictionary}{ + +This \link[mlr3pipelines:PipeOp]{PipeOp} can be instantiated via the +\link[mlr3misc:Dictionary]{dictionary} \link[mlr3pipelines:mlr_pipeops]{mlr3pipelines::mlr_pipeops} or with the associated sugar +function \code{\link[mlr3pipelines:po]{mlr3pipelines::po()}}: + +\if{html}{\out{
}}\preformatted{PipeOpResponseCompositor$new() +mlr_pipeops$get("responsecompose") +po("responsecompose") +}\if{html}{\out{
}} +} + +\section{Input and Output Channels}{ + +\link{PipeOpResponseCompositor} has one input channel named \code{"input"}, which takes +\code{NULL} during training and \link{PredictionSurv} during prediction. + +\link{PipeOpResponseCompositor} has one output channel named \code{"output"}, producing \code{NULL} during training +and a \link{PredictionSurv} during prediction. + +The output during prediction is the \link{PredictionSurv} from the input but with the \code{response} +predict type overwritten by the given method. +} + +\section{State}{ + +The \verb{$state} is left empty (\code{list()}). +} + +\section{Parameters}{ + +\itemize{ +\item \code{method} :: \code{character(1)} \cr +Determines what method should be used to produce a survival time (response) from the survival distribution. +Available methods are \code{"rmst"} and \code{"median"}, corresponding to the \emph{restricted mean survival time} and the \emph{median survival time} respectively. +\item \code{cutoff_time} :: \code{numeric(1)} \cr +Determines the time point up to which we calculate the restricted mean survival time (works only for the \code{"rmst"} method). +If \code{NULL} (default), all the available time points in the predicted survival distribution will be used. +} +\itemize{ +\item \code{add_crank} :: \code{logical(1)} \cr +If \code{TRUE} then \code{crank} predict type will be set as \code{-response} (as higher survival times correspond to lower risk). +Works only if \code{overwrite} is \code{TRUE}. +\item \code{overwrite} :: \code{logical(1)} \cr +If \code{FALSE} (default) and the prediction already has a \code{response} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{response} (and the \code{crank}, if \code{add_crank} is \code{TRUE}) will be overwritten. +} +} + +\section{Internals}{ + +The restricted mean survival time is the default/preferred method and is calculated as follows: +\deqn{T_{i,rmst} \approx \sum_{t_j \in [0,\tau]} (t_j - t_{j-1}) S_i(t_j)} + +where \eqn{T} is the expected survival time, \eqn{\tau} is the time cutoff and \eqn{S_i(t_j)} are the predicted survival probabilities of observation \eqn{i} for all the \eqn{t_j} time points. + +The \eqn{T_{i,median}} survival time is just the first time point for which the survival probability is less than \eqn{0.5}. +If no such time point exists (e.g. when the survival distribution is not proper due to high censoring) we return the last time point. +This is \strong{not a good estimate to use in general}, only a reasonable substitute in such cases. +} + +\examples{ +\dontrun{ +if (requireNamespace("mlr3pipelines", quietly = TRUE)) { + library(mlr3pipelines) + task = tsk("rats") + + # add survival time prediction type to the predictions of a Cox model + # Median survival time as response + pred = lrn("surv.coxph")$train(task)$predict(task) + por = po("responsecompose", param_vals = list(method = "median", overwrite = TRUE)) + por$predict(list(pred))[[1L]] + # mostly improper survival distributions, "median" sets the survival time + # to the last time point + + # RMST (default) as response, while also changing the crank = -response + por = po("responsecompose", param_vals = list(overwrite = TRUE, add_crank = TRUE)) + por$predict(list(pred))[[1L]] +} +} +} +\references{ +Zhao, Lihui, Claggett, Brian, Tian, Lu, Uno, Hajime, Pfeffer, A. M, Solomon, D. S, Trippa, Lorenzo, Wei, J. L (2016). +\dQuote{On the restricted mean survival time curve in survival analysis.} +\emph{Biometrics}, \bold{72}(1), 215--221. +ISSN 1541-0420, \doi{10.1111/BIOM.12384}, \url{https://onlinelibrary.wiley.com/doi/full/10.1111/biom.12384}. +} +\seealso{ +\link{pipeline_crankcompositor} + +Other survival compositors: +\code{\link{mlr_pipeops_compose_breslow_distr}}, +\code{\link{mlr_pipeops_crankcompose}}, +\code{\link{mlr_pipeops_distrcompose}} +} +\concept{survival compositors} +\section{Super class}{ +\code{\link[mlr3pipelines:PipeOp]{mlr3pipelines::PipeOp}} -> \code{PipeOpResponseCompositor} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-PipeOpResponseCompositor-new}{\code{PipeOpResponseCompositor$new()}} +\item \href{#method-PipeOpResponseCompositor-clone}{\code{PipeOpResponseCompositor$clone()}} +} +} +\if{html}{\out{ +
Inherited methods + +
+}} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-PipeOpResponseCompositor-new}{}}} +\subsection{Method \code{new()}}{ +Creates a new instance of this \link[R6:R6Class]{R6} class. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{PipeOpResponseCompositor$new(id = "responsecompose", param_vals = list())}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{id}}{(\code{character(1)})\cr +Identifier of the resulting object.} + +\item{\code{param_vals}}{(\code{list()})\cr +List of hyperparameter settings, overwriting the hyperparameter settings that would +otherwise be set during construction.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-PipeOpResponseCompositor-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{PipeOpResponseCompositor$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index ca6655b0f..741098bfd 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -122,4 +122,6 @@ reference: - .surv_return - breslow - assert_surv + - assert_surv_matrix + - get_mortality - MeasureSurvAUC diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a89dfb32e..ef1d8c4b1 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -10,6 +10,17 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// c_assert_surv +bool c_assert_surv(const NumericMatrix& mat); +RcppExport SEXP _mlr3proba_c_assert_surv(SEXP matSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const NumericMatrix& >::type mat(matSEXP); + rcpp_result_gen = Rcpp::wrap(c_assert_surv(mat)); + return rcpp_result_gen; +END_RCPP +} // c_get_unique_times NumericVector c_get_unique_times(NumericVector true_times, NumericVector req_times); RcppExport SEXP _mlr3proba_c_get_unique_times(SEXP true_timesSEXP, SEXP req_timesSEXP) { @@ -23,43 +34,43 @@ BEGIN_RCPP END_RCPP } // c_score_intslogloss -NumericMatrix c_score_intslogloss(NumericVector truth, NumericVector unique_times, NumericMatrix cdf, double eps); +NumericMatrix c_score_intslogloss(const NumericVector& truth, const NumericVector& unique_times, const NumericMatrix& cdf, double eps); RcppExport SEXP _mlr3proba_c_score_intslogloss(SEXP truthSEXP, SEXP unique_timesSEXP, SEXP cdfSEXP, SEXP epsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericVector >::type truth(truthSEXP); - Rcpp::traits::input_parameter< NumericVector >::type unique_times(unique_timesSEXP); - Rcpp::traits::input_parameter< NumericMatrix >::type cdf(cdfSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type truth(truthSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type unique_times(unique_timesSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type cdf(cdfSEXP); Rcpp::traits::input_parameter< double >::type eps(epsSEXP); rcpp_result_gen = Rcpp::wrap(c_score_intslogloss(truth, unique_times, cdf, eps)); return rcpp_result_gen; END_RCPP } // c_score_graf_schmid -NumericMatrix c_score_graf_schmid(NumericVector truth, NumericVector unique_times, NumericMatrix cdf, int power); +NumericMatrix c_score_graf_schmid(const NumericVector& truth, const NumericVector& unique_times, const NumericMatrix& cdf, int power); RcppExport SEXP _mlr3proba_c_score_graf_schmid(SEXP truthSEXP, SEXP unique_timesSEXP, SEXP cdfSEXP, SEXP powerSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericVector >::type truth(truthSEXP); - Rcpp::traits::input_parameter< NumericVector >::type unique_times(unique_timesSEXP); - Rcpp::traits::input_parameter< NumericMatrix >::type cdf(cdfSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type truth(truthSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type unique_times(unique_timesSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type cdf(cdfSEXP); Rcpp::traits::input_parameter< int >::type power(powerSEXP); rcpp_result_gen = Rcpp::wrap(c_score_graf_schmid(truth, unique_times, cdf, power)); return rcpp_result_gen; END_RCPP } // c_weight_survival_score -NumericMatrix c_weight_survival_score(NumericMatrix score, NumericMatrix truth, NumericVector unique_times, NumericMatrix cens, bool proper, double eps); +NumericMatrix c_weight_survival_score(const NumericMatrix& score, const NumericMatrix& truth, const NumericVector& unique_times, const NumericMatrix& cens, bool proper, double eps); RcppExport SEXP _mlr3proba_c_weight_survival_score(SEXP scoreSEXP, SEXP truthSEXP, SEXP unique_timesSEXP, SEXP censSEXP, SEXP properSEXP, SEXP epsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericMatrix >::type score(scoreSEXP); - Rcpp::traits::input_parameter< NumericMatrix >::type truth(truthSEXP); - Rcpp::traits::input_parameter< NumericVector >::type unique_times(unique_timesSEXP); - Rcpp::traits::input_parameter< NumericMatrix >::type cens(censSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type score(scoreSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type truth(truthSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type unique_times(unique_timesSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type cens(censSEXP); Rcpp::traits::input_parameter< bool >::type proper(properSEXP); Rcpp::traits::input_parameter< double >::type eps(epsSEXP); rcpp_result_gen = Rcpp::wrap(c_weight_survival_score(score, truth, unique_times, cens, proper, eps)); @@ -67,30 +78,30 @@ BEGIN_RCPP END_RCPP } // c_concordance -float c_concordance(NumericVector time, NumericVector status, NumericVector crank, double t_max, std::string weight_meth, NumericMatrix cens, NumericMatrix surv, float tiex); +float c_concordance(const NumericVector& time, const NumericVector& status, const NumericVector& crank, double t_max, const std::string& weight_meth, const NumericMatrix& cens, const NumericMatrix& surv, float tiex); RcppExport SEXP _mlr3proba_c_concordance(SEXP timeSEXP, SEXP statusSEXP, SEXP crankSEXP, SEXP t_maxSEXP, SEXP weight_methSEXP, SEXP censSEXP, SEXP survSEXP, SEXP tiexSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericVector >::type time(timeSEXP); - Rcpp::traits::input_parameter< NumericVector >::type status(statusSEXP); - Rcpp::traits::input_parameter< NumericVector >::type crank(crankSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type time(timeSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type status(statusSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type crank(crankSEXP); Rcpp::traits::input_parameter< double >::type t_max(t_maxSEXP); - Rcpp::traits::input_parameter< std::string >::type weight_meth(weight_methSEXP); - Rcpp::traits::input_parameter< NumericMatrix >::type cens(censSEXP); - Rcpp::traits::input_parameter< NumericMatrix >::type surv(survSEXP); + Rcpp::traits::input_parameter< const std::string& >::type weight_meth(weight_methSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type cens(censSEXP); + Rcpp::traits::input_parameter< const NumericMatrix& >::type surv(survSEXP); Rcpp::traits::input_parameter< float >::type tiex(tiexSEXP); rcpp_result_gen = Rcpp::wrap(c_concordance(time, status, crank, t_max, weight_meth, cens, surv, tiex)); return rcpp_result_gen; END_RCPP } // c_gonen -double c_gonen(NumericVector crank, float tiex); +double c_gonen(const NumericVector& crank, float tiex); RcppExport SEXP _mlr3proba_c_gonen(SEXP crankSEXP, SEXP tiexSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericVector >::type crank(crankSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type crank(crankSEXP); Rcpp::traits::input_parameter< float >::type tiex(tiexSEXP); rcpp_result_gen = Rcpp::wrap(c_gonen(crank, tiex)); return rcpp_result_gen; @@ -98,6 +109,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { + {"_mlr3proba_c_assert_surv", (DL_FUNC) &_mlr3proba_c_assert_surv, 1}, {"_mlr3proba_c_get_unique_times", (DL_FUNC) &_mlr3proba_c_get_unique_times, 2}, {"_mlr3proba_c_score_intslogloss", (DL_FUNC) &_mlr3proba_c_score_intslogloss, 4}, {"_mlr3proba_c_score_graf_schmid", (DL_FUNC) &_mlr3proba_c_score_graf_schmid, 4}, diff --git a/src/survival_assert.cpp b/src/survival_assert.cpp new file mode 100644 index 000000000..26c0be371 --- /dev/null +++ b/src/survival_assert.cpp @@ -0,0 +1,26 @@ +#include +using namespace Rcpp; + +// [[Rcpp::export]] +bool c_assert_surv(const NumericMatrix& mat) { + for (int i = 0; i < mat.nrow(); i++) { + if (mat(i, 0) < 0 || mat(i, 0) > 1) { + // check first element + return false; + } + + for (int j = 1; j < mat.ncol(); j++) { + // S(t) in [0,1] + if (mat(i, j) < 0 || mat(i, j) > 1) { + return false; + } + + // S(t) should not increase! + if (mat(i, j) > mat(i, j - 1)) { + return false; + } + } + } + + return true; +} diff --git a/src/survival_scores.cpp b/src/survival_scores.cpp index 038789fd2..c3134e748 100644 --- a/src/survival_scores.cpp +++ b/src/survival_scores.cpp @@ -1,7 +1,5 @@ -#include -#include -#include #include +#include using namespace Rcpp; using namespace std; @@ -15,45 +13,47 @@ NumericVector c_get_unique_times(NumericVector true_times, NumericVector req_tim std::sort(req_times.begin(), req_times.end()); double mintime = true_times(0); - double maxtime = true_times(true_times.length()-1); + double maxtime = true_times(true_times.length() - 1); for (int i = 0; i < req_times.length(); i++) { - if (req_times[i] < mintime || req_times[i] > maxtime || ((i > 1) && req_times[i] == req_times[i-1])) { - req_times.erase (i); - i--; - } + if (req_times[i] < mintime || req_times[i] > maxtime || ((i > 1) && req_times[i] == req_times[i - 1])) { + req_times.erase(i); + i--; + } } if (req_times.length() == 0) { - Rcpp::stop("Requested times are all outside the observed range."); - } else { - for (int i = 0; i < true_times.length(); i++) { - for (int j = 0; j < req_times.length(); j++) { - if(true_times[i] <= req_times[j] && - (i == true_times.length() - 1 || true_times[i + 1] > req_times[j])) { - break; - } else if(j == req_times.length() - 1) { - true_times.erase(i); - i--; - break; - } - } + Rcpp::stop("Requested times are all outside the observed range."); + } + for (int i = 0; i < true_times.length(); i++) { + for (int j = 0; j < req_times.length(); j++) { + if (true_times[i] <= req_times[j] && + (i == true_times.length() - 1 || true_times[i + 1] > req_times[j])) { + break; + } else if (j == req_times.length() - 1) { + true_times.erase(i); + i--; + break; } + } } return true_times; } // [[Rcpp::export]] -NumericMatrix c_score_intslogloss(NumericVector truth, NumericVector unique_times, NumericMatrix cdf, double eps) { +NumericMatrix c_score_intslogloss(const NumericVector& truth, + const NumericVector& unique_times, + const NumericMatrix& cdf, + double eps) { const int nr_obs = truth.length(); const int nc_times = unique_times.length(); NumericMatrix ll(nr_obs, nc_times); for (int i = 0; i < nr_obs; i++) { for (int j = 0; j < nc_times; j++) { - double tmp = (truth[i] > unique_times[j]) ? 1 - cdf(j, i) : cdf(j, i); - ll(i, j) = -log(max(tmp, eps)); + const double tmp = (truth[i] > unique_times[j]) ? 1 - cdf(j, i) : cdf(j, i); + ll(i, j) = -log(max(tmp, eps)); } } @@ -61,15 +61,17 @@ NumericMatrix c_score_intslogloss(NumericVector truth, NumericVector unique_time } // [[Rcpp::export]] -NumericMatrix c_score_graf_schmid(NumericVector truth, NumericVector unique_times, - NumericMatrix cdf, int power = 2){ +NumericMatrix c_score_graf_schmid(const NumericVector& truth, + const NumericVector& unique_times, + const NumericMatrix& cdf, + int power = 2) { const int nr_obs = truth.length(); const int nc_times = unique_times.length(); NumericMatrix igs(nr_obs, nc_times); for (int i = 0; i < nr_obs; i++) { for (int j = 0; j < nc_times; j++) { - double tmp = (truth[i] > unique_times[j]) ? cdf(j, i) : 1 - cdf(j, i); + const double tmp = (truth[i] > unique_times[j]) ? cdf(j, i) : 1 - cdf(j, i); igs(i, j) = std::pow(tmp, power); } } @@ -78,23 +80,24 @@ NumericMatrix c_score_graf_schmid(NumericVector truth, NumericVector unique_time } // [[Rcpp::export(.c_weight_survival_score)]] -NumericMatrix c_weight_survival_score(NumericMatrix score, NumericMatrix truth, - NumericVector unique_times, NumericMatrix cens, - bool proper, double eps){ - NumericVector times = truth(_,0); - NumericVector status = truth(_,1); +NumericMatrix c_weight_survival_score(const NumericMatrix& score, + const NumericMatrix& truth, + const NumericVector& unique_times, + const NumericMatrix& cens, + bool proper, double eps) { + NumericVector times = truth(_, 0); + NumericVector status = truth(_, 1); - NumericVector cens_times = cens(_,0); - NumericVector cens_surv = cens(_,1); + NumericVector cens_times = cens(_, 0); + NumericVector cens_surv = cens(_, 1); const int nr = score.nrow(); const int nc = score.ncol(); - double k = 0; NumericMatrix mat(nr, nc); for (int i = 0; i < nr; i++) { - k = 0; + double k = 0.0; // if censored and proper then zero-out and remove if (proper && status[i] == 0) { mat(i, _) = NumericVector(nc); @@ -105,7 +108,8 @@ NumericMatrix c_weight_survival_score(NumericMatrix score, NumericMatrix truth, // if alive and not proper then IPC weights are current time if (!proper && times[i] > unique_times[j]) { for (int l = 0; l < cens_times.length(); l++) { - if(unique_times[j] >= cens_times[l] && (l == cens_times.length()-1 || unique_times[j] < cens_times[l+1])) { + if (unique_times[j] >= cens_times[l] && + (l == cens_times.length() - 1 || unique_times[j] < cens_times[l + 1])) { mat(i, j) = score(i, j) / cens_surv[l]; break; } @@ -124,12 +128,14 @@ NumericMatrix c_weight_survival_score(NumericMatrix score, NumericMatrix truth, if ((times[i] < cens_times[l]) && l == 0) { k = 1; break; - } else if (times[i] >= cens_times[l] && (l == cens_times.length()-1 || times[i] < cens_times[l+1])) { + } else if (times[i] >= cens_times[l] && + (l == cens_times.length() - 1 || + times[i] < cens_times[l + 1])) { k = cens_surv[l]; - // k == 0 only if last obsv censored, therefore mat is set to 0 anyway - // This division by eps can cause inflation of the score, due to a - // very large value for a particular (i-obs, j-time) - // use 't_max' to filter 'cens' in that case + // k == 0 only if last obsv censored, therefore mat is set to 0 + // anyway This division by eps can cause inflation of the score, + // due to a very large value for a particular (i-obs, j-time) use + // 't_max' to filter 'cens' in that case if (k == 0) { k = eps; } @@ -148,12 +154,17 @@ NumericMatrix c_weight_survival_score(NumericMatrix score, NumericMatrix truth, } // [[Rcpp::export]] -float c_concordance(NumericVector time, NumericVector status, NumericVector crank, - double t_max, std::string weight_meth, NumericMatrix cens, - NumericMatrix surv, float tiex) { - double num = 0; - double den = 0; - double weight = -1; +float c_concordance(const NumericVector& time, + const NumericVector& status, + const NumericVector& crank, + double t_max, + const std::string& weight_meth, + const NumericMatrix& cens, + const NumericMatrix& surv, + float tiex) { + double num = 0.0; + double den = 0.0; + double weight = -1.0; NumericVector cens_times; NumericVector cens_surv; @@ -164,27 +175,28 @@ float c_concordance(NumericVector time, NumericVector status, NumericVector cran int sl = 0; if (weight_meth == "G2" || weight_meth == "G" || weight_meth == "SG") { - cens_times = cens(_,0); - cens_surv = cens(_,1); + cens_times = cens(_, 0); + cens_surv = cens(_, 1); cl = cens_times.length(); } if (weight_meth == "S" || weight_meth == "SG") { - surv_times = surv(_,0); - surv_surv = surv(_,1); + surv_times = surv(_, 0); + surv_surv = surv(_, 1); sl = surv_times.length(); } for (int i = 0; i < time.length() - 1; i++) { weight = -1; - if(status[i] == 1) { + if (status[i] == 1) { for (int j = i + 1; j < time.length(); j++) { if (time[i] < time[j] && time[i] < t_max) { if (weight == -1) { if (weight_meth == "I") { weight = 1; - } else if (weight_meth == "G2" || weight_meth == "G" || weight_meth == "SG") { + } else if (weight_meth == "G2" || weight_meth == "G" || + weight_meth == "SG") { for (int l = 0; l < cl; l++) { - if(time[i] >= cens_times[l] && ((l == cl -1) || time[i] < cens_times[l + 1])) { + if (time[i] >= cens_times[l] && ((l == cl - 1) || time[i] < cens_times[l + 1])) { if (weight_meth == "G" || weight_meth == "SG") { weight = pow(cens_surv[l], -1); } else { @@ -197,7 +209,8 @@ float c_concordance(NumericVector time, NumericVector status, NumericVector cran if (weight_meth == "SG" || weight_meth == "S") { for (int l = 0; l < sl; l++) { - if(time[i] >= surv_times[l] && (l == sl - 1 || time[i] < surv_times[l + 1])) { + if (time[i] >= surv_times[l] && + (l == sl - 1 || time[i] < surv_times[l + 1])) { if (weight_meth == "S") { weight = surv_surv[l]; } else { @@ -221,26 +234,26 @@ float c_concordance(NumericVector time, NumericVector status, NumericVector cran } } - if (den == 0){ + if (den == 0) { Rcpp::stop("Unable to calculate concordance index. No events, or all survival times are identical."); } - return num/den; + return num / den; } // [[Rcpp::export]] -double c_gonen(NumericVector crank, float tiex) { - // NOTE: we assume crank to be sorted! - const int n = crank.length(); - double ghci = 0.0; - - for (int i = 0; i < n - 1; i++) { - double ci = crank[i]; - for (int j = i + 1; j < n; j++) { - double cj = crank[j]; - ghci += ((ci < cj) ? 1 : tiex) / (1 + exp(ci - cj)); - } +double c_gonen(const NumericVector& crank, float tiex) { + // NOTE: we assume crank to be sorted! + const int n = crank.length(); + double ghci = 0.0; + + for (int i = 0; i < n - 1; i++) { + const double ci = crank[i]; + for (int j = i + 1; j < n; j++) { + const double cj = crank[j]; + ghci += ((ci < cj) ? 1 : tiex) / (1 + exp(ci - cj)); } + } - return (2 * ghci) / (n * (n - 1)); + return (2 * ghci) / (n * (n - 1)); } diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 9b11647a0..ecee56752 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -1,3 +1,3 @@ lg = lgr::get_logger("mlr3") old_threshold = lg$threshold -lg$set_threshold("error") +lg$set_threshold("warn") diff --git a/tests/testthat/test_asserts.R b/tests/testthat/test_asserts.R new file mode 100644 index 000000000..fa873e6b4 --- /dev/null +++ b/tests/testthat/test_asserts.R @@ -0,0 +1,17 @@ +test_that("surv matrix asserts", { + x = matrix(data = c(1,0.6,0.4,0.8,0.8,0.7), nrow = 2, ncol = 3, byrow = TRUE) + # no time points specified + expect_error(assert_surv_matrix(x), "column names must be increasing") + + # time points specified, but not numeric + colnames(x) = letters[1:3] + expect_error(suppressWarnings(assert_surv_matrix(x)), "Contains missing values") + + # time points specified and numeric + colnames(x) = c(12, 34, 42) + expect_silent(assert_surv_matrix(x)) + + # S(t) >= S(t+1) + x[2,3] = 0.81 + expect_error(assert_surv_matrix(x), "Survival probabilities must be") +}) diff --git a/tests/testthat/test_breslow.R b/tests/testthat/test_breslow.R index c97f60171..23578a0a5 100644 --- a/tests/testthat/test_breslow.R +++ b/tests/testthat/test_breslow.R @@ -61,7 +61,7 @@ test_that(".cbhaz_breslow works", { expect_numeric(cbh13, len = 5L, lower = 0, upper = Inf, sorted = TRUE) }) -test_that("breslow works", { +test_that("breslow() works", { set.seed(42L) t = tsk("rats")$filter(sample(50)) p = partition(t, ratio = 0.8) @@ -124,3 +124,43 @@ test_that("breslow works", { expect_matrix(surv, nrows = length(lp_test), ncols = length(cbhaz)) expect_true(all(surv >= 0, surv <= 1)) }) + +test_that("breslowcompose PipeOp works", { + # learner is needed + expect_error(mlr3pipelines::po("breslowcompose"), "is missing") + + # learner needs to be of survival type + expect_error(mlr3pipelines::po("breslowcompose", learner = lrn("classif.featureless")), + "must have task type") + # learner needs to have lp predictions + expect_error(mlr3pipelines::po("breslowcompose", learner = lrn("surv.kaplan")), + "must provide lp") + + # learner with lp predictions + learner = lrn("surv.coxph") + b1 = mlr3pipelines::po("breslowcompose", learner = learner, breslow.overwrite = TRUE) + b2 = mlr3pipelines::po("breslowcompose", learner = learner) + + expect_pipeop(b1) + expect_pipeop(b2) + expect_equal(b1$id, learner$id) + expect_equal(b2$id, learner$id) + expect_true(b1$param_set$values$breslow.overwrite) + expect_false(b2$param_set$values$breslow.overwrite) + expect_learner(b1$learner) + expect_error({b1$learner = lrn("surv.kaplan")}) # read-only + + task = tsk("lung") + cox_pred = lrn("surv.coxph")$train(task)$predict(task) + + expect_silent(b1$train(list(task))) + expect_silent(b2$train(list(task))) + p1 = b1$predict(list(task))[[1L]] + p2 = b2$predict(list(task))[[1L]] + + expect_equal(p1$lp, p2$lp) + surv_mat1 = p1$data$distr + surv_mat2 = p2$data$distr + expect_false(all(surv_mat1 == surv_mat2)) # distr predictions changed (a bit) + expect_true(all(surv_mat2 == cox_pred$data$distr)) # distr was not overwritten +}) diff --git a/tests/testthat/test_crankcompose.R b/tests/testthat/test_crankcompose.R new file mode 100644 index 000000000..e65a8bd94 --- /dev/null +++ b/tests/testthat/test_crankcompose.R @@ -0,0 +1,52 @@ +test_that("basic properties", { + expect_pipeop(PipeOpCrankCompositor$new()) + expect_equal(PipeOpCrankCompositor$new()$param_set$values$method, "mort") + + # check that during construction, initial values are not overwritten + values = PipeOpCrankCompositor$new()$param_set$values + values2 = PipeOpCrankCompositor$new(param_vals = list(method = "mort"))$param_set$values + expect_equal(values, values2) +}) + +set.seed(42) +task = tgen("simsurv")$generate(20L) +pcox = lrn("surv.coxph")$train(task)$predict(task) + +test_that("no params", { + po = PipeOpCrankCompositor$new(param_vals = list()) + pred_km = lrn("surv.kaplan")$train(task)$predict(task) + p = po$predict(list(pred_km))[[1L]] + expect_prediction_surv(p) + # by default, no overwrite happens + expect_identical(p$crank, pred_km$crank) +}) + +test_that("overwrite crank", { + # no overwrite + poc = mlr3pipelines::po("crankcompose") + p1 = poc$predict(list(pcox))[[1L]] + expect_identical(p1$crank, pcox$crank) + + # overwrite crank + poc = mlr3pipelines::po("crankcompose", param_vals = list(overwrite = TRUE)) + p2 = poc$predict(list(pcox))[[1L]] + expect_false(all(p2$crank == pcox$crank)) + + # even if prediction doesn't have crank somehow, pipeop will add it even if no overwrite + por = mlr3pipelines::po("crankcompose", param_vals = list(overwrite = FALSE)) + p2$data$crank = NULL + expect_true(all(is.na(p2$crank))) # NAs produce so that c-index can be calculated + p3 = poc$predict(list(p2))[[1L]] + expect_false(any(is.na(p3$crank))) +}) + +test_that("pipeline works", { + pipe = mlr3pipelines::ppl("crankcompositor", learner = lrn("surv.kaplan")) + expect_class(pipe, "Graph") + + grlrn = mlr3pipelines::ppl("crankcompositor", learner = lrn("surv.kaplan"), graph_learner = TRUE) + expect_class(grlrn, "GraphLearner") + p = grlrn$train(task)$predict(task) + expect_prediction_surv(p) + expect_true("crank" %in% p$predict_types) +}) diff --git a/tests/testthat/test_pipeop_trafotask_survclassif_disctime.R b/tests/testthat/test_discetetime.R similarity index 100% rename from tests/testthat/test_pipeop_trafotask_survclassif_disctime.R rename to tests/testthat/test_discetetime.R diff --git a/tests/testthat/test_distrcompose.R b/tests/testthat/test_distrcompose.R new file mode 100644 index 000000000..64257aa15 --- /dev/null +++ b/tests/testthat/test_distrcompose.R @@ -0,0 +1,69 @@ +test_that("basic properties", { + expect_pipeop(PipeOpDistrCompositor$new()) + + # check that during construction, initial values are not overwritten + values = PipeOpDistrCompositor$new()$param_set$values + values2 = PipeOpDistrCompositor$new(param_vals = list(overwrite = FALSE))$param_set$values + expect_equal(values, values2) +}) + +set.seed(42L) +task = tsk("rats")$filter(sample(300, 110L)) +cox_pred = lrn("surv.coxph")$train(task)$predict(task) + +test_that("no params", { + base = lrn("surv.kaplan")$train(task)$predict(task) + pred = lrn("surv.kaplan")$train(task)$predict(task) + pod = mlr3pipelines::po("distrcompose", param_vals = list()) + expect_silent(pod$predict(list(base = base, pred = pred))) +}) + +test_that("overwrite = FALSE", { + gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.kaplan"), overwrite = FALSE) + expect_class(gr, "Graph") + expect_silent(gr$train(task)) + expect_identical( + gr$predict(task)[[1L]]$data$distr, + lrn("surv.kaplan")$train(task)$predict(task)$data$distr + ) + + # breslow + gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.coxph"), + estimator = "breslow", overwrite = FALSE) + expect_silent(gr$train(task)) + expect_identical(gr$predict(task)[[1L]]$data$distr, cox_pred$data$distr) +}) + +test_that("overwrite = TRUE", { + gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.kaplan"), overwrite = TRUE, form = "ph") + expect_class(gr, "Graph") + expect_silent(gr$train(task)) + p = gr$predict(task)[[1L]] + expect_prediction_surv(p) + expect_true("distr" %in% p$predict_types) + + grlrn = mlr3pipelines::ppl("distrcompositor", learner = lrn("surv.coxph"), + overwrite = TRUE, form = "po", graph_learner = TRUE) + expect_class(grlrn, "GraphLearner") + p = grlrn$train(task)$predict(task) + expect_prediction_surv(p) + expect_false(all(p$data$distr == cox_pred$data$distr)) # PO distr != Cox's Breslow default + + grlrn = mlr3pipelines::ppl("distrcompositor", learner = lrn("surv.coxph"), + overwrite = TRUE, form = "aft", graph_learner = TRUE) + p = grlrn$train(task)$predict(task) + expect_false(all(p$data$distr == cox_pred$data$distr)) # AFT distr != Cox's Breslow default + + # our breslow seems different from Cox's breslow + gr = mlr3pipelines::ppl("distrcompositor", learner = lrn("surv.coxph"), + estimator = "breslow", overwrite = TRUE, graph_learner = TRUE) + p = grlrn$train(task)$predict(task) + expect_false(all(p$data$distr == cox_pred$data$distr)) # distr predictions changed (a bit) +}) + +test_that("composition from crank doesn't work", { + grlrn = mlr3pipelines::ppl("distrcompositor", learner = lrn("surv.rpart"), graph_learner = TRUE) + p = grlrn$train(task)$predict(task) + # rpart has only crank prediction, so no distr composition can be made + expect_false("distr" %in% p$predict_types) +}) diff --git a/tests/testthat/test_mlr_learners_surv_coxph.R b/tests/testthat/test_mlr_learners_surv_coxph.R index f3aa944b9..dafc5f41f 100644 --- a/tests/testthat/test_mlr_learners_surv_coxph.R +++ b/tests/testthat/test_mlr_learners_surv_coxph.R @@ -4,7 +4,9 @@ test_that("autotest", { expect_learner(learner) ## no idea why weights check here fails, we test the same task ## in the below test and it works! - result = run_autotest(learner, exclude = "weights", check_replicable = FALSE, N = 10L) + result = suppressWarnings( + run_autotest(learner, exclude = "weights", check_replicable = FALSE, N = 10L) + ) expect_true(result, info = result$error) }) }) diff --git a/tests/testthat/test_mlr_measures.R b/tests/testthat/test_mlr_measures.R index 71c81c5e7..e5c1f7420 100644 --- a/tests/testthat/test_mlr_measures.R +++ b/tests/testthat/test_mlr_measures.R @@ -163,22 +163,21 @@ test_that("t_max, p_max", { test_that("ERV works as expected", { set.seed(1L) - t = tsk("rats")$filter(sample(1:300, 50)) + t = tsk("rats") + part = partition(t, 0.8) l = lrn("surv.kaplan") - p = l$train(t)$predict(t) + p = l$train(t, part$train)$predict(t, part$test) m = msr("surv.graf", ERV = TRUE) - expect_equal(as.numeric(p$score(m, task = t, train_set = t$row_ids)), 0) - expect_equal(as.numeric(resample(t, l, rsmp("holdout"))$aggregate(m)), 0) + # KM is the baseline score, so ERV score = 0 + expect_equal(as.numeric(p$score(m, task = t, train_set = part$train)), 0) - set.seed(1L) - t = tsk("rats")$filter(sample(1:300, 100)) l = lrn("surv.coxph") - p = suppressWarnings(l$train(t)$predict(t)) + p = l$train(t, part$train)$predict(t, part$test) m = msr("surv.graf", ERV = TRUE) - expect_gt(as.numeric(p$score(m, task = t, train_set = t$row_ids)), 0) - expect_gt(suppressWarnings(as.numeric(resample(t, l, rsmp("holdout"))$ - aggregate(m))), 0) + # Cox should do a little better than the KM baseline (ERV score > 0) + expect_gt(as.numeric(p$score(m, task = t, train_set = part$train)), 0) + # some checks set.seed(1L) t = tsk("rats")$filter(sample(1:300, 50)) l = lrn("surv.kaplan") diff --git a/tests/testthat/test_partition.R b/tests/testthat/test_partition.R deleted file mode 100644 index 1bd99339a..000000000 --- a/tests/testthat/test_partition.R +++ /dev/null @@ -1,17 +0,0 @@ -test_that("partition w/ stratification works", { - task = tsk("rats") - sets = partition(task) - - ratio = function(status) { - tab = table(status) - unname(tab[1L] / tab[2L]) - } - - all = ratio(task$status()) - train = ratio(task$status(sets$train)) - test = ratio(task$status(sets$test)) - - expect_numeric(all, lower = 6, upper = 6.2) - expect_numeric(train, lower = 6, upper = 6.2) - expect_numeric(test, lower = 6, upper = 6.2) -}) diff --git a/tests/testthat/test_pipelines.R b/tests/testthat/test_pipelines.R index fae131dd9..4d2444b4d 100644 --- a/tests/testthat/test_pipelines.R +++ b/tests/testthat/test_pipelines.R @@ -1,29 +1,6 @@ task = tsk("rats")$filter(sample(300, 50L)) task_regr = tgen("friedman1")$generate(20L) -test_that("crankcompositor", { - pipe = mlr3pipelines::ppl("crankcompositor", learner = lrn("surv.kaplan")) - expect_class(pipe, "Graph") - pipe = mlr3pipelines::ppl("crankcompositor", learner = lrn("surv.kaplan"), graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - pipe$train(task) - p = pipe$predict(task) - expect_prediction_surv(p) - expect_true("crank" %in% p$predict_types) -}) - -test_that("distrcompositor", { - pipe = mlr3pipelines::ppl("distrcompositor", learner = lrn("surv.rpart")) - expect_class(pipe, "Graph") - - pipe = mlr3pipelines::ppl("distrcompositor", learner = lrn("surv.rpart"), graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - - p = pipe$train(task)$predict(task) - expect_prediction_surv(p) - expect_true("distr" %in% p$predict_types) -}) - test_that("survaverager", { pipe = mlr3pipelines::ppl("survaverager", learners = list(lrn("surv.kaplan"), lrn("surv.kaplan", id = "k2"))) @@ -47,67 +24,6 @@ test_that("survbagging", { expect_prediction_surv(p) }) -test_that("resample survtoregr", { - pipe = mlr3pipelines::ppl("survtoregr", method = 1, distrcompose = FALSE, graph_learner = TRUE) - rr = resample(task, pipe, rsmp("cv", folds = 2L)) - expect_numeric(rr$aggregate()) -}) - -test_that("survtoregr 1", { - pipe = mlr3pipelines::ppl("survtoregr", method = 1, distrcompose = FALSE) - expect_class(pipe, "Graph") - pipe = mlr3pipelines::ppl("survtoregr", method = 1, distrcompose = FALSE, graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - pipe$train(task) - p = pipe$predict(task) - expect_prediction_surv(p) - - pipe = mlr3pipelines::ppl("survtoregr", method = 1, distrcompose = TRUE, graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - pipe$train(task) - p = pipe$predict(task) - expect_prediction_surv(p) - expect_true("distr" %in% p$predict_types) -}) - -test_that("survtoregr 2", { - pipe = mlr3pipelines::ppl("survtoregr", method = 2) - expect_class(pipe, "Graph") - pipe = mlr3pipelines::ppl("survtoregr", method = 2, graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - pipe$train(task) - p = pipe$predict(task) - expect_prediction_surv(p) - expect_true("distr" %in% p$predict_types) - - pipe = mlr3pipelines::ppl("survtoregr", method = 2, regr_se_learner = lrn("regr.featureless"), - graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - pipe$train(task) - p = pipe$predict(task) - expect_prediction_surv(p) - expect_true("distr" %in% p$predict_types) -}) - -test_that("survtoregr 3", { - pipe = mlr3pipelines::ppl("survtoregr", method = 3, distrcompose = FALSE) - expect_class(pipe, "Graph") - pipe = mlr3pipelines::ppl("survtoregr", method = 3, distrcompose = FALSE, - graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - suppressWarnings(pipe$train(task)) # suppress loglik warning - p = pipe$predict(task) - expect_prediction_surv(p) - - pipe = mlr3pipelines::ppl("survtoregr", method = 3, distrcompose = TRUE, - graph_learner = TRUE) - expect_class(pipe, "GraphLearner") - suppressWarnings(pipe$train(task)) # suppress loglik warning - p = pipe$predict(task) - expect_prediction_surv(p) - expect_true("distr" %in% p$predict_types) -}) - skip_if_not_installed("mlr3learners") test_that("survtoclassif_disctime", { diff --git a/tests/testthat/test_pipeop_crankcompose.R b/tests/testthat/test_pipeop_crankcompose.R deleted file mode 100644 index 95672941a..000000000 --- a/tests/testthat/test_pipeop_crankcompose.R +++ /dev/null @@ -1,88 +0,0 @@ -test_that("PipeOpCrankCompositor - basic properties", { - expect_pipeop(PipeOpCrankCompositor$new()) - expect_equal(PipeOpCrankCompositor$new()$param_set$values$method, "sum_haz") - - # check that during construction, initial values are not overwritten - values = PipeOpCrankCompositor$new()$param_set$values - values2 = PipeOpCrankCompositor$new(param_vals = list(method = "sum_haz"))$param_set$values - expect_equal(values, values2) -}) - -set.seed(2218L) -task = tgen("simsurv")$generate(20L) - -test_that("PipeOpCrankCompositor - estimate", { - gr = mlr3pipelines::ppl("crankcompositor", lrn("surv.coxph"), method = "mode", which = 1) - suppressWarnings(gr$train(task)) - p = gr$predict(task)[[1L]] - expect_prediction_surv(p) - expect_true("crank" %in% p$predict_types) -}) - -test_that("no params", { - po = PipeOpCrankCompositor$new(param_vals = list()) - p = po$predict( - list(lrn("surv.kaplan")$train(task)$predict(task)))$output - expect_prediction_surv(p) - expect_equal(p$lp, rep(NA_real_, 20L)) -}) - -test_that("response", { - po = PipeOpCrankCompositor$new( - param_vals = list(response = TRUE, method = "median") - ) - p = po$predict( - list(lrn("surv.coxph")$train(task)$predict(task)) - )$output - ignore = is.na(unlist(as.numeric(p$distr$median()))) - expect_equal(p$response[ignore], numeric(sum(ignore))) - expect_equal(p$response[!ignore], unlist(as.numeric(p$distr$median()))[!ignore]) - - p = pipeline_crankcompositor(lrn("surv.coxph"), response = TRUE, - graph_learner = TRUE, method = "median")$train(task)$predict(task) - ignore = is.na(unlist(as.numeric(p$distr$median()))) - expect_equal(p$response[ignore], numeric(sum(ignore))) - expect_equal(p$response[!ignore], unlist(as.numeric(p$distr$median()))[!ignore]) -}) - -test_that("overwrite crank", { - pl = mlr3pipelines::ppl("crankcompositor", - lrn("surv.kaplan"), - method = "median", - graph_learner = TRUE) - p1 = pl$train(task)$predict(task) - p2 = lrn("surv.kaplan")$train(task)$predict(task) - expect_true(all(p1$crank == p2$crank)) - - pl = mlr3pipelines::ppl("crankcompositor", - lrn("surv.kaplan"), - graph_learner = TRUE, - overwrite = TRUE) - p1 = pl$train(task)$predict(task) - p2 = lrn("surv.kaplan")$train(task)$predict(task) - expect_false(all(p1$crank == p2$crank)) -}) - -test_that("overwrite response", { - p = lrn("surv.kaplan")$train(task)$predict(task) - p1 = PredictionSurv$new(task = task, crank = p$crank, distr = p$distr, response = rexp(20, 0.5)) - po = PipeOpCrankCompositor$new(param_vals = list(response = TRUE, overwrite = FALSE)) - p2 = po$predict(list(p1))[[1L]] - expect_true(all(p1$response == p2$response)) - - p1 = PredictionSurv$new(task = task, crank = p$crank, distr = p$distr, response = rexp(20, 0.5)) - po = PipeOpCrankCompositor$new(param_vals = list(response = TRUE, overwrite = TRUE)) - p2 = po$predict(list(p1))[[1L]] - expect_false(all(p1$response == p2$response)) -}) - -test_that("neg crank", { - pl = mlr3pipelines::ppl("crankcompositor", - lrn("surv.kaplan"), - method = "sum_haz", - graph_learner = TRUE, - overwrite = TRUE) - p = pl$train(task)$predict(task) - expect_true(all(p$crank < 0)) - expect_gte(p$score(), 0.5) -}) diff --git a/tests/testthat/test_pipeop_distrcompose.R b/tests/testthat/test_pipeop_distrcompose.R deleted file mode 100644 index 242dff761..000000000 --- a/tests/testthat/test_pipeop_distrcompose.R +++ /dev/null @@ -1,91 +0,0 @@ -test_that("PipeOpDistrCompositor - basic properties", { - expect_pipeop(PipeOpDistrCompositor$new()) - - # check that during construction, initial values are not overwritten - values = PipeOpDistrCompositor$new()$param_set$values - values2 = PipeOpDistrCompositor$new(param_vals = list(overwrite = FALSE))$param_set$values - expect_equal(values, values2) -}) - -set.seed(42L) -task = tsk("rats")$filter(sample(300, 110L)) -cox_pred = lrn("surv.coxph")$train(task)$predict(task) - -test_that("PipeOpDistrCompositor - overwrite = FALSE", { - gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.kaplan", id = "k2"), overwrite = FALSE) - expect_silent(gr$train(task)) - expect_equal( - gr$predict(task)[[1L]]$data$distr, - lrn("surv.kaplan", id = "k2")$train(task)$predict(task)$data$distr - ) - - # breslow - gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.coxph"), - estimator = "breslow", overwrite = FALSE) - expect_silent(gr$train(task)) - expect_equal(gr$predict(task)[[1L]]$data$distr, cox_pred$data$distr) -}) - -test_that("PipeOpDistrCompositor - overwrite = TRUE", { - gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.kaplan", id = "k2"), overwrite = TRUE, form = "ph") - expect_silent(gr$train(task)) - p = gr$predict(task)[[1L]] - expect_prediction_surv(p) - expect_true("distr" %in% p$predict_types) - - gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.kaplan", id = "k2"), overwrite = TRUE, form = "po", - graph_learner = TRUE) - expect_prediction_surv(gr$train(task)$predict(task)) - - # breslow - gr = mlr3pipelines::ppl("distrcompositor", lrn("surv.coxph"), estimator = "breslow", - overwrite = TRUE, graph_learner = TRUE) - expect_silent(gr$train(task)) - surv_mat1 = gr$predict(task)$data$distr - surv_mat2 = cox_pred$data$distr - expect_false(all(surv_mat1 == surv_mat2)) # distr predictions changed (a bit) -}) - -test_that("no params", { - base = lrn("surv.kaplan")$train(task)$predict(task) - pred = lrn("surv.kaplan", id = "k2")$train(task)$predict(task) - pod = mlr3pipelines::po("distrcompose", param_vals = list()) - expect_silent(pod$predict(list(base = base, pred = pred))) -}) - -test_that("breslow PipeOp works", { - # learner is needed - expect_error(po("breslowcompose"), "is missing") - - # learner needs to be of survival type - expect_error(po("breslowcompose", learner = lrn("classif.featureless")), - "must have task type") - # learner needs to have lp predictions - expect_error(po("breslowcompose", learner = lrn("surv.kaplan")), - "must provide lp") - - # learner with lp predictions - learner = lrn("surv.coxph") - b1 = po("breslowcompose", learner = learner, breslow.overwrite = TRUE) - b2 = po("breslowcompose", learner = learner) - - expect_pipeop(b1) - expect_pipeop(b2) - expect_equal(b1$id, learner$id) - expect_equal(b2$id, learner$id) - expect_true(b1$param_set$values$breslow.overwrite) - expect_false(b2$param_set$values$breslow.overwrite) - expect_learner(b1$learner) - expect_error({b1$learner = lrn("surv.kaplan")}) # read-only - - expect_silent(b1$train(list(task))) - expect_silent(b2$train(list(task))) - p1 = b1$predict(list(task))[[1L]] - p2 = b2$predict(list(task))[[1L]] - - expect_equal(p1$lp, p2$lp) - surv_mat1 = p1$data$distr - surv_mat2 = p2$data$distr - expect_false(all(surv_mat1 == surv_mat2)) # distr predictions changed (a bit) - expect_true(all(surv_mat2 == cox_pred$data$distr)) # distr was not overwritten -}) diff --git a/tests/testthat/test_responsecompose.R b/tests/testthat/test_responsecompose.R new file mode 100644 index 000000000..8886426f6 --- /dev/null +++ b/tests/testthat/test_responsecompose.R @@ -0,0 +1,82 @@ +test_that("basic properties", { + expect_pipeop(PipeOpResponseCompositor$new()) + expect_equal(PipeOpResponseCompositor$new()$param_set$values$method, "rmst") + expect_false(PipeOpResponseCompositor$new()$param_set$values$add_crank) + expect_false(PipeOpResponseCompositor$new()$param_set$values$overwrite) + + # check that during construction, initial values are not overwritten + values = PipeOpResponseCompositor$new()$param_set$values + values2 = PipeOpResponseCompositor$new(param_vals = list(method = "rmst"))$param_set$values + expect_equal(values, values2) + + # parameter checks + expect_error(PipeOpResponseCompositor$new(param_vals = list(method = "wrong"))) + expect_error(PipeOpResponseCompositor$new(param_vals = list(add_crank = "not_a_bool"))) +}) + +set.seed(42) +task = tgen("coxed", T = 99)$generate(20L) +pcox = lrn("surv.coxph")$train(task)$predict(task) +pcox$data$response = rexp(20) # hack: add survival time predictions to cox model! + +test_that("overwrite", { + # no overwrite + por = mlr3pipelines::po("responsecompose") + p1 = por$predict(list(pcox))[[1L]] + expect_identical(p1$response, pcox$response) + + # overwrite response + por = mlr3pipelines::po("responsecompose", overwrite = TRUE) + p2 = por$predict(list(pcox))[[1L]] + expect_false(all(p2$response == pcox$response)) + + # even if prediction doesn't have response, pipeop will add them even if no overwrite + por = mlr3pipelines::po("responsecompose") + pkm = lrn("surv.kaplan")$train(task)$predict(task) + expect_null(pkm$response) + p3 = por$predict(list(pkm))[[1L]] + expect_false(is.null(p3$response)) + expect_identical(pkm$crank, p3$crank) +}) + +test_that("different methods, different responses", { + por = mlr3pipelines::po("responsecompose", overwrite = TRUE, method = "rmst") + p1 = por$predict(list(pcox))[[1L]] + por$param_set$set_values(method = "median") + p2 = por$predict(list(pcox))[[1L]] + expect_false(all(p1$response == p2$response)) +}) + +test_that("different cutoffs, different rmst", { + por1 = mlr3pipelines::po("responsecompose", overwrite = TRUE, method = "rmst") + por2 = mlr3pipelines::po("responsecompose", overwrite = TRUE, method = "rmst", + cutoff_time = 100) # t_max = 99 in the generated data + por3 = mlr3pipelines::po("responsecompose", overwrite = TRUE, method = "rmst", + cutoff_time = 65) + por4 = mlr3pipelines::po("responsecompose", overwrite = TRUE, method = "rmst", + cutoff_time = 25) + p1 = por1$predict(list(pcox))[[1L]] + p2 = por2$predict(list(pcox))[[1L]] + p3 = por3$predict(list(pcox))[[1L]] + p4 = por4$predict(list(pcox))[[1L]] + expect_identical(p1$response, p2$response) + expect_false(all(p2$response == p3$response)) + expect_false(all(p2$response == p4$response)) + expect_false(all(p3$response == p4$response)) + expect_gte(max(p2$response), max(p3$response)) + expect_gte(max(p3$response), max(p4$response)) +}) + +test_that("crank is added", { + por = mlr3pipelines::po("responsecompose", overwrite = FALSE, add_crank = TRUE) + p1 = por$predict(list(pcox))[[1L]] + # same crank or response + expect_identical(p1$response, pcox$response) + + por = mlr3pipelines::po("responsecompose", overwrite = TRUE, add_crank = TRUE) + p2 = por$predict(list(pcox))[[1L]] + # response changed + expect_false(all(pcox$response == p2$response)) + # crank is -response + expect_identical(p2$response, -p2$crank) +}) diff --git a/tests/testthat/test_pipeop_survavg.R b/tests/testthat/test_survavg.R similarity index 54% rename from tests/testthat/test_pipeop_survavg.R rename to tests/testthat/test_survavg.R index b0b642f51..765f1c0d6 100644 --- a/tests/testthat/test_pipeop_survavg.R +++ b/tests/testthat/test_survavg.R @@ -3,23 +3,22 @@ test_that("basic properties", { expect_pipeop(PipeOpSurvAvg$new(param_vals = list())) }) -task = tsk("rats")$filter(sample(300, 5L)) +task = tsk("lung")$filter(1:50) p1 = lrn("surv.kaplan")$train(task)$predict(task) -p2 = lrn("surv.kaplan")$train(task)$predict(task) - +p2 = lrn("surv.coxph")$train(task)$predict(task) test_that("equal weights", { poc = mlr3pipelines::po("survavg") expect_silent({ - p = poc$predict(list(p1, p2))$output + p = poc$predict(list(p1, p2))[[1L]] }) expect_prediction_surv(p) - expect_equal(p$crank, (p1$crank + p2$crank) / 2) + expect_equal(p$crank, unname((p1$crank + p2$crank)) / 2) expect_equal(as.numeric(p$distr$cdf(100)), as.numeric((p1$distr$cdf(100) + p2$distr$cdf(100)) / 2)) - expect_equal(p$lp, rep(NA_real_, 5L)) - expect_equal(p$response, rep(NA_real_, 5L)) + expect_null(p$lp) # Cox's lp's were deleted + expect_null(p$response) }) test_that("unequal weights", { @@ -29,7 +28,7 @@ test_that("unequal weights", { }) expect_prediction_surv(p) - expect_equal(p$crank, (p1$crank * 0.2 + p2$crank * 0.8)) + expect_equal(p$crank, unname(p1$crank * 0.2 + p2$crank * 0.8)) expect_equal(as.numeric(p$distr$cdf(100)), as.numeric((p1$distr$cdf(100) * 0.2 + p2$distr$cdf(100) * 0.8))) }) @@ -37,28 +36,29 @@ test_that("unequal weights", { test_that("lp", { poc = mlr3pipelines::po("survavg") expect_silent({ - p = poc$predict(list(p1, p1))$output + p = poc$predict(list(p2, p2))[[1L]] }) - expect_equal(p$lp, (p1$lp + p1$lp) / 2) + expect_equal(p$lp, unname((p2$lp + p2$lp) / 2)) }) test_that("response", { poc = mlr3pipelines::po("survavg") - p3 = mlr3pipelines::ppl("crankcompositor", lrn("surv.kaplan"), - response = TRUE, graph_learner = TRUE)$train(task)$predict(task) + por = mlr3pipelines::ppl("responsecompositor", learner = lrn("surv.coxph"), + graph_learner = TRUE) + p3 = por$train(task)$predict(task) expect_silent({ - p = poc$predict(list(p3, p3))$output + p = poc$predict(list(p3, p3))[[1L]] }) - expect_equal(p$response, (p3$response + p3$response) / 2) + expect_equal(p$response, unname((p3$response + p3$response) / 2)) }) -test_that("surv_averager", { +test_that("pipeline surv_averager", { poc = mlr3pipelines::po("survavg", weights = c(0.2, 0.8)) p = poc$predict(list(p1, p2))[[1L]] - p2 = mlr3pipelines::ppl("survaverager", list(lrn("surv.kaplan"), lrn("surv.kaplan", id = "k2")), - list(weights = c(0.2, 0.8)), - graph_learner = TRUE)$ + p2 = mlr3pipelines::ppl("survaverager", + learners = list(lrn("surv.kaplan"), lrn("surv.coxph")), + list(weights = c(0.2, 0.8)), graph_learner = TRUE)$ train(task)$predict(task) expect_equal(p$crank, p2$crank) diff --git a/tests/testthat/test_survtoregr.R b/tests/testthat/test_survtoregr.R new file mode 100644 index 000000000..74ad389c9 --- /dev/null +++ b/tests/testthat/test_survtoregr.R @@ -0,0 +1,54 @@ +skip("Due to bugs in survtoregr methods") +test_that("resample survtoregr", { + grlrn = mlr3pipelines::ppl("survtoregr", method = 1, distrcompose = FALSE, graph_learner = TRUE) + rr = resample(task, grlrn, rsmp("cv", folds = 2L)) + expect_numeric(rr$aggregate()) +}) + +test_that("survtoregr 1", { + pipe = mlr3pipelines::ppl("survtoregr", method = 1) + expect_class(pipe, "Graph") + grlrn = mlr3pipelines::ppl("survtoregr", method = 1, graph_learner = TRUE) + expect_class(grlrn, "GraphLearner") + p = grlrn$train(task)$predict(task) + expect_prediction_surv(p) + expect_true("response" %in% p$predict_types) +}) + +test_that("survtoregr 2", { + pipe = mlr3pipelines::ppl("survtoregr", method = 2) + expect_class(pipe, "Graph") + pipe = mlr3pipelines::ppl("survtoregr", method = 2, graph_learner = TRUE) + expect_class(pipe, "GraphLearner") + pipe$train(task) + p = pipe$predict(task) + expect_prediction_surv(p) + expect_true("distr" %in% p$predict_types) + + pipe = mlr3pipelines::ppl("survtoregr", method = 2, regr_se_learner = lrn("regr.featureless"), + graph_learner = TRUE) + expect_class(pipe, "GraphLearner") + pipe$train(task) + p = pipe$predict(task) + expect_prediction_surv(p) + expect_true("distr" %in% p$predict_types) +}) + +test_that("survtoregr 3", { + pipe = mlr3pipelines::ppl("survtoregr", method = 3, distrcompose = FALSE) + expect_class(pipe, "Graph") + pipe = mlr3pipelines::ppl("survtoregr", method = 3, distrcompose = FALSE, + graph_learner = TRUE) + expect_class(pipe, "GraphLearner") + suppressWarnings(pipe$train(task)) # suppress loglik warning + p = pipe$predict(task) + expect_prediction_surv(p) + + pipe = mlr3pipelines::ppl("survtoregr", method = 3, distrcompose = TRUE, + graph_learner = TRUE) + expect_class(pipe, "GraphLearner") + suppressWarnings(pipe$train(task)) # suppress loglik warning + p = pipe$predict(task) + expect_prediction_surv(p) + expect_true("distr" %in% p$predict_types) +}) diff --git a/tests/testthat/test_unload.R b/tests/testthat/test_unload.R index 3ff3ecedb..db25bceaf 100644 --- a/tests/testthat/test_unload.R +++ b/tests/testthat/test_unload.R @@ -1,4 +1,4 @@ -skip("To deal with this properly on the mlr3 workshop") +skip("Test manually it works and update as needed") test_that("unloading leaves no trace", { library(mlr3proba) @@ -13,15 +13,27 @@ test_that("unloading leaves no trace", { proba_tgen_keys = tgen_dt[task_type %in% c("surv", "dens"), list(key)][[1]] # UPDATE below list when new pipeops are implemented - proba_pipeops = c("survavg", "trafopred_classifsurv", "trafopred_survregr", - "trafotask_regrsurv", "crankcompose", "compose_probregr", - "distrcompose", "trafopred_regrsurv", "trafotask_survregr", - "trafotask_survclassif", "breslowcompose") + proba_pipeops = c( + # miscellaneous + "survavg", "compose_probregr", + # compose prediction types + "crankcompose", "distrcompose", "responsecompose", "breslowcompose", + # transform prediction type + "trafopred_classifsurv_disctime", "trafopred_survregr", "trafopred_regrsurv", + # transform task type + "trafotask_regrsurv", "trafotask_survregr", "trafotask_survclassif_disctime" + ) expect_in(proba_pipeops, mlr_pipeops$keys()) # UPDATE below list when new pipelines are implemented - proba_graphs = c("survaverager", "survbagging", "crankcompositor", - "distrcompositor", "probregr", "survtoregr", "survtoclassif") + proba_graphs = c( + # miscellaneous + "survaverager", "survbagging", "probregr", + # compose prediction types + "crankcompositor", "distrcompositor", "responsecompositor", + # transform surv to other tasks + "survtoregr", "survtoclassif_disctime" + ) expect_in(proba_graphs, mlr_graphs$keys()) unloadNamespace("mlr3proba")