diff --git a/.editorconfig b/.editorconfig index f5ef1a534..1b8da5d5c 100644 --- a/.editorconfig +++ b/.editorconfig @@ -15,7 +15,7 @@ indent_size = 2 indent_size = 4 [*.{cpp,hpp}] -indent_size = 4 +indent_size = 2 [{NEWS.md,DESCRIPTION,LICENSE}] max_line_length = 80 diff --git a/DESCRIPTION b/DESCRIPTION index 9258055c7..26b5b06cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mlr3proba Title: Probabilistic Supervised Learning for 'mlr3' -Version: 0.6.6 +Version: 0.6.8 Authors@R: c(person(given = "Raphael", family = "Sonabend", @@ -64,11 +64,9 @@ Imports: paradox (>= 1.0.0), R6, Rcpp (>= 1.0.4), - survival, - survivalmodels (>= 0.1.12) + survival Suggests: bujar, - cubature, GGally, knitr, lgr, @@ -145,6 +143,7 @@ Collate: 'PipeOpPredRegrSurv.R' 'PipeOpPredSurvRegr.R' 'PipeOpProbregrCompositor.R' + 'PipeOpResponseCompositor.R' 'PipeOpSurvAvg.R' 'PipeOpTaskRegrSurv.R' 'PipeOpTaskSurvClassifDiscTime.R' @@ -176,7 +175,6 @@ Collate: 'histogram.R' 'integrated_scores.R' 'mlr3proba-package.R' - 'partition.R' 'pecs.R' 'pipelines.R' 'plot.R' diff --git a/NAMESPACE b/NAMESPACE index 673c263ba..aeb2060ed 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,7 +25,6 @@ S3method(check_prediction_data,PredictionDataSurv) S3method(filter_prediction_data,PredictionDataSurv) S3method(is_missing_prediction_data,PredictionDataDens) S3method(is_missing_prediction_data,PredictionDataSurv) -S3method(partition,TaskSurv) S3method(pecs,PredictionSurv) S3method(pecs,list) S3method(plot,LearnerSurv) @@ -77,6 +76,7 @@ export(PipeOpPredRegrSurv) export(PipeOpPredSurvRegr) export(PipeOpPredTransformer) export(PipeOpProbregr) +export(PipeOpResponseCompositor) export(PipeOpSurvAvg) export(PipeOpTaskRegrSurv) export(PipeOpTaskSurvClassifDiscTime) @@ -95,7 +95,9 @@ export(as_prediction_surv) export(as_task_dens) export(as_task_surv) export(assert_surv) +export(assert_surv_matrix) export(breslow) +export(get_mortality) export(pecs) export(pipeline_survtoclassif_disctime) export(pipeline_survtoregr) diff --git a/NEWS.md b/NEWS.md index e5e720a95..293fcf698 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# mlr3proba 0.6.8 + +- `Rcpp` code optimizations +- Fixed ERV scoring to comply with `mlr3` dev version (no bugs before) +- Skipping `survtoregr` pipelines due to bugs (to be refactored in the future) + +# mlr3proba 0.6.7 + +- Deprecate `crank` to `distr` composition in `distrcompose` pipeop (only from `lp` => `distr` works now) +- Add `get_mortality()` function (from `survivalmodels::surv_to_risk()` +- Add Rcpp function `assert_surv_matrix()` +- Update and simplify `crankcompose` pipeop and respective pipeline (no `response` is created anymore) +- Add `responsecompositor` pipeline with `rmst` and `median` + # mlr3proba 0.6.6 - Small fixes and refactoring to the discrete-time pipeops diff --git a/R/LearnerDens.R b/R/LearnerDens.R index b3ee721b3..6018e6896 100644 --- a/R/LearnerDens.R +++ b/R/LearnerDens.R @@ -36,14 +36,14 @@ LearnerDens = R6::R6Class("LearnerDens", #' @description Creates a new instance of this [R6][R6::R6Class] class. initialize = function(id, param_set = ps(), predict_types = "cdf", feature_types = character(), - properties = character(), data_formats = "data.table", + properties = character(), packages = character(), label = NA_character_, man = NA_character_) { super$initialize( id = id, task_type = "dens", param_set = param_set, predict_types = predict_types, feature_types = feature_types, properties = properties, - data_formats = data_formats, packages = c("mlr3proba", packages), label = label, man = man) + packages = c("mlr3proba", packages), label = label, man = man) } ) ) diff --git a/R/MeasureSurvCindex.R b/R/MeasureSurvCindex.R index f7d63e166..65cd8134c 100644 --- a/R/MeasureSurvCindex.R +++ b/R/MeasureSurvCindex.R @@ -56,7 +56,7 @@ #' library(mlr3) #' task = tsk("rats") #' learner = lrn("surv.coxph") -#' part = partition(task) # train/test split, stratified on `status` by default +#' part = partition(task) # train/test split #' learner$train(task, part$train) #' p = learner$predict(task, part$test) #' diff --git a/R/MeasureSurvRCLL.R b/R/MeasureSurvRCLL.R index 471496f45..cb961acdb 100644 --- a/R/MeasureSurvRCLL.R +++ b/R/MeasureSurvRCLL.R @@ -16,6 +16,10 @@ #' density function and \eqn{S} the survival function. #' RCLL is proper given that censoring and survival distribution are independent, see Rindt et al. (2022). #' +#' **Note**: Even though RCLL is a proper scoring rule, the calculation of \eqn{f(t)} (which in our case is discrete, i.e. it is a *probability mass function*) for time points in the test set that don't exist in the predicted survival matrix (`distr`), results in 0 values, which are substituted by `"eps"` in our implementation, therefore skewing the result towards \eqn{-log(eps)}. +#' This problem is also discussed in Rindt et al. (2022), where the authors perform interpolation to get non-zero values for the \eqn{f(t)}. +#' Until this is handled in `mlr3proba` some way, we advise against using this measure for model evaluation. +#' #' @section Parameter details: #' - `na.rm` (`logical(1)`)\cr #' If `TRUE` (default) then removes any NAs in individual score calculations. diff --git a/R/PipeOpCrankCompositor.R b/R/PipeOpCrankCompositor.R index 4a8f97716..19bcba3cd 100644 --- a/R/PipeOpCrankCompositor.R +++ b/R/PipeOpCrankCompositor.R @@ -16,14 +16,11 @@ #' ``` #' #' @section Input and Output Channels: -#' [PipeOpCrankCompositor] has one input channel named "input", which takes -#' `NULL` during training and [PredictionSurv] during prediction. +#' [PipeOpCrankCompositor] has one input channel named `"input"`, which takes `NULL` during training and [PredictionSurv] during prediction. #' -#' [PipeOpCrankCompositor] has one output channel named "output", producing `NULL` during training -#' and a [PredictionSurv] during prediction. +#' [PipeOpCrankCompositor] has one output channel named `"output"`, producing `NULL` during training and a [PredictionSurv] during prediction. #' -#' The output during prediction is the [PredictionSurv] from the "pred" input but with the `crank` -#' predict type overwritten by the given estimation method. +#' The output during prediction is the [PredictionSurv] from the input but with the `crank` predict type overwritten by the given estimation method. #' #' @section State: #' The `$state` is left empty (`list()`). @@ -31,42 +28,24 @@ #' @section Parameters: #' * `method` :: `character(1)` \cr #' Determines what method should be used to produce a continuous ranking from the distribution. -#' One of `sum_haz`, `median`, `mode`, or `mean` corresponding to the -#' respective functions in the predicted survival distribution. Note that -#' for models with a proportional hazards form, the ranking implied by -#' `mean` and `median` will be identical (but not the value of `crank` -#' itself). `sum_haz` (default) uses [survivalmodels::surv_to_risk()]. -#' * `which` :: `numeric(1)`\cr -#' If `method = "mode"` then specifies which mode to use if multi-modal, default is the first. -#' * `response` :: `logical(1)`\cr -#' If `TRUE` then the `response` predict type is estimated with the same values as `crank`. +#' Currently only `mort` is supported, which is the sum of the cumulative hazard, also called *expected/ensemble mortality*, see Ishwaran et al. (2008). +#' For more details, see [get_mortality()]. #' * `overwrite` :: `logical(1)` \cr -#' If `FALSE` (default) then if the "pred" input already has a `crank`, the compositor only -#' composes a `response` type if `response = TRUE` and does not already exist. If `TRUE` then -#' both the `crank` and `response` are overwritten. -#' -#' @section Internals: -#' The `median`, `mode`, or `mean` will use analytical expressions if possible but if not they are -#' calculated using methods from [distr6]. `mean` requires \CRANpkg{cubature}. +#' If `FALSE` (default) and the prediction already has a `crank` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `crank` will be overwritten. #' #' @seealso [pipeline_crankcompositor] #' @family survival compositors #' @examples #' \dontrun{ #' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { -#' library(mlr3) #' library(mlr3pipelines) #' task = tsk("rats") #' -#' learn = lrn("surv.coxph")$train(task)$predict(task) -#' poc = po("crankcompose", param_vals = list(method = "sum_haz")) -#' poc$predict(list(learn))[[1]] -#' -#' if (requireNamespace("cubature", quietly = TRUE)) { -#' learn = lrn("surv.coxph")$train(task)$predict(task) -#' poc = po("crankcompose", param_vals = list(method = "sum_haz")) -#' poc$predict(list(learn))[[1]] -#' } +#' # change the crank prediction type of a Cox's model predictions +#' pred = lrn("surv.coxph")$train(task)$predict(task) +#' poc = po("crankcompose", param_vals = list(overwrite = TRUE)) +#' poc$predict(list(pred))[[1L]] #' } #' } #' @export @@ -77,13 +56,10 @@ PipeOpCrankCompositor = R6Class("PipeOpCrankCompositor", #' Creates a new instance of this [R6][R6::R6Class] class. initialize = function(id = "crankcompose", param_vals = list()) { param_set = ps( - method = p_fct(default = "sum_haz", levels = c("sum_haz", "mean", "median", "mode"), - tags = "predict"), - which = p_int(1L, default = 1L, tags = "predict", depends = quote(method == "mode")), - response = p_lgl(default = FALSE, tags = "predict"), + method = p_fct(default = "mort", levels = c("mort"), tags = "predict"), overwrite = p_lgl(default = FALSE, tags = "predict") ) - param_set$set_values(method = "sum_haz", response = FALSE, overwrite = FALSE) + param_set$set_values(method = "mort", overwrite = FALSE) super$initialize( id = id, @@ -91,7 +67,7 @@ PipeOpCrankCompositor = R6Class("PipeOpCrankCompositor", param_vals = param_vals, input = data.table(name = "input", train = "NULL", predict = "PredictionSurv"), output = data.table(name = "output", train = "NULL", predict = "PredictionSurv"), - packages = c("mlr3proba", "distr6") + packages = c("mlr3proba") ) } ), @@ -103,83 +79,47 @@ PipeOpCrankCompositor = R6Class("PipeOpCrankCompositor", }, .predict = function(inputs) { - - inpred = inputs[[1L]] - - response = self$param_set$values$response - b_response = !anyMissing(inpred$response) - if (!length(response)) response = FALSE - + pred = inputs[[1L]] overwrite = self$param_set$values$overwrite - if (!length(overwrite)) overwrite = FALSE + # it's impossible for a learner not to predict crank in mlr3proba, + # but let's check either way: + has_crank = !all(is.na(pred$crank)) - # if crank and response already exist and not overwriting then return prediction - if (!overwrite && (!response || (response && b_response))) { - return(list(inpred)) + if (!overwrite & has_crank) { + # return prediction as is + return(list(pred)) } else { - assert("distr" %in% inpred$predict_types) - method = self$param_set$values$method - if (length(method) == 0L) method = "sum_haz" - if (method == "sum_haz") { - if (inherits(inpred$data$distr, "matrix") || - !requireNamespace("survivalmodels", quietly = TRUE)) { - comp = survivalmodels::surv_to_risk(inpred$data$distr) - } else { - comp = as.numeric( - colSums(inpred$distr$cumHazard(sort(unique(inpred$truth[, 1])))) - ) - } - } else if (method == "mean") { - comp = try(inpred$distr$mean(), silent = TRUE) - if (inherits(comp, "try-error")) { - requireNamespace("cubature") - comp = try(inpred$distr$mean(cubature = TRUE), silent = TRUE) - } - if (inherits(comp, "try-error")) { - comp = numeric(length(inpred$crank)) - } - } else { - comp = switch(method, - median = inpred$distr$median(), - mode = inpred$distr$mode(self$param_set$values$which)) - } + # compose crank from distr prediction + assert("distr" %in% pred$predict_types) - comp = as.numeric(comp) - - # if crank exists and not overwriting then return predicted crank, otherwise compose - if (!overwrite) { - crank = inpred$crank + # get survival matrix + if (inherits(pred$data$distr, "array")) { + surv = pred$data$distr + if (length(dim(surv)) == 3L) { + # survival 3d array, extract median + surv = .ext_surv_mat(arr = surv, which.curve = 0.5) + } } else { - crank = -comp - # missing imputed with median - crank[is.na(crank)] = stats::median(crank[!is.na(crank)]) - crank[crank == Inf] = 1e3 - crank[crank == -Inf] = -1e3 + stop("Distribution prediction does not have a survival matrix or array + in the $data$distr slot") } - # i) not overwriting or requesting response, and already predicted - if (b_response && (!overwrite || !response)) { - response = inpred$response - # ii) not requesting response and doesn't exist - } else if (!response) { - response = NULL - # iii) requesting response and happy to overwrite - # iv) requesting response and doesn't exist - } else { - response = comp - response[is.na(response)] = 0 - response[response == Inf | response == -Inf] = 0 + method = self$param_set$values$method + if (method == "mort") { + crank = get_mortality(surv) } - if (!anyMissing(inpred$lp)) { - lp = inpred$lp - } else { - lp = NULL - } + # update only `crank` + p = PredictionSurv$new( + row_ids = pred$row_ids, + truth = pred$truth, + crank = crank, + distr = pred$distr, + lp = pred$lp, + response = pred$response + ) - return(list(PredictionSurv$new( - row_ids = inpred$row_ids, truth = inpred$truth, crank = crank, - distr = inpred$distr, lp = lp, response = response))) + return(list(p)) } } ) diff --git a/R/PipeOpDistrCompositor.R b/R/PipeOpDistrCompositor.R index 113a49732..aca38240f 100644 --- a/R/PipeOpDistrCompositor.R +++ b/R/PipeOpDistrCompositor.R @@ -3,16 +3,12 @@ #' @template param_pipelines #' #' @description -#' Estimates (or 'composes') a survival distribution from a predicted baseline `distr` and a -#' `crank` or `lp` from two [PredictionSurv]s. +#' Estimates (or 'composes') a survival distribution from a predicted baseline +#' survival distribution (`distr`) and a linear predictor (`lp`) from two [PredictionSurv]s. #' #' Compositor Assumptions: #' * The baseline `distr` is a discrete estimator, e.g. [surv.kaplan][LearnerSurvKaplan]. #' * The composed `distr` is of a linear form -#' * If `lp` is missing then `crank` is equivalent -#' -#' These assumptions are strong and may not be reasonable. Future updates will upgrade this -#' compositor to be more flexible. #' #' @section Dictionary: #' This [PipeOp][mlr3pipelines::PipeOp] can be instantiated via the @@ -25,15 +21,16 @@ #' ``` #' #' @section Input and Output Channels: -#' [PipeOpDistrCompositor] has two input channels, "base" and "pred". Both input channels take -#' `NULL` during training and [PredictionSurv] during prediction. +#' [PipeOpDistrCompositor] has two input channels, `"base"` and `"pred"`. +#' Both input channels take `NULL` during training and [PredictionSurv] during prediction. #' -#' [PipeOpDistrCompositor] has one output channel named "output", producing `NULL` during training -#' and a [PredictionSurv] during prediction. +#' [PipeOpDistrCompositor] has one output channel named `"output"`, producing +#' `NULL` during training and a [PredictionSurv] during prediction. #' -#' The output during prediction is the [PredictionSurv] from the "pred" input but with an extra -#' (or overwritten) column for `distr` predict type; which is composed from the `distr` of "base" -#' and `lp` or `crank` of "pred". +#' The output during prediction is the [PredictionSurv] from the `"pred"` input +#' but with an extra (or overwritten) column for the `distr` predict type; which +#' is composed from the `distr` of `"base"` and the `lp` of `"pred"`. +#' If no `lp` predictions have been made or exist, then the `"pred"` is returned unchanged. #' #' @section State: #' The `$state` is left empty (`list()`). @@ -46,8 +43,8 @@ #' Default `aft`. #' * `overwrite` :: `logical(1)` \cr #' If `FALSE` (default) then if the "pred" input already has a `distr`, the compositor does -#' nothing and returns the given [PredictionSurv]. If `TRUE` then the `distr` is overwritten -#' with the `distr` composed from `lp`/`crank` - this is useful for changing the prediction +#' nothing and returns the given [PredictionSurv]. If `TRUE`, then the `distr` is overwritten +#' with the `distr` composed from `lp` - this is useful for changing the prediction #' `distr` from one model form to another. #' #' @section Internals: @@ -56,8 +53,7 @@ #' \deqn{ph: S(t) = S_0(t)^{exp(lp)}}{ph: S(t) = S0(t)^exp(lp)} #' \deqn{po: S(t) = \frac{S_0(t)}{exp(-lp) + (1-exp(-lp)) S_0(t)}}{po: S(t) = S0(t) / [exp(-lp) + S0(t) (1-exp(-lp))]} # nolint #' where \eqn{S_0}{S0} is the estimated baseline survival distribution, and \eqn{lp} is the -#' predicted linear predictor. If the input model does not predict a linear predictor then `crank` -#' is assumed to be the `lp` - **this may be a strong and unreasonable assumption.** +#' predicted linear predictor. #' #' @seealso [pipeline_distrcompositor] #' @export @@ -71,6 +67,7 @@ #' #' base = lrn("surv.kaplan")$train(task)$predict(task) #' pred = lrn("surv.coxph")$train(task)$predict(task) +#' # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: #' pod = po("distrcompose", param_vals = list(form = "aft", overwrite = TRUE)) #' pod$predict(list(base = base, pred = pred))[[1]] #' } @@ -106,31 +103,27 @@ PipeOpDistrCompositor = R6Class("PipeOpDistrCompositor", .predict = function(inputs) { base = inputs$base - inpred = inputs$pred + pred = inputs$pred - overwrite = self$param_set$values$overwrite - if (!length(overwrite)) overwrite = FALSE + # if no `lp` predictions, we return the survival prediction object unchanged + if (is.null(pred$lp)) { + return(list(pred)) + } - if ("distr" %in% inpred$predict_types & !overwrite) { - return(list(inpred)) + overwrite = assert_logical(self$param_set$values$overwrite) + if ("distr" %in% pred$predict_types & !overwrite) { + return(list(pred)) } else { assert("distr" %in% base$predict_types) - row_ids = inpred$row_ids - truth = inpred$truth - - walk(inputs, function(x) assert_true(identical(truth, x$truth))) + # check: targets are the same + assert_true(identical(base$truth, pred$truth)) form = self$param_set$values$form - if (length(form) == 0L) form = "aft" - nr = length(inpred$data$row_ids) + nr = length(pred$data$row_ids) - # assumes PH-style lp where high value = high risk - if (anyMissing(inpred$lp)) { - lp = inpred$crank - } else { - lp = inpred$lp - } + # we need 'lp' predictions + lp = pred$lp if (inherits(base$data$distr, "Distribution")) { base = distr6::as.MixtureDistribution(base$distr) @@ -138,15 +131,17 @@ PipeOpDistrCompositor = R6Class("PipeOpDistrCompositor", nc = length(times) survmat = matrix(1 - base$cdf(times), nrow = nr, ncol = nc, byrow = TRUE) } else { - base = colMeans(base$data$distr) - times = as.numeric(names(base)) + # average survival probability across observations (on the test set) + avg_surv = colMeans(base$data$distr) + times = as.numeric(names(avg_surv)) nc = length(times) - survmat = matrix(base, nrow = nr, ncol = nc, byrow = TRUE) + survmat = matrix(avg_surv, nrow = nr, ncol = nc, byrow = TRUE) } timesmat = matrix(times, nrow = nr, ncol = nc, byrow = TRUE) lpmat = matrix(lp, nrow = nr, ncol = nc) + # compose survival distribution if (form == "ph") { cdf = 1 - (survmat^exp(lpmat)) } else if (form == "aft") { @@ -159,17 +154,17 @@ PipeOpDistrCompositor = R6Class("PipeOpDistrCompositor", cdf[survmat == 1] = 0 } - distr = .surv_return(times, 1 - cdf)$distr + distr = .surv_return(times = times, surv = 1 - cdf)$distr - if (anyMissing(inpred$lp)) { - lp = NULL - } else { - lp = inpred$lp - } + p = PredictionSurv$new( + row_ids = pred$row_ids, + truth = pred$truth, + crank = pred$crank, + lp = pred$lp, + distr = distr # overwrite only the distribution + ) - return(list(PredictionSurv$new( - row_ids = row_ids, truth = truth, - crank = inpred$crank, distr = distr, lp = lp))) + return(list(p)) } } ) diff --git a/R/PipeOpResponseCompositor.R b/R/PipeOpResponseCompositor.R new file mode 100644 index 000000000..d00fbce97 --- /dev/null +++ b/R/PipeOpResponseCompositor.R @@ -0,0 +1,191 @@ +#' @title PipeOpResponseCompositor +#' @name mlr_pipeops_responsecompose +#' @template param_pipelines +#' +#' @description +#' Uses a predicted survival distribution (`distr`) in a [PredictionSurv] to estimate (or 'compose') an expected survival time (`response`) prediction. +#' Practically, this `PipeOp` summarizes an observation's survival curve/distribution to a single number which can be either the restricted mean survival time or the median survival time. +#' +#' @section Dictionary: +#' This [PipeOp][mlr3pipelines::PipeOp] can be instantiated via the +#' [dictionary][mlr3misc::Dictionary] [mlr3pipelines::mlr_pipeops] or with the associated sugar +#' function [mlr3pipelines::po()]: +#' ``` +#' PipeOpResponseCompositor$new() +#' mlr_pipeops$get("responsecompose") +#' po("responsecompose") +#' ``` +#' +#' @section Input and Output Channels: +#' [PipeOpResponseCompositor] has one input channel named `"input"`, which takes +#' `NULL` during training and [PredictionSurv] during prediction. +#' +#' [PipeOpResponseCompositor] has one output channel named `"output"`, producing `NULL` during training +#' and a [PredictionSurv] during prediction. +#' +#' The output during prediction is the [PredictionSurv] from the input but with the `response` +#' predict type overwritten by the given method. +#' +#' @section State: +#' The `$state` is left empty (`list()`). +#' +#' @section Parameters: +#' - `method` :: `character(1)` \cr +#' Determines what method should be used to produce a survival time (response) from the survival distribution. +#' Available methods are `"rmst"` and `"median"`, corresponding to the *restricted mean survival time* and the *median survival time* respectively. +#' - `cutoff_time` :: `numeric(1)` \cr +#' Determines the time point up to which we calculate the restricted mean survival time (works only for the `"rmst"` method). +#' If `NULL` (default), all the available time points in the predicted survival distribution will be used. +#' * `add_crank` :: `logical(1)` \cr +#' If `TRUE` then `crank` predict type will be set as `-response` (as higher survival times correspond to lower risk). +#' Works only if `overwrite` is `TRUE`. +#' * `overwrite` :: `logical(1)` \cr +#' If `FALSE` (default) and the prediction already has a `response` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `response` (and the `crank`, if `add_crank` is `TRUE`) will be overwritten. +#' +#' @section Internals: +#' The restricted mean survival time is the default/preferred method and is calculated as follows: +#' \deqn{T_{i,rmst} \approx \sum_{t_j \in [0,\tau]} (t_j - t_{j-1}) S_i(t_j)} +#' +#' where \eqn{T} is the expected survival time, \eqn{\tau} is the time cutoff and \eqn{S_i(t_j)} are the predicted survival probabilities of observation \eqn{i} for all the \eqn{t_j} time points. +#' +#' The \eqn{T_{i,median}} survival time is just the first time point for which the survival probability is less than \eqn{0.5}. +#' If no such time point exists (e.g. when the survival distribution is not proper due to high censoring) we return the last time point. +#' This is **not a good estimate to use in general**, only a reasonable substitute in such cases. +#' +#' @references +#' `r format_bib("zhao_2016")` +#' +#' @seealso [pipeline_crankcompositor] +#' @family survival compositors +#' @examples +#' \dontrun{ +#' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { +#' library(mlr3pipelines) +#' task = tsk("rats") +#' +#' # add survival time prediction type to the predictions of a Cox model +#' # Median survival time as response +#' pred = lrn("surv.coxph")$train(task)$predict(task) +#' por = po("responsecompose", param_vals = list(method = "median", overwrite = TRUE)) +#' por$predict(list(pred))[[1L]] +#' # mostly improper survival distributions, "median" sets the survival time +#' # to the last time point +#' +#' # RMST (default) as response, while also changing the crank = -response +#' por = po("responsecompose", param_vals = list(overwrite = TRUE, add_crank = TRUE)) +#' por$predict(list(pred))[[1L]] +#' } +#' } +#' @export +PipeOpResponseCompositor = R6Class("PipeOpResponseCompositor", + inherit = mlr3pipelines::PipeOp, + public = list( + #' @description + #' Creates a new instance of this [R6][R6::R6Class] class. + initialize = function(id = "responsecompose", param_vals = list()) { + param_set = ps( + method = p_fct(default = "rmst", levels = c("rmst", "median"), tags = "predict"), + cutoff_time = p_dbl(0, default = NULL, special_vals = list(NULL), tags = "predict"), + add_crank = p_lgl(default = FALSE, tags = "predict"), + overwrite = p_lgl(default = FALSE, tags = "predict") + ) + + param_set$set_values(method = "rmst", add_crank = FALSE, overwrite = FALSE) + + super$initialize( + id = id, + param_set = param_set, + param_vals = param_vals, + input = data.table(name = "input", train = "NULL", predict = "PredictionSurv"), + output = data.table(name = "output", train = "NULL", predict = "PredictionSurv"), + packages = c("mlr3proba") + ) + } + ), + + private = list( + .train = function(inputs) { + self$state = list() + list(NULL) + }, + + .predict = function(inputs) { + pred = inputs[[1L]] + overwrite = self$param_set$values$overwrite + has_response = !is.null(pred$response) + + if (!overwrite & has_response) { + # return prediction as is + return(list(pred)) + } else { + # compose response (survival time) from distr prediction + assert("distr" %in% pred$predict_types) + + # get survival matrix + if (inherits(pred$data$distr, "array")) { + surv = pred$data$distr + if (length(dim(surv)) == 3L) { + # survival 3d array, extract median + surv = .ext_surv_mat(arr = surv, which.curve = 0.5) + } + } else { + stop("Distribution prediction does not have a survival matrix or array + in the $data$distr slot") + } + + # time points + times = as.numeric(colnames(surv)) + + method = self$param_set$values$method + if (method == "rmst") { + cutoff_time = self$param_set$values$cutoff_time + within_range = !is.null(cutoff_time) && + cutoff_time <= max(times) && + cutoff_time >= min(times) + if (within_range) { + # subset survival matrix and times + surv = surv[, times <= cutoff_time, drop = FALSE] + times = times[times <= cutoff_time] + } + + # calculate the restricted mean survival time + weights = c(times[1], diff(times)) # time intervals as weights + response = apply(surv, 1, function(surv_probs) { + sum(weights * surv_probs) + }) + } else { # median + t_max = tail(times, 1) # last time point + response = apply(surv, 1, function(surv_probs) { + # first survival probability that is < 0.5 + median_time_index = match(TRUE, surv_probs < 0.5) + if (!is.na(median_time_index)) times[median_time_index] else t_max + }) + } + + add_crank = self$param_set$values$add_crank + if (add_crank) { + # higher risk, lower survival time + crank = -response + } else { + # otherwise, leave crank unchanged + crank = pred$crank + } + + # update only `response` (and maybe `crank`) + p = PredictionSurv$new( + row_ids = pred$row_ids, + truth = pred$truth, + crank = crank, + distr = pred$distr, + lp = pred$lp, + response = response + ) + + return(list(p)) + } + } + ) +) + +register_pipeop("responsecompose", PipeOpResponseCompositor) diff --git a/R/PipeOpSurvAvg.R b/R/PipeOpSurvAvg.R index fddaf95c8..12289e3b8 100644 --- a/R/PipeOpSurvAvg.R +++ b/R/PipeOpSurvAvg.R @@ -72,21 +72,22 @@ PipeOpSurvAvg = R6Class("PipeOpSurvAvg", private = list( weighted_avg_predictions = function(inputs, weights, row_ids, truth) { response_matrix = map(inputs, "response") - if (some(response_matrix, anyMissing)) { + + if (some(response_matrix, is.null)) { response = NULL } else { response = c(simplify2array(response_matrix) %*% weights) } crank_matrix = map(inputs, "crank") - if (some(crank_matrix, anyMissing)) { + if (some(crank_matrix, is.null)) { crank = NULL } else { crank = c(simplify2array(crank_matrix) %*% weights) } lp_matrix = map(inputs, "lp") - if (some(lp_matrix, anyMissing)) { + if (some(lp_matrix, is.null)) { lp = NULL } else { lp = c(simplify2array(lp_matrix) %*% weights) diff --git a/R/PredictionSurv.R b/R/PredictionSurv.R index 9732ccc5f..83e72662f 100644 --- a/R/PredictionSurv.R +++ b/R/PredictionSurv.R @@ -90,8 +90,8 @@ PredictionSurv = R6Class("PredictionSurv", distr = private$.simplify_distr(distr) } - pdata = list(row_ids = row_ids, truth = truth, crank = crank, distr = distr, lp = lp, - response = response) + pdata = list(row_ids = row_ids, truth = truth, crank = crank, distr = distr, + lp = lp, response = response) pdata = discard(pdata, is.null) class(pdata) = c("PredictionDataSurv", "PredictionData") @@ -108,7 +108,7 @@ PredictionSurv = R6Class("PredictionSurv", active = list( #' @field truth (`Surv`)\cr - #' True (observed) outcome. + #' True (observed) outcome. truth = function() { self$data$truth }, @@ -126,19 +126,19 @@ PredictionSurv = R6Class("PredictionSurv", return(self$data$distr) } - private$.distrify_survarray(self$data$distr %??% NA_real_) + private$.distrify_survarray(self$data$distr) }, #' @field lp (`numeric()`)\cr #' Access the stored predicted linear predictor. lp = function() { - self$data$lp %??% rep(NA_real_, length(self$data$row_ids)) + self$data$lp }, #' @field response (`numeric()`)\cr #' Access the stored predicted survival time. response = function() { - self$data$response %??% rep(NA_real_, length(self$data$row_ids)) + self$data$response } ), diff --git a/R/RcppExports.R b/R/RcppExports.R index a9a0a21ad..34d44a152 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,10 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +c_assert_surv <- function(mat) { + .Call(`_mlr3proba_c_assert_surv`, mat) +} + .c_get_unique_times <- function(true_times, req_times) { .Call(`_mlr3proba_c_get_unique_times`, true_times, req_times) } diff --git a/R/assertions.R b/R/assertions.R index 6030e997c..3a9dc7a69 100644 --- a/R/assertions.R +++ b/R/assertions.R @@ -10,3 +10,44 @@ assert_surv = function(x, len = NULL, any.missing = TRUE, null.ok = FALSE, .var. assert_class(x, "Surv", null.ok = null.ok, .var.name = .var.name) assert_matrix(x, any.missing = any.missing, nrows = len, null.ok = null.ok, .var.name = .var.name) } + +#' @title Assert survival matrix +#' +#' @description Asserts if the given input matrix is a (discrete) survival +#' probabilities matrix using [Rcpp] code. +#' The following checks are performed: +#' +#' 1. All values are probabilities, i.e. \eqn{S(t) \in [0,1]} +#' 2. Column names correspond to time-points and should therefore be coercable to +#' `numeric` and increasing +#' 3. Per row/observation, the survival probabilities decrease non-strictly, i.e. +#' \eqn{S(t) \ge S(t+1)} +#' +#' @param x (`matrix()`)\cr +#' A matrix of (predicted) survival probabilities. +#' Rows are observations, columns are (increasing) time points. +#' +#' @return if the assertion fails an error occurs, otherwise `NULL` is returned +#' invisibly. +#' +#' @examples +#' x = matrix(data = c(1,0.6,0.4,0.8,0.8,0.7), nrow = 2, ncol = 3, byrow = TRUE) +#' colnames(x) = c(12, 34, 42) +#' x +#' +#' assert_surv_matrix(x) +#' +#' @export +assert_surv_matrix = function(x) { + times = assert_numeric(as.numeric(colnames(x)), any.missing = FALSE) + + if (is.null(times) || !identical(order(times), seq(ncol(x)))) { + stop("Survival matrix column names must be increasing numeric (time points)") + } + + if (!c_assert_surv(x)) { + stop("Survival probabilities must be (non-strictly) decreasing and between [0, 1]") + } + + invisible(NULL) +} diff --git a/R/bibentries.R b/R/bibentries.R index f33899ba0..11c5ccb87 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -1,679 +1,679 @@ bibentries = c( # nolint start aalen_1978 = bibentry("article", - title = "Nonparametric Inference for a Family of Counting Processes", - author = "Odd Aalen", - year = 1978, - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 6, - number = 4, - pages = "701--726", - url = "https://www.jstor.org/stable/2958850" + title = "Nonparametric Inference for a Family of Counting Processes", + author = "Odd Aalen", + year = 1978, + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 6, + number = 4, + pages = "701--726", + url = "https://www.jstor.org/stable/2958850" ), begg_2000 = bibentry("article", - title = "Comparing tumour staging and grading systems: a case study and a review of the issues, using thymoma as a model", - author = "Colin B. Begg and Laura D. Cramer and E. S. Venkatraman and Juan Rosai", - year = 2000, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 19, - number = 15, - pages = "1997--2014", - doi = "10.1002/1097-0258(20000815)19:15<1997::aid-sim511>3.0.co;2-c" + title = "Comparing tumour staging and grading systems: a case study and a review of the issues, using thymoma as a model", + author = "Colin B. Begg and Laura D. Cramer and E. S. Venkatraman and Juan Rosai", + year = 2000, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 19, + number = 15, + pages = "1997--2014", + doi = "10.1002/1097-0258(20000815)19:15<1997::aid-sim511>3.0.co;2-c" ), breiman_1984 = bibentry("book", - title = "Classification And Regression Trees", - author = "Leo Breiman and Jerome H. Friedman and Richard A. Olshen and Charles J. Stone", - year = 1984, - publisher = "Routledge", - doi = "10.1201/9781315139470" + title = "Classification And Regression Trees", + author = "Leo Breiman and Jerome H. Friedman and Richard A. Olshen and Charles J. Stone", + year = 1984, + publisher = "Routledge", + doi = "10.1201/9781315139470" ), breiman_2001 = bibentry("article", - title = "Random Forests", - author = "Breiman, Leo", - year = 2001, - journal = "Machine Learning", - volume = 45, - number = 1, - pages = "5--32", - doi = "10.1023/A:1010933404324", + title = "Random Forests", + author = "Breiman, Leo", + year = 2001, + journal = "Machine Learning", + volume = 45, + number = 1, + pages = "5--32", + doi = "10.1023/A:1010933404324", ), buehlmann_2003 = bibentry("article", - title = "Boosting With the L2 Loss", - author = "Peter B\u00fchlmann and Bin Yu", - year = 2003, - month = "jun", - journal = "Journal of the American Statistical Association", - publisher = "Informa {UK} Limited", - volume = 98, - number = 462, - pages = "324--339", - doi = "10.1198/016214503000125" + title = "Boosting With the L2 Loss", + author = "Peter B\u00fchlmann and Bin Yu", + year = 2003, + month = "jun", + journal = "Journal of the American Statistical Association", + publisher = "Informa {UK} Limited", + volume = 98, + number = 462, + pages = "324--339", + doi = "10.1198/016214503000125" ), buehlmann_2006 = bibentry("article", - title = "Boosting for High-Dimensional Linear Models", - author = "Peter B\u00fchlmann", - year = 2006, - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 34, - number = 2, - pages = "559--583", - url = "https://www.jstor.org/stable/25463430" + title = "Boosting for High-Dimensional Linear Models", + author = "Peter B\u00fchlmann", + year = 2006, + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 34, + number = 2, + pages = "559--583", + url = "https://www.jstor.org/stable/25463430" ), buehlmann_2007 = bibentry("article", - title = "Boosting Algorithms: Regularization, Prediction and Model Fitting", - author = "Peter B\u00fchlmann and Torsten Hothorn", - year = 2007, - month = "nov", - journal = "Statistical Science", - publisher = "Institute of Mathematical Statistics", - volume = 22, - number = 4, - pages = "477--505", - doi = "10.1214/07-sts242" + title = "Boosting Algorithms: Regularization, Prediction and Model Fitting", + author = "Peter B\u00fchlmann and Torsten Hothorn", + year = 2007, + month = "nov", + journal = "Statistical Science", + publisher = "Institute of Mathematical Statistics", + volume = 22, + number = 4, + pages = "477--505", + doi = "10.1214/07-sts242" ), burges_2010 = bibentry("techreport", - title = "From RankNet to LambdaRank to LambdaMART: An Overview", - author = "Burges, Chris J.C.", - year = 2010, - month = "jun", - number = "MSR-TR-2010-82", - url = "https://www.microsoft.com/en-us/research/publication/from-ranknet-to-lambdarank-to-lambdamart-an-overview/", + title = "From RankNet to LambdaRank to LambdaMART: An Overview", + author = "Burges, Chris J.C.", + year = 2010, + month = "jun", + number = "MSR-TR-2010-82", + url = "https://www.microsoft.com/en-us/research/publication/from-ranknet-to-lambdarank-to-lambdamart-an-overview/", institution = "Microsoft Research" ), chambless_2006 = bibentry("article", - title = "Estimation of time-dependent area under the {ROC} curve for long-term risk prediction", - author = "Lloyd E. Chambless and Guoqing Diao", - year = 2006, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 25, - number = 20, - pages = "3474--3486", - doi = "10.1002/sim.2299" + title = "Estimation of time-dependent area under the {ROC} curve for long-term risk prediction", + author = "Lloyd E. Chambless and Guoqing Diao", + year = 2006, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 25, + number = 20, + pages = "3474--3486", + doi = "10.1002/sim.2299" ), cox_1972 = bibentry("article", - title = "Regression Models and Life-Tables", - author = "D. R. Cox", - year = 1972, - month = "jan", - journal = "Journal of the Royal Statistical Society: Series B (Methodological)", - publisher = "Wiley", - volume = 34, - number = 2, - pages = "187--202", - doi = "10.1111/j.2517-6161.1972.tb00899.x" + title = "Regression Models and Life-Tables", + author = "D. R. Cox", + year = 1972, + month = "jan", + journal = "Journal of the Royal Statistical Society: Series B (Methodological)", + publisher = "Wiley", + volume = 34, + number = 2, + pages = "187--202", + doi = "10.1111/j.2517-6161.1972.tb00899.x" ), freund_1996 = bibentry("inproceedings", - title = "Experiments with a new boosting algorithm", - author = "Freund, Yoav and Schapire, Robert E and others", - year = 1996, - booktitle = "In: Thirteenth International Conference on ML", - volume = 96, - pages = "148--156", + title = "Experiments with a new boosting algorithm", + author = "Freund, Yoav and Schapire, Robert E and others", + year = 1996, + booktitle = "In: Thirteenth International Conference on ML", + volume = 96, + pages = "148--156", organization = "Citeseer" ), freund_1997 = bibentry("article", - title = "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting", - author = "Yoav Freund and Robert E Schapire", - year = 1997, - month = "aug", - journal = "Journal of Computer and System Sciences", - publisher = "Elsevier {BV}", - volume = 55, - number = 1, - pages = "119--139", - doi = "10.1006/jcss.1997.1504" + title = "A Decision-Theoretic Generalization of On-Line Learning and an Application to Boosting", + author = "Yoav Freund and Robert E Schapire", + year = 1997, + month = "aug", + journal = "Journal of Computer and System Sciences", + publisher = "Elsevier {BV}", + volume = 55, + number = 1, + pages = "119--139", + doi = "10.1006/jcss.1997.1504" ), friedman_2000 = bibentry("article", - title = "Additive logistic regression: a statistical view of boosting (With discussion and a rejoinder by the authors)", - author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", - year = 2000, - month = "apr", - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 28, - number = 2, - pages = "337--407", - doi = "10.1214/aos/1016218223" + title = "Additive logistic regression: a statistical view of boosting (With discussion and a rejoinder by the authors)", + author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", + year = 2000, + month = "apr", + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 28, + number = 2, + pages = "337--407", + doi = "10.1214/aos/1016218223" ), friedman_2001 = bibentry("article", - title = "Greedy Function Approximation: A Gradient Boosting Machine", - author = "Jerome H. Friedman", - year = 2001, - journal = "The Annals of Statistics", - publisher = "Institute of Mathematical Statistics", - volume = 29, - number = 5, - pages = "1189--1232", - url = "https://www.jstor.org/stable/2699986" + title = "Greedy Function Approximation: A Gradient Boosting Machine", + author = "Jerome H. Friedman", + year = 2001, + journal = "The Annals of Statistics", + publisher = "Institute of Mathematical Statistics", + volume = 29, + number = 5, + pages = "1189--1232", + url = "https://www.jstor.org/stable/2699986" ), friedman_2002 = bibentry("article", - title = "Stochastic gradient boosting", - author = "Jerome H. Friedman", - year = 2002, - month = "feb", - journal = "Computational Statistics & Data Analysis", - publisher = "Elsevier {BV}", - volume = 38, - number = 4, - pages = "367--378", - doi = "10.1016/s0167-9473(01)00065-2" + title = "Stochastic gradient boosting", + author = "Jerome H. Friedman", + year = 2002, + month = "feb", + journal = "Computational Statistics & Data Analysis", + publisher = "Elsevier {BV}", + volume = 38, + number = 4, + pages = "367--378", + doi = "10.1016/s0167-9473(01)00065-2" ), friedman_2010 = bibentry("article", - title = "Regularization Paths for Generalized Linear Models via Coordinate Descent", - author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", - year = 2010, - journal = "Journal of Statistical Software", - publisher = "Foundation for Open Access Statistic", - volume = 33, - number = 1, - doi = "10.18637/jss.v033.i01" + title = "Regularization Paths for Generalized Linear Models via Coordinate Descent", + author = "Jerome Friedman and Trevor Hastie and Robert Tibshirani", + year = 2010, + journal = "Journal of Statistical Software", + publisher = "Foundation for Open Access Statistic", + volume = 33, + number = 1, + doi = "10.18637/jss.v033.i01" ), goeman_2009 = bibentry("article", - title = "L1Penalized Estimation in the Cox Proportional Hazards Model", - author = "Jelle J. Goeman", - year = 2009, - month = "nov", - journal = "Biometrical Journal", - publisher = "Wiley", - pages = "NA--NA", - doi = "10.1002/bimj.200900028" + title = "L1Penalized Estimation in the Cox Proportional Hazards Model", + author = "Jelle J. Goeman", + year = 2009, + month = "nov", + journal = "Biometrical Journal", + publisher = "Wiley", + pages = "NA--NA", + doi = "10.1002/bimj.200900028" ), goenen_2005 = bibentry("article", - title = "Concordance probability and discriminatory power in proportional hazards regression", - author = "Mithat Goenen and Glenn Heller", - year = 2005, - month = "dec", - journal = "Biometrika", - publisher = "Oxford University Press ({OUP})", - volume = 92, - number = 4, - pages = "965--970", - doi = "10.1093/biomet/92.4.965" + title = "Concordance probability and discriminatory power in proportional hazards regression", + author = "Mithat Goenen and Glenn Heller", + year = 2005, + month = "dec", + journal = "Biometrika", + publisher = "Oxford University Press ({OUP})", + volume = 92, + number = 4, + pages = "965--970", + doi = "10.1093/biomet/92.4.965" ), graf_1999 = bibentry("article", - title = "Assessment and comparison of prognostic classification schemes for survival data", - author = "Erika Graf and Claudia Schmoor and Willi Sauerbrei and Martin Schumacher", - year = 1999, - month = "sep", - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 18, - number = "17-18", - pages = "2529--2545", - doi = "10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5" + title = "Assessment and comparison of prognostic classification schemes for survival data", + author = "Erika Graf and Claudia Schmoor and Willi Sauerbrei and Martin Schumacher", + year = 1999, + month = "sep", + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 18, + number = "17-18", + pages = "2529--2545", + doi = "10.1002/(sici)1097-0258(19990915/30)18:17/18<2529::aid-sim274>3.0.co;2-5" ), harrell_1982 = bibentry("article", - title = "Evaluating the yield of medical tests", - author = "Harrell, Frank E and Califf, Robert M and Pryor, David B and Lee, Kerry L and Rosati, Robert A", - year = 1982, - journal = "Jama", - publisher = "American Medical Association", - volume = 247, - number = 18, - pages = "2543--2546" + title = "Evaluating the yield of medical tests", + author = "Harrell, Frank E and Califf, Robert M and Pryor, David B and Lee, Kerry L and Rosati, Robert A", + year = 1982, + journal = "Jama", + publisher = "American Medical Association", + volume = 247, + number = 18, + pages = "2543--2546" ), hofner_2012 = bibentry("article", - title = "Model-based boosting in R: a hands-on tutorial using the R package mboost", - author = "Benjamin Hofner and Andreas Mayr and Nikolay Robinzonov and Matthias Schmid", - year = 2012, - month = "dec", - journal = "Computational Statistics", - publisher = "Springer Science and Business Media {LLC}", - volume = 29, - number = "1-2", - pages = "3--35", - doi = "10.1007/s00180-012-0382-5" + title = "Model-based boosting in R: a hands-on tutorial using the R package mboost", + author = "Benjamin Hofner and Andreas Mayr and Nikolay Robinzonov and Matthias Schmid", + year = 2012, + month = "dec", + journal = "Computational Statistics", + publisher = "Springer Science and Business Media {LLC}", + volume = 29, + number = "1-2", + pages = "3--35", + doi = "10.1007/s00180-012-0382-5" ), hothorn_2006 = bibentry("article", - title = "Unbiased Recursive Partitioning: A Conditional Inference Framework", - author = "Torsten Hothorn and Kurt Hornik and Achim Zeileis", - year = 2006, - month = "sep", - journal = "Journal of Computational and Graphical Statistics", - publisher = "Informa {UK} Limited", - volume = 15, - number = 3, - pages = "651--674", - doi = "10.1198/106186006x133933" + title = "Unbiased Recursive Partitioning: A Conditional Inference Framework", + author = "Torsten Hothorn and Kurt Hornik and Achim Zeileis", + year = 2006, + month = "sep", + journal = "Journal of Computational and Graphical Statistics", + publisher = "Informa {UK} Limited", + volume = 15, + number = 3, + pages = "651--674", + doi = "10.1198/106186006x133933" ), hothorn_2010 = bibentry("article", - title = "Model-based boosting 2.0", - author = "Hothorn, Torsten and B\u00fchlmann, Peter and Kneib, Thomas and Schmid, Matthias and Hofner, Benjamin", - year = 2010, - journal = "Journal of Machine Learning Research", - volume = 11, - number = "Aug", - pages = "2109--2113", - url = "https://www.jmlr.org/papers/v11/hothorn10a.html" + title = "Model-based boosting 2.0", + author = "Hothorn, Torsten and B\u00fchlmann, Peter and Kneib, Thomas and Schmid, Matthias and Hofner, Benjamin", + year = 2010, + journal = "Journal of Machine Learning Research", + volume = 11, + number = "Aug", + pages = "2109--2113", + url = "https://www.jmlr.org/papers/v11/hothorn10a.html" ), hung_2010 = bibentry("article", - title = "Estimation methods for time-dependent AUC models with survival data", - author = "Hung Hung and Chin-Tsang Chiang", - year = 2010, - journal = "The Canadian Journal of Statistics / La Revue Canadienne de Statistique", - publisher = "Statistical Society of Canada, Wiley", - volume = 38, - number = 1, - pages = "8--26", - url = "https://www.jstor.org/stable/27805213" + title = "Estimation methods for time-dependent AUC models with survival data", + author = "Hung Hung and Chin-Tsang Chiang", + year = 2010, + journal = "The Canadian Journal of Statistics / La Revue Canadienne de Statistique", + publisher = "Statistical Society of Canada, Wiley", + volume = 38, + number = 1, + pages = "8--26", + url = "https://www.jstor.org/stable/27805213" ), ishwaran_2008 = bibentry("article", - title = "Random survival forests", - author = "Ishwaran, Hemant and Kogalur, Udaya B and Blackstone, Eugene H and Lauer, Michael S and others", - year = 2008, - journal = "The annals of applied statistics", - publisher = "Institute of Mathematical Statistics", - volume = 2, - number = 3, - pages = "841--860" + title = "Random survival forests", + author = "Ishwaran, Hemant and Kogalur, Udaya B and Blackstone, Eugene H and Lauer, Michael S and others", + year = 2008, + journal = "The Annals of applied statistics", + publisher = "Institute of Mathematical Statistics", + volume = 2, + number = 3, + pages = "841--860" ), kalbfleisch_2002 = bibentry("book", - title = "The Statistical Analysis of Failure Time Data", - author = "John D. Kalbfleisch and Ross L. Prentice", - year = 2002, - month = "aug", - publisher = "John Wiley & Sons, Inc.", - doi = "10.1002/9781118032985" + title = "The Statistical Analysis of Failure Time Data", + author = "John D. Kalbfleisch and Ross L. Prentice", + year = 2002, + month = "aug", + publisher = "John Wiley & Sons, Inc.", + doi = "10.1002/9781118032985" ), kaplan_1958 = bibentry("article", - title = "Nonparametric Estimation from Incomplete Observations", - author = "E. L. Kaplan and Paul Meier", - year = 1958, - month = "jun", - journal = "Journal of the American Statistical Association", - publisher = "Informa {UK} Limited", - volume = 53, - number = 282, - pages = "457--481", - doi = "10.1080/01621459.1958.10501452" + title = "Nonparametric Estimation from Incomplete Observations", + author = "E. L. Kaplan and Paul Meier", + year = 1958, + month = "jun", + journal = "Journal of the American Statistical Association", + publisher = "Informa {UK} Limited", + volume = 53, + number = 282, + pages = "457--481", + doi = "10.1080/01621459.1958.10501452" ), kneib_2008 = bibentry("article", - title = "Variable Selection and Model Choice in Geoadditive Regression Models", - author = "Thomas Kneib and Torsten Hothorn and Gerhard Tutz", - year = 2008, - month = "aug", - journal = "Biometrics", - publisher = "Wiley", - volume = 65, - number = 2, - pages = "626--634", - doi = "10.1111/j.1541-0420.2008.01112.x" + title = "Variable Selection and Model Choice in Geoadditive Regression Models", + author = "Thomas Kneib and Torsten Hothorn and Gerhard Tutz", + year = 2008, + month = "aug", + journal = "Biometrics", + publisher = "Wiley", + volume = 65, + number = 2, + pages = "626--634", + doi = "10.1111/j.1541-0420.2008.01112.x" ), kriegler_2007 = bibentry("phdthesis", - title = "Cost-Sensitive Stochastic Gradient Boosting Within a Quantitative Regression Framework", - author = "Kriegler, Brian", - year = 2007, - url = "https://dl.acm.org/citation.cfm?id=1354603", - school = "University of California at Los Angeles" + title = "Cost-Sensitive Stochastic Gradient Boosting Within a Quantitative Regression Framework", + author = "Kriegler, Brian", + year = 2007, + url = "https://dl.acm.org/citation.cfm?id=1354603", + school = "University of California at Los Angeles" ), nagelkerke_1991 = bibentry("article", - title = "A note on a general definition of the coefficient of determination", - author = "Nagelkerke, Nico JD and others", - year = 1991, - journal = "Biometrika", - publisher = "Oxford University Press", - volume = 78, - number = 3, - pages = "691--692" + title = "A note on a general definition of the coefficient of determination", + author = "Nagelkerke, Nico JD and others", + year = 1991, + journal = "Biometrika", + publisher = "Oxford University Press", + volume = 78, + number = 3, + pages = "691--692" ), nelson_1969 = bibentry("article", - title = "Hazard Plotting for Incomplete Failure Data", - author = "Wayne Nelson", - year = 1969, - month = "jan", - journal = "Journal of Quality Technology", - publisher = "Informa {UK} Limited", - volume = 1, - number = 1, - pages = "27--52", - doi = "10.1080/00224065.1969.11980344" + title = "Hazard Plotting for Incomplete Failure Data", + author = "Wayne Nelson", + year = 1969, + month = "jan", + journal = "Journal of Quality Technology", + publisher = "Informa {UK} Limited", + volume = 1, + number = 1, + pages = "27--52", + doi = "10.1080/00224065.1969.11980344" ), nelson_1972 = bibentry("article", - title = "Theory and Applications of Hazard Plotting for Censored Failure Data", - author = "Wayne Nelson", - year = 1972, - month = "nov", - journal = "Technometrics", - publisher = "Informa {UK} Limited", - volume = 14, - number = 4, - pages = "945--966", - doi = "10.1080/00401706.1972.10488991" + title = "Theory and Applications of Hazard Plotting for Censored Failure Data", + author = "Wayne Nelson", + year = 1972, + month = "nov", + journal = "Technometrics", + publisher = "Informa {UK} Limited", + volume = 14, + number = 4, + pages = "945--966", + doi = "10.1080/00401706.1972.10488991" ), oquigley_2005 = bibentry("article", - title = "Explained randomness in proportional hazards models", - author = "John O'Quigley and Ronghui Xu and Janez Stare", - year = 2005, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 24, - number = 3, - pages = "479--489", - doi = "10.1002/sim.1946" + title = "Explained randomness in proportional hazards models", + author = "John O'Quigley and Ronghui Xu and Janez Stare", + year = 2005, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 24, + number = 3, + pages = "479--489", + doi = "10.1002/sim.1946" ), ridgeway_1999 = bibentry("article", - title = "The state of boosting", - author = "Ridgeway, Greg", - year = 1999, - journal = "Computing Science and Statistics", - number = 31, - pages = "172--181" + title = "The state of boosting", + author = "Ridgeway, Greg", + year = 1999, + journal = "Computing Science and Statistics", + number = 31, + pages = "172--181" ), royston_2002 = bibentry("article", - title = "Flexible parametric proportional-hazards and proportional-odds models for censored survival data, with application to prognostic modelling and estimation of treatment effects", - author = "Patrick Royston and Mahesh K. B. Parmar", - year = 2002, - journal = "Statistics in Medicine", - publisher = "Wiley", - volume = 21, - number = 15, - pages = "2175--2197", - doi = "10.1002/sim.1203" + title = "Flexible parametric proportional-hazards and proportional-odds models for censored survival data, with application to prognostic modelling and estimation of treatment effects", + author = "Patrick Royston and Mahesh K. B. Parmar", + year = 2002, + journal = "Statistics in Medicine", + publisher = "Wiley", + volume = 21, + number = 15, + pages = "2175--2197", + doi = "10.1002/sim.1203" ), schmid_2008 = bibentry("article", - title = "Boosting additive models using component-wise P-Splines", - author = "Matthias Schmid and Torsten Hothorn", - year = 2008, - month = "dec", - journal = "Computational Statistics & Data Analysis", - publisher = "Elsevier {BV}", - volume = 53, - number = 2, - pages = "298--311", - doi = "10.1016/j.csda.2008.09.009" + title = "Boosting additive models using component-wise P-Splines", + author = "Matthias Schmid and Torsten Hothorn", + year = 2008, + month = "dec", + journal = "Computational Statistics & Data Analysis", + publisher = "Elsevier {BV}", + volume = 53, + number = 2, + pages = "298--311", + doi = "10.1016/j.csda.2008.09.009" ), song_2008 = bibentry("article", - title = "A semiparametric approach for the covariate specific ROC curve with survival outcome", - author = "Song, Xiao and Zhou, Xiao-Hua", - year = 2008, - journal = "Statistica Sinica", - volume = 18, - number = 3, - pages = "947--65", - url = "https://www.jstor.org/stable/24308524" + title = "A semiparametric approach for the covariate specific ROC curve with survival outcome", + author = "Song, Xiao and Zhou, Xiao-Hua", + year = 2008, + journal = "Statistica Sinica", + volume = 18, + number = 3, + pages = "947--65", + url = "https://www.jstor.org/stable/24308524" ), uno_2007 = bibentry("article", - title = "Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models", - author = "Hajime Uno and Tianxi Cai and Lu Tian and L. J Wei", - year = 2007, - month = "jun", - journal = "Journal of the American Statistical Association", - publisher = "Informa {UK} Limited", - volume = 102, - number = 478, - pages = "527--537", - doi = "10.1198/016214507000000149" + title = "Evaluating Prediction Rules fort-Year Survivors With Censored Regression Models", + author = "Hajime Uno and Tianxi Cai and Lu Tian and L. J Wei", + year = 2007, + month = "jun", + journal = "Journal of the American Statistical Association", + publisher = "Informa {UK} Limited", + volume = 102, + number = 478, + pages = "527--537", + doi = "10.1198/016214507000000149" ), uno_2011 = bibentry("article", - title = "On the C-statistics for evaluating overall adequacy of risk prediction procedures with censored survival data", - author = "Hajime Uno and Tianxi Cai and Michael J. Pencina and Ralph B. D'Agostino and L. J. Wei", - year = 2011, - journal = "Statistics in Medicine", - publisher = "Wiley", - pages = "n/a--n/a", - doi = "10.1002/sim.4154" + title = "On the C-statistics for evaluating overall adequacy of risk prediction procedures with censored survival data", + author = "Hajime Uno and Tianxi Cai and Michael J. Pencina and Ralph B. D'Agostino and L. J. Wei", + year = 2011, + journal = "Statistics in Medicine", + publisher = "Wiley", + pages = "n/a--n/a", + doi = "10.1002/sim.4154" ), vanbelle_2010 = bibentry("article", - title = "Improved performance on high-dimensional survival data by application of Survival-{SVM}", - author = "V. Van Belle and K. Pelckmans and S. Van Huffel and J. A. K. Suykens", - year = 2010, - month = "nov", - journal = "Bioinformatics", - publisher = "Oxford University Press ({OUP})", - volume = 27, - number = 1, - pages = "87--94", - doi = "10.1093/bioinformatics/btq617" + title = "Improved performance on high-dimensional survival data by application of Survival-{SVM}", + author = "V. Van Belle and K. Pelckmans and S. Van Huffel and J. A. K. Suykens", + year = 2010, + month = "nov", + journal = "Bioinformatics", + publisher = "Oxford University Press ({OUP})", + volume = 27, + number = 1, + pages = "87--94", + doi = "10.1093/bioinformatics/btq617" ), vanbelle_2011 = bibentry("article", - title = "Support vector methods for survival analysis: a comparison between ranking and regression approaches", - author = "Vanya Van Belle and Kristiaan Pelckmans and Sabine Van Huffel and Johan A.K. Suykens", - year = 2011, - month = "oct", - journal = "Artificial Intelligence in Medicine", - publisher = "Elsevier {BV}", - volume = 53, - number = 2, - pages = "107--118", - doi = "10.1016/j.artmed.2011.06.006" + title = "Support vector methods for survival analysis: a comparison between ranking and regression approaches", + author = "Vanya Van Belle and Kristiaan Pelckmans and Sabine Van Huffel and Johan A.K. Suykens", + year = 2011, + month = "oct", + journal = "Artificial Intelligence in Medicine", + publisher = "Elsevier {BV}", + volume = 53, + number = 2, + pages = "107--118", + doi = "10.1016/j.artmed.2011.06.006" ), wright_2017 = bibentry("article", - title = "ranger: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}", - author = "Wright, Marvin N. and Ziegler, Andreas", - year = 2017, - journal = "Journal of Statistical Software", - volume = 77, - number = 1, - pages = "1--17", - doi = "10.18637/jss.v077.i01" + title = "ranger: A Fast Implementation of Random Forests for High Dimensional Data in {C++} and {R}", + author = "Wright, Marvin N. and Ziegler, Andreas", + year = 2017, + journal = "Journal of Statistical Software", + volume = 77, + number = 1, + pages = "1--17", + doi = "10.18637/jss.v077.i01" ), xu_1999 = bibentry("article", - title = "A R2 type measure of dependence for proportional hazards models", - author = "Ronghui Xu and John O'Quigley", - year = 1999, - month = "jan", - journal = "Journal of Nonparametric Statistics", - publisher = "Informa {UK} Limited", - volume = 12, - number = 1, - pages = "83--107", - doi = "10.1080/10485259908832799" + title = "A R2 type measure of dependence for proportional hazards models", + author = "Ronghui Xu and John O'Quigley", + year = 1999, + month = "jan", + journal = "Journal of Nonparametric Statistics", + publisher = "Informa {UK} Limited", + volume = 12, + number = 1, + pages = "83--107", + doi = "10.1080/10485259908832799" ), jaeger_2019 = bibentry("article", - title = "Oblique random survival forests", - volume = "13", - url = "https://projecteuclid.org/euclid.aoas/1571277776", - doi = "10.1214/19-AOAS1261", - language = "EN", - number = "3", - urldate = "2019-10-19", - journal = "The Annals of Applied Statistics", - author = "Jaeger, Byron C. and Long, D. Leann and Long, Dustin M. and Sims, Mario and Szychowski, Jeff M. and Min, Yuan-I. and Mcclure, Leslie A. and Howard, George and Simon, Noah", - month = "sep", - year = "2019", - pages = "1847--1883" + title = "Oblique random survival forests", + volume = "13", + url = "https://projecteuclid.org/euclid.aoas/1571277776", + doi = "10.1214/19-AOAS1261", + language = "EN", + number = "3", + urldate = "2019-10-19", + journal = "The Annals of Applied Statistics", + author = "Jaeger, Byron C. and Long, D. Leann and Long, Dustin M. and Sims, Mario and Szychowski, Jeff M. and Min, Yuan-I. and Mcclure, Leslie A. and Howard, George and Simon, Noah", + month = "sep", + year = "2019", + pages = "1847--1883" ), silverman_1986 = bibentry("book", - address = "London", - author = "Silverman, B. W.", - biburl = "https://www.bibsonomy.org/bibtex/25c3b630fb0a76da55942a77551fde8a2/pierpaolo.pk81", + address = "London", + author = "Silverman, B. W.", + biburl = "https://www.bibsonomy.org/bibtex/25c3b630fb0a76da55942a77551fde8a2/pierpaolo.pk81", description = "WSD", - publisher = "Chapman & Hall", - timestamp = "2007-02-27T16:22:14.000+0100", - title = "Density Estimation for Statistics and Data Analysis", - year = 1986 + publisher = "Chapman & Hall", + timestamp = "2007-02-27T16:22:14.000+0100", + title = "Density Estimation for Statistics and Data Analysis", + year = 1986 ), chen_2016 = bibentry("article", - title = "{XGBoost}: {A} {Scalable} {Tree} {Boosting} {System}", - url = "https://arxiv.org/abs/1603.02754", - journal = "Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16", - author = "Chen, Tianqi and Guestrin, Carlos", - year = "2016", - note = "arXiv: 1603.02754", - pages = "785--794" + title = "{XGBoost}: {A} {Scalable} {Tree} {Boosting} {System}", + url = "https://arxiv.org/abs/1603.02754", + journal = "Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16", + author = "Chen, Tianqi and Guestrin, Carlos", + year = "2016", + note = "arXiv: 1603.02754", + pages = "785--794" ), binder_2009 = bibentry("article", - author = "Binder, H. and Allignol, A. and Schumacher, M. and Beyersmann, J.", - year = "2009", - title = "Boosting for high-dimensional time-to-event data with competing risks", - journal = "Bioinformatics", - pages = "890--896", - volume = "25" + author = "Binder, H. and Allignol, A. and Schumacher, M. and Beyersmann, J.", + year = "2009", + title = "Boosting for high-dimensional time-to-event data with competing risks", + journal = "Bioinformatics", + pages = "890--896", + volume = "25" ), schemper_2000 = bibentry("article", - author = "Schemper, Michael and Henderson, Robin", - doi = "10.1002/sim.1486", - journal = "Biometrics", - pages = "249--255", - pmid = "12854094", - title = "Predictive Accuracy and Explained Variation in Cox Regression", - volume = "56", - year = "2000" + author = "Schemper, Michael and Henderson, Robin", + doi = "10.1002/sim.1486", + journal = "Biometrics", + pages = "249--255", + pmid = "12854094", + title = "Predictive Accuracy and Explained Variation in Cox Regression", + volume = "56", + year = "2000" ), schmid_2011 = bibentry("article", - author = "Schmid, Matthias and Hielscher, Thomas and Augustin, Thomas and Gefeller, Olaf", - doi = "10.1111/j.1541-0420.2010.01459.x", - journal = "Biometrics", - number = "2", - pages = "524--535", - pmid = "20618308", - title = "A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error", - volume = "67", - year = "2011" + author = "Schmid, Matthias and Hielscher, Thomas and Augustin, Thomas and Gefeller, Olaf", + doi = "10.1111/j.1541-0420.2010.01459.x", + journal = "Biometrics", + number = "2", + pages = "524--535", + pmid = "20618308", + title = "A Robust Alternative to the Schemper-Henderson Estimator of Prediction Error", + volume = "67", + year = "2011" ), vanhouwelingen_2000 = bibentry("article", - author = "Van Houwelingen, Hans C.", - doi = "10.1002/1097-0258(20001230)19:24<3401::AID-SIM554>3.0.CO;2-2", - journal = "Statistics in Medicine", - number = "24", - pages = "3401--3415", - pmid = "11122504", - title = "Validation, calibration, revision and combination of prognostic survival models", - volume = "19", - year = "2000" + author = "Van Houwelingen, Hans C.", + doi = "10.1002/1097-0258(20001230)19:24<3401::AID-SIM554>3.0.CO;2-2", + journal = "Statistics in Medicine", + number = "24", + pages = "3401--3415", + pmid = "11122504", + title = "Validation, calibration, revision and combination of prognostic survival models", + volume = "19", + year = "2000" ), peto_1972 = bibentry("article", - author = "Peto, Richard and Peto, Julian", - journal = "Journal of the Royal Statistical Society: Series A (General)", - number = "2", - pages = "185--198", - publisher = "Wiley Online Library", - title = "Asymptotically efficient rank invariant test procedures", - volume = "135", - year = "1972" + author = "Peto, Richard and Peto, Julian", + journal = "Journal of the Royal Statistical Society: Series A (General)", + number = "2", + pages = "185--198", + publisher = "Wiley Online Library", + title = "Asymptotically efficient rank invariant test procedures", + volume = "135", + year = "1972" ), schemper_2009 = bibentry("article", - author = "Schemper, Michael and Wakounig, Samo and Heinze, Georg", - doi = "10.1002/sim.3623", - journal = "Statistics in Medicine", - month = "aug", - number = "19", - pages = "2473--2489", - publisher = "John Wiley & Sons, Ltd", - title = "The estimation of average hazard ratios by weighted Cox regression", - volume = "28", - year = "2009" + author = "Schemper, Michael and Wakounig, Samo and Heinze, Georg", + doi = "10.1002/sim.3623", + journal = "Statistics in Medicine", + month = "aug", + number = "19", + pages = "2473--2489", + publisher = "John Wiley & Sons, Ltd", + title = "The estimation of average hazard ratios by weighted Cox regression", + volume = "28", + year = "2009" ), vock_2016 = bibentry("article", - author = "Vock, David M and Wolfson, Julian and Bandyopadhyay, Sunayan and Adomavicius, Gediminas and Johnson, Paul E and Vazquez-Benitez, Gabriela and O'Connor, Patrick J", - doi = "https://doi.org/10.1016/j.jbi.2016.03.009", - journal = "Journal of Biomedical Informatics", - pages = "119--131", - title = "Adapting machine learning techniques to censored time-to-event health record data: A general-purpose approach using inverse probability of censoring weighting", - url = "https://www.sciencedirect.com/science/article/pii/S1532046416000496", - volume = "61", - year = "2016" + author = "Vock, David M and Wolfson, Julian and Bandyopadhyay, Sunayan and Adomavicius, Gediminas and Johnson, Paul E and Vazquez-Benitez, Gabriela and O'Connor, Patrick J", + doi = "https://doi.org/10.1016/j.jbi.2016.03.009", + journal = "Journal of Biomedical Informatics", + pages = "119--131", + title = "Adapting machine learning techniques to censored time-to-event health record data: A general-purpose approach using inverse probability of censoring weighting", + url = "https://www.sciencedirect.com/science/article/pii/S1532046416000496", + volume = "61", + year = "2016" ), jackson_2014 = bibentry("article", - author = "Jackson, Dan and White, Ian R. and Seaman, Shaun and Evans, Hannah and Baisley, Kathy and Carpenter, James", - doi = "10.1002/sim.6274", - journal = "Statistics in Medicine", - month = "nov", - number = "27", - pages = "4681--4694", - publisher = "John Wiley and Sons Ltd", - title = "Relaxing the independent censoring assumption in the Cox proportional hazards model using multiple imputation", - volume = "33", - year = "2014" + author = "Jackson, Dan and White, Ian R. and Seaman, Shaun and Evans, Hannah and Baisley, Kathy and Carpenter, James", + doi = "10.1002/sim.6274", + journal = "Statistics in Medicine", + month = "nov", + number = "27", + pages = "4681--4694", + publisher = "John Wiley and Sons Ltd", + title = "Relaxing the independent censoring assumption in the Cox proportional hazards model using multiple imputation", + volume = "33", + year = "2014" ), klein_2003 = bibentry("book", - author = "Klein, John P and Moeschberger, Melvin L", - edition = "2", - isbn = "0387216456", - publisher = "Springer Science & Business Media", - title = "Survival analysis: techniques for censored and truncated data", - year = "2003" + author = "Klein, John P and Moeschberger, Melvin L", + edition = "2", + isbn = "0387216456", + publisher = "Springer Science & Business Media", + title = "Survival analysis: techniques for censored and truncated data", + year = "2003" ), buckley_1979 = bibentry("article", - author = "Buckley, Jonathan and James, Ian", - doi = "10.2307/2335161", - journal = "Biometrika", - month = "apr", - number = "3", - pages = "429--436", - publisher = "Oxford University Press, Biometrika Trust", - title = "Linear Regression with Censored Data", - url = "https://www.jstor.org/stable/2335161", - volume = "66", - year = "1979" + author = "Buckley, Jonathan and James, Ian", + doi = "10.2307/2335161", + journal = "Biometrika", + month = "apr", + number = "3", + pages = "429--436", + publisher = "Oxford University Press, Biometrika Trust", + title = "Linear Regression with Censored Data", + url = "https://www.jstor.org/stable/2335161", + volume = "66", + year = "1979" ), haider_2020 = bibentry("article", - author = "Haider, Humza and Hoehn, Bret and Davis, Sarah and Greiner, Russell", - journal = "Journal of Machine Learning Research", - volume = "21", - number = "85", - pages = "1--63", - title = "Effective Ways to Build and Evaluate Individual Survival Distributions", - url = "https://jmlr.org/papers/v21/18-772.html", - year = "2020" + author = "Haider, Humza and Hoehn, Bret and Davis, Sarah and Greiner, Russell", + journal = "Journal of Machine Learning Research", + volume = "21", + number = "85", + pages = "1--63", + title = "Effective Ways to Build and Evaluate Individual Survival Distributions", + url = "https://jmlr.org/papers/v21/18-772.html", + year = "2020" ), dcalib = bibentry("article", - author = "Humza Haider and Bret Hoehn and Sarah Davis and Russell Greiner", - title = "Effective Ways to Build and Evaluate Individual Survival Distributions", - journal = "Journal of Machine Learning Research", - year = "2020", - volume = "21", - number = "85", - pages = "1-63", - url = "https://jmlr.org/papers/v21/18-772.html" + author = "Humza Haider and Bret Hoehn and Sarah Davis and Russell Greiner", + title = "Effective Ways to Build and Evaluate Individual Survival Distributions", + journal = "Journal of Machine Learning Research", + year = "2020", + volume = "21", + number = "85", + pages = "1-63", + url = "https://jmlr.org/papers/v21/18-772.html" ), breslow_1972 = bibentry("article", - author = "Norman Breslow", - title = "Discussion of 'Regression Models and Life-Tables' by D.R. Cox", - journal = "Journal of the Royal Statistical Society: Series B", - year = "1972", - volume = "34", - number = "2", - pages = "216-217" + author = "Norman Breslow", + title = "Discussion of 'Regression Models and Life-Tables' by D.R. Cox", + journal = "Journal of the Royal Statistical Society: Series B", + year = "1972", + volume = "34", + number = "2", + pages = "216-217" ), lin_2007 = bibentry("article", - author = "Lin, D. Y.", - title = "On the Breslow estimator", - journal = "Lifetime Data Analysis", - year = "2007", - volume = "13", - number = "4", - pages = "471-480", - publisher = "Springer", - doi = "10.1007/s10985-007-9048-y" + author = "Lin, D. Y.", + title = "On the Breslow estimator", + journal = "Lifetime Data Analysis", + year = "2007", + volume = "13", + number = "4", + pages = "471-480", + publisher = "Springer", + doi = "10.1007/s10985-007-9048-y" ), avati_2020 = bibentry("article", - author = "Avati, Anand and Duan, Tony and Zhou, Sharon and Jung, Kenneth and Shah, Nigam H and Ng, Andrew Y", - title = "Countdown Regression: Sharp and Calibrated Survival Predictions", - journal = "Proceedings of The 35th Uncertainty in Artificial Intelligence Conference", - year = "2020", - volume = "115", - number = "4", - series = "Proceedings of Machine Learning Research", - pages = "145--155", - publisher = "PMLR", - url = "https://proceedings.mlr.press/v115/avati20a.html" + author = "Avati, Anand and Duan, Tony and Zhou, Sharon and Jung, Kenneth and Shah, Nigam H and Ng, Andrew Y", + title = "Countdown Regression: Sharp and Calibrated Survival Predictions", + journal = "Proceedings of The 35th Uncertainty in Artificial Intelligence Conference", + year = "2020", + volume = "115", + number = "4", + series = "Proceedings of Machine Learning Research", + pages = "145--155", + publisher = "PMLR", + url = "https://proceedings.mlr.press/v115/avati20a.html" ), rindt_2022 = bibentry("article", - author = "Rindt, David and Hu, Robert and Steinsaltz, David and Sejdinovic, Dino", - title = "Survival regression with proper scoring rules and monotonic neural networks", - journal = "Proceedings of The 25th International Conference on Artificial Intelligence and Statistics", - year = "2022", - volume = "151", - number = "4", - series = "Proceedings of Machine Learning Research", - pages = "1190--1205", - publisher = "PMLR", - url = "https://proceedings.mlr.press/v151/rindt22a.html" + author = "Rindt, David and Hu, Robert and Steinsaltz, David and Sejdinovic, Dino", + title = "Survival regression with proper scoring rules and monotonic neural networks", + journal = "Proceedings of The 25th International Conference on Artificial Intelligence and Statistics", + year = "2022", + volume = "151", + number = "4", + series = "Proceedings of Machine Learning Research", + pages = "1190--1205", + publisher = "PMLR", + url = "https://proceedings.mlr.press/v151/rindt22a.html" ), grambsch_1994 = bibentry("article", - author = "Grambsch, Patricia and Therneau, Terry", - title = "Proportional hazards tests and diagnostics based on weighted residuals", - journal = "Biometrika", - year = "1994", - volume = "81", - number = "3", - pages = "515--526", - doi = "10.1093/biomet/81.3.515", - url = "https://doi.org/10.1093/biomet/81.3.515" + author = "Grambsch, Patricia and Therneau, Terry", + title = "Proportional hazards tests and diagnostics based on weighted residuals", + journal = "Biometrika", + year = "1994", + volume = "81", + number = "3", + pages = "515--526", + doi = "10.1093/biomet/81.3.515", + url = "https://doi.org/10.1093/biomet/81.3.515" ), tutz_2016 = bibentry("book", author = "Tutz, Gerhard and Schmid, Matthias", @@ -683,5 +683,31 @@ bibentries = c( # nolint start series = "Springer Series in Statistics", isbn = "978-3-319-28156-8 978-3-319-28158-2", url = "http://link.springer.com/10.1007/978-3-319-28158-2" + ), + sonabend_2022 = bibentry("article", + author = "Sonabend, Raphael and Bender, Andreas and Vollmer, Sebastian", + doi = "10.1093/BIOINFORMATICS/BTAC451", + editor = "Lu, Zhiyong", + isbn = "451/6640155", + issn = "1367-4803", + journal = "Bioinformatics", + month = "jul", + title = "Avoiding C-hacking when evaluating survival distribution predictions with discrimination measures", + url = "https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btac451/6640155", + year = "2022" + ), + zhao_2016 = bibentry("article", + author = "Zhao, Lihui and Claggett, Brian and Tian, Lu and Uno, Hajime and Pfeffer, Marc A. and Solomon, Scott D. and Trippa, Lorenzo and Wei, L. J.", + doi = "10.1111/BIOM.12384", + issn = "1541-0420", + journal = "Biometrics", + month = "mar", + number = "1", + pages = "215--221", + publisher = "John Wiley & Sons, Ltd", + title = "On the restricted mean survival time curve in survival analysis", + url = "https://onlinelibrary.wiley.com/doi/full/10.1111/biom.12384", + volume = "72", + year = "2016" ) ) # nolint end diff --git a/R/partition.R b/R/partition.R deleted file mode 100644 index 2ad3dff31..000000000 --- a/R/partition.R +++ /dev/null @@ -1,9 +0,0 @@ -#' @export -partition.TaskSurv = function(task, ratio = 0.67, stratify = TRUE, ...) { # nolint - if (stratify) { - task = task$clone() - task$set_col_roles(task$target_names[2L], add_to = "stratum") - } - - NextMethod("partition") -} diff --git a/R/pipelines.R b/R/pipelines.R index 638c89d97..8587f6d44 100644 --- a/R/pipelines.R +++ b/R/pipelines.R @@ -108,49 +108,46 @@ pipeline_survbagging = function(learner, iterations = 10, frac = 0.7, avg = TRUE #' @templateVar pipeop [PipeOpCrankCompositor] #' @templateVar id crankcompositor #' @template param_pipeline_learner +#' #' @param method `character(1)`\cr -#' One of `sum_haz` (default), `mean`, `mode`, or `median`; -#' abbreviations allowed. Used to determine how `crank` is estimated from -#' the predicted `distr`. -#' @param which `integer(1)`\cr -#' If `method = "mode"` then specifies which mode to use if multi-modal, default -#' is the first. -#' @param response `logical(1)`\cr -#' If `TRUE` then the `response` predict type is also estimated with the same values as `crank`. +#' Determines what method should be used to produce a continuous ranking from the distribution. +#' Currently only `mort` is supported, which is the sum of the cumulative hazard, also called *expected/ensemble mortality*, see Ishwaran et al. (2008). +#' For more details, see [get_mortality()]. #' @param overwrite `logical(1)`\cr -#' If `TRUE` then existing `response` and `crank` predict types are overwritten. +#' If `FALSE` (default) and the prediction already has a `crank` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `crank` will be overwritten. +#' #' @examples #' \dontrun{ #' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { #' library("mlr3") #' library("mlr3pipelines") #' -#' task = tsk("rats") -#' pipe = ppl( +#' task = tsk("lung") +#' part = partition(task) +#' +#' # change the crank prediction type of a Cox's model predictions +#' grlrn = ppl( #' "crankcompositor", #' learner = lrn("surv.coxph"), -#' method = "sum_haz" +#' method = "mort", +#' overwrite = TRUE, +#' graph_learner = TRUE #' ) -#' pipe$train(task) -#' pipe$predict(task) +#' grlrn$train(task, part$train) +#' grlrn$predict(task, part$test) #' } #' } -pipeline_crankcompositor = function(learner, - method = c("sum_haz", "mean", "median", "mode"), which = NULL, - response = FALSE, overwrite = FALSE, graph_learner = FALSE) { - - if (testCharacter(learner)) { - warning("Passing a learner id is now deprecated. In the future please pass a constructed - learner or graph instead.") - learner = lrn(learner) - } +pipeline_crankcompositor = function(learner, method = c("mort"), + overwrite = FALSE, graph_learner = FALSE) { + assert_learner(learner, task_type = "surv") + assert_choice(method, choices = c("mort")) + assert_logical(overwrite) + assert_logical(graph_learner) pred = mlr3pipelines::as_graph(learner) - pv = list(method = match.arg(method), response = response, overwrite = overwrite) - if (!is.null(which)) { - pv$which = which - } + pv = list(method = method, overwrite = overwrite) compositor = mlr3pipelines::po("crankcompose", param_vals = pv) gr = mlr3pipelines::`%>>%`(pred, compositor) @@ -162,6 +159,71 @@ pipeline_crankcompositor = function(learner, gr } +#' @template pipeline +#' @templateVar title Estimate Survival Time/Response Predict Type +#' @templateVar pipeop [PipeOpResponseCompositor] +#' @templateVar id responsecompositor +#' @template param_pipeline_learner +#' +#' @param method (`character(1)`)\cr +#' Determines what method should be used to produce a survival time (response) from the survival distribution. +#' Available methods are `"rmst"` and `"median"`, corresponding to the *restricted mean survival time* and the *median survival time* respectively. +#' @param cutoff_time (`numeric(1)`)\cr +#' Determines the time point up to which we calculate the restricted mean survival time (works only for the `"rmst"` method). +#' If `NULL` (default), all the available time points in the predicted survival distribution will be used. +#' @param add_crank (`logical(1)`)\cr +#' If `TRUE` then `crank` predict type will be set as `-response` (as higher survival times correspond to lower risk). +#' Works only if `overwrite` is `TRUE`. +#' @param overwrite (`logical(1)`)\cr +#' If `FALSE` (default) and the prediction already has a `response` prediction, then the compositor returns the input prediction unchanged. +#' If `TRUE`, then the `response` (and the `crank`, if `add_crank` is `TRUE`) will be overwritten. +#' +#' @examples +#' \dontrun{ +#' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { +#' library("mlr3") +#' library("mlr3pipelines") +#' +#' task = tsk("lung") +#' part = partition(task) +#' +#' # add survival time prediction type to the predictions of a Cox model +#' grlrn = ppl( +#' "responsecompositor", +#' learner = lrn("surv.coxph"), +#' method = "rmst", +#' overwrite = TRUE, +#' graph_learner = TRUE +#' ) +#' grlrn$train(task, part$train) +#' grlrn$predict(task, part$test) +#' } +#' } +pipeline_responsecompositor = function(learner, method = "rmst", cutoff_time = NULL, + add_crank = FALSE, overwrite = FALSE, + graph_learner = FALSE) { + assert_learner(learner, task_type = "surv") + assert_choice(method, choices = c("rmst", "median")) + assert_number(cutoff_time, null.ok = TRUE, lower = 0) + assert_logical(add_crank) + assert_logical(overwrite) + assert_logical(graph_learner) + + pred = mlr3pipelines::as_graph(learner) + + pv = list(method = method, cutoff_time = cutoff_time, add_crank = add_crank, + overwrite = overwrite) + compositor = mlr3pipelines::po("responsecompose", param_vals = pv) + + gr = mlr3pipelines::`%>>%`(pred, compositor) + + if (graph_learner) { + gr = mlr3pipelines::GraphLearner$new(gr) + } + + gr +} + #' @template pipeline #' @templateVar title Estimate Survival distr Predict Type #' @templateVar pipeop [PipeOpDistrCompositor] or [PipeOpBreslow] @@ -185,20 +247,21 @@ pipeline_crankcompositor = function(learner, #' to another. #' @examples #' \dontrun{ -#' if (requireNamespace("mlr3pipelines", quietly = TRUE) && -#' requireNamespace("rpart", quietly = TRUE)) { -#' library("mlr3") +#' if (requireNamespace("mlr3pipelines", quietly = TRUE)) { #' library("mlr3pipelines") #' +#' # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: #' task = tsk("rats") -#' pipe = ppl( +#' grlrn = ppl( #' "distrcompositor", -#' learner = lrn("surv.rpart"), +#' learner = lrn("surv.coxph"), #' estimator = "kaplan", -#' form = "ph" +#' form = "aft", +#' overwrite = TRUE, +#' graph_learner = TRUE #' ) -#' pipe$train(task) -#' pipe$predict(task) +#' grlrn$train(task) +#' grlrn$predict(task) #' } #' } pipeline_distrcompositor = function(learner, estimator = "kaplan", form = "aft", @@ -310,8 +373,6 @@ pipeline_probregr = function(learner, learner_se = NULL, dist = "Uniform", #' \item A [LearnerRegr] is fit and predicted on the new `TaskRegr`. #' \item [PipeOpPredRegrSurv] transforms the resulting [PredictionRegr][mlr3::PredictionRegr] #' to [PredictionSurv]. -#' \item Optionally: [PipeOpDistrCompositor] is used to compose a `distr` predict_type from the -#' predicted `response` predict_type. #' } #' \item Survival to Probabilistic Regression #' \enumerate{ @@ -357,7 +418,7 @@ pipeline_probregr = function(learner, learner_se = NULL, dist = "Uniform", #' Regression learner to fit to the transformed [TaskRegr][mlr3::TaskRegr]. If `regr_se_learner` is #' `NULL` in method `2`, then `regr_learner` must have `se` predict_type. #' @param distrcompose `logical(1)`\cr -#' For methods `1` and `3` if `TRUE` (default) then [PipeOpDistrCompositor] is utilised to +#' For method `3` if `TRUE` (default) then [PipeOpDistrCompositor] is utilised to #' transform the deterministic predictions to a survival distribution. #' @param distr_estimator [LearnerSurv]\cr #' For methods `1` and `3` if `distrcompose = TRUE` then specifies the learner to estimate the @@ -398,7 +459,6 @@ pipeline_probregr = function(learner, learner_se = NULL, dist = "Uniform", #' "survtoregr", #' method = 1, #' regr_learner = lrn("regr.featureless"), -#' distrcompose = TRUE, #' survregr_params = list(method = "delete") #' ) #' pipe$train(task) @@ -448,16 +508,6 @@ pipeline_survtoregr = function(method = 1, regr_learner = lrn("regr.featureless" add_edge("trafotask_survregr", "regr_learner")$ add_edge("regr_learner", "trafopred_regrsurv", dst_channel = "pred")$ add_edge("task_surv", "trafopred_regrsurv", dst_channel = "task") - - if (distrcompose) { - assert("distr" %in% distr_estimator$predict_types) - - gr$add_pipeop(mlr3pipelines::po("learner", distr_estimator, id = "distr_estimator"))$ - add_pipeop(mlr3pipelines::po("distrcompose", param_vals = distrcompose_params))$ - add_edge("trafopred_regrsurv", dst_id = "distrcompose", dst_channel = "pred")$ - add_edge("distr_estimator", dst_id = "distrcompose", dst_channel = "base") - } - } else if (method == 2) { gr = mlr3pipelines::Graph$new()$ @@ -485,7 +535,6 @@ pipeline_survtoregr = function(method = 1, regr_learner = lrn("regr.featureless" add_edge("regr_learner", "compose_probregr", dst_channel = "input_response") } else if (method == 3) { - assert("lp" %in% surv_learner$predict_types) gr = mlr3pipelines::Graph$new()$ @@ -614,6 +663,7 @@ register_graph("survaverager", pipeline_survaverager) register_graph("survbagging", pipeline_survbagging) register_graph("crankcompositor", pipeline_crankcompositor) register_graph("distrcompositor", pipeline_distrcompositor) +register_graph("responsecompositor", pipeline_responsecompositor) register_graph("probregr", pipeline_probregr) register_graph("survtoregr", pipeline_survtoregr) register_graph("survtoclassif_disctime", pipeline_survtoclassif_disctime) diff --git a/R/scoring_rule_erv.R b/R/scoring_rule_erv.R index b680b8b06..e956b120d 100644 --- a/R/scoring_rule_erv.R +++ b/R/scoring_rule_erv.R @@ -7,18 +7,20 @@ if (!is.null(ps$se) && ps$se) { stop("Only one of `ERV` and `se` can be TRUE") } - measure$param_set$values$ERV = FALSE + + measure$param_set$set_values(ERV = FALSE) # compute score for the learner - learner_score = measure$score(prediction, task = task, - train_set = train_set) + learner_score = measure$score(prediction, task = task, train_set = train_set) # compute score for the baseline (Kaplan-Meier) + # train KM km = lrn("surv.kaplan")$train(task = task, row_ids = train_set) - km = km$predict(as_task_surv(data.frame( - as.matrix(prediction$truth)), event = "status")) - base_score = measure$score(km, task = task, train_set = train_set) + # predict KM on the test set (= not train ids) + test_set = setdiff(task$row_ids, train_set) + km_pred = km$predict(task, row_ids = test_set) + base_score = measure$score(km_pred, task = task, train_set = train_set) - measure$param_set$values$ERV = TRUE + measure$param_set$set_values(ERV = TRUE) # return percentage decrease 1 - (learner_score / base_score) diff --git a/R/surv_return.R b/R/surv_return.R index 775734300..6f7a857ed 100644 --- a/R/surv_return.R +++ b/R/surv_return.R @@ -3,38 +3,25 @@ #' @description Internal helper function to easily return the correct survival predict types. #' #' @param times (`numeric()`) \cr Vector of survival times. -#' @param surv (`matrix()|array()`)\cr Matrix or array of predicted survival -#' probabilities, rows (1st dimension) are observations, columns (2nd dimension) -#' are times and in the case of an array there should be one more dimension. -#' Number of columns should be equal to length of `times`. In case a `numeric()` -#' vector is provided, it is converted to a single row (one observation) matrix. -#' @param crank (`numeric()`)\cr Relative risk/continuous ranking. Higher value is associated -#' with higher risk. If `NULL` then either set as `-response` if available or -#' `lp` if available (this assumes that the `lp` prediction comes from a PH type -#' model - in case of an AFT model the user should provide `-lp`). -#' In case neither `response` or `lp` are provided, then `crank` is calculated -#' as the sum of the cumulative hazard function (expected mortality) derived from -#' the predicted survival function (`surv`). In case `surv` is a 3d array, we use -#' the `which.curve` parameter to decide which survival matrix (index in the 3rd -#' dimension) will be chosen for the calculation of `crank`. +#' @param surv (`matrix()|array()`)\cr Matrix or array of predicted survival probabilities, rows (1st dimension) are observations, columns (2nd dimension) are times and in the case of an array there should be one more dimension. +#' Number of columns should be equal to length of `times`. +#' In case a `numeric()` vector is provided, it is converted to a single row (one observation) matrix. +#' @param crank (`numeric()`)\cr Relative risk/continuous ranking. +#' Higher value is associated with higher risk. +#' If `NULL` then either set as `-response` if available or `lp` if available (this assumes that the `lp` prediction comes from a PH type model - in case of an AFT model the user should provide `-lp`). +#' In case neither `response` or `lp` are provided, then `crank` is calculated as the sum of the cumulative hazard function (**expected mortality**) derived from the predicted survival function (`surv`), see [get_mortality]. +#' In case `surv` is a 3d array, we use the `which.curve` parameter to decide which survival matrix (index in the 3rd dimension) will be chosen for the calculation of `crank`. #' @param lp (`numeric()`)\cr Predicted linear predictor, used to impute `crank` if `NULL`. #' @param response (`numeric()`)\cr Predicted survival time, passed through function without #' modification. -#' @param which.curve Which curve (3rd dimension) should the `crank` be -#' calculated for, in case `surv` is an `array`? If between (0,1) it is taken as -#' the quantile of the curves otherwise if greater than 1 it is taken as the -#' curve index. It can also be 'mean' and the survival probabilities are averaged -#' across the 3rd dimension. Default value (`NULL`) is the **0.5 quantile** which -#' is the median across the 3rd dimension of the survival array. -#' -#' @details -#' Uses [survivalmodels::surv_to_risk] to reduce survival matrices to relative -#' risks / rankings if `crank` is NULL. +#' @param which.curve Which curve (3rd dimension) should the `crank` be calculated for, in case `surv` is an `array`? +#' If between (0,1) it is taken as the quantile of the curves otherwise if greater than 1 it is taken as the curve index. +#' It can also be 'mean' and the survival probabilities are averaged across the 3rd dimension. +#' Default value (`NULL`) is the **0.5 quantile** which is the median across the 3rd dimension of the survival array. #' #' @references -#' Sonabend, R., Bender, A., & Vollmer, S. (2022). Avoiding C-hacking when -#' evaluating survival distribution predictions with discrimination measures. -#' Bioinformatics. https://doi.org/10.1093/BIOINFORMATICS/BTAC451 +#' `r format_bib("sonabend_2022")` +#' #' @export .surv_return = function(times = NULL, surv = NULL, crank = NULL, lp = NULL, response = NULL, which.curve = NULL) { @@ -64,10 +51,10 @@ crank = lp } else if (!is.null(surv)) { if (inherits(surv, "matrix")) { - crank = survivalmodels::surv_to_risk(surv) + crank = get_mortality(surv) } else { # array surv_mat = .ext_surv_mat(surv, which.curve) - crank = survivalmodels::surv_to_risk(surv_mat) + crank = get_mortality(surv_mat) } } } @@ -118,3 +105,53 @@ array(arr[, , which.curve], c(nrow(arr), ncol(arr)), dimnames(arr)[1:2]) } } + +#' @title Calculate the expected mortality risks from a survival matrix +#' +#' @description Many methods can be used to reduce a discrete survival +#' distribution prediction (i.e. matrix) to a relative risk / ranking +#' prediction, see Sonabend et al. (2022). +#' +#' This function calculates a relative risk score as the sum of the +#' predicted cumulative hazard function, also called **ensemble/expected mortality**. +#' This risk score can be loosely interpreted as the expected number of deaths for +#' patients with similar characteristics, see Ishwaran et al. (2008) and has no +#' model or survival distribution assumptions. +#' +#' @param x (`matrix()`) \cr A survival matrix where rows are the +#' (predicted) observations and columns the time-points. +#' For more details, see [assert_surv_matrix]. +#' +#' @return a `numeric` vector of the mortality risk scores, one per row of the +#' input survival matrix. +#' +#' @references +#' `r format_bib("sonabend_2022", "ishwaran_2008")` +#' +#' @examples +#' n = 10 # number of observations +#' k = 50 # time points +#' +#' # Create the matrix with random values between 0 and 1 +#' mat = matrix(runif(n * k, min = 0, max = 1), nrow = n, ncol = k) +#' +#' # transform it to a survival matrix +#' surv_mat = t(apply(mat, 1L, function(row) sort(row, decreasing = TRUE))) +#' colnames(surv_mat) = 1:k # time points +#' +#' # get mortality scores (the larger, the more risk) +#' mort = get_mortality(surv_mat) +#' mort +#' +#' @export +get_mortality = function(x) { + assert_surv_matrix(x) + + # H(t) = -log(S(t)) + cumhaz = -log(x) + + # Ignore S(t) = 0 => -log(S(t)) = Inf + cumhaz[is.infinite(cumhaz)] = 0 + + rowSums(cumhaz) +} diff --git a/R/zzz.R b/R/zzz.R index 20a0b3e83..bad8118b7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -81,7 +81,7 @@ unregister_reflections = function() { # task package = NULL # silence data.table notes - x$task_types[package != "mlr3proba"] + x$task_types = x$task_types[package != "mlr3proba"] x$task_col_roles$surv = NULL x$task_col_roles$dens = NULL x$task_col_roles$classif = setdiff(x$task_col_roles$classif, "original_ids") diff --git a/man/assert_surv_matrix.Rd b/man/assert_surv_matrix.Rd new file mode 100644 index 000000000..349250639 --- /dev/null +++ b/man/assert_surv_matrix.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assertions.R +\name{assert_surv_matrix} +\alias{assert_surv_matrix} +\title{Assert survival matrix} +\usage{ +assert_surv_matrix(x) +} +\arguments{ +\item{x}{(\code{matrix()})\cr +A matrix of (predicted) survival probabilities. +Rows are observations, columns are (increasing) time points.} +} +\value{ +if the assertion fails an error occurs, otherwise \code{NULL} is returned +invisibly. +} +\description{ +Asserts if the given input matrix is a (discrete) survival +probabilities matrix using \link{Rcpp} code. +The following checks are performed: +\enumerate{ +\item All values are probabilities, i.e. \eqn{S(t) \in [0,1]} +\item Column names correspond to time-points and should therefore be coercable to +\code{numeric} and increasing +\item Per row/observation, the survival probabilities decrease non-strictly, i.e. +\eqn{S(t) \ge S(t+1)} +} +} +\examples{ +x = matrix(data = c(1,0.6,0.4,0.8,0.8,0.7), nrow = 2, ncol = 3, byrow = TRUE) +colnames(x) = c(12, 34, 42) +x + +assert_surv_matrix(x) + +} diff --git a/man/dot-surv_return.Rd b/man/dot-surv_return.Rd index f2020ee44..fafca5b72 100644 --- a/man/dot-surv_return.Rd +++ b/man/dot-surv_return.Rd @@ -16,43 +16,32 @@ \arguments{ \item{times}{(\code{numeric()}) \cr Vector of survival times.} -\item{surv}{(\code{matrix()|array()})\cr Matrix or array of predicted survival -probabilities, rows (1st dimension) are observations, columns (2nd dimension) -are times and in the case of an array there should be one more dimension. -Number of columns should be equal to length of \code{times}. In case a \code{numeric()} -vector is provided, it is converted to a single row (one observation) matrix.} +\item{surv}{(\code{matrix()|array()})\cr Matrix or array of predicted survival probabilities, rows (1st dimension) are observations, columns (2nd dimension) are times and in the case of an array there should be one more dimension. +Number of columns should be equal to length of \code{times}. +In case a \code{numeric()} vector is provided, it is converted to a single row (one observation) matrix.} -\item{crank}{(\code{numeric()})\cr Relative risk/continuous ranking. Higher value is associated -with higher risk. If \code{NULL} then either set as \code{-response} if available or -\code{lp} if available (this assumes that the \code{lp} prediction comes from a PH type -model - in case of an AFT model the user should provide \code{-lp}). -In case neither \code{response} or \code{lp} are provided, then \code{crank} is calculated -as the sum of the cumulative hazard function (expected mortality) derived from -the predicted survival function (\code{surv}). In case \code{surv} is a 3d array, we use -the \code{which.curve} parameter to decide which survival matrix (index in the 3rd -dimension) will be chosen for the calculation of \code{crank}.} +\item{crank}{(\code{numeric()})\cr Relative risk/continuous ranking. +Higher value is associated with higher risk. +If \code{NULL} then either set as \code{-response} if available or \code{lp} if available (this assumes that the \code{lp} prediction comes from a PH type model - in case of an AFT model the user should provide \code{-lp}). +In case neither \code{response} or \code{lp} are provided, then \code{crank} is calculated as the sum of the cumulative hazard function (\strong{expected mortality}) derived from the predicted survival function (\code{surv}), see \link{get_mortality}. +In case \code{surv} is a 3d array, we use the \code{which.curve} parameter to decide which survival matrix (index in the 3rd dimension) will be chosen for the calculation of \code{crank}.} \item{lp}{(\code{numeric()})\cr Predicted linear predictor, used to impute \code{crank} if \code{NULL}.} \item{response}{(\code{numeric()})\cr Predicted survival time, passed through function without modification.} -\item{which.curve}{Which curve (3rd dimension) should the \code{crank} be -calculated for, in case \code{surv} is an \code{array}? If between (0,1) it is taken as -the quantile of the curves otherwise if greater than 1 it is taken as the -curve index. It can also be 'mean' and the survival probabilities are averaged -across the 3rd dimension. Default value (\code{NULL}) is the \strong{0.5 quantile} which -is the median across the 3rd dimension of the survival array.} +\item{which.curve}{Which curve (3rd dimension) should the \code{crank} be calculated for, in case \code{surv} is an \code{array}? +If between (0,1) it is taken as the quantile of the curves otherwise if greater than 1 it is taken as the curve index. +It can also be 'mean' and the survival probabilities are averaged across the 3rd dimension. +Default value (\code{NULL}) is the \strong{0.5 quantile} which is the median across the 3rd dimension of the survival array.} } \description{ Internal helper function to easily return the correct survival predict types. } -\details{ -Uses \link[survivalmodels:surv_to_risk]{survivalmodels::surv_to_risk} to reduce survival matrices to relative -risks / rankings if \code{crank} is NULL. -} \references{ -Sonabend, R., Bender, A., & Vollmer, S. (2022). Avoiding C-hacking when -evaluating survival distribution predictions with discrimination measures. -Bioinformatics. https://doi.org/10.1093/BIOINFORMATICS/BTAC451 +Sonabend, Raphael, Bender, Andreas, Vollmer, Sebastian (2022). +\dQuote{Avoiding C-hacking when evaluating survival distribution predictions with discrimination measures.} +\emph{Bioinformatics}. +ISSN 1367-4803, \doi{10.1093/BIOINFORMATICS/BTAC451}, \url{https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btac451/6640155}. } diff --git a/man/get_mortality.Rd b/man/get_mortality.Rd new file mode 100644 index 000000000..49cf04ac9 --- /dev/null +++ b/man/get_mortality.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/surv_return.R +\name{get_mortality} +\alias{get_mortality} +\title{Calculate the expected mortality risks from a survival matrix} +\usage{ +get_mortality(x) +} +\arguments{ +\item{x}{(\code{matrix()}) \cr A survival matrix where rows are the +(predicted) observations and columns the time-points. +For more details, see \link{assert_surv_matrix}.} +} +\value{ +a \code{numeric} vector of the mortality risk scores, one per row of the +input survival matrix. +} +\description{ +Many methods can be used to reduce a discrete survival +distribution prediction (i.e. matrix) to a relative risk / ranking +prediction, see Sonabend et al. (2022). + +This function calculates a relative risk score as the sum of the +predicted cumulative hazard function, also called \strong{ensemble/expected mortality}. +This risk score can be loosely interpreted as the expected number of deaths for +patients with similar characteristics, see Ishwaran et al. (2008) and has no +model or survival distribution assumptions. +} +\examples{ +n = 10 # number of observations +k = 50 # time points + +# Create the matrix with random values between 0 and 1 +mat = matrix(runif(n * k, min = 0, max = 1), nrow = n, ncol = k) + +# transform it to a survival matrix +surv_mat = t(apply(mat, 1L, function(row) sort(row, decreasing = TRUE))) +colnames(surv_mat) = 1:k # time points + +# get mortality scores (the larger, the more risk) +mort = get_mortality(surv_mat) +mort + +} +\references{ +Sonabend, Raphael, Bender, Andreas, Vollmer, Sebastian (2022). +\dQuote{Avoiding C-hacking when evaluating survival distribution predictions with discrimination measures.} +\emph{Bioinformatics}. +ISSN 1367-4803, \doi{10.1093/BIOINFORMATICS/BTAC451}, \url{https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btac451/6640155}. + +Ishwaran, Hemant, Kogalur, B U, Blackstone, H E, Lauer, S M, others (2008). +\dQuote{Random survival forests.} +\emph{The Annals of applied statistics}, \bold{2}(3), 841--860. +} diff --git a/man/mlr_graphs_crankcompositor.Rd b/man/mlr_graphs_crankcompositor.Rd index 2aed5581c..14922ccc0 100644 --- a/man/mlr_graphs_crankcompositor.Rd +++ b/man/mlr_graphs_crankcompositor.Rd @@ -7,9 +7,7 @@ \usage{ pipeline_crankcompositor( learner, - method = c("sum_haz", "mean", "median", "mode"), - which = NULL, - response = FALSE, + method = c("mort"), overwrite = FALSE, graph_learner = FALSE ) @@ -21,19 +19,13 @@ be wrapped in \link[mlr3pipelines:Graph]{mlr3pipelines::Graph} or a \code{Graph} \link{LearnerSurv}.} \item{method}{\code{character(1)}\cr -One of \code{sum_haz} (default), \code{mean}, \code{mode}, or \code{median}; -abbreviations allowed. Used to determine how \code{crank} is estimated from -the predicted \code{distr}.} - -\item{which}{\code{integer(1)}\cr -If \code{method = "mode"} then specifies which mode to use if multi-modal, default -is the first.} - -\item{response}{\code{logical(1)}\cr -If \code{TRUE} then the \code{response} predict type is also estimated with the same values as \code{crank}.} +Determines what method should be used to produce a continuous ranking from the distribution. +Currently only \code{mort} is supported, which is the sum of the cumulative hazard, also called \emph{expected/ensemble mortality}, see Ishwaran et al. (2008). +For more details, see \code{\link[=get_mortality]{get_mortality()}}.} \item{overwrite}{\code{logical(1)}\cr -If \code{TRUE} then existing \code{response} and \code{crank} predict types are overwritten.} +If \code{FALSE} (default) and the prediction already has a \code{crank} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{crank} will be overwritten.} \item{graph_learner}{\code{logical(1)}\cr If \code{TRUE} returns wraps the \link[mlr3pipelines:Graph]{Graph} as a @@ -51,14 +43,19 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { library("mlr3") library("mlr3pipelines") - task = tsk("rats") - pipe = ppl( + task = tsk("lung") + part = partition(task) + + # change the crank prediction type of a Cox's model predictions + grlrn = ppl( "crankcompositor", learner = lrn("surv.coxph"), - method = "sum_haz" + method = "mort", + overwrite = TRUE, + graph_learner = TRUE ) - pipe$train(task) - pipe$predict(task) + grlrn$train(task, part$train) + grlrn$predict(task, part$test) } } } @@ -66,6 +63,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other pipelines: \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, diff --git a/man/mlr_graphs_distrcompositor.Rd b/man/mlr_graphs_distrcompositor.Rd index e75fee5cf..d25347f8d 100644 --- a/man/mlr_graphs_distrcompositor.Rd +++ b/man/mlr_graphs_distrcompositor.Rd @@ -46,20 +46,21 @@ Wrapper around \link{PipeOpDistrCompositor} or \link{PipeOpBreslow} to simplify } \examples{ \dontrun{ -if (requireNamespace("mlr3pipelines", quietly = TRUE) && - requireNamespace("rpart", quietly = TRUE)) { - library("mlr3") +if (requireNamespace("mlr3pipelines", quietly = TRUE)) { library("mlr3pipelines") + # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: task = tsk("rats") - pipe = ppl( + grlrn = ppl( "distrcompositor", - learner = lrn("surv.rpart"), + learner = lrn("surv.coxph"), estimator = "kaplan", - form = "ph" + form = "aft", + overwrite = TRUE, + graph_learner = TRUE ) - pipe$train(task) - pipe$predict(task) + grlrn$train(task) + grlrn$predict(task) } } } @@ -67,6 +68,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE) && Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, diff --git a/man/mlr_graphs_probregr.Rd b/man/mlr_graphs_probregr.Rd index 4a0573d2d..25db8a1e6 100644 --- a/man/mlr_graphs_probregr.Rd +++ b/man/mlr_graphs_probregr.Rd @@ -70,6 +70,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE) && Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, diff --git a/man/mlr_graphs_responsecompositor.Rd b/man/mlr_graphs_responsecompositor.Rd new file mode 100644 index 000000000..0bc4b237b --- /dev/null +++ b/man/mlr_graphs_responsecompositor.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipelines.R +\name{mlr_graphs_responsecompositor} +\alias{mlr_graphs_responsecompositor} +\alias{pipeline_responsecompositor} +\title{Estimate Survival Time/Response Predict Type Pipeline} +\usage{ +pipeline_responsecompositor( + learner, + method = "rmst", + cutoff_time = NULL, + add_crank = FALSE, + overwrite = FALSE, + graph_learner = FALSE +) +} +\arguments{ +\item{learner}{\verb{[mlr3::Learner]|[mlr3pipelines::PipeOp]|[mlr3pipelines::Graph]} \cr +Either a \code{Learner} which will be wrapped in \link[mlr3pipelines:mlr_pipeops_learner]{mlr3pipelines::PipeOpLearner}, a \code{PipeOp} which will +be wrapped in \link[mlr3pipelines:Graph]{mlr3pipelines::Graph} or a \code{Graph} itself. Underlying \code{Learner} should be +\link{LearnerSurv}.} + +\item{method}{(\code{character(1)})\cr +Determines what method should be used to produce a survival time (response) from the survival distribution. +Available methods are \code{"rmst"} and \code{"median"}, corresponding to the \emph{restricted mean survival time} and the \emph{median survival time} respectively.} + +\item{cutoff_time}{(\code{numeric(1)})\cr +Determines the time point up to which we calculate the restricted mean survival time (works only for the \code{"rmst"} method). +If \code{NULL} (default), all the available time points in the predicted survival distribution will be used.} + +\item{add_crank}{(\code{logical(1)})\cr +If \code{TRUE} then \code{crank} predict type will be set as \code{-response} (as higher survival times correspond to lower risk). +Works only if \code{overwrite} is \code{TRUE}.} + +\item{overwrite}{(\code{logical(1)})\cr +If \code{FALSE} (default) and the prediction already has a \code{response} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{response} (and the \code{crank}, if \code{add_crank} is \code{TRUE}) will be overwritten.} + +\item{graph_learner}{\code{logical(1)}\cr +If \code{TRUE} returns wraps the \link[mlr3pipelines:Graph]{Graph} as a +\link[mlr3pipelines:mlr_learners_graph]{GraphLearner} otherwise (default) returns as a \code{Graph}.} +} +\value{ +\link[mlr3pipelines:Graph]{mlr3pipelines::Graph} or \link[mlr3pipelines:mlr_learners_graph]{mlr3pipelines::GraphLearner} +} +\description{ +Wrapper around \link{PipeOpResponseCompositor} to simplify \link[mlr3pipelines:Graph]{Graph} creation. +} +\examples{ +\dontrun{ +if (requireNamespace("mlr3pipelines", quietly = TRUE)) { + library("mlr3") + library("mlr3pipelines") + + task = tsk("lung") + part = partition(task) + + # add survival time prediction type to the predictions of a Cox model + grlrn = ppl( + "responsecompositor", + learner = lrn("surv.coxph"), + method = "rmst", + overwrite = TRUE, + graph_learner = TRUE + ) + grlrn$train(task, part$train) + grlrn$predict(task, part$test) +} +} +} +\seealso{ +Other pipelines: +\code{\link{mlr_graphs_crankcompositor}}, +\code{\link{mlr_graphs_distrcompositor}}, +\code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_survaverager}}, +\code{\link{mlr_graphs_survbagging}}, +\code{\link{mlr_graphs_survtoclassif_disctime}}, +\code{\link{mlr_graphs_survtoregr}} +} +\concept{pipelines} diff --git a/man/mlr_graphs_survaverager.Rd b/man/mlr_graphs_survaverager.Rd index e85c9dae0..7384fbf13 100644 --- a/man/mlr_graphs_survaverager.Rd +++ b/man/mlr_graphs_survaverager.Rd @@ -47,6 +47,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, \code{\link{mlr_graphs_survtoregr}} diff --git a/man/mlr_graphs_survbagging.Rd b/man/mlr_graphs_survbagging.Rd index 3b9eaba82..c6ceb8f6e 100644 --- a/man/mlr_graphs_survbagging.Rd +++ b/man/mlr_graphs_survbagging.Rd @@ -78,6 +78,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survtoclassif_disctime}}, \code{\link{mlr_graphs_survtoregr}} diff --git a/man/mlr_graphs_survtoclassif_disctime.Rd b/man/mlr_graphs_survtoclassif_disctime.Rd index c3b3e64c3..d1c09d1b3 100644 --- a/man/mlr_graphs_survtoclassif_disctime.Rd +++ b/man/mlr_graphs_survtoclassif_disctime.Rd @@ -84,6 +84,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoregr}} diff --git a/man/mlr_graphs_survtoregr.Rd b/man/mlr_graphs_survtoregr.Rd index ef9d99047..746725f5d 100644 --- a/man/mlr_graphs_survtoregr.Rd +++ b/man/mlr_graphs_survtoregr.Rd @@ -28,7 +28,7 @@ Regression learner to fit to the transformed \link[mlr3:TaskRegr]{TaskRegr}. If \code{NULL} in method \code{2}, then \code{regr_learner} must have \code{se} predict_type.} \item{distrcompose}{\code{logical(1)}\cr -For methods \code{1} and \code{3} if \code{TRUE} (default) then \link{PipeOpDistrCompositor} is utilised to +For method \code{3} if \code{TRUE} (default) then \link{PipeOpDistrCompositor} is utilised to transform the deterministic predictions to a survival distribution.} \item{distr_estimator}{\link{LearnerSurv}\cr @@ -79,8 +79,6 @@ Three reduction strategies are implemented, these are: \item A \link{LearnerRegr} is fit and predicted on the new \code{TaskRegr}. \item \link{PipeOpPredRegrSurv} transforms the resulting \link[mlr3:PredictionRegr]{PredictionRegr} to \link{PredictionSurv}. -\item Optionally: \link{PipeOpDistrCompositor} is used to compose a \code{distr} predict_type from the -predicted \code{response} predict_type. } \item Survival to Probabilistic Regression \enumerate{ @@ -134,7 +132,6 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { "survtoregr", method = 1, regr_learner = lrn("regr.featureless"), - distrcompose = TRUE, survregr_params = list(method = "delete") ) pipe$train(task) @@ -169,6 +166,7 @@ Other pipelines: \code{\link{mlr_graphs_crankcompositor}}, \code{\link{mlr_graphs_distrcompositor}}, \code{\link{mlr_graphs_probregr}}, +\code{\link{mlr_graphs_responsecompositor}}, \code{\link{mlr_graphs_survaverager}}, \code{\link{mlr_graphs_survbagging}}, \code{\link{mlr_graphs_survtoclassif_disctime}} diff --git a/man/mlr_measures_surv.cindex.Rd b/man/mlr_measures_surv.cindex.Rd index f4fc2880d..5697b9764 100644 --- a/man/mlr_measures_surv.cindex.Rd +++ b/man/mlr_measures_surv.cindex.Rd @@ -92,7 +92,7 @@ Weighting applied to tied rankings, default is to give them half (0.5) weighting library(mlr3) task = tsk("rats") learner = lrn("surv.coxph") -part = partition(task) # train/test split, stratified on `status` by default +part = partition(task) # train/test split learner$train(task, part$train) p = learner$predict(task, part$test) diff --git a/man/mlr_measures_surv.rcll.Rd b/man/mlr_measures_surv.rcll.Rd index 405555ff9..1a1a4b47a 100644 --- a/man/mlr_measures_surv.rcll.Rd +++ b/man/mlr_measures_surv.rcll.Rd @@ -13,6 +13,10 @@ The RCLL, in the context of probabilistic predictions, is defined by where \eqn{\Delta} is the censoring indicator, \eqn{f} the probability density function and \eqn{S} the survival function. RCLL is proper given that censoring and survival distribution are independent, see Rindt et al. (2022). + +\strong{Note}: Even though RCLL is a proper scoring rule, the calculation of \eqn{f(t)} (which in our case is discrete, i.e. it is a \emph{probability mass function}) for time points in the test set that don't exist in the predicted survival matrix (\code{distr}), results in 0 values, which are substituted by \code{"eps"} in our implementation, therefore skewing the result towards \eqn{-log(eps)}. +This problem is also discussed in Rindt et al. (2022), where the authors perform interpolation to get non-zero values for the \eqn{f(t)}. +Until this is handled in \code{mlr3proba} some way, we advise against using this measure for model evaluation. } \section{Dictionary}{ diff --git a/man/mlr_pipeops_compose_breslow_distr.Rd b/man/mlr_pipeops_compose_breslow_distr.Rd index 211ba6adf..c8f3138fa 100644 --- a/man/mlr_pipeops_compose_breslow_distr.Rd +++ b/man/mlr_pipeops_compose_breslow_distr.Rd @@ -75,7 +75,8 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other survival compositors: \code{\link{mlr_pipeops_crankcompose}}, -\code{\link{mlr_pipeops_distrcompose}} +\code{\link{mlr_pipeops_distrcompose}}, +\code{\link{mlr_pipeops_responsecompose}} } \concept{survival compositors} \section{Super class}{ diff --git a/man/mlr_pipeops_crankcompose.Rd b/man/mlr_pipeops_crankcompose.Rd index f6c931a42..adedaf7f7 100644 --- a/man/mlr_pipeops_crankcompose.Rd +++ b/man/mlr_pipeops_crankcompose.Rd @@ -21,14 +21,11 @@ po("crankcompose") \section{Input and Output Channels}{ -\link{PipeOpCrankCompositor} has one input channel named "input", which takes -\code{NULL} during training and \link{PredictionSurv} during prediction. +\link{PipeOpCrankCompositor} has one input channel named \code{"input"}, which takes \code{NULL} during training and \link{PredictionSurv} during prediction. -\link{PipeOpCrankCompositor} has one output channel named "output", producing \code{NULL} during training -and a \link{PredictionSurv} during prediction. +\link{PipeOpCrankCompositor} has one output channel named \code{"output"}, producing \code{NULL} during training and a \link{PredictionSurv} during prediction. -The output during prediction is the \link{PredictionSurv} from the "pred" input but with the \code{crank} -predict type overwritten by the given estimation method. +The output during prediction is the \link{PredictionSurv} from the input but with the \code{crank} predict type overwritten by the given estimation method. } \section{State}{ @@ -41,44 +38,24 @@ The \verb{$state} is left empty (\code{list()}). \itemize{ \item \code{method} :: \code{character(1)} \cr Determines what method should be used to produce a continuous ranking from the distribution. -One of \code{sum_haz}, \code{median}, \code{mode}, or \code{mean} corresponding to the -respective functions in the predicted survival distribution. Note that -for models with a proportional hazards form, the ranking implied by -\code{mean} and \code{median} will be identical (but not the value of \code{crank} -itself). \code{sum_haz} (default) uses \code{\link[survivalmodels:surv_to_risk]{survivalmodels::surv_to_risk()}}. -\item \code{which} :: \code{numeric(1)}\cr -If \code{method = "mode"} then specifies which mode to use if multi-modal, default is the first. -\item \code{response} :: \code{logical(1)}\cr -If \code{TRUE} then the \code{response} predict type is estimated with the same values as \code{crank}. +Currently only \code{mort} is supported, which is the sum of the cumulative hazard, also called \emph{expected/ensemble mortality}, see Ishwaran et al. (2008). +For more details, see \code{\link[=get_mortality]{get_mortality()}}. \item \code{overwrite} :: \code{logical(1)} \cr -If \code{FALSE} (default) then if the "pred" input already has a \code{crank}, the compositor only -composes a \code{response} type if \code{response = TRUE} and does not already exist. If \code{TRUE} then -both the \code{crank} and \code{response} are overwritten. +If \code{FALSE} (default) and the prediction already has a \code{crank} prediction, then the compositor returns the input prediction unchanged. +If \code{TRUE}, then the \code{crank} will be overwritten. } } -\section{Internals}{ - -The \code{median}, \code{mode}, or \code{mean} will use analytical expressions if possible but if not they are -calculated using methods from \link{distr6}. \code{mean} requires \CRANpkg{cubature}. -} - \examples{ \dontrun{ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { - library(mlr3) library(mlr3pipelines) task = tsk("rats") - learn = lrn("surv.coxph")$train(task)$predict(task) - poc = po("crankcompose", param_vals = list(method = "sum_haz")) - poc$predict(list(learn))[[1]] - - if (requireNamespace("cubature", quietly = TRUE)) { - learn = lrn("surv.coxph")$train(task)$predict(task) - poc = po("crankcompose", param_vals = list(method = "sum_haz")) - poc$predict(list(learn))[[1]] - } + # change the crank prediction type of a Cox's model predictions + pred = lrn("surv.coxph")$train(task)$predict(task) + poc = po("crankcompose", param_vals = list(overwrite = TRUE)) + poc$predict(list(pred))[[1L]] } } } @@ -87,7 +64,8 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other survival compositors: \code{\link{mlr_pipeops_compose_breslow_distr}}, -\code{\link{mlr_pipeops_distrcompose}} +\code{\link{mlr_pipeops_distrcompose}}, +\code{\link{mlr_pipeops_responsecompose}} } \concept{survival compositors} \section{Super class}{ diff --git a/man/mlr_pipeops_distrcompose.Rd b/man/mlr_pipeops_distrcompose.Rd index b5ac7b05e..5d43c3504 100644 --- a/man/mlr_pipeops_distrcompose.Rd +++ b/man/mlr_pipeops_distrcompose.Rd @@ -5,18 +5,14 @@ \alias{PipeOpDistrCompositor} \title{PipeOpDistrCompositor} \description{ -Estimates (or 'composes') a survival distribution from a predicted baseline \code{distr} and a -\code{crank} or \code{lp} from two \link{PredictionSurv}s. +Estimates (or 'composes') a survival distribution from a predicted baseline +survival distribution (\code{distr}) and a linear predictor (\code{lp}) from two \link{PredictionSurv}s. Compositor Assumptions: \itemize{ \item The baseline \code{distr} is a discrete estimator, e.g. \link[=LearnerSurvKaplan]{surv.kaplan}. \item The composed \code{distr} is of a linear form -\item If \code{lp} is missing then \code{crank} is equivalent } - -These assumptions are strong and may not be reasonable. Future updates will upgrade this -compositor to be more flexible. } \section{Dictionary}{ @@ -32,15 +28,16 @@ po("distrcompose") \section{Input and Output Channels}{ -\link{PipeOpDistrCompositor} has two input channels, "base" and "pred". Both input channels take -\code{NULL} during training and \link{PredictionSurv} during prediction. +\link{PipeOpDistrCompositor} has two input channels, \code{"base"} and \code{"pred"}. +Both input channels take \code{NULL} during training and \link{PredictionSurv} during prediction. -\link{PipeOpDistrCompositor} has one output channel named "output", producing \code{NULL} during training -and a \link{PredictionSurv} during prediction. +\link{PipeOpDistrCompositor} has one output channel named \code{"output"}, producing +\code{NULL} during training and a \link{PredictionSurv} during prediction. -The output during prediction is the \link{PredictionSurv} from the "pred" input but with an extra -(or overwritten) column for \code{distr} predict type; which is composed from the \code{distr} of "base" -and \code{lp} or \code{crank} of "pred". +The output during prediction is the \link{PredictionSurv} from the \code{"pred"} input +but with an extra (or overwritten) column for the \code{distr} predict type; which +is composed from the \code{distr} of \code{"base"} and the \code{lp} of \code{"pred"}. +If no \code{lp} predictions have been made or exist, then the \code{"pred"} is returned unchanged. } \section{State}{ @@ -58,8 +55,8 @@ accelerated-failure time, \code{aft}, proportional hazards, \code{ph}, or propor Default \code{aft}. \item \code{overwrite} :: \code{logical(1)} \cr If \code{FALSE} (default) then if the "pred" input already has a \code{distr}, the compositor does -nothing and returns the given \link{PredictionSurv}. If \code{TRUE} then the \code{distr} is overwritten -with the \code{distr} composed from \code{lp}/\code{crank} - this is useful for changing the prediction +nothing and returns the given \link{PredictionSurv}. If \code{TRUE}, then the \code{distr} is overwritten +with the \code{distr} composed from \code{lp} - this is useful for changing the prediction \code{distr} from one model form to another. } } @@ -71,8 +68,7 @@ The respective \code{form}s above have respective survival distributions: \deqn{ph: S(t) = S_0(t)^{exp(lp)}}{ph: S(t) = S0(t)^exp(lp)} \deqn{po: S(t) = \frac{S_0(t)}{exp(-lp) + (1-exp(-lp)) S_0(t)}}{po: S(t) = S0(t) / [exp(-lp) + S0(t) (1-exp(-lp))]} # nolint where \eqn{S_0}{S0} is the estimated baseline survival distribution, and \eqn{lp} is the -predicted linear predictor. If the input model does not predict a linear predictor then \code{crank} -is assumed to be the \code{lp} - \strong{this may be a strong and unreasonable assumption.} +predicted linear predictor. } \examples{ @@ -84,6 +80,7 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { base = lrn("surv.kaplan")$train(task)$predict(task) pred = lrn("surv.coxph")$train(task)$predict(task) + # let's change the distribution prediction of Cox (Breslow-based) to an AFT form: pod = po("distrcompose", param_vals = list(form = "aft", overwrite = TRUE)) pod$predict(list(base = base, pred = pred))[[1]] } @@ -94,7 +91,8 @@ if (requireNamespace("mlr3pipelines", quietly = TRUE)) { Other survival compositors: \code{\link{mlr_pipeops_compose_breslow_distr}}, -\code{\link{mlr_pipeops_crankcompose}} +\code{\link{mlr_pipeops_crankcompose}}, +\code{\link{mlr_pipeops_responsecompose}} } \concept{survival compositors} \section{Super class}{ diff --git a/man/mlr_pipeops_responsecompose.Rd b/man/mlr_pipeops_responsecompose.Rd new file mode 100644 index 000000000..dea59706c --- /dev/null +++ b/man/mlr_pipeops_responsecompose.Rd @@ -0,0 +1,166 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpResponseCompositor.R +\name{mlr_pipeops_responsecompose} +\alias{mlr_pipeops_responsecompose} +\alias{PipeOpResponseCompositor} +\title{PipeOpResponseCompositor} +\description{ +Uses a predicted survival distribution (\code{distr}) in a \link{PredictionSurv} to estimate (or 'compose') an expected survival time (\code{response}) prediction. +Practically, this \code{PipeOp} summarizes an observation's survival curve/distribution to a single number which can be either the restricted mean survival time or the median survival time. +} +\section{Dictionary}{ + +This \link[mlr3pipelines:PipeOp]{PipeOp} can be instantiated via the +\link[mlr3misc:Dictionary]{dictionary} \link[mlr3pipelines:mlr_pipeops]{mlr3pipelines::mlr_pipeops} or with the associated sugar +function \code{\link[mlr3pipelines:po]{mlr3pipelines::po()}}: + +\if{html}{\out{