Preproces to remove correlated features #313

missuse · 2020-02-07T12:46:26Z

Hi,

I felt compelled to continue the discussion (mlr-org/mlr3filters#61) .

I tried to make a function to remove correlated features similar to findCorrelation() but without actually looking what that function does internally so they don't look too similar.

I came up with this

findCor <- function(x, threshold = 0.9, method = "pearson"){
  cor_mat <- cor(x, method = method)
  cor_mat <- abs(cor_mat)
  diag(cor_mat) <- 0
  drop_features <- vector("character", 0)
  repeat{
    cor_cut <- cor_mat >= threshold
    cor_sums <- colSums(cor_cut, na.rm = TRUE)
    if(sum(cor_sums) == 0){
      break
    } else {
      cand <- which(cor_sums == max(cor_sums, na.rm = TRUE))
      max_cor <- apply(cor_mat[,cand, drop = FALSE], 2, max, na.rm = TRUE)
      mean_cor <- colMeans(cor_mat[,cand, drop = FALSE])
      max_which <- which(max_cor == max(max_cor))
      rem_i <- cand[max_which][which.max(mean_cor[max_which])] #not the best choice of variable names 
      cor_mat <- cor_mat[-rem_i,-rem_i]
      drop_features <- c(drop_features, names(rem_i))
    }
  }
  return(drop_features)
}

It can most likely be simplified. But it is besides the point. Results of testing were interesting to me.

library(mlbench)
data(Sonar)

Sonari <- Sonar[,1:60]

names_mine <- findCor(Sonari, threshold = 0.6)
names_caret <- caret::findCorrelation(cor(Sonari),
                                      cutoff = 0.6,
                                      names = TRUE,
                                      exact = TRUE)
names_caret_fast <- caret::findCorrelation(cor(Sonari),
                                           cutoff = 0.6,
                                           names = TRUE,
                                           exact = FALSE)


length(names_mine)
[1] 32
length(names_caret)
[1] 37
length(names_caret_fast)
[1] 38

findCor() identifies 32 features to be removed in this data set while findCorrelation() with exact = TRUE finds 37 features, and with exact = FALSE finds 38 features. This trend appears to be present at different threshold values.

The correlation of the remaining features

test_cor <- abs(cor(Sonari[,!colnames(Sonari) %in% names_mine]))
diag(test_cor) <- 0
max(test_cor)
[1] 0.5974416

caret_cor <- abs(cor(Sonari[,!colnames(Sonari) %in% names_caret]))
diag(caret_cor) <- 0
max(caret_cor)
[1] 0.5850103

caret_fast_cor <- abs(cor(Sonari[,!colnames(Sonari) %in% names_caret_fast ]))
diag(caret_fast_cor) <- 0
max(caret_fast_cor)
[1] 0.5970527

findCor() keeps more features while still maintaining the maximum allowed absolute correlation.

Comparison with step_corr()

library(recipes)
rec <- recipe(Class ~ .,
data = Sonar)

corr_filter <- rec %>%
step_corr(all_predictors(), threshold = .6)


filter_obj <- prep(corr_filter, training = Sonar)

filtered_te <- bake(filter_obj, Sonar)

ncol(filtered_te) #this is with Class target column
[1] 23

22 out of 60 features are kept so this removes the same amount of features as findCorrelation() with exact = FALSE

library(microbenchmark)

microbenchmark(mine = findCor(Sonari, threshold = 0.6),
               exact = caret::findCorrelation(cor(Sonari),
                                              cutoff = 0.6,
                                              names = TRUE,
                                              exact = TRUE),
               fasta = caret::findCorrelation(cor(Sonari),
                                              cutoff = 0.6,
                                              names = TRUE,
                                              exact = FALSE))
Unit: microseconds
  expr    min      lq     mean  median      uq     max neval
  mine 2667.2 2748.90 3226.600 2779.35 2824.95 35978.4   100
 exact 3859.2 3909.85 4307.806 3982.75 4054.95 11987.7   100
 fasta  951.5  970.45  991.171  980.70 1001.80  1344.5   100

The function as a pipeop

PipeOPdropCor <- R6Class("PipeOPdropCor",
                         inherit = mlr3pipelines::PipeOpTaskPreproc,
                         public = list(
                           initialize = function(id = "dropCor", param_vals = list()) {
                             ps = paradox::ParamSet$new(params = list(
                               paradox::ParamFct$new("method",
                                                     default = "pearson",
                                                     levels = c("pearson", "kendall", "spearman"),
                                                     tags = c("train", "dropCor")),
                               paradox::ParamDbl$new("threshold",
                                                     default = 0.9,
                                                     lower = 0,
                                                     upper = 1,
                                                     tags = c("train", "dropCor"))
                             ))
                             super$initialize(id, param_set = ps, param_vals = param_vals)
                           },
                           
                           select_cols = function(task) {
                             task$feature_types[get("type") %in% c("numeric", "integer"), get("id")]
                           },
                           
                           train_dt = function(dt, levels, target) {
                             cor_mat = cor(dt, method = self$param_set$values$method)
                             cor_mat <- abs(cor_mat)
                             diag(cor_mat) <- 0
                             drop_features <- vector("character", 0)
                             repeat{
                               cor_cut <- cor_mat >= self$param_set$values$threshold
                               cor_sums <- colSums(cor_cut, na.rm = TRUE)
                               if(sum(cor_sums) == 0){
                                 break
                               } else {
                                 cand <- which(cor_sums == max(cor_sums, na.rm = TRUE))
                                 max_cor <- apply(cor_mat[,cand, drop = FALSE], 2, max, na.rm = TRUE)
                                 mean_cor <- colMeans(cor_mat[,cand, drop = FALSE])
                                 max_which <- which(max_cor == max(max_cor))
                                 rem_i <- cand[max_which][which.max(mean_cor[max_which])]
                                 cor_mat <- cor_mat[-rem_i,-rem_i]
                                 drop_features <- c(drop_features, names(rem_i))
                               }
                             }
                             self$state = list(dt_columns = drop_features)
                             dt[, -drop_features, with = FALSE]
                           },
                           
                           predict_dt = function(dt, levels) {
                             dt[, -self$state$dt_columns, with = FALSE]
                           }
                         )
)

I apologize for the rant.

If you like it I can submit a PR.

All the best,

Milan

The text was updated successfully, but these errors were encountered:

pfistfl · 2020-02-07T14:36:20Z

Hey,

have you looked into https://github.com/topepo/caret/blob/master/pkg/caret/R/findCorrelation.R?
I have not fully understood the code from quickly looking at it, but it might help.

Other than that, gladly create a PR.
I think in general, it would be cool if we can recreate a known and well-tested function as exactly as possible.

Best,

Florian

missuse · 2020-02-07T17:20:23Z

I did look at the code for caret:::findCorrelation_exact . I understand what is happening but not the logic behind it still. So much attention is given to the mean of the correlations of each variable to the others.

Additionally, I think the optimal solution to this problem would keep the maximal number of features while maintaining each pairwise correlation below the threshold. I mean if this was not the case one can just omit features at random until each pairwise correlation is below the threshold.

If the objective is to acquire close to the largest possible subset of features with the mentioned pairwise correlation constraint then findCor() does a better job compared to caret::findCorrelation().

This does not mean I think findCor() is good, this just means I think I am missing something.

mb706 · 2020-02-10T15:43:28Z

Sorry for being blunt, why isn't this a filter?

mb706 · 2020-02-10T15:48:54Z

ah ok, findCorrelation gives a mapping threshold -> set of features, but what we would need is that [set of features] is monotonic in [threshold] (but that is the case, right?!), and ideally we want the points of [threshold] where [set of features] changes. I still believe this should be implemented in filters if we have to re-implement this in any case.

pfistfl · 2020-02-10T16:26:40Z

I guess for reference:
mlr-org/mlr3filters#61

missuse · 2020-02-11T09:25:17Z

but what we would need is that [set of features] is monotonic in [threshold] (but that is the case, right?!).

This is not the case unfortunately. See mlr-org/mlr3filters#61 (comment).

Currently I am thinking of setting this a side and using another approach altogether which would measure association with the target while removing redundancy like mRMR does (mlr-org/mlr3filters#61 (comment)). Unfortunately mRMR performs worse in my current project compared to using Information gain for instance which selects less features with better trained learner performance.

So I thought about two simple to implement solutions in mlr3filters

Fast Correlation-Based Filter Solution (Yu and Liu 2003) which would rank the predominant features (as defined in the paper) in order of symmetrical uncertainty with the target, and below them rank the removed features also in order of symmetrical uncertainty with the target.
Ranking features based on the lambda in lasso where their coefficient becomes 0. Do you think this makes sense at all?

mb706 · 2020-02-12T07:03:03Z

Ranking features based on the lambda in lasso where their coefficient becomes 0. Do you think this makes sense at all?

I used this in a project of mine quite successfully. The nicest way to make this work would be to add an $importance slot to the glmnet Learner, then it could work with FilterImportance. I suggested this a while ago: mlr-org/mlr3learners#28

The caret findCorrelation for exact = FALSE filter should be on its way, PR is mlr-org/mlr3filters#62.

The exact = TRUE version does not fit the mlr3filters mold. However, mlr3pipelines has the PipeOpSelect pipeop that removes features depending on a function. This function can be a thin wrapper around findCorrelation:

pose = po("select")
pose$param_set$values$selector = function(task) {
  fn = task$feature_names
  data = task$data(cols = fn)
  drop = caret::findCorrelation(cor(data), cutoff = 0.6, exact = TRUE, names = TRUE)
  setdiff(fn, drop)
}
pose$train(list(tsk("sonar")))

gives

$output
<TaskClassif:sonar> (208 x 24)
* Target: Class
* Properties: twoclass
* Features (23):
  - dbl (23): V1, V10, V12, V17, V22, V24, V28, V31, V34, V37, V40,
    V44, V47, V5, V51, V53, V54, V55, V56, V57, V58, V60, V7

to tune over the cutoff value one would need to use a paradox trafo, however.

Would it make sense to turn this into its own PipeOp? I feel like there would be many useful ways of removing features that are not filters, so this would result in a whole new zoo of operators.

(Edit: Fixed bug where findCorrelation excluding 0 features broke things)

missuse · 2020-02-17T17:38:00Z

Sorry for the delayed replay. I was traveling with very limited connectivity.

I used this in a project of mine quite successfully. The nicest way to make this work would be to add an $importance slot to the glmnet Learner, then it could work with FilterImportance. I suggested this a while ago: mlr-org/mlr3learners#28

This sounds great. Why isn't it implemented? Is it due to not having a simple solution to handle multi class tasks? Why not just return a column per class and let the user decide on the aggregation?
How would it handle features that are factors? Each factor level would have a coefficient.

The caret findCorrelation for exact = FALSE filter should be on its way, PR is mlr-org/mlr3filters#62.

I like you.

The exact = TRUE version does not fit the mlr3filters mold. However, mlr3pipelines has the PipeOpSelect pipeop that removes features depending on a function. This function can be a thin wrapper around findCorrelation:
pose = po("select")
pose$param_set$values$selector = function(task) {
  fn = task$feature_names
  data = task$data(cols = fn)
  drop = caret::findCorrelation(cor(data), cutoff = 0.6, exact = TRUE)
  fn[-drop]
}
pose$train(list(tsk("sonar")))
gives
$output
<TaskClassif:sonar> (208 x 24)
* Target: Class
* Properties: twoclass
* Features (23):
  - dbl (23): V1, V10, V12, V17, V22, V24, V28, V31, V34, V37, V40,
    V44, V47, V5, V51, V53, V54, V55, V56, V57, V58, V60, V7
to tune over the cutoff value one would need to use a paradox trafo, however.

I would be grateful if you could provide an example on how to create a selector with tunable parameters. This is mlr3 book material in my opinion.

Would it make sense to turn this into its own PipeOp? I feel like there would be many useful ways of removing features that are not filters, so this would result in a whole new zoo of operators.

There is no need if users can easily create their own tunable selector functions.

mb706 · 2020-02-20T18:26:27Z

I haven't tested the following code, but tuning should work somewhere similar to using this ParamSet

ps = ParamSet$new(list(ParamDbl$new("cutoff", 0, 1)))
ps$trafo = function(x, param_set) {
  cutoff = x$cutoff
  x$selector = function(task) {
    fn = task$feature_names
    data = task$data(cols = fn)
    drop = caret::findCorrelation(cor(data), cutoff = cutoff, exact = TRUE, names = TRUE)
    setdiff(fn, drop)
  }
  x$cutoff = NULL
  x
}

We will think about if PipeOpSelect should be the canonical tool for this kind of problem (and would add the documentation in that case), or if we find a more elegant solution.

(Edit: bugfixes)

missuse · 2020-02-21T12:55:10Z

I am not able to get it to work. Anyhow for now I can just stick to using a custom Preproc function when I wish to do this sort of thing..

Thanks.

mb706 · 2020-02-21T16:30:00Z

A full example would be

library("mlr3")
library("mlr3pipelines")
library("paradox")
library("mlr3tuning")
ps = ParamSet$new(list(ParamDbl$new("cutoff", 0, 1)))
ps$trafo = function(x, param_set) {
  cutoff = x$cutoff
  x$select.selector = function(task) {
    fn = task$feature_names
    data = task$data(cols = fn)
    drop = caret::findCorrelation(cor(data), cutoff = cutoff, exact = TRUE, names = TRUE)
    setdiff(fn, drop)
  }
  x$cutoff = NULL
  x
}
pipeline = po("select") %>>% lrn("classif.rpart")
inst = TuningInstance$new(
  task = tsk("iris"),
  learner = pipeline,
  resampling = rsmp("cv"),
  measures = msr("classif.ce"),
  param_set = ps,
  terminator = term("none"),
  # don't need the following line for optimization, this is for
  # demonstration that different features were selected
  bm_args = list(store_models = TRUE)
)
tnr("grid_search")$tune(inst)

Just for demonstration that different cutoff values result in different features being selected, we can run the following to inspect the trained models: (Edit: note this inspects only the trained models of the first CV fold of each evaluated model. The features being excluded depends on the training data seen by the pipeline and may be different in different folds, even at the same cutoff value)

inst$archive(unnest = "tune_x")[order(cutoff),
  list(cutoff, classif.ce,
    featurenames = lapply(resample_result, function(x)
      x$learners[[1]]$model$classif.rpart$train_task$feature_names
  ))]

which gives

       cutoff classif.ce                                      featurenames
 1: 0.0000000 0.26666667                                      Sepal.Length
 2: 0.1111111 0.26000000                          Sepal.Length,Sepal.Width
 3: 0.2222222 0.25333333                          Sepal.Length,Sepal.Width
 4: 0.3333333 0.25333333                          Sepal.Length,Sepal.Width
 5: 0.4444444 0.25333333                          Sepal.Length,Sepal.Width
 6: 0.5555556 0.25333333                          Sepal.Length,Sepal.Width
 7: 0.6666667 0.25333333                          Sepal.Length,Sepal.Width
 8: 0.7777778 0.25333333                          Sepal.Length,Sepal.Width
 9: 0.8888889 0.06666667              Petal.Width,Sepal.Length,Sepal.Width
10: 1.0000000 0.08666667 Petal.Length,Petal.Width,Sepal.Length,Sepal.Width

missuse · 2020-02-24T08:59:43Z

A full example would be

Thank you.

pfistfl · 2020-02-24T09:01:53Z

I guess we should either add this as a PipeOp or condense the code above into a mlr3gallery example! Any favorites @mb706 ?

mb706 · 2020-02-24T12:30:57Z

I don't like either solution, but I think for time being this should be a documented example. I hope we have some nicer way of doing this at some point, however. Maybe when mlr-org/paradox#231 is solved we could have parameterized selector functions. I don't think this should be a specialized pipeop because I believe "do something non-filter-like to select features" is a quite general use-case.

pfistfl · 2020-02-25T14:07:19Z

Added to the gallery here: mlr-org/mlr3gallery#9

mb706 added Status: Needs Discussion We still need to think about what the solution should look like Type: New PipeOp Issue suggests a new PipeOp labels Feb 12, 2020

mb706 added the Status: Needs Docs Documentation should be improved label Feb 20, 2020

pfistfl mentioned this issue Feb 25, 2020

Add new post "remove correlated features" mlr-org/mlr3gallery#9

Merged

pfistfl closed this as completed Feb 25, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Preproces to remove correlated features #313

Preproces to remove correlated features #313

missuse commented Feb 7, 2020 •

edited

Loading

pfistfl commented Feb 7, 2020

missuse commented Feb 7, 2020

mb706 commented Feb 10, 2020

mb706 commented Feb 10, 2020 •

edited

Loading

pfistfl commented Feb 10, 2020

missuse commented Feb 11, 2020 •

edited

Loading

mb706 commented Feb 12, 2020 •

edited

Loading

missuse commented Feb 17, 2020

mb706 commented Feb 20, 2020 •

edited

Loading

missuse commented Feb 21, 2020

mb706 commented Feb 21, 2020 •

edited

Loading

missuse commented Feb 24, 2020

pfistfl commented Feb 24, 2020

mb706 commented Feb 24, 2020

pfistfl commented Feb 25, 2020

Preproces to remove correlated features #313

Preproces to remove correlated features #313

Comments

missuse commented Feb 7, 2020 • edited Loading

pfistfl commented Feb 7, 2020

missuse commented Feb 7, 2020

mb706 commented Feb 10, 2020

mb706 commented Feb 10, 2020 • edited Loading

pfistfl commented Feb 10, 2020

missuse commented Feb 11, 2020 • edited Loading

mb706 commented Feb 12, 2020 • edited Loading

missuse commented Feb 17, 2020

mb706 commented Feb 20, 2020 • edited Loading

missuse commented Feb 21, 2020

mb706 commented Feb 21, 2020 • edited Loading

missuse commented Feb 24, 2020

pfistfl commented Feb 24, 2020

mb706 commented Feb 24, 2020

pfistfl commented Feb 25, 2020

missuse commented Feb 7, 2020 •

edited

Loading

mb706 commented Feb 10, 2020 •

edited

Loading

missuse commented Feb 11, 2020 •

edited

Loading

mb706 commented Feb 12, 2020 •

edited

Loading

mb706 commented Feb 20, 2020 •

edited

Loading

mb706 commented Feb 21, 2020 •

edited

Loading