Skip to content

Commit

Permalink
feat: support for BIRCH and BICO learners (#70)
Browse files Browse the repository at this point in the history
* feat: bico and birch learner from stream package

* tests: add autotests for birch and bico learners

* fix: set birch to hierarchical property instead of partitional

* docs(readme): rebuild readme

* tests: more tests and news entry and removal of logical for bico and birch learner

* docs: revert line removal in hdbscan roxygen
  • Loading branch information
m-muecke authored May 2, 2024
1 parent ae9b4c4 commit b3a6055
Show file tree
Hide file tree
Showing 33 changed files with 610 additions and 1 deletion.
3 changes: 3 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Suggests:
mclust,
mlbench,
RWeka,
stream,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Encoding: UTF-8
Expand All @@ -47,6 +48,8 @@ Collate:
'aaa.R'
'LearnerClustAffinityPropagation.R'
'LearnerClustAgnes.R'
'LearnerClustBICO.R'
'LearnerClustBIRCH.R'
'LearnerClustCMeans.R'
'LearnerClustCobweb.R'
'LearnerClustDBSCAN.R'
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ S3method(is_missing_prediction_data,PredictionDataClust)
export(LearnerClust)
export(LearnerClustAP)
export(LearnerClustAgnes)
export(LearnerClustBICO)
export(LearnerClustBIRCH)
export(LearnerClustCMeans)
export(LearnerClustCobweb)
export(LearnerClustDBSCAN)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# mlr3cluster (development version)

* Add BIRCH learner from 'stream' package
* Add BICO learner from 'stream' package

# mlr3cluster 0.1.9

* Add DBSCAN learner from 'fpc' package
Expand Down
66 changes: 66 additions & 0 deletions R/LearnerClustBICO.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#' @title BICO Clustering Learner
#'
#' @name mlr_learners_clust.bico
#'
#' @description
#' BICO (Fast computation of k-means coresets in a data stream) clustering.
#' Calls [stream::DSC_BICO()] from \CRANpkg{stream}.
#'
#' @templateVar id clust.bico
#' @template learner
#'
#' @references
#' `r format_bib("fichtenberger2013bico", "hahsler2017stream")`
#'
#' @export
#' @template seealso_learner
#' @template example
LearnerClustBICO = R6Class("LearnerClustBICO",
inherit = LearnerClust,
public = list(
#' @description
#' Creates a new instance of this [R6][R6::R6Class] class.
initialize = function() {
param_set = ps(
k = p_int(1L, default = 5L, tags = "train"),
space = p_int(1L, default = 10L, tags = "train"),
p = p_int(1L, default = 10L, tags = "train"),
iterations = p_int(1L, default = 10L, tags = "train")
)

super$initialize(
id = "clust.bico",
feature_types = c("integer", "numeric"),
predict_types = "partition",
param_set = param_set,
properties = c("partitional", "exclusive", "complete"),
packages = "stream",
man = "mlr3cluster::mlr_learners_clust.bico",
label = "BICO Clustering"
)
}
),
private = list(
.train = function(task) {
pv = self$param_set$get_values(tags = "train")
dt = task$data()
m = invoke(stream::DSC_BICO, .args = pv)
x = stream::DSD_Memory(dt)
stats::update(m, x, n = nrow(dt))

if (self$save_assignments) {
self$assignments = as.integer(invoke(predict, m, newdata = dt)[[1L]])
}

return(m)
},

.predict = function(task) {
partition = as.integer(invoke(predict, self$model, newdata = task$data())[[1L]])
PredictionClust$new(task = task, partition = partition)
}
)
)

#' @include aaa.R
learners[["clust.bico"]] = LearnerClustBICO
67 changes: 67 additions & 0 deletions R/LearnerClustBIRCH.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#' @title BIRCH Clustering Learner
#'
#' @name mlr_learners_clust.birch
#'
#' @description
#' BIRCH (Balanced Iterative Reducing Clustering using Hierarchies) clustering.
#' Calls [stream::DSC_BIRCH()] from \CRANpkg{stream}.
#'
#' @templateVar id clust.birch
#' @template learner
#'
#' @references
#' `r format_bib("zhang1996birch", "zhang1997birch", "hahsler2017stream")`
#'
#' @export
#' @template seealso_learner
#' @template example
LearnerClustBIRCH = R6Class("LearnerClustBIRCH",
inherit = LearnerClust,
public = list(
#' @description
#' Creates a new instance of this [R6][R6::R6Class] class.
initialize = function() {
param_set = ps(
threshold = p_dbl(0L, tags = c("train", "required")),
branching = p_int(1L, tags = c("train", "required")),
maxLeaf = p_int(1L, tags = c("train", "required")),
maxMem = p_int(0L, default = 0L, tags = "train"),
outlierThreshold = p_dbl(default = 0.25, tags = "train")
)

super$initialize(
id = "clust.birch",
feature_types = c("integer", "numeric"),
predict_types = "partition",
param_set = param_set,
properties = c("hierarchical", "exclusive", "complete"),
packages = "stream",
man = "mlr3cluster::mlr_learners_clust.birch",
label = "BIRCH Clustering"
)
}
),
private = list(
.train = function(task) {
pv = self$param_set$get_values(tags = "train")
dt = task$data()
m = invoke(stream::DSC_BIRCH, .args = pv)
x = stream::DSD_Memory(dt)
stats::update(m, x, n = nrow(dt))

if (self$save_assignments) {
self$assignments = as.integer(invoke(predict, m, newdata = dt)[[1L]])
}

return(m)
},

.predict = function(task) {
partition = as.integer(invoke(predict, self$model, newdata = task$data())[[1L]])
PredictionClust$new(task = task, partition = partition)
}
)
)

#' @include aaa.R
learners[["clust.birch"]] = LearnerClustBIRCH
37 changes: 37 additions & 0 deletions R/bibentries.R
Original file line number Diff line number Diff line change
Expand Up @@ -320,5 +320,42 @@ bibentries = c( # nolint start
pages = "274--295",
year = "2014",
doi = "10.1007/s00357-014-9161-z"
),
hahsler2017stream = bibentry("article",
title = "Introduction to {stream}: An Extensible Framework for Data Stream Clustering Research with {R}",
author = "Michael Hahsler and Matthew Bola\u00f1os and John Forrest",
journal = "Journal of Statistical Software",
year = "2017",
volume = "76",
number = "14",
pages = "1--50",
doi = "10.18637/jss.v076.i14",
),
fichtenberger2013bico = bibentry("inproceedings",
title = "BICO: BIRCH Meets Coresets for k-Means Clustering",
author = "Fichtenberger, Hendrik and Gille, Marc and Schmidt, Melanie and Schwiegelshohn, Chris and Sohler, Christian",
booktitle = "Algorithms--ESA 2013: 21st Annual European Symposium, Sophia Antipolis, France, September 2-4, 2013. Proceedings 21",
pages = "481--492",
year = "2013",
organization = "Springer"
),
zhang1996birch = bibentry("article",
title = "BIRCH: An Efficient Data Clustering Method for Very Large Databases",
author = "Zhang, Tian and Ramakrishnan, Raghu and Livny, Miron",
journal = "ACM sigmod record",
volume = "25",
number = "2",
pages = "103--114",
year = "1996",
publisher = "ACM New York, NY, USA"
),
zhang1997birch = bibentry("article",
title = "BIRCH: A new data clustering algorithm and its applications",
author = "Zhang, Tian and Ramakrishnan, Raghu and Livny, Miron",
journal = "Data Mining and Knowledge Discovery",
volume = "1",
pages = "141--182",
year = "1997",
publisher = "Springer"
)
) # nolint end
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pak::pak("mlr-org/mlr3cluster")

The current version of **mlr3cluster** contains:

- A selection of 22 clustering learners that represent a wide variety of
- A selection of 24 clustering learners that represent a wide variety of
clusterers: partitional, hierarchical, fuzzy, etc.
- A selection of 4 performance measures
- Two built-in tasks to get started with clustering
Expand All @@ -60,6 +60,8 @@ create great visualizations with just one line of code!
| [clust.SimpleKMeans](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.SimpleKMeans) | K-Means (Weka) | [RWeka](https://cran.r-project.org/package=RWeka) |
| [clust.agnes](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.agnes) | Agglomerative Hierarchical Clustering | [cluster](https://cran.r-project.org/package=cluster) |
| [clust.ap](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.ap) | Affinity Propagation Clustering | [apcluster](https://cran.r-project.org/package=apcluster) |
| [clust.bico](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.bico) | BICO Clustering | [stream](https://cran.r-project.org/package=stream) |
| [clust.birch](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.birch) | BIRCH Clustering | [stream](https://cran.r-project.org/package=stream) |
| [clust.cmeans](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.cmeans) | Fuzzy C-Means Clustering Learner | [e1071](https://cran.r-project.org/package=e1071) |
| [clust.cobweb](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.cobweb) | Cobweb Clustering | [RWeka](https://cran.r-project.org/package=RWeka) |
| [clust.dbscan](https://mlr3cluster.mlr-org.com/reference/mlr_learners_clust.dbscan) | Density-Based Clustering | [dbscan](https://cran.r-project.org/package=dbscan) |
Expand Down
2 changes: 2 additions & 0 deletions man/mlr_learners_clust.MBatchKMeans.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_learners_clust.SimpleKMeans.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_learners_clust.agnes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_learners_clust.ap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b3a6055

Please sign in to comment.