diff --git a/Project.toml b/Project.toml index 617dc3d..2b23a07 100644 --- a/Project.toml +++ b/Project.toml @@ -13,12 +13,3 @@ Clustering = "0.14" Distances = "0.9, 0.10" MLJModelInterface = "1.4" julia = "1.6" - -[extras] -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["LinearAlgebra", "MLJBase", "Random", "Test"] diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl index 8bd084a..68eae62 100644 --- a/src/MLJClusteringInterface.jl +++ b/src/MLJClusteringInterface.jl @@ -16,7 +16,7 @@ using Distances # =================================================================== ## EXPORTS -export KMeans, KMedoids +export KMeans, KMedoids, DBSCAN # =================================================================== ## CONSTANTS @@ -25,19 +25,14 @@ const MMI = MLJModelInterface const Cl = Clustering const PKG = "MLJClusteringInterface" -#### -#### KMeans -#### + +# # K_MEANS @mlj_model mutable struct KMeans <: MMI.Unsupervised k::Int = 3::(_ ≥ 2) metric::SemiMetric = SqEuclidean() end -#### -#### KMeans -#### - function MMI.fit(model::KMeans, verbosity::Int, X) # NOTE: using transpose here to get a LinearAlgebra.Transpose object # which Kmeans can handle. @@ -66,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X) return MMI.table(X̃, prototype=X) end +# # K_MEDOIDS + @mlj_model mutable struct KMedoids <: MMI.Unsupervised k::Int = 3::(_ ≥ 2) metric::SemiMetric = SqEuclidean() @@ -100,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X) return MMI.table(X̃, prototype=X) end -#### -#### Predict methods -#### + +# # PREDICT FOR K_MEANS AND K_MEDOIDS function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew) locations, cluster_labels = fitresult @@ -124,12 +120,59 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew) return cluster_labels[pred] end -#### -#### METADATA -#### +# # DBSCAN + +@mlj_model mutable struct DBSCAN <: MMI.Static + radius::Real = 1.0::(_ > 0) + leafsize::Int = 20::(_ > 0) + min_neighbors::Int = 1::(_ > 0) + min_cluster_size::Int = 1::(_ > 0) +end + +# As DBSCAN is `Static`, there is no `fit` to implement. + +function MMI.predict(model::DBSCAN, ::Nothing, X) + + Xarray = MMI.matrix(X)' + + # output of core algorithm: + clusters = Cl.dbscan( + Xarray, model.radius; + leafsize=model.leafsize, + min_neighbors=model.min_neighbors, + min_cluster_size=model.min_cluster_size, + ) + nclusters = length(clusters) + + # assignments and point types + npoints = size(Xarray, 2) + assignments = zeros(Int, npoints) + raw_point_types = fill('N', npoints) + for (k, cluster) in enumerate(clusters) + for i in cluster.core_indices + assignments[i] = k + raw_point_types[i] = 'C' + end + for i in cluster.boundary_indices + assignments[i] = k + raw_point_types[i] = 'B' + end + end + point_types = MMI.categorical(raw_point_types) + cluster_labels = unique(assignments) + + yhat = MMI.categorical(assignments) + report = (; point_types, nclusters, cluster_labels, clusters) + return yhat, report +end + +MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,) + + +# # METADATA metadata_pkg.( - (KMeans, KMedoids), + (KMeans, KMedoids, DBSCAN), name="Clustering", uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5", url="https://github.com/JuliaStats/Clustering.jl", @@ -143,7 +186,6 @@ metadata_model( human_name = "K-means clusterer", input = MMI.Table(Continuous), output = MMI.Table(Continuous), - weights = false, path = "$(PKG).KMeans" ) @@ -152,9 +194,18 @@ metadata_model( human_name = "K-medoids clusterer", input = MMI.Table(Continuous), output = MMI.Table(Continuous), - weights = false, path = "$(PKG).KMedoids" ) + +metadata_model( + DBSCAN, + human_name = "DBSCAN clusterer (density-based spatial clustering of "* + "applications with noise)", + input = MMI.Table(Continuous), + path = "$(PKG).DBSCAN" +) + + """ $(MMI.doc_header(KMeans)) @@ -323,6 +374,107 @@ See also """ KMedoids +""" +$(MMI.doc_header(DBSCAN)) -end # module +[DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups +together points that are closely packed together (points with many nearby neighbors), +marking as outliers points that lie alone in low-density regions (whose nearest neighbors +are too far away). More information is available at the [Clustering.jl +documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to +get cluster assignments. Point types - core, boundary or noise - are accessed from the +machine report (see below). + +This is a static implementation, i.e., it does not generalize to new data instances, and +there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or +[`KMedoids`](@ref). + +In MLJ or MLJBase, create a machine with + + mach = machine(model) + +# Hyper-parameters + +- `radius=1.0`: query radius. + +- `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d + tree. + +- `min_neighbors=1`: minimum number of a core point neighbors. +- `min_cluster_size=1`: minimum number of points in a valid cluster. + + +# Operations + +- `predict(mach, X)`: return cluster label assignments, as an unordered + `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose + columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that + points of type `noise` will always get a label of `0`. + + +# Report + +After calling `predict(mach)`, the fields of `report(mach)` are: + +- `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one + element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'` + (noise). + +- `nclusters`: The number of clusters (excluding the noise "cluster") + +- `cluster_labels`: The unique list of cluster labels + +- `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which + have these fields: + + - `size`: number of points in a cluster (core + boundary) + + - `core_indices`: indices of points in the cluster core + + - `boundary_indices`: indices of points on the cluster boundary + + +# Examples + +``` +using MLJ + +X, labels = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X +y = map(labels) do label + label == 0 ? "cookie" : "monster" +end; +y = coerce(y, Multiclass); + +DBSCAN = @load DBSCAN pkg=Clustering +model = DBSCAN(radius=0.13, min_cluster_size=5) +mach = machine(model) + +# compute and output cluster assignments for observations in `X`: +yhat = predict(mach, X) + +# get DBSCAN point types: +report(mach).point_types +report(mach).nclusters + +# compare cluster labels with actual labels: +compare = zip(yhat, y) |> collect; +compare[1:10] # clusters align with classes + +# visualize clusters, noise in red: +points = zip(X.x1, X.x2) |> collect +colors = map(yhat) do i + i == 0 ? :red : + i == 1 ? :blue : + i == 2 ? :green : + i == 3 ? :yellow : + :black +end +using Plots +scatter(points, color=colors) +``` + +""" +DBSCAN + +end # module diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..2ca2a1a --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,11 @@ +[deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +MLJTestIntegration = "0.2.2" \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index cd2b3d0..e4d7a97 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,20 +3,18 @@ import Distances import LinearAlgebra: norm using MLJBase +using MLJTestIntegration using MLJClusteringInterface -using Random:seed! +using Random: seed! using Test -const Dist = Distances - seed!(132442) X, y = @load_crabs -#### -#### KMEANS -#### -@testset "Kmeans" begin +# # K_MEANS + +@testset "KMeans" begin barekm = KMeans() fitresult, cache, report = fit(barekm, 1, X) R = matrix(transform(barekm, fitresult, X)) @@ -28,25 +26,83 @@ X, y = @load_crabs p = predict(barekm, fitresult, X) @test argmin(R[1, :]) == p[1] @test argmin(R[10, :]) == p[10] - - end -#### -#### KMEDOIDS -#### -@testset "Kmedoids" begin +# # K_MEDOIDS + +@testset "KMedoids" begin barekm = KMedoids() fitresult, cache, report = fit(barekm, 1, X) X_array = matrix(X) R = matrix(transform(barekm, fitresult, X)) - @test R[1, 2] ≈ Dist.evaluate( + @test R[1, 2] ≈ Distances.evaluate( barekm.metric, view(X_array, 1, :), view(fitresult[1], :, 2) ) - @test R[10, 3] ≈ Dist.evaluate( + @test R[10, 3] ≈ Distances.evaluate( barekm.metric, view(X_array, 10, :), view(fitresult[1], :, 3) ) p = predict(barekm, fitresult, X) @test all(report.assignments .== p) end + + +# # DBSCAN + +@testset "DBSCAN" begin + + # five spot pattern + X = [ + 0.0 0.0 + 1.0 0.0 + 1.0 1.0 + 0.0 1.0 + 0.5 0.5 + ] |> MLJBase.table + + # radius < √2 ==> 5 clusters + dbscan = DBSCAN(radius=0.1) + yhat1, report1 = predict(dbscan, nothing, X) + @test report1.nclusters == 5 + @test report1.point_types == fill('B', 5) + @test Set(yhat1) == Set(unique(yhat1)) + @test Set(report1.cluster_labels) == Set(unique(yhat1)) + + # DbscanCluster fields: + @test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices) + + # radius > √2 ==> 1 cluster + dbscan = DBSCAN(radius=√2+eps()) + yhat, report = predict(dbscan, nothing, X) + @test report.nclusters == 1 + @test report.point_types == fill('C', 5) + @test length(unique(yhat)) == 1 + + # radius < √2 && min_cluster_size = 2 ==> all points are noise + dbscan = DBSCAN(radius=0.1, min_cluster_size=2) + yhat, report = predict(dbscan, nothing, X) + @test report.nclusters == 0 + @test report.point_types == fill('N', 5) + @test length(unique(yhat)) == 1 + + # MLJ integration: + model = DBSCAN(radius=0.1) + mach = machine(model) # no training data + yhat = predict(mach, X) + @test yhat == yhat1 + @test MLJBase.report(mach).point_types == report1.point_types + @test MLJBase.report(mach).nclusters == report1.nclusters + +end + +@testset "MLJ interface" begin + models = [KMeans, KMedoids, DBSCAN] + failures, summary = MLJTestIntegration.test( + models, + X; + mod=@__MODULE__, + verbosity=0, + throw=false, # set to true to debug + ) + @test isempty(failures) +end