JuliaAI · ablaom · Jun 6, 2024 · May 22, 2024 · May 30, 2024 · May 30, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,14 @@
 name = "MLJ"
 uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.20.5"
+version = "0.20.6"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+FeatureSelection = "33837fe5-dbff-4c9e-8c2f-c5612fe2b8b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJBalancing = "45f359ea-796d-4f51-95a5-deb1a414c586"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
@@ -31,12 +32,13 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CategoricalArrays = "0.8,0.9, 0.10"
 ComputationalResources = "0.3"
 Distributions = "0.21,0.22,0.23, 0.24, 0.25"
+FeatureSelection = "0.1.1"
 MLJBalancing = "0.1"
 MLJBase = "1"
 MLJEnsembles = "0.4"
 MLJFlow = "0.5"
 MLJIteration = "0.6"
-MLJModels = "0.16"
+MLJModels = "0.17"
 MLJTestIntegration = "0.5.0"
 MLJTuning = "0.8"
 OpenML = "0.2,0.3"
@@ -89,4 +91,41 @@ SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["BetaML", "CatBoost", "EvoLinear", "EvoTrees", "Imbalance", "InteractiveUtils", "LightGBM", "MLJClusteringInterface", "MLJDecisionTreeInterface", "MLJFlux", "MLJGLMInterface", "MLJLIBSVMInterface", "MLJLinearModels", "MLJMultivariateStatsInterface", "MLJNaiveBayesInterface", "MLJScikitLearnInterface", "MLJTSVDInterface", "MLJTestInterface", "MLJTestIntegration", "MLJText", "MLJXGBoostInterface", "Markdown", "NearestNeighborModels", "OneRule", "OutlierDetectionNeighbors", "OutlierDetectionPython", "ParallelKMeans", "PartialLeastSquaresRegressor", "PartitionedLS", "SelfOrganizingMaps", "SIRUS", "SymbolicRegression", "StableRNGs", "Suppressor","Test"]
+test = [
+    "BetaML",
+    "CatBoost",
+    "EvoLinear",
+    "EvoTrees",
+    "Imbalance",
+    "InteractiveUtils",
+    "LightGBM",
+    "MLJClusteringInterface",
+    "MLJDecisionTreeInterface",
+    "MLJFlux",
+    "MLJGLMInterface",
+    "MLJLIBSVMInterface",
+    "MLJLinearModels",
+    "MLJMultivariateStatsInterface",
+    "MLJNaiveBayesInterface",
+    "MLJScikitLearnInterface",
+    "MLJTSVDInterface",
+    "MLJTestInterface",
+    "MLJTestIntegration",
+    "MLJText",
+    "MLJXGBoostInterface",
+    "Markdown",
+    "NearestNeighborModels",
+    "OneRule",
+    "OutlierDetectionNeighbors",
+    "OutlierDetectionPython",
+    "ParallelKMeans",
+    "PartialLeastSquaresRegressor",
+    "PartitionedLS",
+    "SelfOrganizingMaps",
+    "SIRUS",
+    "SymbolicRegression",
+    "StableRNGs",
+    "Suppressor",
+    "Test",
+]
+
diff --git a/docs/ModelDescriptors.toml b/docs/ModelDescriptors.toml
@@ -11,7 +11,9 @@ AutoEncoder_BetaML = ["dimension_reduction"]
 BM25Transformer_MLJText = ["encoders", "text_analysis"]
 BaggingClassifier_MLJScikitLearnInterface = ["classification", "ensemble_models"]
 BaggingRegressor_MLJScikitLearnInterface = ["regression", "ensemble_models"]
-BalancedBaggingClassifier_MLJBalancing = ["class_imbalance", "classification"]
+BalancedBaggingClassifier_MLJBalancing = ["class_imbalance", "classification", "meta_algorithms"]
+BinaryThresholdPredictor_MLJModels = ["meta_algorithms", "classification"]
+BalancedModel_MLJBalancing = ["class_imbalance", "meta_algorithms"]
 BayesianLDA_MultivariateStats = ["dimension_reduction", "classification", "Bayesian_models"]
 BayesianLDA_MLJScikitLearnInterface = ["dimension_reduction", "classification", "Bayesian_models"]
 BayesianQDA_MLJScikitLearnInterface = ["dimension_reduction", "classification", "Bayesian_models"]
@@ -52,6 +54,7 @@ ElasticNetCVRegressor_MLJScikitLearnInterface = ["regression"]
 ElasticNetRegressor_MLJLinearModels = ["regression"]
 ElasticNetRegressor_MLJScikitLearnInterface = ["regression"]
 ENNUndersampler_Imbalance = ["class_imbalance"]
+EnsembleModel_MLJEnsembles = ["ensemble_models", "meta_algorithms"]
 EpsilonSVR_LIBSVM = ["regression"]
 EvoLinearRegressor_EvoLinear = ["regression"]
 EvoTreeClassifier_EvoTrees = ["classification", "ensemble_models", "iterative_models"]
@@ -63,8 +66,8 @@ EvoSplineRegressor_EvoLinear = ["regression", "ensemble_models", "iterative_mode
 ExtraTreesClassifier_MLJScikitLearnInterface = ["classification", "iterative_models"]
 ExtraTreesRegressor_MLJScikitLearnInterface = ["regression", "iterative_models"]
 FactorAnalysis_MultivariateStats = ["dimension_reduction", ]
-FeatureAgglomeration_MLJScikitLearnInterface = ["clustering", "static_models"]
-FeatureSelector_MLJModels = ["dimension_reduction", ]
+FeatureAgglomeration_MLJScikitLearnInterface = ["clustering", "static_models", "feature_engineering"]
+FeatureSelector_FeatureSelection = ["dimension_reduction", "feature_engineering"]
 FillImputer_MLJModels = ["missing_value_imputation", ]
 GaussianMixtureClusterer_BetaML = ["clustering", "distribution_fitter"]
 GaussianMixtureImputer_BetaML = ["missing_value_imputation", "distribution_fitter"]
@@ -88,7 +91,8 @@ ICA_MultivariateStats = ["encoders"]
 IForestDetector_OutlierDetectionPython = ["outlier_detection"]
 ImageClassifier_MLJFlux = ["classification", "image_processing", "iterative_models"]
 INNEDetector_OutlierDetectionPython = ["outlier_detection"]
-InteractionTransformer_MLJModels = ["static_models"]
+InteractionTransformer_MLJModels = ["static_models", "feature_engineering"]
+IteratedModel_MLJIteration = ["iterative_models", "meta_algorithms"]
 KDEDetector_OutlierDetectionPython = ["outlier_detection"]
 KMeansClusterer_BetaML = ["clustering"]
 KMeans_Clustering = ["clustering", "dimension_reduction", ]
@@ -104,7 +108,7 @@ KNeighborsClassifier_MLJScikitLearnInterface = ["classification"]
 KNeighborsRegressor_MLJScikitLearnInterface = ["regression"]
 KPLSRegressor_PartialLeastSquaresRegressor = ["regression"]
 KernelPCA_MultivariateStats = ["dimension_reduction", ]
-KernelPerceptronClassifier_BetaML = ["classification"]
+KernelPerceptronClassifier_BetaML = ["classification", "neural networks"]
 LADRegressor_MLJLinearModels = ["regression"]
 LDA_MultivariateStats = ["classification", "dimension_reduction", ]
 LGBMClassifier_LightGBM = ["classification", "ensemble_models", "iterative_models"]
@@ -146,14 +150,14 @@ MultitargetGaussianMixtureRegressor_BetaML = ["regression", "distribution_fitter
 MultitargetKNNClassifier_NearestNeighborModels = ["classification"]
 MultitargetKNNRegressor_NearestNeighborModels = ["regression"]
 MultitargetLinearRegressor_MultivariateStats = ["regression"]
-MultitargetNeuralNetworkRegressor_BetaML = ["regression"]
-MultitargetNeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models"]
+MultitargetNeuralNetworkRegressor_BetaML = ["regression", "neural networks"]
+MultitargetNeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models", "neural networks"]
 MultitargetRidgeRegressor_MultivariateStats = ["regression"]
 MultitargetSRRegressor_SymbolicRegression = ["regression"]
-NeuralNetworkClassifier_BetaML = ["classification"]
-NeuralNetworkClassifier_MLJFlux = ["classification", "iterative_models"]
-NeuralNetworkRegressor_BetaML = ["regression"]
-NeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models"]
+NeuralNetworkClassifier_BetaML = ["classification", "neural networks"]
+NeuralNetworkClassifier_MLJFlux = ["classification", "iterative_models", "neural networks"]
+NeuralNetworkRegressor_BetaML = ["regression", "neural networks"]
+NeuralNetworkRegressor_MLJFlux = ["regression", "iterative_models", "neural networks"]
 NuSVC_LIBSVM = ["classification"]
 NuSVR_LIBSVM = ["regression"]
 OCSVMDetector_OutlierDetectionPython = ["outlier_detection"]
@@ -171,8 +175,9 @@ PartLS_PartitionedLS = ["regression"]
 PassiveAggressiveClassifier_MLJScikitLearnInterface = ["classification"]
 PassiveAggressiveRegressor_MLJScikitLearnInterface = ["regression"]
 PegasosClassifier_BetaML = ["classification"]
-PerceptronClassifier_BetaML = ["classification", "iterative_models"]
-PerceptronClassifier_MLJScikitLearnInterface = ["classification", "iterative_models"]
+PerceptronClassifier_BetaML = ["classification", "iterative_models", "neural networks"]
+PerceptronClassifier_MLJScikitLearnInterface = ["classification", "iterative_models", "neural networks"]
+Pipeline_MLJBase = ["meta_algorithms"]
 ProbabilisticNuSVC_LIBSVM = ["classification"]
 ProbabilisticSGDClassifier_MLJScikitLearnInterface = ["classification"]
 ProbabilisticSVC_LIBSVM = ["classification"]
@@ -190,6 +195,8 @@ RandomForestImputer_BetaML = ["missing_value_imputation", "ensemble_models", "it
 RandomForestRegressor_BetaML = ["regression", "ensemble_models", "iterative_models"]
 RandomForestRegressor_DecisionTree = ["regression", "ensemble_models", "iterative_models"]
 RandomForestRegressor_MLJScikitLearnInterface = ["regression", "ensemble_models", "iterative_models"]
+RecursiveFeatureElimination_FeatureSelection = ["dimension_reduction", "meta_algorithms", "feature_engineering"]
+Resampler_MLJBase = ["meta_algorithms"]
 RidgeCVClassifier_MLJScikitLearnInterface = ["classification"]
 RidgeCVRegressor_MLJScikitLearnInterface = ["classification"]
 RidgeClassifier_MLJScikitLearnInterface = ["classification"]
@@ -210,6 +217,7 @@ StableForestClassifier_SIRUS = ["classification"]
 StableForestRegressor_SIRUS = ["regression"]
 StableRulesClassifier_SIRUS = ["classification"]
 StableRulesRegressor_SIRUS = ["regression"]
+Stack_MLJBase = ["meta_algorithms", "ensemble_models"]
 SVC_LIBSVM = ["classification"]
 SVMClassifier_MLJScikitLearnInterface = ["classification"]
 SVMLinearClassifier_MLJScikitLearnInterface = ["classification"]
@@ -222,9 +230,11 @@ SpectralClustering_MLJScikitLearnInterface = ["clustering", "static_models"]
 Standardizer_MLJModels = ["encoders"]
 SubspaceLDA_MultivariateStats = ["classification", "dimension_reduction"]
 TomekUndersampler_Imbalance = ["class_imbalance"]
+TunedModel_MLJTuning = ["meta_algorithms"]
 TSVDTransformer_TSVD = ["dimension_reduction"]
 TfidfTransformer_MLJText = ["encoders", "text_analysis"]
 TheilSenRegressor_MLJScikitLearnInterface = ["regression"]
+TransformedTargetModel_MLJBase = ["meta_algorithms", "outlier_detection"]
 UnivariateBoxCoxTransformer_MLJModels = ["encoders"]
 UnivariateDiscretizer_MLJModels = ["encoders"]
 UnivariateFillImputer_MLJModels = ["missing_value_imputation"]

diff --git a/docs/make.jl b/docs/make.jl
@@ -15,6 +15,7 @@ import MLJ.MLJModels
 import MLJ.MLJEnsembles
 import MLJ.ScientificTypes
 import MLJ.MLJBalancing
+import MLJ.FeatureSelection
 import ScientificTypesBase
 import Distributions
 using CategoricalArrays
@@ -37,7 +38,7 @@ isempty(problems) || error(
 # compose the individual model docstring pages:
 @info "Getting individual model docstrings from the registry and generating "*
     "pages for them, written at /docs/src/models/ ."
-for model in models()
+for model in models(wrappers=true)
     write_page(model)
 end
 
@@ -54,45 +55,62 @@ pages = [
     "Model Browser" => "model_browser.md",
     "About MLJ" => "about_mlj.md",
     "Learning MLJ" => "learning_mlj.md",
-    "Getting Started" => "getting_started.md",
-    "Common MLJ Workflows" => "common_mlj_workflows.md",
-    "Working with Categorical Data" => "working_with_categorical_data.md",
-    "Model Search" => "model_search.md",
-    "Loading Model Code" => "loading_model_code.md",
-    "Machines" => "machines.md",
-    "Evaluating Model Performance" => "evaluating_model_performance.md",
-    "Performance Measures" => "performance_measures.md",
-    "Weights" => "weights.md",
-    "Tuning Models" => "tuning_models.md",
-    "Learning Curves" => "learning_curves.md",
-    "Preparing Data" => "preparing_data.md",
-    "Transformers and Other Unsupervised models" => "transformers.md",
-    "More on Probabilistic Predictors" => "more_on_probabilistic_predictors.md",
-    "Composing Models" => "composing_models.md",
-    "Linear Pipelines" => "linear_pipelines.md",
-    "Target Transformations" => "target_transformations.md",
-    "Homogeneous Ensembles" => "homogeneous_ensembles.md",
-    "Correcting Class Imbalance" => "correcting_class_imbalance.md",
-    "Model Stacking" => "model_stacking.md",
-    "Learning Networks" => "learning_networks.md",
-    "Controlling Iterative Models" => "controlling_iterative_models.md",
-    "Generating Synthetic Data" => "generating_synthetic_data.md",
-    "Logging Workflows" => "logging_workflows.md",
-    "OpenML Integration" => "openml_integration.md",
-    "Acceleration and Parallelism" => "acceleration_and_parallelism.md",
-    "Simple User Defined Models" => "simple_user_defined_models.md",
-    "Quick-Start Guide to Adding Models" =>
-               "quick_start_guide_to_adding_models.md",
-    "Adding Models for General Use" => "adding_models_for_general_use.md",
-    "Modifying Behavior" => "modifying_behavior.md",
-    "Internals" => "internals.md",
-    "List of Supported Models" => "list_of_supported_models.md",
-    "Third Party Packages" => "third_party_packages.md",
-    "Glossary" => "glossary.md",
-    "MLJ Cheatsheet" => "mlj_cheatsheet.md",
-    "FAQ" => "frequently_asked_questions.md",
+    "Basics" => [
+        "Getting Started" => "getting_started.md",
+        "Common MLJ Workflows" => "common_mlj_workflows.md",
+        "Machines" => "machines.md",
+        "MLJ Cheatsheet" => "mlj_cheatsheet.md",
+    ],
+    "Data" => [
+        "Working with Categorical Data" => "working_with_categorical_data.md",
+        "Preparing Data" => "preparing_data.md",
+        "Generating Synthetic Data" => "generating_synthetic_data.md",
+        "OpenML Integration" => "openml_integration.md",
+    ],
+    "Model Basics" => [
+        "Model Search" => "model_search.md",
+        "Loading Model Code" => "loading_model_code.md",
+        "Transformers and Other Unsupervised models" => "transformers.md",
+        "List of Supported Models" => "list_of_supported_models.md",
+    ],
+    "Meta-algorithms" => [
+        "Evaluating Model Performance" => "evaluating_model_performance.md",
+        "Tuning Models" => "tuning_models.md",
+        "Learning Curves" => "learning_curves.md",
+        "Controlling Iterative Models" => "controlling_iterative_models.md",
+        "Correcting Class Imbalance" => "correcting_class_imbalance.md",
+        "Thresholding Probabilistic Predictors" =>
+            "thresholding_probabilistic_predictors.md",
+        "Target Transformations" => "target_transformations.md",
+        "Homogeneous Ensembles" => "homogeneous_ensembles.md",
+    ],
+    "Model Composition" => [
+        "Composing Models" => "composing_models.md",
+        "Linear Pipelines" => "linear_pipelines.md",
+        "Model Stacking" => "model_stacking.md",
+        "Learning Networks" => "learning_networks.md",
+    ],
+    "Third Party Tools" => [
+        "Logging Workflows using MLflow" => "logging_workflows.md",
+        "Third Party Packages" => "third_party_packages.md",
+    ],
+    "Customization and Extension" => [
+        "Simple User Defined Models" => "simple_user_defined_models.md",
+        "Quick-Start Guide to Adding Models" =>
+            "quick_start_guide_to_adding_models.md",
+        "Adding Models for General Use" => "adding_models_for_general_use.md",
+        "Modifying Behavior" => "modifying_behavior.md",
+        "Internals" => "internals.md",
+    ],
+    "Miscellaneous" => [
+        "Performance Measures" => "performance_measures.md",
+        "Weights" => "weights.md",
+        "Acceleration and Parallelism" => "acceleration_and_parallelism.md",
+        "Glossary" => "glossary.md",
+        "FAQ" => "frequently_asked_questions.md",
+    ],
     "Index of Methods" => "api.md",
-    ]
+]
 
 for (k, v) in pages
     println("$k\t=>$v")
@@ -118,6 +136,7 @@ makedocs(
         IterationControl,
         CategoricalDistributions,
         StatisticalMeasures,
+        FeatureSelection,
     ],
     pages    = pages,
     warnonly = [:cross_references, :missing_docs],

diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl
@@ -5,7 +5,7 @@ const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models")
 """
     remove_doc_refs(str::AbstractString)
 
-Removes `@ref` references from `str. For example, a substring of the form
+Removes `@ref` references from `str`. For example, a substring of the form
 "[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`".
 
 """
@@ -27,8 +27,8 @@ handle(model) = model.name*"_"*model.package_name
 **Private method.**
 
 Compose and write to file the documentation page for `model`. Here `model` is an entry in
-the MLJ Model Registry, i.e., an element of `MLJModels.models()`. The file name has the
-form `"ModelName_PackageName.md"`, for example,
+the MLJ Model Registry, i.e., an element of `MLJModels.models(; wrappers=true)`. The file
+name has the form `"ModelName_PackageName.md"`, for example,
 `"DecisionTreeClassifier_DecisionTree.md"`. Such a page can be referenced from any other
 markdown page in /docs/src/ like this: `[DecisionTreeClassifier](@ref
 DecisionTreeClassifier_DecisionTree)`.
@@ -56,6 +56,7 @@ const DESCRIPTORS_GIVEN_HANDLE =
 # determined the list of all descriptors, ranked by frequency:
 const descriptors = vcat(values(DESCRIPTORS_GIVEN_HANDLE)...)
 const ranking = MLJBase.countmap(descriptors)
+ranking["meta algorithms"] = 1e10
 const DESCRIPTORS = sort(unique(descriptors), by=d -> ranking[d], rev=true)
 const HANDLES = keys(DESCRIPTORS_GIVEN_HANDLE)
 
@@ -67,7 +68,7 @@ handle as key in /docs/src/ModelDescriptors.toml.
 
 """
 function models_missing_descriptors()
-    handles = handle.(models())
+    handles = handle.(models(wrappers=true))
     filter(handles) do h
         !(h in HANDLES)
     end
@@ -82,7 +83,7 @@ Return the list of  models with a given `descriptor`, such as "regressor", as
 these appear in /src/docs/ModelDescriptors.toml.
 
 """
-modelswith(descriptor) = filter(models()) do model
+modelswith(descriptor) = filter(models(wrappers=true)) do model
     descriptor in DESCRIPTORS_GIVEN_HANDLE[handle(model)]
 end
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -56,18 +56,19 @@ To support MLJ development, please cite these works or star the repo:
 [Model Search](@ref model_search) |
 [Loading Model Code](@ref) |
 [Transformers and Other Unsupervised Models](@ref) |
-[More on Probabilistic Predictors](@ref) |
-[Composing Models](@ref) |
 [Simple User Defined Models](@ref) |
 [List of Supported Models](@ref model_list) |
 [Third Party Packages](@ref) 
 
 ### Meta-algorithms
 [Evaluating Model Performance](@ref) |
 [Tuning Models](@ref) |
+[Composing Models](@ref) |
 [Controlling Iterative Models](@ref) |
 [Learning Curves](@ref)|
-[Correcting Class Imbalance](@ref)
+[Correcting Class Imbalance](@ref) |
+[Thresholding Probabilistic Predictors](@ref)
+
 
 ### Composition
 [Composing Models](@ref) |