Merge branch 'dev' into compathelper/new_version/2024-05-22-00-20-58-…

…074-00566135728
JuliaAI · Jun 5, 2024 · c4247a6 · c4247a6
2 parents a855bf7 + fa8b1e3
commit c4247a6
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 28 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJ"
 uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.20.4"
+version = "0.20.5"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -34,7 +34,7 @@ Distributions = "0.21,0.22,0.23, 0.24, 0.25"
 MLJBalancing = "0.1"
 MLJBase = "1"
 MLJEnsembles = "0.4"
-MLJFlow = "0.4.2, 0.5"
+MLJFlow = "0.5"
 MLJIteration = "0.6"
 MLJModels = "0.16"
 MLJTestIntegration = "0.5.0"

diff --git a/docs/make.jl b/docs/make.jl
@@ -67,7 +67,7 @@ pages = [
     "Learning Curves" => "learning_curves.md",
     "Preparing Data" => "preparing_data.md",
     "Transformers and Other Unsupervised models" => "transformers.md",
-    "More on Probabilistic Predictors" => "more_on_probabilistic_predictors.md",
+    "Thresholding Probabilistic Predictors" => "thresholding_probabilistic_predictors.md",
     "Composing Models" => "composing_models.md",
     "Linear Pipelines" => "linear_pipelines.md",
     "Target Transformations" => "target_transformations.md",

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -56,18 +56,19 @@ To support MLJ development, please cite these works or star the repo:
 [Model Search](@ref model_search) |
 [Loading Model Code](@ref) |
 [Transformers and Other Unsupervised Models](@ref) |
-[More on Probabilistic Predictors](@ref) |
-[Composing Models](@ref) |
 [Simple User Defined Models](@ref) |
 [List of Supported Models](@ref model_list) |
 [Third Party Packages](@ref) 
 
 ### Meta-algorithms
 [Evaluating Model Performance](@ref) |
 [Tuning Models](@ref) |
+[Composing Models](@ref) |
 [Controlling Iterative Models](@ref) |
 [Learning Curves](@ref)|
-[Correcting Class Imbalance](@ref)
+[Correcting Class Imbalance](@ref) |
+[Thresholding Probabilistic Predictors](@ref)
+
 
 ### Composition
 [Composing Models](@ref) |

diff --git a/docs/src/mlj_cheatsheet.md b/docs/src/mlj_cheatsheet.md
@@ -119,6 +119,12 @@ Split a table or matrix `X`, instead of indices:
 Xtrain, Xvalid, Xtest = partition(X, 0.5, 0.3, rng=123)
 ```
 
+Simultaneous splitting (needs `multi=true`):
+
+```julia
+(Xtrain, Xtest), (ytrain, ytest) = partition((X, y), 0.8, rng=123, multi=true)
+```
+
 Getting data from [OpenML](https://www.openml.org):
 ```julia
 table = OpenML.load(91)
@@ -128,7 +134,7 @@ Creating synthetic classification data:
 ```julia
 X, y = make_blobs(100, 2)
 ```
-(also: `make_moons`, `make_circles`)
+(also: `make_moons`, `make_circles`, `make_regression`)
 
 Creating synthetic regression data:
 
@@ -162,17 +168,13 @@ fit!(mach, rows=1:100, verbosity=1, force=false)
 
 - Supervised case: `predict(mach, Xnew)` or `predict(mach, rows=1:100)`
 
-  Similarly, for probabilistic models: `predict_mode`, `predict_mean` and `predict_median`.
+  For probabilistic models: `predict_mode`, `predict_mean` and `predict_median`.
 
-- Unsupervised case: `transform(mach, rows=1:100)` or `inverse_transform(mach, rows)`, etc.
+- Unsupervised case: `W = transform(mach, Xnew)` or `inverse_transform(mach, W)`, etc.
 
 
 ## Inspecting objects
 
-`@more` gets detail on the last object in REPL
-
-`params(model)` gets a nested-tuple of all hyperparameters, even nested ones
-
 `info(ConstantRegressor())`, `info("PCA")`, `info("RidgeRegressor",
 pkg="MultivariateStats")` gets all properties (aka traits) of registered models
 
@@ -187,19 +189,19 @@ pkg="MultivariateStats")` gets all properties (aka traits) of registered models
 
 ## Saving and retrieving machines using Julia serializer
 
-`MLJ.save("trained_for_five_days.jls", mach)` to save machine `mach` (without data)
+`MLJ.save("my_machine.jls", mach)` to save machine `mach` (without data)
 
-`predict_only_mach = machine("trained_for_five_days.jlso")` to deserialize.
+`predict_only_mach = machine("my_machine.jls")` to deserialize.
 
 
 ## Performance estimation
 
 ```julia
-evaluate(model, X, y, resampling=CV(), measure=rms, operation=predict, weights=..., verbosity=1)
+evaluate(model, X, y, resampling=CV(), measure=rms)
 ```
 
 ```julia
-evaluate!(mach, resampling=Holdout(), measure=[rms, mav], operation=predict, weights=..., verbosity=1)
+evaluate!(mach, resampling=Holdout(), measure=[rms, mav])
 ```
 
 ```julia
@@ -216,6 +218,8 @@ evaluate!(mach, resampling=[(fold1, fold2), (fold2, fold1)], measure=rms)
 
 `TimeSeriesSV(nfolds=4)` for time-series cross-validation
 
+`InSample()`: test set = train set
+
 or a list of pairs of row indices:
 
 `[(train1, eval1), (train2, eval2), ... (traink, evalk)]`
@@ -225,7 +229,7 @@ or a list of pairs of row indices:
 ## Tuning model wrapper
 
 ```julia
-tuned_model = TunedModel(model=…, tuning=RandomSearch(), resampling=Holdout(), measure=…, operation=predict, range=…)
+tuned_model = TunedModel(model; tuning=RandomSearch(), resampling=Holdout(), measure=…, range=…)
 ```
 
 ## Ranges for tuning `(range=...)`
@@ -238,9 +242,11 @@ then `Grid()` search uses `iterator(r, 6) == [1, 2, 3, 6, 11, 20]`.
 
 Non-numeric ranges: `r = range(model, :parameter, values=…)`
 
+Instead of `model`, declare type: `r = range(Char, :c; values=['a', 'b'])`
+
 Nested ranges: Use dot syntax, as in `r = range(EnsembleModel(atom=tree), :(atom.max_depth), ...)`
 
-Can specify multiple ranges, as in `range=[r1, r2, r3]`. For more range options do `?Grid` or `?RandomSearch`
+Specify multiple ranges, as in `range=[r1, r2, r3]`. For more range options do `?Grid` or `?RandomSearch`
 
 
 ## Tuning strategies
@@ -257,11 +263,11 @@ Also available: `LatinHyperCube`, `Explicit` (built-in), `MLJTreeParzenTuning`,
 For generating a plot of performance against parameter specified by `range`:
 
 ```julia
-curve = learning_curve(mach, resolution=30, resampling=Holdout(), measure=…, operation=predict, range=…, n=1)
+curve = learning_curve(mach, resolution=30, resampling=Holdout(), measure=…, range=…, n=1)
 ```
 
 ```julia
-curve = learning_curve(model, X, y, resolution=30, resampling=Holdout(), measure=…, operation=predict, range=…, n=1)
+curve = learning_curve(model, X, y, resolution=30, resampling=Holdout(), measure=…, range=…, n=1)
 ```
 
 If using Plots.jl:
@@ -313,14 +319,14 @@ Externals include: `PCA` (in MultivariateStats), `KMeans`, `KMedoids` (in Cluste
 ## Ensemble model wrapper
 
 ```julia
-EnsembleModel(atom=…, weights=Float64[], bagging_fraction=0.8, rng=GLOBAL_RNG, n=100, parallel=true, out_of_bag_measure=[])
+EnsembleModel(model; weights=Float64[], bagging_fraction=0.8, rng=GLOBAL_RNG, n=100, parallel=true, out_of_bag_measure=[])
 ```
 
 
 ## Target transformation wrapper
 
 ```julia
-TransformedTargetModel(model=ConstantClassifier(), target=Standardizer())
+TransformedTargetModel(model; target=Standardizer())
 ```
 
 ## Pipelines

diff --git a/docs/src/more_on_probabilistic_predictors.md → .../thresholding_probabilistic_predictors.md b/docs/src/more_on_probabilistic_predictors.md → .../thresholding_probabilistic_predictors.md
@@ -1,4 +1,4 @@
-# More on Probabilistic Predictors
+# Thresholding Probabilistic Predictors
 
 Although one can call `predict_mode` on a probabilistic binary
 classifier to get deterministic predictions, a more flexible strategy

diff --git a/src/MLJ.jl b/src/MLJ.jl
@@ -1,15 +1,24 @@
 """
    MLJ
 
-[`MLJ`](https://juliaai.github.io/MLJ.jl//dev/) is a Machine Learning toolbox
-for Julia. It collects together functionality from the following packages, which can be
-loaded separately:
+[`MLJ`](https://juliaai.github.io/MLJ.jl//dev/) is a Machine Learning toolbox for
+Julia. It collects together functionality from separate components listed below, which can
+be loaded individually.
+
+Actual model code (e.g., code for instantiating a `DecisionTreeClassifier`) must be
+explicitly loaded from the model-providing package, using `@load`, for example. However
+some common transformers, listed using `localmodels()` at startup, are immediately
+available, as are the following model wrappers: `Pipeline`, `TunedModel`, `EnsembleModel`,
+`IteratedModel`, `BalancedModel`, `TransformedTargetModel`, `BinaryThresholdPredictor`,
+and `Stack`.
+
+# Components
 
 - MLJBase.jl: The `machine` interface, tools to `partition` and `unpack` datasets,
   `evaluate`/`evaluate!` for model performance, `|>` pipeline syntax,
   `TransformedTargetModel` wrapper, general model composition syntax (learning networks),
   synthetic data generators, `scitype` and `schema` methods (from ScientificTypes.jl) for
-  checking how MLJ interprets your data
+  checking how MLJ interprets your data. Generally required for any MLJ workflow.
 
 - StatisticalMeasures.jl: MLJ-compatible measures (metrics) for machine learning,
   confusion matrices, ROC curves.
@@ -30,6 +39,8 @@ loaded separately:
 
 - OpenML.jl: Tool for grabbing datasets from OpenML.org
 
+
+
 """
 module MLJ