diff --git a/examples/lightning_tour/lightning_tour.ipynb b/examples/lightning_tour/lightning_tour.ipynb index bd503dfd7..230187581 100644 --- a/examples/lightning_tour/lightning_tour.ipynb +++ b/examples/lightning_tour/lightning_tour.ipynb @@ -2,31 +2,34 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# Lightning tour of MLJ" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "*For a more elementary introduction to MLJ, see [Getting\n", - "Started](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/).*" - ], - "metadata": {} + "Started](https://juliaai.github.io/MLJ.jl/dev/getting_started/).*" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Note.** Be sure this file has not been separated from the\n", "accompanying Project.toml and Manifest.toml files, which should not\n", "should be altered unless you know what you are doing. Using them,\n", "the following code block instantiates a julia environment with a tested\n", "bundle of packages known to work with the rest of the script:" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -36,40 +39,40 @@ ] } ], - "cell_type": "code", "source": [ "using Pkg\n", "Pkg.activate(@__DIR__)\n", "Pkg.instantiate()" - ], - "metadata": {}, - "execution_count": 1 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Assuming Julia 1.7" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In MLJ a *model* is just a container for hyperparameters, and that's\n", "all. Here we will apply several kinds of model composition before\n", "binding the resulting \"meta-model\" to data in a *machine* for\n", "evaluation, using cross-validation." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Loading and instantiating a gradient tree-boosting model:" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -80,80 +83,141 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": "EvoTreeRegressor(\n loss = EvoTrees.Linear(),\n nrounds = 10,\n λ = 0.0,\n γ = 0.0,\n η = 0.1,\n max_depth = 2,\n min_weight = 1.0,\n rowsample = 1.0,\n colsample = 1.0,\n nbins = 64,\n α = 0.5,\n metric = :mse,\n rng = Random.MersenneTwister(123),\n device = \"cpu\")" + "text/plain": [ + "EvoTreeRegressor(\n", + " loss = EvoTrees.Linear(),\n", + " nrounds = 10,\n", + " λ = 0.0,\n", + " γ = 0.0,\n", + " η = 0.1,\n", + " max_depth = 2,\n", + " min_weight = 1.0,\n", + " rowsample = 1.0,\n", + " colsample = 1.0,\n", + " nbins = 64,\n", + " α = 0.5,\n", + " metric = :mse,\n", + " rng = Random.MersenneTwister(123),\n", + " device = \"cpu\")" + ] }, + "execution_count": 2, "metadata": {}, - "execution_count": 2 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "using MLJ\n", "MLJ.color_off()\n", "\n", "Booster = @load EvoTreeRegressor # loads code defining a model type\n", "booster = Booster(max_depth=2) # specify hyperparameter at construction" - ], - "metadata": {}, - "execution_count": 2 + ] }, { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": "EvoTreeRegressor(\n loss = EvoTrees.Linear(),\n nrounds = 50,\n λ = 0.0,\n γ = 0.0,\n η = 0.1,\n max_depth = 2,\n min_weight = 1.0,\n rowsample = 1.0,\n colsample = 1.0,\n nbins = 64,\n α = 0.5,\n metric = :mse,\n rng = Random.MersenneTwister(123),\n device = \"cpu\")" + "text/plain": [ + "EvoTreeRegressor(\n", + " loss = EvoTrees.Linear(),\n", + " nrounds = 50,\n", + " λ = 0.0,\n", + " γ = 0.0,\n", + " η = 0.1,\n", + " max_depth = 2,\n", + " min_weight = 1.0,\n", + " rowsample = 1.0,\n", + " colsample = 1.0,\n", + " nbins = 64,\n", + " α = 0.5,\n", + " metric = :mse,\n", + " rng = Random.MersenneTwister(123),\n", + " device = \"cpu\")" + ] }, + "execution_count": 3, "metadata": {}, - "execution_count": 3 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "booster.nrounds=50 # or mutate post facto\n", "booster" - ], - "metadata": {}, - "execution_count": 3 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "This model is an example of an iterative model. As is stands, the\n", "number of iterations `nrounds` is fixed." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Composition 1: Wrapping the model to make it \"self-iterating\"" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Let's create a new model that automatically learns the number of iterations,\n", "using the `NumberSinceBest(3)` criterion, as applied to an\n", "out-of-sample `l1` loss:" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": "DeterministicIteratedModel(\n model = EvoTreeRegressor(\n loss = EvoTrees.Linear(),\n nrounds = 50,\n λ = 0.0,\n γ = 0.0,\n η = 0.1,\n max_depth = 2,\n min_weight = 1.0,\n rowsample = 1.0,\n colsample = 1.0,\n nbins = 64,\n α = 0.5,\n metric = :mse,\n rng = Random.MersenneTwister(123),\n device = \"cpu\"),\n controls = Any[Step(2), NumberSinceBest(3), NumberLimit(300)],\n resampling = Holdout(\n fraction_train = 0.8,\n shuffle = false,\n rng = Random._GLOBAL_RNG()),\n measure = LPLoss(p = 1),\n weights = nothing,\n class_weights = nothing,\n operation = MLJModelInterface.predict,\n retrain = true,\n check_measure = true,\n iteration_parameter = nothing,\n cache = true)" + "text/plain": [ + "DeterministicIteratedModel(\n", + " model = EvoTreeRegressor(\n", + " loss = EvoTrees.Linear(),\n", + " nrounds = 50,\n", + " λ = 0.0,\n", + " γ = 0.0,\n", + " η = 0.1,\n", + " max_depth = 2,\n", + " min_weight = 1.0,\n", + " rowsample = 1.0,\n", + " colsample = 1.0,\n", + " nbins = 64,\n", + " α = 0.5,\n", + " metric = :mse,\n", + " rng = Random.MersenneTwister(123),\n", + " device = \"cpu\"),\n", + " controls = Any[Step(2), NumberSinceBest(3), NumberLimit(300)],\n", + " resampling = Holdout(\n", + " fraction_train = 0.8,\n", + " shuffle = false,\n", + " rng = Random._GLOBAL_RNG()),\n", + " measure = LPLoss(p = 1),\n", + " weights = nothing,\n", + " class_weights = nothing,\n", + " operation = MLJModelInterface.predict,\n", + " retrain = true,\n", + " check_measure = true,\n", + " iteration_parameter = nothing,\n", + " cache = true)" + ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "using MLJIteration\n", "iterated_booster = IteratedModel(model=booster,\n", @@ -161,98 +225,145 @@ " controls=[Step(2), NumberSinceBest(3), NumberLimit(300)],\n", " measure=l1,\n", " retrain=true)" - ], - "metadata": {}, - "execution_count": 4 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Composition 2: Preprocess the input features" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Combining the model with categorical feature encoding:" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": "DeterministicPipeline(\n continuous_encoder = ContinuousEncoder(\n drop_last = false,\n one_hot_ordered_factors = false),\n deterministic_iterated_model = DeterministicIteratedModel(\n model = EvoTreeRegressor{Float64,…},\n controls = Any[Step(2), NumberSinceBest(3), NumberLimit(300)],\n resampling = Holdout,\n measure = LPLoss(p = 1),\n weights = nothing,\n class_weights = nothing,\n operation = MLJModelInterface.predict,\n retrain = true,\n check_measure = true,\n iteration_parameter = nothing,\n cache = true),\n cache = true)" + "text/plain": [ + "DeterministicPipeline(\n", + " continuous_encoder = ContinuousEncoder(\n", + " drop_last = false,\n", + " one_hot_ordered_factors = false),\n", + " deterministic_iterated_model = DeterministicIteratedModel(\n", + " model = EvoTreeRegressor{Float64,…},\n", + " controls = Any[Step(2), NumberSinceBest(3), NumberLimit(300)],\n", + " resampling = Holdout,\n", + " measure = LPLoss(p = 1),\n", + " weights = nothing,\n", + " class_weights = nothing,\n", + " operation = MLJModelInterface.predict,\n", + " retrain = true,\n", + " check_measure = true,\n", + " iteration_parameter = nothing,\n", + " cache = true),\n", + " cache = true)" + ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "pipe = ContinuousEncoder |> iterated_booster" - ], - "metadata": {}, - "execution_count": 5 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Composition 3: Wrapping the model to make it \"self-tuning\"" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "First, we define a hyperparameter range for optimization of a\n", "(nested) hyperparameter:" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": "NumericRange(1 ≤ deterministic_iterated_model.model.max_depth ≤ 10; origin=5.5, unit=4.5)" + "text/plain": [ + "NumericRange(1 ≤ deterministic_iterated_model.model.max_depth ≤ 10; origin=5.5, unit=4.5)" + ] }, + "execution_count": 6, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "max_depth_range = range(pipe,\n", " :(deterministic_iterated_model.model.max_depth),\n", " lower = 1,\n", " upper = 10)" - ], - "metadata": {}, - "execution_count": 6 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Now we can wrap the pipeline model in an optimization strategy to make\n", "it \"self-tuning\":" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": "DeterministicTunedModel(\n model = DeterministicPipeline(\n continuous_encoder = ContinuousEncoder,\n deterministic_iterated_model = DeterministicIteratedModel{EvoTreeRegressor{Float64,…}},\n cache = true),\n tuning = RandomSearch(\n bounded = Distributions.Uniform,\n positive_unbounded = Distributions.Gamma,\n other = Distributions.Normal,\n rng = Random._GLOBAL_RNG()),\n resampling = CV(\n nfolds = 3,\n shuffle = true,\n rng = Random.MersenneTwister(456)),\n measure = LPLoss(p = 1),\n weights = nothing,\n operation = nothing,\n range = NumericRange(1 ≤ deterministic_iterated_model.model.max_depth ≤ 10; origin=5.5, unit=4.5),\n selection_heuristic = MLJTuning.NaiveSelection(nothing),\n train_best = true,\n repeats = 1,\n n = 50,\n acceleration = CPUThreads{Int64}(5),\n acceleration_resampling = CPU1{Nothing}(nothing),\n check_measure = true,\n cache = true)" + "text/plain": [ + "DeterministicTunedModel(\n", + " model = DeterministicPipeline(\n", + " continuous_encoder = ContinuousEncoder,\n", + " deterministic_iterated_model = DeterministicIteratedModel{EvoTreeRegressor{Float64,…}},\n", + " cache = true),\n", + " tuning = RandomSearch(\n", + " bounded = Distributions.Uniform,\n", + " positive_unbounded = Distributions.Gamma,\n", + " other = Distributions.Normal,\n", + " rng = Random._GLOBAL_RNG()),\n", + " resampling = CV(\n", + " nfolds = 3,\n", + " shuffle = true,\n", + " rng = Random.MersenneTwister(456)),\n", + " measure = LPLoss(p = 1),\n", + " weights = nothing,\n", + " operation = nothing,\n", + " range = NumericRange(1 ≤ deterministic_iterated_model.model.max_depth ≤ 10; origin=5.5, unit=4.5),\n", + " selection_heuristic = MLJTuning.NaiveSelection(nothing),\n", + " train_best = true,\n", + " repeats = 1,\n", + " n = 50,\n", + " acceleration = CPUThreads{Int64}(5),\n", + " acceleration_resampling = CPU1{Nothing}(nothing),\n", + " check_measure = true,\n", + " cache = true)" + ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 7 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "self_tuning_pipe = TunedModel(model=pipe,\n", " tuning=RandomSearch(),\n", @@ -261,69 +372,76 @@ " measure=l1,\n", " acceleration=CPUThreads(),\n", " n=50)" - ], - "metadata": {}, - "execution_count": 7 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Binding to data and evaluating performance" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Loading a selection of features and labels from the Ames\n", "House Price dataset:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], "source": [ "X, y = @load_reduced_ames;" - ], - "metadata": {}, - "execution_count": 8 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Binding the \"self-tuning\" pipeline model to data in a *machine* (which\n", "will additionally store *learned* parameters):" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": "Machine{DeterministicTunedModel{RandomSearch,…},…} trained 0 times; caches data\n model: MLJTuning.DeterministicTunedModel{RandomSearch, MLJBase.DeterministicPipeline{NamedTuple{(:continuous_encoder, :deterministic_iterated_model), Tuple{Unsupervised, Deterministic}}, MLJModelInterface.predict}}\n args: \n 1:\tSource @512 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}, AbstractVector{Multiclass{15}}, AbstractVector{Multiclass{25}}, AbstractVector{OrderedFactor{10}}}}`\n 2:\tSource @129 ⏎ `AbstractVector{Continuous}`\n" + "text/plain": [ + "Machine{DeterministicTunedModel{RandomSearch,…},…} trained 0 times; caches data\n", + " model: MLJTuning.DeterministicTunedModel{RandomSearch, MLJBase.DeterministicPipeline{NamedTuple{(:continuous_encoder, :deterministic_iterated_model), Tuple{Unsupervised, Deterministic}}, MLJModelInterface.predict}}\n", + " args: \n", + " 1:\tSource @512 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}, AbstractVector{Multiclass{15}}, AbstractVector{Multiclass{25}}, AbstractVector{OrderedFactor{10}}}}`\n", + " 2:\tSource @129 ⏎ `AbstractVector{Continuous}`\n" + ] }, + "execution_count": 9, "metadata": {}, - "execution_count": 9 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "mach = machine(self_tuning_pipe, X, y)" - ], - "metadata": {}, - "execution_count": 9 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Evaluating the \"self-tuning\" pipeline model's performance using 5-fold\n", "cross-validation (implies multiple layers of nested resampling):" - ], - "metadata": {} + ] }, { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -334,47 +452,56 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": "PerformanceEvaluation object with these fields:\n measure, measurement, operation, per_fold,\n per_observation, fitted_params_per_fold,\n report_per_fold, train_test_pairs\nExtract:\n┌───────────────┬─────────────┬───────────┬───────────────────────────────────────────────┐\n│\u001b[22m measure \u001b[0m│\u001b[22m measurement \u001b[0m│\u001b[22m operation \u001b[0m│\u001b[22m per_fold \u001b[0m│\n├───────────────┼─────────────┼───────────┼───────────────────────────────────────────────┤\n│ LPLoss(p = 1) │ 16800.0 │ predict │ [16500.0, 16300.0, 16300.0, 16600.0, 18600.0] │\n│ LPLoss(p = 2) │ 6.65e8 │ predict │ [6.14e8, 6.3e8, 5.98e8, 6.17e8, 8.68e8] │\n└───────────────┴─────────────┴───────────┴───────────────────────────────────────────────┘\n" + "text/plain": [ + "PerformanceEvaluation object with these fields:\n", + " measure, measurement, operation, per_fold,\n", + " per_observation, fitted_params_per_fold,\n", + " report_per_fold, train_test_pairs\n", + "Extract:\n", + "┌───────────────┬─────────────┬───────────┬───────────────────────────────────────────────┐\n", + "│\u001b[22m measure \u001b[0m│\u001b[22m measurement \u001b[0m│\u001b[22m operation \u001b[0m│\u001b[22m per_fold \u001b[0m│\n", + "├───────────────┼─────────────┼───────────┼───────────────────────────────────────────────┤\n", + "│ LPLoss(p = 1) │ 16800.0 │ predict │ [16500.0, 16300.0, 16300.0, 16600.0, 18600.0] │\n", + "│ LPLoss(p = 2) │ 6.65e8 │ predict │ [6.14e8, 6.3e8, 5.98e8, 6.17e8, 8.68e8] │\n", + "└───────────────┴─────────────┴───────────┴───────────────────────────────────────────────┘\n" + ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 10 + "output_type": "execute_result" } ], - "cell_type": "code", "source": [ "evaluate!(mach,\n", " measures=[l1, l2],\n", " resampling=CV(nfolds=5, rng=123),\n", " acceleration=CPUThreads())" - ], - "metadata": {}, - "execution_count": 10 + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "---\n", "\n", "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*" - ], - "metadata": {} + ] } ], - "nbformat_minor": 3, "metadata": { + "kernelspec": { + "display_name": "Julia 1.7.1", + "language": "julia", + "name": "julia-1.7" + }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.7.1" - }, - "kernelspec": { - "name": "julia-1.7", - "display_name": "Julia 1.7.1", - "language": "julia" } }, - "nbformat": 4 + "nbformat": 4, + "nbformat_minor": 3 } diff --git a/examples/lightning_tour/lightning_tour.jl b/examples/lightning_tour/lightning_tour.jl index ff1c47415..c0c8eb425 100644 --- a/examples/lightning_tour/lightning_tour.jl +++ b/examples/lightning_tour/lightning_tour.jl @@ -1,7 +1,7 @@ # # Lightning tour of MLJ # *For a more elementary introduction to MLJ, see [Getting -# Started](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/).* +# Started](https://juliaai.github.io/MLJ.jl/dev/getting_started/).* # **Note.** Be sure this file has not been separated from the # accompanying Project.toml and Manifest.toml files, which should not diff --git a/examples/telco/notebook.ipynb b/examples/telco/notebook.ipynb index 7019a6b63..77fc58651 100644 --- a/examples/telco/notebook.ipynb +++ b/examples/telco/notebook.ipynb @@ -12,7 +12,7 @@ "metadata": {}, "source": [ "An application of the [MLJ\n", - "toolbox](https://alan-turing-institute.github.io/MLJ.jl/dev/) to the\n", + "toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the\n", "Telco Customer Churn dataset, aimed at practicing data scientists\n", "new to MLJ (Machine Learning in Julia). This tutorial does not\n", "cover exploratory data analysis." @@ -31,9 +31,9 @@ "metadata": {}, "source": [ "For other MLJ learning resources see the [Learning\n", - "MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_mlj/)\n", + "MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/)\n", "section of the\n", - "[manual](https://alan-turing-institute.github.io/MLJ.jl/dev/)." + "[manual](https://juliaai.github.io/MLJ.jl/dev/)." ] }, { @@ -132,7 +132,7 @@ "the notebook, package instantiation and pre-compilation may take a\n", "minute or so to complete. **This step will fail** if the [correct\n", "Manifest.toml and Project.toml\n", - "files](https://github.com/alan-turing-institute/MLJ.jl/tree/dev/examples/telco)\n", + "files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco)\n", "are not in the same directory as this notebook." ] }, @@ -203,7 +203,7 @@ "metadata": {}, "source": [ "This section is a condensed adaption of the [Getting Started\n", - "example](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict)\n", + "example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict)\n", "in the MLJ documentation." ] }, @@ -448,7 +448,7 @@ "metadata": {}, "source": [ "A machine stores some other information enabling [warm\n", - "restart](https://alan-turing-institute.github.io/MLJ.jl/dev/machines/#Warm-restarts)\n", + "restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts)\n", "for some models, but we won't go into that here. You are allowed to\n", "access and mutate the `model` parameter:" ] @@ -1140,7 +1140,7 @@ "metadata": {}, "source": [ "For tools helping us to identify suitable models, see the [Model\n", - "Search](https://alan-turing-institute.github.io/MLJ.jl/dev/model_search/#model_search)\n", + "Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search)\n", "section of the manual. We will build a gradient tree-boosting model,\n", "a popular first choice for structured data like we have here. Model\n", "code is contained in a third-party package called\n", @@ -1379,7 +1379,7 @@ "source": [ "Note that the component models appear as hyper-parameters of\n", "`pipe`. Pipelines are an implementation of a more general [model\n", - "composition](https://alan-turing-institute.github.io/MLJ.jl/dev/composing_models/#Composing-Models)\n", + "composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models)\n", "interface provided by MLJ that advanced users may want to learn about." ] }, @@ -2152,7 +2152,7 @@ "metadata": {}, "source": [ "We choose a `StratifiedCV` resampling strategy; the complete list of options is\n", - "[here](https://alan-turing-institute.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies)." + "[here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies)." ] }, { @@ -2393,7 +2393,7 @@ "metadata": {}, "source": [ "First, we select appropriate controls from [this\n", - "list](https://alan-turing-institute.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided):" + "list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided):" ] }, { @@ -2559,7 +2559,7 @@ "wanting to visualize the effect of changes to a *single*\n", "hyper-parameter (which could be an iteration parameter). See, for\n", "example, [this section of the\n", - "manual](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_curves/)\n", + "manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/)\n", "or [this\n", "tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb)." ] @@ -2689,7 +2689,7 @@ "metadata": {}, "source": [ "Next, we choose an optimization strategy from [this\n", - "list](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models):" + "list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models):" ] }, { diff --git a/examples/telco/notebook.jl b/examples/telco/notebook.jl index 9ae13e3fc..d42a46923 100644 --- a/examples/telco/notebook.jl +++ b/examples/telco/notebook.jl @@ -1,7 +1,7 @@ # # MLJ for Data Scientists in Two Hours # An application of the [MLJ -# toolbox](https://alan-turing-institute.github.io/MLJ.jl/dev/) to the +# toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the # Telco Customer Churn dataset, aimed at practicing data scientists # new to MLJ (Machine Learning in Julia). This tutorial does not # cover exploratory data analysis. @@ -10,9 +10,9 @@ # deep-learning). # For other MLJ learning resources see the [Learning -# MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_mlj/) +# MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/) # section of the -# [manual](https://alan-turing-institute.github.io/MLJ.jl/dev/). +# [manual](https://juliaai.github.io/MLJ.jl/dev/). # **Topics covered**: Grabbing and preparing a dataset, basic # fit/predict workflow, constructing a pipeline to include data @@ -78,7 +78,7 @@ # the notebook, package instantiation and pre-compilation may take a # minute or so to complete. **This step will fail** if the [correct # Manifest.toml and Project.toml -# files](https://github.com/alan-turing-institute/MLJ.jl/tree/dev/examples/telco) +# files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco) # are not in the same directory as this notebook. using Pkg @@ -94,7 +94,7 @@ Pkg.instantiate() # don't fully grasp should become clearer in the Telco study. # This section is a condensed adaption of the [Getting Started -# example](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict) +# example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict) # in the MLJ documentation. # First, using the built-in iris dataset, we load and inspect the features @@ -137,7 +137,7 @@ fit!(mach, rows=train_rows) fitted_params(mach) # A machine stores some other information enabling [warm -# restart](https://alan-turing-institute.github.io/MLJ.jl/dev/machines/#Warm-restarts) +# restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts) # for some models, but we won't go into that here. You are allowed to # access and mutate the `model` parameter: @@ -292,7 +292,7 @@ const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID)); # > Introduces: `@load`, `input_scitype`, `target_scitype` # For tools helping us to identify suitable models, see the [Model -# Search](https://alan-turing-institute.github.io/MLJ.jl/dev/model_search/#model_search) +# Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search) # section of the manual. We will build a gradient tree-boosting model, # a popular first choice for structured data like we have here. Model # code is contained in a third-party package called @@ -340,7 +340,7 @@ pipe = ContinuousEncoder() |> booster # Note that the component models appear as hyperparameters of # `pipe`. Pipelines are an implementation of a more general [model -# composition](https://alan-turing-institute.github.io/MLJ.jl/dev/composing_models/#Composing-Models) +# composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models) # interface provided by MLJ that advanced users may want to learn about. # From the above display, we see that component model hyperparameters @@ -464,7 +464,7 @@ plot!([0, 1], [0, 1], linewidth=2, linestyle=:dash, color=:black) # `acceleration=CPUThreads()` to parallelize the computation. # We choose a `StratifiedCV` resampling strategy; the complete list of options is -# [here](https://alan-turing-institute.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies). +# [here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies). e_pipe = evaluate(pipe, X, y, resampling=StratifiedCV(nfolds=6, rng=123), @@ -535,7 +535,7 @@ pipe2 = ContinuousEncoder() |> # [MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl). # First, we select appropriate controls from [this -# list](https://alan-turing-institute.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided): +# list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided): controls = [ Step(1), # to increment iteration parameter (`pipe.nrounds`) @@ -580,7 +580,7 @@ fit!(mach_iterated_pipe); # wanting to visualize the effect of changes to a *single* # hyperparameter (which could be an iteration parameter). See, for # example, [this section of the -# manual](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_curves/) +# manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/) # or [this # tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb). @@ -618,7 +618,7 @@ r2 = range(iterated_pipe, p2, lower=2, upper=6) # and `upper`. # Next, we choose an optimization strategy from [this -# list](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models): +# list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models): tuning = RandomSearch(rng=123) @@ -755,4 +755,3 @@ ŷ_basic = predict(mach_basic, Xtest); auc(ŷ_basic, ytest), accuracy(mode.(ŷ_basic), ytest) ) - diff --git a/examples/telco/notebook.pluto.jl b/examples/telco/notebook.pluto.jl index 06e0142e4..345837476 100644 --- a/examples/telco/notebook.pluto.jl +++ b/examples/telco/notebook.pluto.jl @@ -10,7 +10,7 @@ md"# MLJ for Data Scientists in Two Hours" # ╔═╡ 8a6670b8-96a8-4a5d-b795-033f6f2a0674 md""" An application of the [MLJ -toolbox](https://alan-turing-institute.github.io/MLJ.jl/dev/) to the +toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the Telco Customer Churn dataset, aimed at practicing data scientists new to MLJ (Machine Learning in Julia). This tutorial does not cover exploratory data analysis. @@ -25,9 +25,9 @@ deep-learning). # ╔═╡ b04c4790-59e0-42a3-af2a-25235e544a31 md""" For other MLJ learning resources see the [Learning -MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_mlj/) +MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/) section of the -[manual](https://alan-turing-institute.github.io/MLJ.jl/dev/). +[manual](https://juliaai.github.io/MLJ.jl/dev/). """ # ╔═╡ 4eb8dff4-c23a-4b41-8af5-148d95ea2900 @@ -106,7 +106,7 @@ used to develop this tutorial. If this is your first time running the notebook, package instantiation and pre-compilation may take a minute or so to complete. **This step will fail** if the [correct Manifest.toml and Project.toml -files](https://github.com/alan-turing-institute/MLJ.jl/tree/dev/examples/telco) +files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco) are not in the same directory as this notebook. """ @@ -131,7 +131,7 @@ don't fully grasp should become clearer in the Telco study. # ╔═╡ 33ca287e-8cba-47d1-a0de-1721c1bc2df2 md""" This section is a condensed adaption of the [Getting Started -example](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict) +example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict) in the MLJ documentation. """ @@ -197,7 +197,7 @@ end # ╔═╡ 0f978839-cc95-4c3a-8a29-32f11452654a md""" A machine stores some other information enabling [warm -restart](https://alan-turing-institute.github.io/MLJ.jl/dev/machines/#Warm-restarts) +restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts) for some models, but we won't go into that here. You are allowed to access and mutate the `model` parameter: """ @@ -324,7 +324,7 @@ begin return x end end - + df0.TotalCharges = fix_blanks(df0.TotalCharges); end @@ -424,7 +424,7 @@ md"> Introduces: `@load`, `input_scitype`, `target_scitype`" # ╔═╡ f97969e2-c15c-42cf-a6fa-eaf14df5d44b md""" For tools helping us to identify suitable models, see the [Model -Search](https://alan-turing-institute.github.io/MLJ.jl/dev/model_search/#model_search) +Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search) section of the manual. We will build a gradient tree-boosting model, a popular first choice for structured data like we have here. Model code is contained in a third-party package called @@ -497,7 +497,7 @@ pipe = ContinuousEncoder() |> booster md""" Note that the component models appear as hyperparameters of `pipe`. Pipelines are an implementation of a more general [model -composition](https://alan-turing-institute.github.io/MLJ.jl/dev/composing_models/#Composing-Models) +composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models) interface provided by MLJ that advanced users may want to learn about. """ @@ -693,7 +693,7 @@ observation space, for a total of 18 folds) and set # ╔═╡ 562887bb-b7fb-430f-b61c-748aec38e674 md""" We choose a `StratifiedCV` resampling strategy; the complete list of options is -[here](https://alan-turing-institute.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies). +[here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies). """ # ╔═╡ f9be989e-2604-44c2-9727-ed822e4fd85d @@ -734,7 +734,7 @@ begin table = (measure=measure, measurement=measurement) return DataFrames.DataFrame(table) end - + const confidence_intervals_basic_model = confidence_intervals(e_pipe) end @@ -753,7 +753,7 @@ with low feature importance, to speed up later optimization: # ╔═╡ cdfe840d-4e87-467f-b582-dfcbeb05bcc5 begin unimportant_features = filter(:importance => <(0.005), feature_importance_table).feature - + pipe2 = ContinuousEncoder() |> FeatureSelector(features=unimportant_features, ignore=true) |> booster end @@ -790,7 +790,7 @@ eg, the neural network models provided by # ╔═╡ 8fc99d35-d8cc-455f-806e-1bc580dc349d md""" First, we select appropriate controls from [this -list](https://alan-turing-institute.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided): +list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided): """ # ╔═╡ 29f33708-4a82-4acc-9703-288eae064e2a @@ -857,7 +857,7 @@ here is the `learning_curve` function, which can be useful when wanting to visualize the effect of changes to a *single* hyperparameter (which could be an iteration parameter). See, for example, [this section of the -manual](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_curves/) +manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/) or [this tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb). """ @@ -898,7 +898,7 @@ show(iterated_pipe, 2) begin p1 = :(model.evo_tree_classifier.η) p2 = :(model.evo_tree_classifier.max_depth) - + r1 = range(iterated_pipe, p1, lower=-2, upper=-0.5, scale=x->10^x) r2 = range(iterated_pipe, p2, lower=2, upper=6) end @@ -912,7 +912,7 @@ and `upper`. # ╔═╡ af3023e6-920f-478d-af76-60dddeecbe6c md""" Next, we choose an optimization strategy from [this -list](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models): +list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models): """ # ╔═╡ 93c17a9b-b49c-4780-9074-c069a0e97d7e @@ -1105,9 +1105,9 @@ md"For comparison, here's the performance for the basic pipeline model" begin mach_basic = machine(pipe, X, y) fit!(mach_basic, verbosity=0) - + ŷ_basic = predict(mach_basic, Xtest); - + @info("Basic model measurements on test set:", brier_loss(ŷ_basic, ytest) |> mean, auc(ŷ_basic, ytest), diff --git a/examples/telco/notebook.unexecuted.ipynb b/examples/telco/notebook.unexecuted.ipynb index 2b7c88755..71d109350 100644 --- a/examples/telco/notebook.unexecuted.ipynb +++ b/examples/telco/notebook.unexecuted.ipynb @@ -2,53 +2,54 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# MLJ for Data Scientists in Two Hours" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "An application of the [MLJ\n", - "toolbox](https://alan-turing-institute.github.io/MLJ.jl/dev/) to the\n", + "toolbox](https://juliaai.github.io/MLJ.jl/dev/) to the\n", "Telco Customer Churn dataset, aimed at practicing data scientists\n", "new to MLJ (Machine Learning in Julia). This tutorial does not\n", "cover exploratory data analysis." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "MLJ is a *multi-paradigm* machine learning toolbox (i.e., not just\n", "deep-learning)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For other MLJ learning resources see the [Learning\n", - "MLJ](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_mlj/)\n", + "MLJ](https://juliaai.github.io/MLJ.jl/dev/learning_mlj/)\n", "section of the\n", - "[manual](https://alan-turing-institute.github.io/MLJ.jl/dev/)." - ], - "metadata": {} + "[manual](https://juliaai.github.io/MLJ.jl/dev/)." + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Topics covered**: Grabbing and preparing a dataset, basic\n", "fit/predict workflow, constructing a pipeline to include data\n", "pre-processing, estimating performance metrics, ROC curves, confusion\n", "matrices, feature importance, basic feature selection, controlling iterative\n", "models, hyper-parameter optimization (tuning)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Prerequisites for this tutorial.** Previous experience building,\n", "evaluating, and optimizing machine learning models using\n", @@ -59,25 +60,25 @@ "minimal way ([this\n", "cheatsheet](https://ahsmart.com/pub/data-wrangling-with-data-frames-jl-cheat-sheet/index.html)\n", "may help)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Time.** Between two and three hours, first time through." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Summary of methods and types introduced" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "|code | purpose|\n", "|:-------|:-------------------------------------------------------|\n", @@ -113,316 +114,316 @@ "| `range(model, :some_hyperparam, lower=..., upper=...)` | define a numeric range|\n", "| `RandomSearch()` | random search tuning strategy|\n", "| `TunedModel(model=..., tuning=..., options...)` | wrap the supervised `model` in specified `tuning` strategy|" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Instantiate a Julia environment" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The following code replicates precisely the set of Julia packages\n", "used to develop this tutorial. If this is your first time running\n", "the notebook, package instantiation and pre-compilation may take a\n", "minute or so to complete. **This step will fail** if the [correct\n", "Manifest.toml and Project.toml\n", - "files](https://github.com/alan-turing-institute/MLJ.jl/tree/dev/examples/telco)\n", + "files](https://github.com/JuliaAI/MLJ.jl/tree/dev/examples/telco)\n", "are not in the same directory as this notebook." - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "using Pkg\n", "Pkg.activate(@__DIR__) # get env from TOML files in same directory as this notebook\n", "Pkg.instantiate()" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Warm up: Building a model for the iris dataset" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Before turning to the Telco Customer Churn dataset, we very quickly\n", "build a predictive model for Fisher's well-known iris data set, as way of\n", "introducing the main actors in any MLJ workflow. Details that you\n", "don't fully grasp should become clearer in the Telco study." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "This section is a condensed adaption of the [Getting Started\n", - "example](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict)\n", + "example](https://juliaai.github.io/MLJ.jl/dev/getting_started/#Fit-and-predict)\n", "in the MLJ documentation." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "First, using the built-in iris dataset, we load and inspect the features\n", "`X_iris` (a table) and target variable `y_iris` (a vector):" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "using MLJ" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "const X_iris, y_iris = @load_iris;\n", "schema(X_iris)" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "y_iris[1:4]" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "levels(y_iris)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We load a decision tree model, from the package DecisionTree.jl:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "DecisionTree = @load DecisionTreeClassifier pkg=DecisionTree # model type\n", "model = DecisionTree(min_samples_split=5) # model instance" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In MLJ, a *model* is just a container for hyper-parameters of\n", "some learning algorithm. It does not store learned parameters." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Next, we bind the model together with the available data in what's\n", "called a *machine*:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach = machine(model, X_iris, y_iris)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "A machine is essentially just a model (ie, hyper-parameters) plus data, but\n", "it additionally stores *learned parameters* (the tree) once it is\n", "trained on some view of the data:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "train_rows = vcat(1:60, 91:150); # some row indices (observations are rows not columns)\n", "fit!(mach, rows=train_rows)\n", "fitted_params(mach)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "A machine stores some other information enabling [warm\n", - "restart](https://alan-turing-institute.github.io/MLJ.jl/dev/machines/#Warm-restarts)\n", + "restart](https://juliaai.github.io/MLJ.jl/dev/machines/#Warm-restarts)\n", "for some models, but we won't go into that here. You are allowed to\n", "access and mutate the `model` parameter:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach.model.min_samples_split = 10\n", "fit!(mach, rows=train_rows) # re-train with new hyper-parameter" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Now we can make predictions on some other view of the data, as in" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "predict(mach, rows=71:73)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "or on completely new data, as in" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "Xnew = (sepal_length = [5.1, 6.3],\n", " sepal_width = [3.0, 2.5],\n", " petal_length = [1.4, 4.9],\n", " petal_width = [0.3, 1.5])\n", "yhat = predict(mach, Xnew)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "These are probabilistic predictions which can be manipulated using a\n", "widely adopted interface defined in the Distributions.jl\n", "package. For example, we can get raw probabilities like this:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "pdf.(yhat, \"virginica\")" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We now turn to the Telco dataset." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Getting the Telco data" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import DataFrames" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "data = OpenML.load(42178) # data set from OpenML.org\n", "df0 = DataFrames.DataFrame(data)\n", "first(df0, 4)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The object of this tutorial is to build and evaluate supervised\n", "learning models to predict the `:Churn` variable, a binary variable\n", "measuring customer retention, based on other variables that are\n", "relevant." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In the table, observations correspond to rows, and features to\n", "columns, which is the convention for representing all\n", "two-dimensional data in MLJ." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Type coercion" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `scitype`, `schema`, `coerce`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "A [\"scientific\n", "type\"](https://juliaai.github.io/ScientificTypes.jl/dev/) or\n", @@ -430,59 +431,59 @@ "`typeof(3.14) == Float64`, while `scitype(3.14) == Continuous` and\n", "also `scitype(3.14f0) == Continuous`. In MLJ, model data\n", "requirements are articulated using scitypes." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Here are common \"scalar\" scitypes:" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "![](assets/scitypes.png)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "There are also container scitypes. For example, the scitype of any\n", "`N`-dimensional array is `AbstractArray{S, N}`, where `S` is the scitype of the\n", "elements:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "scitype([\"cat\", \"mouse\", \"dog\"])" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The `schema` operator summarizes the column scitypes of a table:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "schema(df0) |> DataFrames.DataFrame # converted to DataFrame for better display" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "All of the fields being interpreted as `Textual` are really\n", "something else, either `Multiclass` or, in the case of\n", @@ -490,12 +491,13 @@ "mostly floats wrapped as strings. However, it needs special\n", "treatment because some elements consist of a single space, \" \",\n", "which we'll treat as \"0.0\"." - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "fix_blanks(v) = map(v) do x\n", " if x == \" \"\n", @@ -506,550 +508,550 @@ "end\n", "\n", "df0.TotalCharges = fix_blanks(df0.TotalCharges);" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Coercing the `:TotalCharges` type to ensure a `Continuous` scitype:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "coerce!(df0, :TotalCharges => Continuous);" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Coercing all remaining `Textual` data to `Multiclass`:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "coerce!(df0, Textual => Multiclass);" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Finally, we'll coerce our target variable `:Churn` to be\n", "`OrderedFactor`, rather than `Multiclass`, to enable a reliable\n", "interpretation of metrics like \"true positive rate\". By convention,\n", "the first class is the negative one:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "coerce!(df0, :Churn => OrderedFactor)\n", "levels(df0.Churn) # to check order" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Re-inspecting the scitypes:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "schema(df0) |> DataFrames.DataFrame" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Preparing a holdout set for final testing" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `partition`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "To reduce training times for the purposes of this tutorial, we're\n", "going to dump 90% of observations (after shuffling) and split off\n", "30% of the remainder for use as a lock-and-throw-away-the-key\n", "holdout set:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df, df_test, df_dumped = partition(df0, 0.07, 0.03, # in ratios 7:3:90\n", " stratify=df0.Churn,\n", " rng=123);" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The reader interested in including all data can instead do\n", "`df, df_test = partition(df0, 0.7, stratify=df0.Churn, rng=123)`." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Splitting data into target and features" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `unpack`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In the following call, the column with name `:Churn` is copied over\n", "to a vector `y`, and every remaining column, except `:customerID`\n", "(which contains no useful information) goes into a table `X`. Here\n", "`:Churn` is the target variable for which we seek predictions, given\n", "new versions of the features `X`." - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "const y, X = unpack(df, ==(:Churn), !=(:customerID));\n", "schema(X).names" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "intersect([:Churn, :customerID], schema(X).names)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We'll do the same for the holdout data:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID));" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Loading a model and checking type requirements" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `@load`, `input_scitype`, `target_scitype`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For tools helping us to identify suitable models, see the [Model\n", - "Search](https://alan-turing-institute.github.io/MLJ.jl/dev/model_search/#model_search)\n", + "Search](https://juliaai.github.io/MLJ.jl/dev/model_search/#model_search)\n", "section of the manual. We will build a gradient tree-boosting model,\n", "a popular first choice for structured data like we have here. Model\n", "code is contained in a third-party package called\n", "[EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) which is\n", "loaded as follows:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "Booster = @load EvoTreeClassifier pkg=EvoTrees" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Recall that a *model* is just a container for some algorithm's\n", "hyper-parameters. Let's create a `Booster` with default values for\n", "the hyper-parameters:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "booster = Booster()" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "This model is appropriate for the kind of target variable we have because of\n", "the following passing test:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "scitype(y) <: target_scitype(booster)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "However, our features `X` cannot be directly used with `booster`:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "scitype(X) <: input_scitype(booster)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "As it turns out, this is because `booster`, like the majority of MLJ\n", "supervised models, expects the features to be `Continuous`. (With\n", "some experience, this can be gleaned from `input_scitype(booster)`.)\n", "So we need categorical feature encoding, discussed next." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Building a model pipeline to incorporate feature encoding" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `ContinuousEncoder`, pipeline operator `|>`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The built-in `ContinuousEncoder` model transforms an arbitrary table\n", "to a table whose features are all `Continuous` (dropping any fields\n", "it does not know how to encode). In particular, all `Multiclass`\n", "features are one-hot encoded." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "A *pipeline* is a stand-alone model that internally combines one or\n", "more models in a linear (non-branching) pipeline. Here's a pipeline\n", "that adds the `ContinuousEncoder` as a pre-processor to the\n", "gradient tree-boosting model above:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "pipe = ContinuousEncoder() |> booster" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Note that the component models appear as hyper-parameters of\n", "`pipe`. Pipelines are an implementation of a more general [model\n", - "composition](https://alan-turing-institute.github.io/MLJ.jl/dev/composing_models/#Composing-Models)\n", + "composition](https://juliaai.github.io/MLJ.jl/dev/composing_models/#Composing-Models)\n", "interface provided by MLJ that advanced users may want to learn about." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "From the above display, we see that component model hyper-parameters\n", "are now *nested*, but they are still accessible (important in hyper-parameter\n", "optimization):" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "pipe.evo_tree_classifier.max_depth" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Evaluating the pipeline model's performance" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `measures` (function), **measures:** `brier_loss`, `auc`, `accuracy`;\n", "> `machine`, `fit!`, `predict`, `fitted_params`, `report`, `roc`, **resampling strategy** `StratifiedCV`, `evaluate`, `FeatureSelector`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Without touching our test set `Xtest`, `ytest`, we will estimate the\n", "performance of our pipeline model, with default hyper-parameters, in\n", "two different ways:" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Evaluating by hand.** First, we'll do this \"by hand\" using the `fit!` and `predict`\n", "workflow illustrated for the iris data set above, using a\n", "holdout resampling strategy. At the same time we'll see how to\n", "generate a **confusion matrix**, **ROC curve**, and inspect\n", "**feature importances**." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Automated performance evaluation.** Next we'll apply the more\n", "typical and convenient `evaluate` workflow, but using `StratifiedCV`\n", "(stratified cross-validation) which is more informative." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "In any case, we need to choose some measures (metrics) to quantify\n", "the performance of our model. For a complete list of measures, one\n", "does `measures()`. Or we also can do:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "measures(\"Brier\")" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We will be primarily using `brier_loss`, but also `auc` (area under\n", "the ROC curve) and `accuracy`." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Evaluating by hand (with a holdout set)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Our pipeline model can be trained just like the decision tree model\n", "we built for the iris data set. Binding all non-test data to the\n", "pipeline model:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach_pipe = machine(pipe, X, y)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We already encountered the `partition` method above. Here we apply\n", "it to row indices, instead of data containers, as `fit!` and\n", "`predict` only need a *view* of the data to work." - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "train, validation = partition(1:length(y), 0.7)\n", "fit!(mach_pipe, rows=train)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We note in passing that we can access two kinds of information from a trained machine:" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- The **learned parameters** (eg, coefficients of a linear model): We use `fitted_params(mach_pipe)`\n", "- Other **by-products of training** (eg, feature importances): We use `report(mach_pipe)`" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "fp = fitted_params(mach_pipe);\n", "keys(fp)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For example, we can check that the encoder did not actually drop any features:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "Set(fp.continuous_encoder.features_to_keep) == Set(schema(X).names)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "And, from the report, extract feature importances:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "rpt = report(mach_pipe)\n", "keys(rpt.evo_tree_classifier)" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "fi = rpt.evo_tree_classifier.feature_importances\n", "feature_importance_table =\n", " (feature=Symbol.(first.(fi)), importance=last.(fi)) |> DataFrames.DataFrame" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For models not reporting feature importances, we recommend the\n", "[Shapley.jl](https://expandingman.gitlab.io/Shapley.jl/) package." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Returning to predictions and evaluations of our measures:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "ŷ = predict(mach_pipe, rows=validation);\n", "@info(\"Measurements\",\n", @@ -1057,76 +1059,75 @@ " auc(ŷ, y[validation]),\n", " accuracy(mode.(ŷ), y[validation])\n", " )" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Note that we need `mode` in the last case because `accuracy` expects\n", "point predictions, not probabilistic ones. (One can alternatively\n", "use `predict_mode` to generate the predictions.)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "While we're here, lets also generate a **confusion matrix** and\n", "[receiver-operator\n", "characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)\n", "(ROC):" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "confmat(mode.(ŷ), y[validation])" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Note: Importing the plotting package and calling the plotting\n", "functions for the first time can take a minute or so." - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "using Plots" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "roc_curve = roc(ŷ, y[validation])\n", "plt = scatter(roc_curve, legend=false)\n", "plot!(plt, xlab=\"false positive rate\", ylab=\"true positive rate\")\n", "plot!([0, 1], [0, 1], linewidth=2, linestyle=:dash, color=:black)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Automated performance evaluation (more typical workflow)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We can also get performance estimates with a single call to the\n", "`evaluate` function, which also allows for more complicated\n", @@ -1135,41 +1136,41 @@ "cross-validation \"Monte Carlo\" (3 random size-6 partitions of the\n", "observation space, for a total of 18 folds) and set\n", "`acceleration=CPUThreads()` to parallelize the computation." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We choose a `StratifiedCV` resampling strategy; the complete list of options is\n", - "[here](https://alan-turing-institute.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies)." - ], - "metadata": {} + "[here](https://juliaai.github.io/MLJ.jl/dev/evaluating_model_performance/#Built-in-resampling-strategies)." + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "e_pipe = evaluate(pipe, X, y,\n", " resampling=StratifiedCV(nfolds=6, rng=123),\n", " measures=[brier_loss, auc, accuracy],\n", " repeats=3,\n", " acceleration=CPUThreads())" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "(There is also a version of `evaluate` for machines. Query the\n", "`evaluate` and `evaluate!` doc-strings to learn more about these\n", "functions and what the `PerformanceEvaluation` object `e_pipe` records.)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "While [less than ideal](https://arxiv.org/abs/2104.00673), let's\n", "adopt the common practice of using the standard error of a\n", @@ -1177,21 +1178,22 @@ "performance measure's expected value. Here's a utility function to\n", "calculate 95% confidence intervals for our performance estimates based\n", "on this practice, and it's application to the current evaluation:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "using Measurements" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "function confidence_intervals(e)\n", " factor = 2.0 # to get level of 95%\n", @@ -1204,60 +1206,59 @@ "end\n", "\n", "const confidence_intervals_basic_model = confidence_intervals(e_pipe)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Filtering out unimportant features" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `FeatureSelector`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Before continuing, we'll modify our pipeline to drop those features\n", "with low feature importance, to speed up later optimization:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "unimportant_features = filter(:importance => <(0.005), feature_importance_table).feature\n", "\n", "pipe2 = ContinuousEncoder() |>\n", " FeatureSelector(features=unimportant_features, ignore=true) |> booster" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Wrapping our iterative model in control strategies" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: **control strategies:** `Step`, `NumberSinceBest`, `TimeLimit`, `InvalidValue`, **model wrapper** `IteratedModel`, **resampling strategy:** `Holdout`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We want to optimize the hyper-parameters of our model. Since our\n", "model is iterative, these parameters include the (nested) iteration\n", @@ -1271,29 +1272,30 @@ "some data hygiene issues, and, when we subsequently optimize other\n", "parameters, we will always being using an optimal number of\n", "iterations." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Note that this approach can be applied to any iterative MLJ model,\n", "eg, the neural network models provided by\n", "[MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "First, we select appropriate controls from [this\n", - "list](https://alan-turing-institute.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided):" - ], - "metadata": {} + "list](https://juliaai.github.io/MLJ.jl/dev/controlling_iterative_models/#Controls-provided):" + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "controls = [\n", " Step(1), # to increment iteration parameter (`pipe.nrounds`)\n", @@ -1301,114 +1303,113 @@ " TimeLimit(2/3600), # never train more than 2 sec\n", " InvalidValue() # stop if NaN or ±Inf encountered\n", "]" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Now we wrap our pipeline model using the `IteratedModel` wrapper,\n", "being sure to specify the `measure` on which internal estimates of\n", "the out-of-sample performance will be based:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "iterated_pipe = IteratedModel(model=pipe2,\n", " controls=controls,\n", " measure=brier_loss,\n", " resampling=Holdout(fraction_train=0.7))" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We've set `resampling=Holdout(fraction_train=0.7)` to arrange that\n", "data attached to our model should be internally split into a train\n", "set (70%) and a holdout set (30%) for determining the out-of-sample\n", "estimate of the Brier loss." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For demonstration purposes, let's bind `iterated_model` to all data\n", "not in our don't-touch holdout set, and train on all of that data:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach_iterated_pipe = machine(iterated_pipe, X, y)\n", "fit!(mach_iterated_pipe);" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "To recap, internally this training is split into two separate steps:" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- A controlled iteration step, training on the holdout set, with the total number of iterations determined by the specified stopping criteria (based on the out-of-sample performance estimates)\n", "- A final step that trains the atomic model on *all* available\n", " data using the number of iterations determined in the first step. Calling `predict` on `mach_iterated_pipe` means using the learned parameters of the second step." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Hyper-parameter optimization (model tuning)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `range`, **model wrapper** `TunedModel`, `RandomSearch`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We now turn to hyper-parameter optimization. A tool not discussed\n", "here is the `learning_curve` function, which can be useful when\n", "wanting to visualize the effect of changes to a *single*\n", "hyper-parameter (which could be an iteration parameter). See, for\n", "example, [this section of the\n", - "manual](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_curves/)\n", + "manual](https://juliaai.github.io/MLJ.jl/dev/learning_curves/)\n", "or [this\n", "tutorial](https://github.com/ablaom/MLJTutorial.jl/blob/dev/notebooks/04_tuning/notebook.ipynb)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Fine tuning the hyper-parameters of a gradient booster can be\n", "somewhat involved. Here we settle for simultaneously optimizing two\n", "key parameters: `max_depth` and `η` (learning_rate)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Like iteration control, **model optimization in MLJ is implemented as\n", "a model wrapper**, called `TunedModel`. After wrapping a model in a\n", @@ -1421,87 +1422,88 @@ "model. That is, wrapping the model simply transforms certain\n", "hyper-parameters into learned parameters (just as `IteratedModel`\n", "does for an iteration parameter)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "To start with, we define ranges for the parameters of\n", "interest. Since these parameters are nested, let's force a\n", "display of our model to a larger depth:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "show(iterated_pipe, 2)" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "p1 = :(model.evo_tree_classifier.η)\n", "p2 = :(model.evo_tree_classifier.max_depth)\n", "\n", "r1 = range(iterated_pipe, p1, lower=-2, upper=-0.5, scale=x->10^x)\n", "r2 = range(iterated_pipe, p2, lower=2, upper=6)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Nominal ranges are defined by specifying `values` instead of `lower`\n", "and `upper`." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Next, we choose an optimization strategy from [this\n", - "list](https://alan-turing-institute.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models):" - ], - "metadata": {} + "list](https://juliaai.github.io/MLJ.jl/dev/tuning_models/#Tuning-Models):" + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "tuning = RandomSearch(rng=123)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Then we wrap the model, specifying a `resampling` strategy and a\n", "`measure`, as we did for `IteratedModel`. In fact, we can include a\n", "battery of `measures`; by default, optimization is with respect to\n", "performance estimates based on the first measure, but estimates for\n", "all measures can be accessed from the model's `report`." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The keyword `n` specifies the total number of models (sets of\n", "hyper-parameters) to evaluate." - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "tuned_iterated_pipe = TunedModel(model=iterated_pipe,\n", " range=[r1, r2],\n", @@ -1510,175 +1512,174 @@ " resampling=StratifiedCV(nfolds=6, rng=123),\n", " acceleration=CPUThreads(),\n", " n=40)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "To save time, we skip the `repeats` here." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Binding our final model to data and training:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach_tuned_iterated_pipe = machine(tuned_iterated_pipe, X, y)\n", "fit!(mach_tuned_iterated_pipe)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "As explained above, the training we have just performed was split\n", "internally into two separate steps:" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- A step to determine the parameter values that optimize the aggregated cross-validation scores\n", "- A final step that trains the optimal model on *all* available data. Future predictions `predict(mach_tuned_iterated_pipe, ...)` are based on this final training step." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "From `report(mach_tuned_iterated_pipe)` we can extract details about\n", "the optimization procedure. For example:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "rpt2 = report(mach_tuned_iterated_pipe);\n", "best_booster = rpt2.best_model.model.evo_tree_classifier" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "@info \"Optimal hyper-parameters:\" best_booster.max_depth best_booster.η;" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Using the `confidence_intervals` function we defined earlier:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "e_best = rpt2.best_history_entry\n", "confidence_intervals(e_best)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Digging a little deeper, we can learn what stopping criterion was\n", "applied in the case of the optimal model, and how many iterations\n", "were required:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "rpt2.best_report.controls |> collect" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Finally, we can visualize the optimization results:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "plot(mach_tuned_iterated_pipe, size=(600,450))" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Saving our model" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> Introduces: `MLJ.save`" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Here's how to serialize our final, trained self-iterating,\n", "self-tuning pipeline machine:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "MLJ.save(\"tuned_iterated_pipe.jlso\", mach_tuned_iterated_pipe)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We'll deserialize this in \"Testing the final model\" below." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Final performance estimate" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Finally, to get an even more accurate estimate of performance, we\n", "can evaluate our model using stratified cross-validation and all the\n", @@ -1688,64 +1689,64 @@ "this computation takes quite a bit longer than the previous one\n", "(which is being repeated six times, using 5/6th of the data each\n", "time):" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "e_tuned_iterated_pipe = evaluate(tuned_iterated_pipe, X, y,\n", " resampling=StratifiedCV(nfolds=6, rng=123),\n", " measures=[brier_loss, auc, accuracy])" - ], - "metadata": {}, - "execution_count": null + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "confidence_intervals(e_tuned_iterated_pipe)" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For comparison, here are the confidence intervals for the basic\n", "pipeline model (no feature selection and default hyperparameters):" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "confidence_intervals_basic_model" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "As each pair of intervals overlap, it's doubtful the small changes\n", "here can be assigned statistical significance. Default `booster`\n", "hyper-parameters do a pretty good job." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Testing the final model" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We now determine the performance of our model on our\n", "lock-and-throw-away-the-key holdout set. To demonstrate\n", @@ -1753,65 +1754,66 @@ "have called `import`/`using` on the same packages). Then the\n", "following should suffice to recover our model trained under\n", "\"Hyper-parameter optimization\" above:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach_restored = machine(\"tuned_iterated_pipe.jlso\")" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We compute predictions on the holdout set:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "ŷ_tuned = predict(mach_restored, Xtest);\n", "ŷ_tuned[1]" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "And can compute the final performance measures:" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "@info(\"Tuned model measurements on test:\",\n", " brier_loss(ŷ_tuned, ytest) |> mean,\n", " auc(ŷ_tuned, ytest),\n", " accuracy(mode.(ŷ_tuned), ytest)\n", " )" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For comparison, here's the performance for the basic pipeline model" - ], - "metadata": {} + ] }, { - "outputs": [], "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mach_basic = machine(pipe, X, y)\n", "fit!(mach_basic, verbosity=0)\n", @@ -1823,33 +1825,31 @@ " auc(ŷ_basic, ytest),\n", " accuracy(mode.(ŷ_basic), ytest)\n", " )" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "---\n", "\n", "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*" - ], - "metadata": {} + ] } ], - "nbformat_minor": 3, "metadata": { + "kernelspec": { + "display_name": "Julia 1.6.5", + "language": "julia", + "name": "julia-1.6" + }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.6.5" - }, - "kernelspec": { - "name": "julia-1.6", - "display_name": "Julia 1.6.5", - "language": "julia" } }, - "nbformat": 4 + "nbformat": 4, + "nbformat_minor": 3 } diff --git a/paper/paper.bib b/paper/paper.bib index 4a80f9ad5..7faab78a2 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -156,7 +156,7 @@ @misc{MLJ year = {2019}, publisher = {GitHub}, journal = {GitHub repository}, - url = {https://github.com/alan-turing-institute/MLJ.jl} + url = {https://github.com/JuliaAI/MLJ.jl} } @misc{MLJdocs, @@ -165,7 +165,7 @@ @misc{MLJdocs year = {2020}, publisher = {GitHub}, journal = {GitHub pages}, - url = {https://alan-turing-institute.github.io/MLJ.jl/dev/} + url = {https://juliaai.github.io/MLJ.jl/dev/} } @misc{MLJTuning, @@ -174,7 +174,7 @@ @misc{MLJTuning year = {2020}, publisher = {GitHub}, journal = {GitHub repository}, - url = {https://github.com/alan-turing-institute/MLJTuning.jl} + url = {https://github.com/JuliaAI/MLJTuning.jl} } @misc{MLJtutorials, @@ -183,7 +183,7 @@ @misc{MLJtutorials year = {2020}, publisher = {GitHub}, journal = {GitHub pages}, - url = {https://alan-turing-institute.github.io/DataScienceTutorials.jl/} + url = {https://juliaai.github.io/DataScienceTutorials.jl/} } @article{Rackauckas2017, @@ -214,7 +214,7 @@ @misc{ScientificTypes year = {2019}, publisher = {GitHub}, journal = {GitHub repository}, - url = {https://github.com/alan-turing-institute/ScientificTypes.jl} + url = {https://github.com/JuliaAI/ScientificTypes.jl} } @misc{Quinn, diff --git a/src/MLJ.jl b/src/MLJ.jl index f7afe2d09..51b4c4230 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -1,7 +1,7 @@ """ MLJ -[`MLJ`](https://alan-turing-institute.github.io/MLJ.jl/dev/) is a Machine Learning toolbox +[`MLJ`](https://juliaai.github.io/MLJ.jl//dev/) is a Machine Learning toolbox for Julia. It collects together functionality from the following packages, which can be loaded separately: diff --git a/test/integration.jl b/test/integration.jl index 7746dfd6d..b3f27d75f 100644 --- a/test/integration.jl +++ b/test/integration.jl @@ -9,7 +9,7 @@ const OTHER_TEST_LEVEL = 3 # # RECORD OF OUTSTANDING ISSUES FILTER_GIVEN_ISSUE = Dict( - "https://github.com/alan-turing-institute/MLJ.jl/issues/1085" => + "https://github.com/JuliaAI/MLJ.jl/issues/1085" => model -> (model.name == "AdaBoostStumpClassifier" && model.package_name == "DecisionTree") || @@ -32,7 +32,7 @@ FILTER_GIVEN_ISSUE = Dict( model.package_name == "BetaML", "https://github.com/JuliaAI/MLJTSVDInterface.jl/pull/17" => model -> model.name == "TSVDTransformer", - "https://github.com/alan-turing-institute/MLJ.jl/issues/1074" => + "https://github.com/JuliaAI/MLJ.jl/issues/1074" => model -> model.name == "AutoEncoderMLJ", "https://github.com/sylvaticus/BetaML.jl/issues/64" => model -> model.name =="GaussianMixtureClusterer" && model.package_name=="BetaML",