Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Bagging ensemble of probabilistic regressors #32

Merged
merged 26 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
295 changes: 228 additions & 67 deletions skpro/regression/ensemble.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,247 @@
# LEGACY MODULE - TODO: remove or refactor
"""Bagging probabilistic regressors."""

__author__ = ["fkiraly"]
__all__ = ["BaggingRegressor"]

from math import ceil

import numpy as np
from sklearn.ensemble import BaggingRegressor as BaseBaggingRegressor
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
import pandas as pd

from skpro.distributions.mixture import Mixture
from skpro.regression.base import BaseProbaRegressor


class BaggingRegressor(BaseProbaRegressor):
"""Bagging ensemble of probabilistic regresesors.

Fits ``n_estimators`` clones of an skpro regressor on
datasets which are instance sub-samples and/or variable sub-samples.

On ``predict_proba``, the mixture of probabilistic predictions is returned.

The estimator allows to choose sample sizes for instances, variables,
and whether sampling is with or without replacement.

Direct generalization of ``sklearn``'s ``BaggingClassifier``
to the probabilistic regrsesion task.

Parameters
----------
estimator : skpro regressor, descendant of BaseProbaRegressor
regressor to use in the bagging estimator
n_estimators : int, default=10
number of estimators in the sample for bagging
n_samples : int or float, default=1.0
The number of instances drawn from ``X`` in ``fit`` to train each clone
If int, then indicates number of instances precisely
If float, interpreted as a fraction, and rounded by ``ceil``
n_features : int or float, default=1.0
The number of features/variables drawn from ``X`` in ``fit`` to train each clone
If int, then indicates number of instances precisely
If float, interpreted as a fraction, and rounded by ``ceil``
bootstrap : boolean, default=True
whether samples/instances are drawn with replacement (True) or not (False)
bootstrap_features : boolean, default=False
whether features/variables are drawn with replacement (True) or not (False)
random_state : int, RandomState instance or None, optional (default=None)
If int, ``random_state`` is the seed used by the random number generator;
If ``RandomState`` instance, ``random_state`` is the random number generator;
If None, the random number generator is the ``RandomState`` instance used
by ``np.random``.

Attributes
----------
estimators_ : list of of skpro regressors
clones of regressor in `estimator` fitted in the ensemble

Examples
--------
>>> from skpro.regression.ensemble import BaggingRegressor
>>> from skpro.regression.residual import ResidualDouble
>>> from sklearn.linear_model import LinearRegression
>>> from sklearn.datasets import load_diabetes
>>> from sklearn.model_selection import train_test_split
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y)
>>>
>>> reg_mean = LinearRegression()
>>> reg_proba = ResidualDouble(reg_mean)
>>>
>>> ens = BaggingRegressor(reg_proba, n_estimators=10)
>>> ens.fit(X_train, y_train)
BaggingRegressor(...)
>>> y_pred = ens.predict_proba(X_test)
"""

_tags = {"capability:missing": True}

def __init__(
self,
estimator,
n_estimators=10,
n_samples=1.0,
n_features=1.0,
bootstrap=True,
bootstrap_features=False,
random_state=None,
):
self.estimator = estimator
self.n_estimators = n_estimators
self.n_samples = n_samples
self.n_features = n_features
self.bootstrap = bootstrap
self.bootstrap_features = bootstrap_features
self.random_state = random_state

super().__init__()

tags_to_clone = ["capability:missing"]
self.clone_tags(estimator, tags_to_clone)

def _fit(self, X, y):
"""Fit regressor to training data.

Writes to self:
Sets fitted model attributes ending in "_".

Parameters
----------
X : pandas DataFrame
feature instances to fit regressor to
y : pandas DataFrame, must be same length as X
labels to fit regressor to

Returns
-------
self : reference to self
"""
estimator = self.estimator
n_estimators = self.n_estimators
n_samples = self.n_samples
n_features = self.n_features
bootstrap = self.bootstrap
bootstrap_ft = self.bootstrap_features
random_state = self.random_state
np.random.seed(random_state)

inst_ix = X.index
col_ix = X.columns
n = len(inst_ix)
m = len(col_ix)

if isinstance(n_samples, float):
n_samples_ = ceil(n_samples * n)
else:
n_samples_ = n_samples

if isinstance(n_features, float):
n_features_ = ceil(n_features * m)
else:
n_features_ = n_features

from skpro.base.old_base import ProbabilisticEstimator
self.estimators_ = []
self.cols_ = []

for _i in range(n_estimators):
esti = estimator.clone()
row_iloc = pd.RangeIndex(n)
row_ss = _random_ss_ix(row_iloc, size=n_samples_, replace=bootstrap)
inst_ix_i = inst_ix[row_ss]
col_ix_i = _random_ss_ix(col_ix, size=n_features_, replace=bootstrap_ft)

class BaggingRegressor(BaseBaggingRegressor, ProbabilisticEstimator):
class Distribution(ProbabilisticEstimator.Distribution):
def __init__(self, estimator, X, distributions, n_estimators):
super().__init__(estimator, X)
self.distributions = distributions
self.n_estimators = n_estimators
# store column subset for use in predict
self.cols_ += [col_ix_i]

def point(self):
return NotImplemented
Xi = _subs_cols(X.loc[inst_ix_i], col_ix_i, reset_cols=bootstrap_ft)
Xi = Xi.reset_index(drop=True)

def std(self):
return NotImplemented
yi = y.loc[inst_ix_i].reset_index(drop=True)

def pdf(self, x):
# Average the predicted PDFs
arr = np.array(
[d.pdf(x) for distribution in self.distributions for d in distribution]
)
self.estimators_ += [esti.fit(Xi, yi)]

return np.mean(arr, axis=0)
return self

def predict(self, X):
"""Predict regression target for X.
def _predict_proba(self, X) -> np.ndarray:
"""Predict distribution over labels for data from features.

The predicted regression target of an input sample is computed as the
averaged predicted distributions of the estimators in the ensemble.
State required:
Requires state to be "fitted".

Accesses in self:
Fitted model attributes ending in "_"

Parameters
----------
X : {array-like, sparse matrix} of shape = [n_samples, n_features]
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.
X : pandas DataFrame, must have same columns as X in `fit`
data to predict labels for

Returns
-------
y : skpro.base.Distribution = [n_samples]
The predicted bagged distributions.
y : skpro BaseDistribution, same length as `X`
labels predicted for `X`
"""
reset_cols = self.bootstrap_features
Xis = [_subs_cols(X, col_ix_i, reset_cols) for col_ix_i in self.cols_]

y_probas = [est.predict_proba(Xi) for est, Xi in zip(self.estimators_, Xis)]

y_proba = Mixture(y_probas)

# Ensure estimator were being fitted
check_is_fitted(self, "estimators_features_")
# Check data
X = check_array(X, accept_sparse=["csr", "csc"])

# Parallel loop
from sklearn.ensemble.base import _partition_estimators

n_jobs, n_estimators, starts = _partition_estimators(
self.n_estimators, self.n_jobs
)

def _parallel_predict_regression(estimators, estimators_features, X):
"""Private function used to compute predictions within a job."""
return [
estimator.predict(X[:, features])
for estimator, features in zip(estimators, estimators_features)
]

# Obtain predictions
all_y_hat = [
_parallel_predict_regression(
self.estimators_[starts[i] : starts[i + 1]],
self.estimators_features_[starts[i] : starts[i + 1]],
X,
)
for i in range(n_jobs)
]

# Reduce
return self._distribution()(self, X, all_y_hat, n_estimators)

def __str__(self, describer=str):
return "BaggingRegressor(" + describer(self.base_estimator) + ")"

def __repr__(self):
return self.__str__(repr)
return y_proba

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from sklearn.linear_model import LinearRegression

from skpro.regression.residual import ResidualDouble

regressor = ResidualDouble(LinearRegression())

params1 = {"estimator": regressor}
params2 = {
"estimator": regressor,
"n_samples": 0.5,
"n_features": 0.5,
}
params3 = {
"estimator": regressor,
"n_samples": 7,
"n_features": 2,
"bootstrap": False,
"bootstrap_features": True,
}

return [params1, params2, params3]


def _random_ss_ix(ix, size, replace=True):
"""Randomly uniformly sample indices from a list of indices."""
a = range(len(ix))
ixs = ix[np.random.choice(a, size=size, replace=replace)]
return ixs


def _subs_cols(df, col_ix, reset_cols=False):
"""Subset columns of a DataFrame, with potential resetting of column index."""
df_subset = df.loc[:, col_ix]
if reset_cols:
df_subset.columns = pd.RangeIndex(len(df_subset.columns))
return df_subset
6 changes: 6 additions & 0 deletions skpro/regression/tests/test_all_regressors.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def test_input_output_contract(self, object_instance):
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True, as_frame=True)
X = X.iloc[:50]
y = y.iloc[:50]
y = pd.DataFrame(y)

X_train, X_test, y_train, y_test = train_test_split(X, y)

# fit - just once for all predict output methods
Expand Down Expand Up @@ -145,7 +148,10 @@ def test_pred_quantiles_interval(self, object_instance, alpha):
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True, as_frame=True)
X = X.iloc[:50]
y = y.iloc[:50]
y = pd.DataFrame(y)

X_train, X_test, y_train, _ = train_test_split(X, y)

regressor = object_instance
Expand Down
Loading