"""Bagging meta-estimator."""
# Author: Gilles Louppe <g.louppe@gmail.com>
# License: BSD 3 clause
import itertools
import numbers
import numpy as np
from abc import ABCMeta, abstractmethod
from warnings import warn
from joblib import Parallel, delayed
from ._base import BaseEnsemble, _partition_estimators
from ..base import ClassifierMixin, RegressorMixin
from ..metrics import r2_score, accuracy_score
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..utils import check_random_state, check_array, column_or_1d
from ..utils import indices_to_mask
from ..utils.metaestimators import if_delegate_has_method
from ..utils.multiclass import check_classification_targets
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted, \
_check_sample_weight, _deprecate_positional_args
__all__ = ["BaggingClassifier",
"BaggingRegressor"]
MAX_INT = np.iinfo(np.int32).max
def _generate_indices(random_state, bootstrap, n_population, n_samples):
"""Draw randomly sampled indices."""
# Draw sample indices
if bootstrap:
indices = random_state.randint(0, n_population, n_samples)
else:
indices = sample_without_replacement(n_population, n_samples,
random_state=random_state)
return indices
def _generate_bagging_indices(random_state, bootstrap_features,
bootstrap_samples, n_features, n_samples,
max_features, max_samples):
"""Randomly draw feature and sample indices."""
# Get valid random state
random_state = check_random_state(random_state)
# Draw indices
feature_indices = _generate_indices(random_state, bootstrap_features,
n_features, max_features)
sample_indices = _generate_indices(random_state, bootstrap_samples,
n_samples, max_samples)
return feature_indices, sample_indices
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
seeds, total_n_estimators, verbose):
"""Private function used to build a batch of estimators within a job."""
# Retrieve settings
n_samples, n_features = X.shape
max_features = ensemble._max_features
max_samples = ensemble._max_samples
bootstrap = ensemble.bootstrap
bootstrap_features = ensemble.bootstrap_features
support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
"sample_weight")
if not support_sample_weight and sample_weight is not None:
raise ValueError("The base estimator doesn't support sample weight")
# Build estimators
estimators = []
estimators_features = []
for i in range(n_estimators):
if verbose > 1:
print("Building estimator %d of %d for this parallel run "
"(total %d)..." % (i + 1, n_estimators, total_n_estimators))
random_state = seeds[i]
estimator = ensemble._make_estimator(append=False,
random_state=random_state)
# Draw random feature, sample indices
features, indices = _generate_bagging_indices(random_state,
bootstrap_features,
bootstrap, n_features,
n_samples, max_features,
max_samples)
# Draw samples, using sample weights, and then fit
if support_sample_weight:
if sample_weight is None:
curr_sample_weight = np.ones((n_samples,))
else:
curr_sample_weight = sample_weight.copy()
if bootstrap:
sample_counts = np.bincount(indices, minlength=n_samples)
curr_sample_weight *= sample_counts
else:
not_indices_mask = ~indices_to_mask(indices, n_samples)
curr_sample_weight[not_indices_mask] = 0
estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
else:
estimator.fit((X[indices])[:, features], y[indices])
estimators.append(estimator)
estimators_features.append(features)
return estimators, estimators_features
def _parallel_predict_proba(estimators, estimators_features, X, n_classes):
"""Private function used to compute (proba-)predictions within a job."""
n_samples = X.shape[0]
proba = np.zeros((n_samples, n_classes))
for estimator, features in zip(estimators, estimators_features):
if hasattr(estimator, "predict_proba"):
proba_estimator = estimator.predict_proba(X[:, features])
if n_classes == len(estimator.classes_):
proba += proba_estimator
else:
proba[:, estimator.classes_] += \
proba_estimator[:, range(len(estimator.classes_))]
else:
# Resort to voting
predictions = estimator.predict(X[:, features])
for i in range(n_samples):
proba[i, predictions[i]] += 1
return proba
def _parallel_predict_log_proba(estimators, estimators_features, X, n_classes):
"""Private function used to compute log probabilities within a job."""
n_samples = X.shape[0]
log_proba = np.empty((n_samples, n_classes))
log_proba.fill(-np.inf)
all_classes = np.arange(n_classes, dtype=np.int)
for estimator, features in zip(estimators, estimators_features):
log_proba_estimator = estimator.predict_log_proba(X[:, features])
if n_classes == len(estimator.classes_):
log_proba = np.logaddexp(log_proba, log_proba_estimator)
else:
log_proba[:, estimator.classes_] = np.logaddexp(
log_proba[:, estimator.classes_],
log_proba_estimator[:, range(len(estimator.classes_))])
missing = np.setdiff1d(all_classes, estimator.classes_)
log_proba[:, missing] = np.logaddexp(log_proba[:, missing],
-np.inf)
return log_proba
def _parallel_decision_function(estimators, estimators_features, X):
"""Private function used to compute decisions within a job."""
return sum(estimator.decision_function(X[:, features])
for estimator, features in zip(estimators,
estimators_features))
def _parallel_predict_regression(estimators, estimators_features, X):
"""Private function used to compute predictions within a job."""
return sum(estimator.predict(X[:, features])
for estimator, features in zip(estimators,
estimators_features))
class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
"""Base class for Bagging meta-estimator.
Warning: This class should not be used directly. Use derived classes
instead.
"""
@abstractmethod
def __init__(self,
base_estimator=None,
n_estimators=10, *,
max_samples=1.0,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
oob_score=False,
warm_start=False,
n_jobs=None,
random_state=None,
verbose=0):
super().__init__(
base_estimator=base_estimator,
n_estimators=n_estimators)
self.max_samples = max_samples
self.max_features = max_features
self.bootstrap = bootstrap
self.bootstrap_features = bootstrap_features
self.oob_score = oob_score
self.warm_start = warm_start
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose
def fit(self, X, y, sample_weight=None):
"""Build a Bagging ensemble of estimators from the training
set (X, y).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.
y : array-like of shape (n_samples,)
The target values (class labels in classification, real numbers in
regression).
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if the base estimator supports
sample weighting.
Returns
-------
self : object
"""
return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
def _parallel_args(self):
return {}
def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
"""Build a Bagging ensemble of estimators from the training
set (X, y).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.
y : array-like of shape (n_samples,)
The target values (class labels in classification, real numbers in
regression).
max_samples : int or float, default=None
Argument to use instead of self.max_samples.
max_depth : int, default=None
Override value used when constructing base estimator. Only
supported if the base estimator has a max_depth parameter.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted.
Note that this is supported only if the base estimator supports
sample weighting.
Returns
-------
self : object
"""
random_state = check_random_state(self.random_state)
# Convert data (X is required to be 2d and indexable)
X, y = self._validate_data(
X, y, accept_sparse=['csr', 'csc'], dtype=None,
force_all_finite=False, multi_output=True
)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
# Remap output
n_samples, self.n_features_ = X.shape
self._n_samples = n_samples
y = self._validate_y(y)
# Check parameters
self._validate_estimator()
if max_depth is not None:
self.base_estimator_.max_depth = max_depth
# Validate max_samples
if max_samples is None:
max_samples = self.max_samples
elif not isinstance(max_samples, numbers.Integral):
max_samples = int(max_samples * X.shape[0])
if not (0 < max_samples <= X.shape[0]):
raise ValueError("max_samples must be in (0, n_samples]")
# Store validated integer row sampling value
self._max_samples = max_samples
# Validate max_features
if isinstance(self.max_features, numbers.Integral):
max_features = self.max_features
elif isinstance(self.max_features, np.float):
max_features = self.max_features * self.n_features_
else:
raise ValueError("max_features must be int or float")
if not (0 < max_features <= self.n_features_):
raise ValueError("max_features must be in (0, n_features]")
max_features = max(1, int(max_features))
# Store validated integer feature sampling value
self._max_features = max_features
# Other checks
if not self.bootstrap and self.oob_score:
raise ValueError("Out of bag estimation only available"
" if bootstrap=True")
if self.warm_start and self.oob_score:
raise ValueError("Out of bag estimate only available"
" if warm_start=False")
if hasattr(self, "oob_score_") and self.warm_start:
del self.oob_score_
if not self.warm_start or not hasattr(self, 'estimators_'):
# Free allocated memory, if any
self.estimators_ = []
self.estimators_features_ = []
n_more_estimators = self.n_estimators - len(self.estimators_)
Loading ...