Repository URL to install this package:
|
Version:
0.15.2 ▾
|
"""
This module gathers tree-based methods, including decision, regression and
randomized trees. Single and multi-output problems are both handled.
"""
# Authors: Gilles Louppe <g.louppe@gmail.com>
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Brian Holt <bdholt1@gmail.com>
# Noel Dawe <noel@dawe.me>
# Satrajit Gosh <satrajit.ghosh@gmail.com>
# Joly Arnaud <arnaud.v.joly@gmail.com>
# Licence: BSD 3 clause
from __future__ import division
import numbers
import numpy as np
from abc import ABCMeta, abstractmethod
from warnings import warn
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
from ..externals import six
from ..externals.six.moves import xrange
from ..feature_selection.from_model import _LearntSelectorMixin
from ..utils import array2d, check_random_state
from ..utils.validation import check_arrays
from ._tree import Criterion
from ._tree import Splitter
from ._tree import DepthFirstTreeBuilder, BestFirstTreeBuilder
from ._tree import Tree
from . import _tree
__all__ = ["DecisionTreeClassifier",
"DecisionTreeRegressor",
"ExtraTreeClassifier",
"ExtraTreeRegressor"]
# =============================================================================
# Types and constants
# =============================================================================
DTYPE = _tree.DTYPE
DOUBLE = _tree.DOUBLE
CRITERIA_CLF = {"gini": _tree.Gini, "entropy": _tree.Entropy}
CRITERIA_REG = {"mse": _tree.MSE, "friedman_mse": _tree.FriedmanMSE}
SPLITTERS = {"best": _tree.BestSplitter,
"presort-best": _tree.PresortBestSplitter,
"random": _tree.RandomSplitter}
# =============================================================================
# Base decision tree
# =============================================================================
class BaseDecisionTree(six.with_metaclass(ABCMeta, BaseEstimator,
_LearntSelectorMixin)):
"""Base class for decision trees.
Warning: This class should not be used directly.
Use derived classes instead.
"""
@abstractmethod
def __init__(self,
criterion,
splitter,
max_depth,
min_samples_split,
min_samples_leaf,
max_features,
max_leaf_nodes,
random_state):
self.criterion = criterion
self.splitter = splitter
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.max_features = max_features
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.n_features_ = None
self.n_outputs_ = None
self.classes_ = None
self.n_classes_ = None
self.tree_ = None
self.max_features_ = None
def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True,
sample_weight=None):
"""Build a decision tree from the training set (X, y).
Parameters
----------
X : array-like, shape = [n_samples, n_features]
The training input samples. Use ``dtype=np.float32`` for maximum
efficiency.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
The target values (class labels in classification, real numbers in
regression). In the regression case, use ``dtype=np.float64`` and
``order='C'`` for maximum efficiency.
sample_weight : array-like, shape = [n_samples] or None
Sample weights. If None, then samples are equally weighted. Splits
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node. In the case of
classification, splits are also ignored if they would result in any
single class carrying a negative weight in either child node.
check_input : boolean, (default=True)
Allow to bypass several input checking.
Don't use this parameter unless you know what you do.
Returns
-------
self : object
Returns self.
"""
random_state = check_random_state(self.random_state)
# Deprecations
if sample_mask is not None:
warn("The sample_mask parameter is deprecated as of version 0.14 "
"and will be removed in 0.16.", DeprecationWarning)
if X_argsorted is not None:
warn("The X_argsorted parameter is deprecated as of version 0.14 "
"and will be removed in 0.16.", DeprecationWarning)
# Convert data
if check_input:
X, = check_arrays(X, dtype=DTYPE, sparse_format="dense")
# Determine output settings
n_samples, self.n_features_ = X.shape
is_classification = isinstance(self, ClassifierMixin)
y = np.atleast_1d(y)
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
# [:, np.newaxis] that does not.
y = np.reshape(y, (-1, 1))
self.n_outputs_ = y.shape[1]
if is_classification:
y = np.copy(y)
self.classes_ = []
self.n_classes_ = []
for k in xrange(self.n_outputs_):
classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
self.classes_.append(classes_k)
self.n_classes_.append(classes_k.shape[0])
else:
self.classes_ = [None] * self.n_outputs_
self.n_classes_ = [1] * self.n_outputs_
self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)
# Check parameters
max_depth = ((2 ** 31) - 1 if self.max_depth is None
else self.max_depth)
max_leaf_nodes = (-1 if self.max_leaf_nodes is None
else self.max_leaf_nodes)
if isinstance(self.max_features, six.string_types):
if self.max_features == "auto":
if is_classification:
max_features = max(1, int(np.sqrt(self.n_features_)))
else:
max_features = self.n_features_
elif self.max_features == "sqrt":
max_features = max(1, int(np.sqrt(self.n_features_)))
elif self.max_features == "log2":
max_features = max(1, int(np.log2(self.n_features_)))
else:
raise ValueError(
'Invalid value for max_features. Allowed string '
'values are "auto", "sqrt" or "log2".')
elif self.max_features is None:
max_features = self.n_features_
elif isinstance(self.max_features, (numbers.Integral, np.integer)):
max_features = self.max_features
else: # float
if self.max_features > 0.0:
max_features = max(1, int(self.max_features * self.n_features_))
else:
max_features = 0
self.max_features_ = max_features
if len(y) != n_samples:
raise ValueError("Number of labels=%d does not match "
"number of samples=%d" % (len(y), n_samples))
if self.min_samples_split <= 0:
raise ValueError("min_samples_split must be greater than zero.")
if self.min_samples_leaf <= 0:
raise ValueError("min_samples_leaf must be greater than zero.")
if max_depth <= 0:
raise ValueError("max_depth must be greater than zero. ")
if not (0 < max_features <= self.n_features_):
raise ValueError("max_features must be in (0, n_features]")
if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
raise ValueError("max_leaf_nodes must be integral number but was "
"%r" % max_leaf_nodes)
if -1 < max_leaf_nodes < 2:
raise ValueError(("max_leaf_nodes {0} must be either smaller than "
"0 or larger than 1").format(max_leaf_nodes))
if sample_weight is not None:
if (getattr(sample_weight, "dtype", None) != DOUBLE or
not sample_weight.flags.contiguous):
sample_weight = np.ascontiguousarray(
sample_weight, dtype=DOUBLE)
if len(sample_weight.shape) > 1:
raise ValueError("Sample weights array has more "
"than one dimension: %d" %
len(sample_weight.shape))
if len(sample_weight) != n_samples:
raise ValueError("Number of weights=%d does not match "
"number of samples=%d" %
(len(sample_weight), n_samples))
# Set min_samples_split sensibly
min_samples_split = max(self.min_samples_split,
2 * self.min_samples_leaf)
# Build tree
criterion = self.criterion
if not isinstance(criterion, Criterion):
if is_classification:
criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
self.n_classes_)
else:
criterion = CRITERIA_REG[self.criterion](self.n_outputs_)
splitter = self.splitter
if not isinstance(self.splitter, Splitter):
splitter = SPLITTERS[self.splitter](criterion,
self.max_features_,
self.min_samples_leaf,
random_state)
self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
if max_leaf_nodes < 0:
builder = DepthFirstTreeBuilder(splitter, min_samples_split,
self.min_samples_leaf, max_depth)
else:
builder = BestFirstTreeBuilder(splitter, min_samples_split,
self.min_samples_leaf, max_depth,
max_leaf_nodes)
builder.build(self.tree_, X, y, sample_weight)
if self.n_outputs_ == 1:
self.n_classes_ = self.n_classes_[0]
self.classes_ = self.classes_[0]
return self
def predict(self, X):
"""Predict class or regression value for X.
For a classification model, the predicted class for each sample in X is
returned. For a regression model, the predicted value based on X is
returned.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
y : array of shape = [n_samples] or [n_samples, n_outputs]
The predicted classes, or the predict values.
"""
if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
X = array2d(X, dtype=DTYPE)
n_samples, n_features = X.shape
if self.tree_ is None:
raise Exception("Tree not initialized. Perform a fit first")
if self.n_features_ != n_features:
raise ValueError("Number of features of the model must "
" match the input. Model n_features is %s and "
" input n_features is %s "
% (self.n_features_, n_features))
proba = self.tree_.predict(X)
# Classification
if isinstance(self, ClassifierMixin):
if self.n_outputs_ == 1:
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
else:
predictions = np.zeros((n_samples, self.n_outputs_))
for k in xrange(self.n_outputs_):
predictions[:, k] = self.classes_[k].take(
np.argmax(proba[:, k], axis=1),
axis=0)
return predictions
# Regression
else:
if self.n_outputs_ == 1:
return proba[:, 0]
else:
return proba[:, :, 0]
@property
def feature_importances_(self):
"""Return the feature importances.
The importance of a feature is computed as the (normalized) total
reduction of the criterion brought by that feature.
It is also known as the Gini importance.
Returns
-------
feature_importances_ : array, shape = [n_features]
"""
if self.tree_ is None:
raise ValueError("Estimator not fitted, "
"call `fit` before `feature_importances_`.")
return self.tree_.compute_feature_importances()
# =============================================================================
# Public estimators
# =============================================================================
class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
"""A decision tree classifier.
Parameters
----------
criterion : string, optional (default="gini")
The function to measure the quality of a split. Supported criteria are
"gini" for the Gini impurity and "entropy" for the information gain.
splitter : string, optional (default="best")
The strategy used to choose the split at each node. Supported
strategies are "best" to choose the best split and "random" to choose
the best random split.
max_features : int, float, string or None, optional (default=None)
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a percentage and
`int(max_features * n_features)` features are considered at each
split.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_depth : int or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
Ignored if ``max_samples_leaf`` is not None.
min_samples_split : int, optional (default=2)
The minimum number of samples required to split an internal node.
min_samples_leaf : int, optional (default=1)
The minimum number of samples required to be at a leaf node.
max_leaf_nodes : int or None, optional (default=None)
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
If not None then ``max_depth`` will be ignored.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
`tree_` : Tree object
The underlying Tree object.
`max_features_` : int,
The infered value of max_features.
`classes_` : array of shape = [n_classes] or a list of such arrays
The classes labels (single output problem),
or a list of arrays of class labels (multi-output problem).
`n_classes_` : int or list
The number of classes (for single output problems),
or a list containing the number of classes for each
output (for multi-output problems).
`feature_importances_` : array of shape = [n_features]
The feature importances. The higher, the more important the
feature. The importance of a feature is computed as the (normalized)
total reduction of the criterion brought by that feature. It is also
known as the Gini importance [4]_.
See also
--------
DecisionTreeRegressor
References
----------
.. [1] http://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
and Regression Trees", Wadsworth, Belmont, CA, 1984.
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
Learning", Springer, 2009.
.. [4] L. Breiman, and A. Cutler, "Random Forests",
http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
Examples
--------
>>> from sklearn.datasets import load_iris
>>> from sklearn.cross_validation import cross_val_score
>>> from sklearn.tree import DecisionTreeClassifier
>>> clf = DecisionTreeClassifier(random_state=0)
>>> iris = load_iris()
>>> cross_val_score(clf, iris.data, iris.target, cv=10)
... # doctest: +SKIP
...
array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
0.93..., 0.93..., 1. , 0.93..., 1. ])
"""
def __init__(self,
criterion="gini",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features=None,
random_state=None,
min_density=None,
compute_importances=None,
max_leaf_nodes=None):
super(DecisionTreeClassifier, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
random_state=random_state)
if min_density is not None:
warn("The min_density parameter is deprecated as of version 0.14 "
"and will be removed in 0.16.", DeprecationWarning)
if compute_importances is not None:
warn("Setting compute_importances is no longer required as "
"version 0.14. Variable importances are now computed on the "
"fly when accessing the feature_importances_ attribute. "
"This parameter will be removed in 0.16.",
DeprecationWarning)
def predict_proba(self, X):
"""Predict class probabilities of the input samples X.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
p : array of shape = [n_samples, n_classes], or a list of n_outputs
such arrays if n_outputs > 1.
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute `classes_`.
"""
if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
X = array2d(X, dtype=DTYPE)
n_samples, n_features = X.shape
if self.tree_ is None:
raise Exception("Tree not initialized. Perform a fit first.")
if self.n_features_ != n_features:
raise ValueError("Number of features of the model must "
" match the input. Model n_features is %s and "
" input n_features is %s "
% (self.n_features_, n_features))
proba = self.tree_.predict(X)
if self.n_outputs_ == 1:
proba = proba[:, :self.n_classes_]
normalizer = proba.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba /= normalizer
return proba
else:
all_proba = []
for k in xrange(self.n_outputs_):
proba_k = proba[:, k, :self.n_classes_[k]]
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
all_proba.append(proba_k)
return all_proba
def predict_log_proba(self, X):
"""Predict class log-probabilities of the input samples X.
Parameters
----------
X : array-like of shape = [n_samples, n_features]
The input samples.
Returns
-------
p : array of shape = [n_samples, n_classes], or a list of n_outputs
such arrays if n_outputs > 1.
The class log-probabilities of the input samples. The order of the
classes corresponds to that in the attribute `classes_`.
"""
proba = self.predict_proba(X)
if self.n_outputs_ == 1:
return np.log(proba)
else:
for k in xrange(self.n_outputs_):
proba[k] = np.log(proba[k])
return proba
class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
"""A decision tree regressor.
Parameters
----------
criterion : string, optional (default="mse")
The function to measure the quality of a split. The only supported
criterion is "mse" for the mean squared error.
splitter : string, optional (default="best")
The strategy used to choose the split at each node. Supported
strategies are "best" to choose the best split and "random" to choose
the best random split.
max_features : int, float, string or None, optional (default=None)
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a percentage and
`int(max_features * n_features)` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_depth : int or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
Ignored if ``max_samples_leaf`` is not None.
min_samples_split : int, optional (default=2)
The minimum number of samples required to split an internal node.
min_samples_leaf : int, optional (default=1)
The minimum number of samples required to be at a leaf node.
max_leaf_nodes : int or None, optional (default=None)
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
If not None then ``max_depth`` will be ignored.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Attributes
----------
`tree_` : Tree object
The underlying Tree object.
`max_features_` : int,
The infered value of max_features.
`feature_importances_` : array of shape = [n_features]
The feature importances.
The higher, the more important the feature.
The importance of a feature is computed as the
(normalized) total reduction of the criterion brought
by that feature. It is also known as the Gini importance [4]_.
See also
--------
DecisionTreeClassifier
References
----------
.. [1] http://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
and Regression Trees", Wadsworth, Belmont, CA, 1984.
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
Learning", Springer, 2009.
.. [4] L. Breiman, and A. Cutler, "Random Forests",
http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
Examples
--------
>>> from sklearn.datasets import load_boston
>>> from sklearn.cross_validation import cross_val_score
>>> from sklearn.tree import DecisionTreeRegressor
>>> boston = load_boston()
>>> regressor = DecisionTreeRegressor(random_state=0)
>>> cross_val_score(regressor, boston.data, boston.target, cv=10)
... # doctest: +SKIP
...
array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75...,
0.07..., 0.29..., 0.33..., -1.42..., -1.77...])
"""
def __init__(self,
criterion="mse",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features=None,
random_state=None,
min_density=None,
compute_importances=None,
max_leaf_nodes=None):
super(DecisionTreeRegressor, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
random_state=random_state)
if min_density is not None:
warn("The min_density parameter is deprecated as of version 0.14 "
"and will be removed in 0.16.", DeprecationWarning)
if compute_importances is not None:
warn("Setting compute_importances is no longer required as "
"version 0.14. Variable importances are now computed on the "
"fly when accessing the feature_importances_ attribute. "
"This parameter will be removed in 0.16.",
DeprecationWarning)
class ExtraTreeClassifier(DecisionTreeClassifier):
"""An extremely randomized tree classifier.
Extra-trees differ from classic decision trees in the way they are built.
When looking for the best split to separate the samples of a node into two
groups, random splits are drawn for each of the `max_features` randomly
selected features and the best split among those is chosen. When
`max_features` is set 1, this amounts to building a totally random
decision tree.
Warning: Extra-trees should only be used within ensemble methods.
See also
--------
ExtraTreeRegressor, ExtraTreesClassifier, ExtraTreesRegressor
References
----------
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
Machine Learning, 63(1), 3-42, 2006.
"""
def __init__(self,
criterion="gini",
splitter="random",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features="auto",
random_state=None,
min_density=None,
compute_importances=None,
max_leaf_nodes=None):
super(ExtraTreeClassifier, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
random_state=random_state)
if min_density is not None:
warn("The min_density parameter is deprecated as of version 0.14 "
"and will be removed in 0.16.", DeprecationWarning)
if compute_importances is not None:
warn("Setting compute_importances is no longer required as "
"version 0.14. Variable importances are now computed on the "
"fly when accessing the feature_importances_ attribute. "
"This parameter will be removed in 0.16.",
DeprecationWarning)
class ExtraTreeRegressor(DecisionTreeRegressor):
"""An extremely randomized tree regressor.
Extra-trees differ from classic decision trees in the way they are built.
When looking for the best split to separate the samples of a node into two
groups, random splits are drawn for each of the `max_features` randomly
selected features and the best split among those is chosen. When
`max_features` is set 1, this amounts to building a totally random
decision tree.
Warning: Extra-trees should only be used within ensemble methods.
See also
--------
ExtraTreeClassifier, ExtraTreesClassifier, ExtraTreesRegressor
References
----------
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
Machine Learning, 63(1), 3-42, 2006.
"""
def __init__(self,
criterion="mse",
splitter="random",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
max_features="auto",
random_state=None,
min_density=None,
compute_importances=None,
max_leaf_nodes=None):
super(ExtraTreeRegressor, self).__init__(
criterion=criterion,
splitter=splitter,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
random_state=random_state)
if min_density is not None:
warn("The min_density parameter is deprecated as of version 0.14 "
"and will be removed in 0.16.", DeprecationWarning)
if compute_importances is not None:
warn("Setting compute_importances is no longer required as "
"version 0.14. Variable importances are now computed on the "
"fly when accessing the feature_importances_ attribute. "
"This parameter will be removed in 0.16.",
DeprecationWarning)