"""
This module contains the TreeGrower class.
TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on
the gradients and hessians of the training data.
"""
# Author: Nicolas Hug
from heapq import heappush, heappop
import numpy as np
from timeit import default_timer as time
import numbers
from .splitting import Splitter
from .histogram import HistogramBuilder
from .predictor import TreePredictor
from .utils import sum_parallel
from .common import PREDICTOR_RECORD_DTYPE
from .common import Y_DTYPE
from .common import MonotonicConstraint
EPS = np.finfo(Y_DTYPE).eps # to avoid zero division errors
class TreeNode:
"""Tree Node class used in TreeGrower.
This isn't used for prediction purposes, only for training (see
TreePredictor).
Parameters
----------
depth : int
The depth of the node, i.e. its distance from the root.
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
The indices of the samples at the node.
sum_gradients : float
The sum of the gradients of the samples at the node.
sum_hessians : float
The sum of the hessians of the samples at the node.
parent : TreeNode or None, optional (default=None)
The parent of the node. None for root.
Attributes
----------
depth : int
The depth of the node, i.e. its distance from the root.
sample_indices : ndarray of unsigned int, shape (n_samples_at_node,)
The indices of the samples at the node.
sum_gradients : float
The sum of the gradients of the samples at the node.
sum_hessians : float
The sum of the hessians of the samples at the node.
parent : TreeNode or None
The parent of the node. None for root.
split_info : SplitInfo or None
The result of the split evaluation.
left_child : TreeNode or None
The left child of the node. None for leaves.
right_child : TreeNode or None
The right child of the node. None for leaves.
value : float or None
The value of the leaf, as computed in finalize_leaf(). None for
non-leaf nodes.
partition_start : int
start position of the node's sample_indices in splitter.partition.
partition_stop : int
stop position of the node's sample_indices in splitter.partition.
"""
split_info = None
left_child = None
right_child = None
histograms = None
sibling = None
parent = None
# start and stop indices of the node in the splitter.partition
# array. Concretely,
# self.sample_indices = view(self.splitter.partition[start:stop])
# Please see the comments about splitter.partition and
# splitter.split_indices for more info about this design.
# These 2 attributes are only used in _update_raw_prediction, because we
# need to iterate over the leaves and I don't know how to efficiently
# store the sample_indices views because they're all of different sizes.
partition_start = 0
partition_stop = 0
def __init__(self, depth, sample_indices, sum_gradients,
sum_hessians, parent=None, value=None):
self.depth = depth
self.sample_indices = sample_indices
self.n_samples = sample_indices.shape[0]
self.sum_gradients = sum_gradients
self.sum_hessians = sum_hessians
self.parent = parent
self.value = value
self.is_leaf = False
self.set_children_bounds(float('-inf'), float('+inf'))
def set_children_bounds(self, lower, upper):
"""Set children values bounds to respect monotonic constraints."""
# These are bounds for the node's *children* values, not the node's
# value. The bounds are used in the splitter when considering potential
# left and right child.
self.children_lower_bound = lower
self.children_upper_bound = upper
def __lt__(self, other_node):
"""Comparison for priority queue.
Nodes with high gain are higher priority than nodes with low gain.
heapq.heappush only need the '<' operator.
heapq.heappop take the smallest item first (smaller is higher
priority).
Parameters
----------
other_node : TreeNode
The node to compare with.
"""
return self.split_info.gain > other_node.split_info.gain
class TreeGrower:
"""Tree grower class used to build a tree.
The tree is fitted to predict the values of a Newton-Raphson step. The
splits are considered in a best-first fashion, and the quality of a
split is defined in splitting._split_gain.
Parameters
----------
X_binned : ndarray of int, shape (n_samples, n_features)
The binned input samples. Must be Fortran-aligned.
gradients : ndarray, shape (n_samples,)
The gradients of each training sample. Those are the gradients of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
hessians : ndarray, shape (n_samples,)
The hessians of each training sample. Those are the hessians of the
loss w.r.t the predictions, evaluated at iteration ``i - 1``.
max_leaf_nodes : int or None, optional (default=None)
The maximum number of leaves for each tree. If None, there is no
maximum limit.
max_depth : int or None, optional (default=None)
The maximum depth of each tree. The depth of a tree is the number of
edges to go from the root to the deepest leaf.
Depth isn't constrained by default.
min_samples_leaf : int, optional (default=20)
The minimum number of samples per leaf.
min_gain_to_split : float, optional (default=0.)
The minimum gain needed to split a node. Splits with lower gain will
be ignored.
n_bins : int, optional (default=256)
The total number of bins, including the bin for missing values. Used
to define the shape of the histograms.
n_bins_non_missing_ : array of uint32
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this
is equal to ``n_bins - 1``. If it's an int, all features are
considered to have the same number of bins. If None, all features
are considered to have ``n_bins - 1`` bins.
has_missing_values : ndarray of bool or bool, optional (default=False)
Whether each feature contains missing values (in the training data).
If it's a bool, the same value is used for all features.
l2_regularization : float, optional (default=0)
The L2 regularization parameter.
min_hessian_to_split : float, optional (default=1e-3)
The minimum sum of hessians needed in each node. Splits that result in
at least one child having a sum of hessians less than
``min_hessian_to_split`` are discarded.
shrinkage : float, optional (default=1)
The shrinkage parameter to apply to the leaves values, also known as
learning rate.
"""
def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
n_bins=256, n_bins_non_missing=None, has_missing_values=False,
monotonic_cst=None, l2_regularization=0.,
min_hessian_to_split=1e-3, shrinkage=1.):
self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
l2_regularization, min_hessian_to_split)
if n_bins_non_missing is None:
n_bins_non_missing = n_bins - 1
if isinstance(n_bins_non_missing, numbers.Integral):
n_bins_non_missing = np.array(
[n_bins_non_missing] * X_binned.shape[1],
dtype=np.uint32)
else:
n_bins_non_missing = np.asarray(n_bins_non_missing,
dtype=np.uint32)
if isinstance(has_missing_values, bool):
has_missing_values = [has_missing_values] * X_binned.shape[1]
has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
if monotonic_cst is None:
self.with_monotonic_cst = False
monotonic_cst = np.full(shape=X_binned.shape[1],
fill_value=MonotonicConstraint.NO_CST,
dtype=np.int8)
else:
self.with_monotonic_cst = True
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
if monotonic_cst.shape[0] != X_binned.shape[1]:
raise ValueError(
"monotonic_cst has shape {} but the input data "
"X has {} features.".format(
monotonic_cst.shape[0], X_binned.shape[1]
)
)
if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
raise ValueError(
"monotonic_cst must be None or an array-like of "
"-1, 0 or 1."
)
hessians_are_constant = hessians.shape[0] == 1
self.histogram_builder = HistogramBuilder(
X_binned, n_bins, gradients, hessians, hessians_are_constant)
missing_values_bin_idx = n_bins - 1
self.splitter = Splitter(
X_binned, n_bins_non_missing, missing_values_bin_idx,
has_missing_values, monotonic_cst,
l2_regularization, min_hessian_to_split,
min_samples_leaf, min_gain_to_split, hessians_are_constant)
self.n_bins_non_missing = n_bins_non_missing
self.max_leaf_nodes = max_leaf_nodes
self.has_missing_values = has_missing_values
self.monotonic_cst = monotonic_cst
self.l2_regularization = l2_regularization
self.n_features = X_binned.shape[1]
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf
self.X_binned = X_binned
self.min_gain_to_split = min_gain_to_split
self.shrinkage = shrinkage
self.splittable_nodes = []
self.finalized_leaves = []
self.total_find_split_time = 0. # time spent finding the best splits
self.total_compute_hist_time = 0. # time spent computing histograms
self.total_apply_split_time = 0. # time spent splitting nodes
self._intilialize_root(gradients, hessians, hessians_are_constant)
self.n_nodes = 1
def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth,
min_samples_leaf, min_gain_to_split,
l2_regularization, min_hessian_to_split):
"""Validate parameters passed to __init__.
Also validate parameters passed to splitter.
"""
if X_binned.dtype != np.uint8:
raise NotImplementedError(
"X_binned must be of type uint8.")
if not X_binned.flags.f_contiguous:
raise ValueError(
"X_binned should be passed as Fortran contiguous "
"array for maximum efficiency.")
if max_leaf_nodes is not None and max_leaf_nodes <= 1:
raise ValueError('max_leaf_nodes={} should not be'
' smaller than 2'.format(max_leaf_nodes))
if max_depth is not None and max_depth < 1:
raise ValueError('max_depth={} should not be'
' smaller than 1'.format(max_depth))
if min_samples_leaf < 1:
raise ValueError('min_samples_leaf={} should '
'not be smaller than 1'.format(min_samples_leaf))
if min_gain_to_split < 0:
raise ValueError('min_gain_to_split={} '
'must be positive.'.format(min_gain_to_split))
if l2_regularization < 0:
raise ValueError('l2_regularization={} must be '
'positive.'.format(l2_regularization))
if min_hessian_to_split < 0:
raise ValueError('min_hessian_to_split={} '
'must be positive.'.format(min_hessian_to_split))
def grow(self):
"""Grow the tree, from root to leaves."""
while self.splittable_nodes:
self.split_next()
self._apply_shrinkage()
def _apply_shrinkage(self):
"""Multiply leaves values by shrinkage parameter.
This must be done at the very end of the growing process. If this were
done during the growing process e.g. in finalize_leaf(), then a leaf
would be shrunk but its sibling would potentially not be (if it's a
non-leaf), which would lead to a wrong computation of the 'middle'
value needed to enforce the monotonic constraints.
"""
for leaf in self.finalized_leaves:
leaf.value *= self.shrinkage
def _intilialize_root(self, gradients, hessians, hessians_are_constant):
"""Initialize root node and finalize it if needed."""
n_samples = self.X_binned.shape[0]
depth = 0
sum_gradients = sum_parallel(gradients)
if self.histogram_builder.hessians_are_constant:
sum_hessians = hessians[0] * n_samples
else:
sum_hessians = sum_parallel(hessians)
self.root = TreeNode(
depth=depth,
sample_indices=self.splitter.partition,
sum_gradients=sum_gradients,
sum_hessians=sum_hessians,
value=0
)
self.root.partition_start = 0
self.root.partition_stop = n_samples
if self.root.n_samples < 2 * self.min_samples_leaf:
# Do not even bother computing any splitting statistics.
self._finalize_leaf(self.root)
return
if sum_hessians < self.splitter.min_hessian_to_split:
self._finalize_leaf(self.root)
return
self.root.histograms = self.histogram_builder.compute_histograms_brute(
self.root.sample_indices)
self._compute_best_split_and_push(self.root)
def _compute_best_split_and_push(self, node):
"""Compute the best possible split (SplitInfo) of a given node.
Also push it in the heap of splittable nodes if gain isn't zero.
The gain of a node is 0 if either all the leaves are pure
(best gain = 0), or if no split would satisfy the constraints,
(min_hessians_to_split, min_gain_to_split, min_samples_leaf)
"""
Loading ...