Repository URL to install this package:
|
Version:
2.2.1 ▾
|
glmnet
/
linear.py
|
|---|
import numpy as np
from scipy.sparse import issparse, csc_matrix
from scipy import stats
from sklearn.base import BaseEstimator
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, GroupKFold
from sklearn.utils import check_array, check_X_y
from .errors import _check_error_flag
from _glmnet import elnet, spelnet, solns
from glmnet.util import (_fix_lambda_path,
_check_user_lambda,
_interpolate_model,
_score_lambda_path)
class ElasticNet(BaseEstimator):
"""Elastic Net with squared error loss.
This is a wrapper for the glmnet function elnet.
Parameters
----------
alpha : float, default 1
The alpha parameter, 0 <= alpha <= 1, 0 for ridge, 1 for lasso
n_lambda : int, default 100
Maximum number of lambda values to compute
min_lambda_ratio : float, default 1e-4
In combination with n_lambda, the ratio of the smallest and largest
values of lambda computed.
lambda_path : array, default None
In place of supplying n_lambda, provide an array of specific values
to compute. The specified values must be in decreasing order. When
None, the path of lambda values will be determined automatically. A
maximum of `n_lambda` values will be computed.
standardize : bool, default True
Standardize input features prior to fitting. The final coefficients
will be on the scale of the original data regardless of the value
of standardize.
lower_limits : array, (shape n_features,) default -infinity
Array of lower limits for each coefficient, must be non-positive.
Can be a single value (which is then replicated), else an array
corresponding to the number of features.
upper_limits : array, (shape n_features,) default +infinity
Array of upper limits for each coefficient, must be positive.
See lower_limits.
fit_intercept : bool, default True
Include an intercept term in the model.
cut_point : float, default 1
The cut point to use for selecting lambda_best.
arg_max lambda cv_score(lambda) >= cv_score(lambda_max) - cut_point * standard_error(lambda_max)
n_splits : int, default 3
Number of cross validation folds for computing performance metrics
(including determination of `lambda_best_` and `lambda_max_`). If
non-zero, must be at least 3.
scoring : string, callable, or None, default None
Scoring method for model selection during cross validation. When None,
defaults to r^2 score. Valid options are `r2`, `mean_squared_error`,
`mean_absolute_error`, `median_absolute_error`. Alternatively, supply
a function or callable object with the following signature
``scorer(estimator, X, y)``. Note, the scoring function affects the
selection of `lambda_best_` and `lambda_max_`, fitting the same data
with different scoring methods will result in the selection of
different models.
n_jobs : int, default 1
Maximum number of threads for computing cross validation metrics.
tol : float, default 1e-7
Convergence tolerance.
max_iter : int, default 100000
Maximum passes over the data for all values of lambda.
random_state : number, default None
Seed for the random number generator. The glmnet solver is not
deterministic, this seed is used for determining the cv folds.
max_features : int
Optional maximum number of features with nonzero coefficients after
regularization. If not set, defaults to X.shape[1] during fit
Note, this will be ignored if the user specifies lambda_path.
verbose : bool, default False
When True some warnings and log messages are suppressed.
Attributes
----------
n_lambda_ : int
The number of lambda values found by glmnet. Note, this may be less
than the number specified via n_lambda.
lambda_path_ : array, shape (n_lambda_,)
The values of lambda found by glmnet, in decreasing order.
coef_path_ : array, shape (n_features, n_lambda_)
The set of coefficients for each value of lambda in lambda_path_.
coef_ : array, shape (n_features,)
The coefficients corresponding to lambda_best_.
intercept_ : float
The intercept corresponding to lambda_best_.
intercept_path_ : array, shape (n_lambda_,)
The intercept for each value of lambda in lambda_path_.
cv_mean_score_ : array, shape (n_lambda_,)
The mean cv score for each value of lambda. This will be set by fit_cv.
cv_standard_error_ : array, shape (n_lambda_,)
The standard error of the mean cv score for each value of lambda, this
will be set by fit_cv.
lambda_max_ : float
The value of lambda that gives the best performance in cross
validation.
lambda_best_ : float
The largest value of lambda which is greater than lambda_max_ and
performs within cut_point * standard error of lambda_max_.
"""
def __init__(self, alpha=1, n_lambda=100, min_lambda_ratio=1e-4,
lambda_path=None, standardize=True, fit_intercept=True,
lower_limits=-np.inf, upper_limits=np.inf,
cut_point=1.0, n_splits=3, scoring=None, n_jobs=1, tol=1e-7,
max_iter=100000, random_state=None, max_features=None, verbose=False):
self.alpha = alpha
self.n_lambda = n_lambda
self.min_lambda_ratio = min_lambda_ratio
self.lambda_path = lambda_path
self.standardize = standardize
self.lower_limits = lower_limits
self.upper_limits = upper_limits
self.fit_intercept = fit_intercept
self.cut_point = cut_point
self.n_splits = n_splits
self.scoring = scoring
self.n_jobs = n_jobs
self.tol = tol
self.max_iter = max_iter
self.random_state = random_state
self.max_features = max_features
self.verbose = verbose
def fit(self, X, y, sample_weight=None, relative_penalties=None, groups=None):
"""Fit the model to training data. If n_splits > 1 also run n-fold cross
validation on all values in lambda_path.
The model will be fit n+1 times. On the first pass, the lambda_path
will be determined, on the remaining passes, the model performance for
each value of lambda. After cross validation, the attribute
`cv_mean_score_` will contain the mean score over all folds for each
value of lambda, and `cv_standard_error_` will contain the standard
error of `cv_mean_score_` for each value of lambda. The value of lambda
which achieves the best performance in cross validation will be saved
to `lambda_max_` additionally, the largest value of lambda s.t.:
cv_score(l) >= cv_score(lambda_max_) -\
cut_point * standard_error(lambda_max_)
will be saved to `lambda_best_`.
Parameters
----------
X : array, shape (n_samples, n_features)
Input features
y : array, shape (n_samples,)
Target values
sample_weight : array, shape (n_samples,)
Optional weight vector for observations
relative_penalties: array, shape (n_features,)
Optional relative weight vector for penalty.
0 entries remove penalty.
groups: array, shape (n_samples,)
Group labels for the samples used while splitting the dataset into train/test set.
If the groups are specified, the groups will be passed to sklearn.model_selection.GroupKFold.
If None, then data will be split randomly for K-fold cross-validation via sklearn.model_selection.KFold.
Returns
-------
self : object
Returns self.
"""
X, y = check_X_y(X, y, accept_sparse='csr', ensure_min_samples=2)
if sample_weight is None:
sample_weight = np.ones(X.shape[0])
else:
sample_weight = np.asarray(sample_weight)
if not np.isscalar(self.lower_limits):
self.lower_limits = np.asarray(self.lower_limits)
if len(self.lower_limits) != X.shape[1]:
raise ValueError("lower_limits must equal number of features")
if not np.isscalar(self.upper_limits):
self.upper_limits = np.asarray(self.upper_limits)
if len(self.upper_limits) != X.shape[1]:
raise ValueError("upper_limits must equal number of features")
if any(self.lower_limits > 0) if isinstance(self.lower_limits, np.ndarray) else self.lower_limits > 0:
raise ValueError("lower_limits must be non-positive")
if any(self.upper_limits < 0) if isinstance(self.upper_limits, np.ndarray) else self.upper_limits < 0:
raise ValueError("upper_limits must be positive")
if self.alpha > 1 or self.alpha < 0:
raise ValueError("alpha must be between 0 and 1")
if self.n_splits > 0 and self.n_splits < 3:
raise ValueError("n_splits must be at least 3")
self._fit(X, y, sample_weight, relative_penalties)
if self.n_splits >= 3:
if groups is None:
self._cv = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
else:
self._cv = GroupKFold(n_splits=self.n_splits)
cv_scores = _score_lambda_path(self, X, y, groups,
sample_weight,
relative_penalties,
self.scoring,
n_jobs=self.n_jobs,
verbose=self.verbose)
self.cv_mean_score_ = np.atleast_1d(np.mean(cv_scores, axis=0))
self.cv_standard_error_ = np.atleast_1d(stats.sem(cv_scores))
self.lambda_max_inx_ = np.argmax(self.cv_mean_score_)
self.lambda_max_ = self.lambda_path_[self.lambda_max_inx_]
target_score = self.cv_mean_score_[self.lambda_max_inx_] -\
self.cut_point * self.cv_standard_error_[self.lambda_max_inx_]
self.lambda_best_inx_ = np.argwhere(self.cv_mean_score_ >= target_score)[0]
self.lambda_best_ = self.lambda_path_[self.lambda_best_inx_]
self.coef_ = self.coef_path_[..., self.lambda_best_inx_]
self.coef_ = self.coef_.squeeze(axis=self.coef_.ndim-1)
self.intercept_ = self.intercept_path_[..., self.lambda_best_inx_].squeeze()
if self.intercept_.shape == (): # convert 0d array to scalar
self.intercept_ = float(self.intercept_)
return self
def _fit(self, X, y, sample_weight, relative_penalties):
if self.lambda_path is not None:
n_lambda = len(self.lambda_path)
min_lambda_ratio = 1.0
else:
n_lambda = self.n_lambda
min_lambda_ratio = self.min_lambda_ratio
_y = y.astype(dtype=np.float64, order='F', copy=True)
_sample_weight = sample_weight.astype(dtype=np.float64, order='F',
copy=True)
exclude_vars = 0
if relative_penalties is None:
relative_penalties = np.ones(X.shape[1], dtype=np.float64,
order='F')
coef_bounds = np.empty((2, X.shape[1]), dtype=np.float64, order='F')
coef_bounds[0, :] = self.lower_limits
coef_bounds[1, :] = self.upper_limits
if X.shape[1] > X.shape[0]:
# the glmnet docs suggest using a different algorithm for the case
# of p >> n
algo_flag = 2
else:
algo_flag = 1
# This is a stopping criterion (nx)
# R defaults to nx = num_features, and ne = num_features + 1
if self.max_features is None:
max_features = X.shape[1]
else:
max_features = self.max_features
if issparse(X):
_x = csc_matrix(X, dtype=np.float64, copy=True)
(self.n_lambda_,
self.intercept_path_,
ca,
ia,
nin,
_, # rsq
self.lambda_path_,
_, # nlp
jerr) = spelnet(algo_flag,
self.alpha,
_x.shape[0],
_x.shape[1],
_x.data,
_x.indptr + 1, # Fortran uses 1-based indexing
_x.indices + 1,
_y,
_sample_weight,
exclude_vars,
relative_penalties,
coef_bounds,
max_features,
X.shape[1] + 1,
min_lambda_ratio,
self.lambda_path,
self.tol,
n_lambda,
self.standardize,
self.fit_intercept,
self.max_iter)
else:
_x = X.astype(dtype=np.float64, order='F', copy=True)
(self.n_lambda_,
self.intercept_path_,
ca,
ia,
nin,
_, # rsq
self.lambda_path_,
_, # nlp
jerr) = elnet(algo_flag,
self.alpha,
_x,
_y,
_sample_weight,
exclude_vars,
relative_penalties,
coef_bounds,
X.shape[1] + 1,
min_lambda_ratio,
self.lambda_path,
self.tol,
max_features,
n_lambda,
self.standardize,
self.fit_intercept,
self.max_iter)
# raises RuntimeError if self.jerr_ is nonzero
self.jerr_ = jerr
_check_error_flag(self.jerr_)
self.lambda_path_ = self.lambda_path_[:self.n_lambda_]
self.lambda_path_ = _fix_lambda_path(self.lambda_path_)
# trim the pre-allocated arrays returned by glmnet to match the actual
# number of values found for lambda
self.intercept_path_ = self.intercept_path_[:self.n_lambda_]
ca = ca[:, :self.n_lambda_]
nin = nin[:self.n_lambda_]
self.coef_path_ = solns(_x.shape[1], ca, ia, nin)
return self
def decision_function(self, X, lamb=None):
lambda_best = None
if hasattr(self, 'lambda_best_'):
lambda_best = self.lambda_best_
lamb = _check_user_lambda(self.lambda_path_, lambda_best, lamb)
coef, intercept = _interpolate_model(self.lambda_path_,
self.coef_path_,
self.intercept_path_, lamb)
X = check_array(X, accept_sparse='csr')
z = X.dot(coef) + intercept
# drop last dimension (lambda path) when we are predicting for a
# single value of lambda
if lamb.shape[0] == 1:
z = z.squeeze(axis=-1)
return z
def predict(self, X, lamb=None):
"""Predict the response Y for each sample in X
Parameters
----------
X : array, shape (n_samples, n_features)
lamb : array, shape (n_lambda,)
Values of lambda from lambda_path_ from which to make predictions.
If no values are provided, the returned predictions will be those
corresponding to lambda_best_. The values of lamb must also be in
the range of lambda_path_, values greater than max(lambda_path_)
or less than min(lambda_path_) will be clipped.
Returns
-------
C : array, shape (n_samples,) or (n_samples, n_lambda)
Predicted response value for each sample given each value of lambda
"""
return self.decision_function(X, lamb)
def score(self, X, y, lamb=None):
"""Returns the coefficient of determination R^2 for each value of lambda.
Parameters
----------
X : array, shape (n_samples, n_features)
Test samples
y : array, shape (n_samples,)
True values for X
lamb : array, shape (n_lambda,)
Values from lambda_path_ for which to score predictions.
Returns
-------
scores : array, shape (n_lambda,)
R^2 of predictions for each lambda.
"""
# pred will have shape (n_samples, n_lambda)
pred = self.predict(X, lamb=lamb)
# Reverse the args of the r2_score function from scikit-learn. The
# function np.apply_along_axis passes an array as the first argument to
# the provided function and any extra args as the subsequent arguments.
def r2_reverse(y_pred, y_true):
return r2_score(y_true, y_pred)
# compute the score for each value of lambda
return np.apply_along_axis(r2_reverse, 0, pred, y)