# -*- coding: utf-8 -*-
"""
Created on Sat May 19 15:53:21 2018
Author: Josef Perktold
License: BSD-3
"""
from collections import defaultdict
import numpy as np
from statsmodels.base._penalties import SCADSmoothed
class ScreeningResults(object):
"""Results for Variable Screening
Note: Indices except for exog_idx and in the iterated case also
idx_nonzero_batches are based on the combined [exog_keep, exog] array.
Attributes
----------
results_final : instance
Results instance returned by the final fit of the penalized model, i.e.
after trimming exog with params below trimming threshold.
results_pen : results instance
Results instance of the penalized model before trimming. This includes
variables from the last forward selection
idx_nonzero
index of exog columns in the final selection including exog_keep
idx_exog
index of exog columns in the final selection for exog candidates, i.e.
without exog_keep
idx_excl
idx of excluded exog based on combined [exog_keep, exog] array. This is
the complement of idx_nonzero
converged : bool
True if the iteration has converged and stopped before maxiter has been
reached. False if maxiter has been reached.
iterations : int
number of iterations in the screening process. Each iteration consists
of a forward selection step and a trimming step.
history : dict of lists
results collected for each iteration during the screening process
'idx_nonzero' 'params_keep'].append(start_params)
history['idx_added'].append(idx)
The ScreeningResults returned by `screen_exog_iterator` has additional
attributes:
idx_nonzero_batches : ndarray 2-D
Two-dimensional array with batch index in the first column and variable
index withing batch in the second column. They can be used jointly as
index for the data in the exog_iterator.
exog_final_names : list[str]
'var<bidx>_<idx>' where `bidx` is the batch index and `idx` is the
index of the selected column withing batch `bidx`.
history_batches : dict of lists
This provides information about the selected variables within each
batch during the first round screening
'idx_nonzero' is based ond the array that includes exog_keep, while
'idx_exog' is the index based on the exog of the batch.
"""
def __init__(self, screener, **kwds):
self.screener = screener
self.__dict__.update(**kwds)
class VariableScreening(object):
"""Ultra-high, conditional sure independence screening
This is an adjusted version of Fan's sure independence screening.
Parameters
----------
model : instance of penalizing model
examples: GLMPenalized, PoissonPenalized and LogitPenalized.
The attributes of the model instance `pen_weight` and `penal` will be
ignored.
pen_weight : None or float
penalization weight use in SCAD penalized MLE
k_add : int
number of exog to add during expansion or forward selection
see Notes section for tie handling
k_max_add : int
maximum number of variables to include during variable addition, i.e.
forward selection. default is 30
threshold_trim : float
threshold for trimming parameters to zero, default is 1e-4
k_max_included : int
maximum total number of variables to include in model.
ranking_attr : str
This determines the result attribute or model method that is used for
the ranking of exog to include. The availability of attributes depends
on the model.
Default is 'resid_pearson', 'model.score_factor' can be used in GLM.
ranking_project : bool
If ranking_project is True, then the exog candidates for inclusion are
first projected on the already included exog before the computation
of the ranking measure. This brings the ranking measure closer to
the statistic of a score test for variable addition.
Notes
-----
Status: experimental, tested only on a limited set of models and
with a limited set of model options.
Tie handling: If there are ties at the decision threshold, then all those
tied exog columns are treated in the same way. During forward selection
all exog columns with the same boundary value are included. During
elimination, the tied columns are not dropped. Consequently, if ties are
present, then the number of included exog can be larger than specified
by k_add, k_max_add and k_max_included.
The screening algorithm works similar to step wise regression. Each
iteration of the screening algorithm includes a forward selection step
where variables are added to the model, and a backwards selection step
where variables are removed. In contrast to step wise regression, we add
a fixed number of variables at each forward selection step. The
backwards selection step is based on SCAD penalized estimation and
trimming of variables with estimated coefficients below a threshold.
The tuning parameters can be used to adjust the number of variables to add
and to include depending on the size of the dataset.
There is currently no automatic tuning parameter selection. Candidate
explanatory variables should be standardized or should be on a similar
scale because penalization and trimming are based on the absolute values
of the parameters.
TODOs and current limitations:
freq_weights are not supported in this. Candidate ranking uses
moment condition with resid_pearson or others without freq_weights.
pearson_resid: GLM resid_pearson does not include freq_weights.
variable names: do we keep track of those? currently made-up names
currently only supports numpy arrays, no exog type check or conversion
currently only single columns are selected, no terms (multi column exog)
"""
def __init__(self, model, pen_weight=None, use_weights=True, k_add=30,
k_max_add=30, threshold_trim=1e-4, k_max_included=20,
ranking_attr='resid_pearson', ranking_project=True):
self.model = model
self.model_class = model.__class__
self.init_kwds = model._get_init_kwds()
# pen_weight and penal are explicitly included
# TODO: check what we want to do here
self.init_kwds.pop('pen_weight', None)
self.init_kwds.pop('penal', None)
self.endog = model.endog
self.exog_keep = model.exog
self.k_keep = model.exog.shape[1]
self.nobs = len(self.endog)
self.penal = self._get_penal()
if pen_weight is not None:
self.pen_weight = pen_weight
else:
self.pen_weight = self.nobs * 10
# option for screening algorithm
self.use_weights = use_weights
self.k_add = k_add
self.k_max_add = k_max_add
self.threshold_trim = threshold_trim
self.k_max_included = k_max_included
self.ranking_attr = ranking_attr
self.ranking_project = ranking_project
def _get_penal(self, weights=None):
"""create new Penalty instance
"""
return SCADSmoothed(0.1, c0=0.0001, weights=weights)
def ranking_measure(self, res_pen, exog, keep=None):
"""compute measure for ranking exog candidates for inclusion
"""
endog = self.endog
if self.ranking_project:
assert res_pen.model.exog.shape[1] == len(keep)
ex_incl = res_pen.model.exog[:, keep]
exog = exog - ex_incl.dot(np.linalg.pinv(ex_incl).dot(exog))
if self.ranking_attr == 'predicted_poisson':
# I keep this for more experiments
# TODO: does it really help to change/trim params
# we are not reestimating with trimmed model
p = res_pen.params.copy()
if keep is not None:
p[~keep] = 0
predicted = res_pen.model.predict(p)
# this is currently hardcoded for Poisson
resid_factor = (endog - predicted) / np.sqrt(predicted)
elif self.ranking_attr[:6] == 'model.':
# use model method, this is intended for score_factor
attr = self.ranking_attr.split('.')[1]
resid_factor = getattr(res_pen.model, attr)(res_pen.params)
if resid_factor.ndim == 2:
# for score_factor when extra params are in model
resid_factor = resid_factor[:, 0]
mom_cond = np.abs(resid_factor.dot(exog))**2
else:
# use results attribute
resid_factor = getattr(res_pen, self.ranking_attr)
mom_cond = np.abs(resid_factor.dot(exog))**2
return mom_cond
def screen_exog(self, exog, endog=None, maxiter=100, method='bfgs',
disp=False, fit_kwds=None):
"""screen and select variables (columns) in exog
Parameters
----------
exog : ndarray
candidate explanatory variables that are screened for inclusion in
the model
endog : ndarray (optional)
use a new endog in the screening model.
This is not tested yet, and might not work correctly
maxiter : int
number of screening iterations
method : str
optimization method to use in fit, needs to be only of the gradient
optimizers
disp : bool
display option for fit during optimization
Returns
-------
res_screen : instance of ScreeningResults
The attribute `results_final` contains is the results instance
with the final model selection.
`idx_nonzero` contains the index of the selected exog in the full
exog, combined exog that are always kept plust exog_candidates.
see ScreeningResults for a full description
"""
model_class = self.model_class
if endog is None:
# allow a different endog than used in model
endog = self.endog
x0 = self.exog_keep
k_keep = self.k_keep
x1 = exog
k_current = x0.shape[1]
# TODO: remove the need for x, use x1 separately from x0
# needs change to idx to be based on x1 (candidate variables)
x = np.column_stack((x0, x1))
nobs, k_vars = x.shape
fkwds = fit_kwds if fit_kwds is not None else {}
fit_kwds = {'maxiter': 200, 'disp': False}
fit_kwds.update(fkwds)
history = defaultdict(list)
idx_nonzero = np.arange(k_keep, dtype=np.int)
keep = np.ones(k_keep, np.bool_)
idx_excl = np.arange(k_keep, k_vars)
mod_pen = model_class(endog, x0, **self.init_kwds)
# do not penalize initial estimate
mod_pen.pen_weight = 0
res_pen = mod_pen.fit(**fit_kwds)
start_params = res_pen.params
converged = False
idx_old = []
for it in range(maxiter):
# candidates for inclusion in next iteration
x1 = x[:, idx_excl]
mom_cond = self.ranking_measure(res_pen, x1, keep=keep)
assert len(mom_cond) == len(idx_excl)
mcs = np.sort(mom_cond)[::-1]
idx_thr = min((self.k_max_add, k_current + self.k_add, len(mcs)))
threshold = mcs[idx_thr]
# indices of exog in current expansion model
idx = np.concatenate((idx_nonzero, idx_excl[mom_cond > threshold]))
start_params2 = np.zeros(len(idx))
start_params2[:len(start_params)] = start_params
if self.use_weights:
weights = np.ones(len(idx))
weights[:k_keep] = 0
# modify Penalty instance attached to self
# damgerous if res_pen is reused
self.penal.weights = weights
mod_pen = model_class(endog, x[:, idx], penal=self.penal,
pen_weight=self.pen_weight,
**self.init_kwds)
res_pen = mod_pen.fit(method=method,
start_params=start_params2,
warn_convergence=False, skip_hessian=True,
**fit_kwds)
keep = np.abs(res_pen.params) > self.threshold_trim
# use largest params to keep
if keep.sum() > self.k_max_included:
# TODO we can use now np.partition with partial sort
thresh_params = np.sort(np.abs(res_pen.params))[
-self.k_max_included]
keep2 = np.abs(res_pen.params) > thresh_params
keep = np.logical_and(keep, keep2)
# Note: idx and keep are for current expansion model
# idx_nonzero has indices of selected variables in full exog
keep[:k_keep] = True # always keep exog_keep
idx_nonzero = idx[keep]
if disp:
print(keep)
print(idx_nonzero)
# x0 is exog of currently selected model, not used in iteration
# x0 = x[:, idx_nonzero]
k_current = len(idx_nonzero)
start_params = res_pen.params[keep]
# use mask to get excluded indices
mask_excl = np.ones(k_vars, dtype=bool)
mask_excl[idx_nonzero] = False
idx_excl = np.nonzero(mask_excl)[0]
history['idx_nonzero'].append(idx_nonzero)
history['keep'].append(keep)
history['params_keep'].append(start_params)
history['idx_added'].append(idx)
if (len(idx_nonzero) == len(idx_old) and
(idx_nonzero == idx_old).all()):
converged = True
break
idx_old = idx_nonzero
# final esimate
# check that we still have exog_keep
assert np.all(idx_nonzero[:k_keep] == np.arange(k_keep))
if self.use_weights:
weights = np.ones(len(idx_nonzero))
weights[:k_keep] = 0
# create new Penalty instance to avoide sharing attached penal
penal = self._get_penal(weights=weights)
else:
Loading ...