# -*- coding: utf-8 -*-
"""Influence and Outlier Measures
Created on Sun Jan 29 11:16:09 2012
Author: Josef Perktold
License: BSD-3
"""
from collections import defaultdict
import numpy as np
from statsmodels.compat.python import lzip
from statsmodels.compat.pandas import Appender
from statsmodels.graphics._regressionplots_doc import _plot_influence_doc
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.multitest import multipletests
from statsmodels.tools.decorators import cache_readonly
from statsmodels.tools.tools import maybe_unwrap_results
# outliers test convenience wrapper
def outlier_test(model_results, method='bonf', alpha=.05, labels=None,
order=False, cutoff=None):
"""
Outlier Tests for RegressionResults instances.
Parameters
----------
model_results : RegressionResults
Linear model results
method : str
- `bonferroni` : one-step correction
- `sidak` : one-step correction
- `holm-sidak` :
- `holm` :
- `simes-hochberg` :
- `hommel` :
- `fdr_bh` : Benjamini/Hochberg
- `fdr_by` : Benjamini/Yekutieli
See `statsmodels.stats.multitest.multipletests` for details.
alpha : float
familywise error rate
labels : None or array_like
If `labels` is not None, then it will be used as index to the
returned pandas DataFrame. See also Returns below
order : bool
Whether or not to order the results by the absolute value of the
studentized residuals. If labels are provided they will also be sorted.
cutoff : None or float in [0, 1]
If cutoff is not None, then the return only includes observations with
multiple testing corrected p-values strictly below the cutoff. The
returned array or dataframe can be empty if there are no outlier
candidates at the specified cutoff.
Returns
-------
table : ndarray or DataFrame
Returns either an ndarray or a DataFrame if labels is not None.
Will attempt to get labels from model_results if available. The
columns are the Studentized residuals, the unadjusted p-value,
and the corrected p-value according to method.
Notes
-----
The unadjusted p-value is stats.t.sf(abs(resid), df) where
df = df_resid - 1.
"""
from scipy import stats # lazy import
if labels is None:
labels = getattr(model_results.model.data, 'row_labels', None)
infl = getattr(model_results, 'get_influence', None)
if infl is None:
results = maybe_unwrap_results(model_results)
raise AttributeError("model_results object %s does not have a "
"get_influence "
"method." % results.__class__.__name__)
resid = infl().resid_studentized_external
if order:
idx = np.abs(resid).argsort()[::-1]
resid = resid[idx]
if labels is not None:
labels = np.asarray(labels)[idx]
df = model_results.df_resid - 1
unadj_p = stats.t.sf(np.abs(resid), df) * 2
adj_p = multipletests(unadj_p, alpha=alpha, method=method)
data = np.c_[resid, unadj_p, adj_p[1]]
if cutoff is not None:
mask = data[:, -1] < cutoff
data = data[mask]
else:
mask = slice(None)
if labels is not None:
from pandas import DataFrame
return DataFrame(data,
columns=['student_resid', 'unadj_p', method + "(p)"],
index=np.asarray(labels)[mask])
return data
# influence measures
def reset_ramsey(res, degree=5):
"""Ramsey's RESET specification test for linear models
This is a general specification test, for additional non-linear effects
in a model.
Parameters
----------
degree : int
Maximum power to include in the RESET test. Powers 0 and 1 are
excluded, so that degree tests powers 2, ..., degree of the fitted
values.
Notes
-----
The test fits an auxiliary OLS regression where the design matrix, exog,
is augmented by powers 2 to degree of the fitted values. Then it performs
an F-test whether these additional terms are significant.
If the p-value of the f-test is below a threshold, e.g. 0.1, then this
indicates that there might be additional non-linear effects in the model
and that the linear model is mis-specified.
References
----------
https://en.wikipedia.org/wiki/Ramsey_RESET_test
"""
order = degree + 1
k_vars = res.model.exog.shape[1]
# vander without constant and x, and drop constant
norm_values = np.asarray(res.fittedvalues)
norm_values = norm_values / np.sqrt((norm_values ** 2).mean())
y_fitted_vander = np.vander(norm_values, order)[:, :-2]
exog = np.column_stack((res.model.exog, y_fitted_vander))
exog /= np.sqrt((exog ** 2).mean(0))
endog = res.model.endog / (res.model.endog ** 2).mean()
res_aux = OLS(endog, exog).fit()
# r_matrix = np.eye(degree, exog.shape[1], k_vars)
r_matrix = np.eye(degree - 1, exog.shape[1], k_vars)
# df1 = degree - 1
# df2 = exog.shape[0] - degree - res.df_model (without constant)
return res_aux.f_test(r_matrix) # , r_matrix, res_aux
def variance_inflation_factor(exog, exog_idx):
"""variance inflation factor, VIF, for one exogenous variable
The variance inflation factor is a measure for the increase of the
variance of the parameter estimates if an additional variable, given by
exog_idx is added to the linear regression. It is a measure for
multicollinearity of the design matrix, exog.
One recommendation is that if VIF is greater than 5, then the explanatory
variable given by exog_idx is highly collinear with the other explanatory
variables, and the parameter estimates will have large standard errors
because of this.
Parameters
----------
exog : ndarray
design matrix with all explanatory variables, as for example used in
regression
exog_idx : int
index of the exogenous variable in the columns of exog
Returns
-------
vif : float
variance inflation factor
Notes
-----
This function does not save the auxiliary regression.
See Also
--------
xxx : class for regression diagnostics TODO: does not exist yet
References
----------
https://en.wikipedia.org/wiki/Variance_inflation_factor
"""
k_vars = exog.shape[1]
x_i = exog[:, exog_idx]
mask = np.arange(k_vars) != exog_idx
x_noti = exog[:, mask]
r_squared_i = OLS(x_i, x_noti).fit().rsquared
vif = 1. / (1. - r_squared_i)
return vif
class _BaseInfluenceMixin(object):
"""common methods between OLSInfluence and MLE/GLMInfluence
"""
@Appender(_plot_influence_doc.format(**{'extra_params_doc': ""}))
def plot_influence(self, external=None, alpha=.05, criterion="cooks",
size=48, plot_alpha=.75, ax=None, **kwargs):
if external is None:
external = hasattr(self, '_cache') and 'res_looo' in self._cache
from statsmodels.graphics.regressionplots import _influence_plot
res = _influence_plot(self.results, self, external=external,
alpha=alpha,
criterion=criterion, size=size,
plot_alpha=plot_alpha, ax=ax, **kwargs)
return res
def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,
**kwds):
from statsmodels.graphics import utils
fig, ax = utils.create_mpl_ax(ax)
if title is None:
title = "Index Plot"
nobs = len(self.endog)
index = np.arange(nobs)
ax.scatter(index, y, **kwds)
if threshold == 'all':
large_points = np.ones(nobs, np.bool_)
else:
large_points = np.abs(y) > threshold
psize = 3 * np.ones(nobs)
# add point labels
labels = self.results.model.data.row_labels
if labels is None:
labels = np.arange(nobs)
ax = utils.annotate_axes(np.where(large_points)[0], labels,
lzip(index, y),
lzip(-psize, psize), "large",
ax)
font = {"fontsize": 16, "color": "black"}
ax.set_ylabel(ylabel, **font)
ax.set_xlabel("Observation", **font)
ax.set_title(title, **font)
return fig
def plot_index(self, y_var='cooks', threshold=None, title=None, ax=None,
idx=None, **kwds):
"""index plot for influence attributes
Parameters
----------
y_var : str
Name of attribute or shortcut for predefined attributes that will
be plotted on the y-axis.
threshold : None or float
Threshold for adding annotation with observation labels.
Observations for which the absolute value of the y_var is larger
than the threshold will be annotated. Set to a negative number to
label all observations or to a large number to have no annotation.
title : str
If provided, the title will replace the default "Index Plot" title.
ax : matplolib axis instance
The plot will be added to the `ax` if provided, otherwise a new
figure is created.
idx : {None, int}
Some attributes require an additional index to select the y-var.
In dfbetas this refers to the column indes.
kwds : optional keywords
Keywords will be used in the call to matplotlib scatter function.
"""
criterion = y_var # alias
if threshold is None:
# TODO: criterion specific defaults
threshold = 'all'
if criterion == 'dfbeta':
y = self.dfbetas[:, idx]
ylabel = 'DFBETA for ' + self.results.model.exog_names[idx]
elif criterion.startswith('cook'):
y = self.cooks_distance[0]
ylabel = "Cook's distance"
elif criterion.startswith('hat') or criterion.startswith('lever'):
y = self.hat_matrix_diag
ylabel = "Leverage (diagonal of hat matrix)"
elif criterion.startswith('cook'):
y = self.cooks_distance[0]
ylabel = "Cook's distance"
elif criterion.startswith('resid'):
y = self.resid_studentized
ylabel = "Internally Studentized Residuals"
else:
# assume we have the name of an attribute
y = getattr(self, y_var)
if idx is not None:
y = y[idx]
ylabel = y_var
fig = self._plot_index(y, ylabel, threshold=threshold, title=title,
ax=ax, **kwds)
return fig
class MLEInfluence(_BaseInfluenceMixin):
"""Local Influence and outlier measures (experimental)
This currently subclasses GLMInfluence instead of the other way.
No common superclass yet.
This is another version before checking what is common
Parameters
----------
results : instance of results class
This only works for model and results classes that have the necessary
helper methods.
other arguments are only to override default behavior and are used instead
of the corresponding attribute of the results class.
By default resid_pearson is used as resid.
Attributes
----------
hat_matrix_diag (hii) : This is the generalized leverage computed as the
local derivative of fittedvalues (predicted mean) with respect to the
observed response for each observation.
d_params : Change in parameters computed with one Newton step using the
full Hessian corrected by division by (1 - hii).
dbetas : change in parameters divided by the standard error of parameters
from the full model results, ``bse``.
cooks_distance : quadratic form for change in parameters weighted by
``cov_params`` from the full model divided by the number of variables.
It includes p-values based on the F-distribution which are only
approximate outside of linear Gaussian models.
resid_studentized : In the general MLE case resid_studentized are
computed from the score residuals scaled by hessian factor and
leverage. This does not use ``cov_params``.
d_fittedvalues : local change of expected mean given the change in the
parameters as computed in ``d_params``.
d_fittedvalues_scaled : same as d_fittedvalues but scaled by the standard
errors of a predicted mean of the response.
params_one : is the one step parameter estimate computed as ``params``
from the full sample minus ``d_params``.
Notes
-----
MLEInfluence produces the same results as GLMInfluence (verified for GLM
Loading ...