Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ stats / outliers_influence.py

# -*- coding: utf-8 -*-
"""Influence and Outlier Measures

Created on Sun Jan 29 11:16:09 2012

Author: Josef Perktold
License: BSD-3
"""
from collections import defaultdict

import numpy as np

from statsmodels.compat.python import lzip
from statsmodels.compat.pandas import Appender
from statsmodels.graphics._regressionplots_doc import _plot_influence_doc
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.multitest import multipletests
from statsmodels.tools.decorators import cache_readonly
from statsmodels.tools.tools import maybe_unwrap_results


# outliers test convenience wrapper

def outlier_test(model_results, method='bonf', alpha=.05, labels=None,
                 order=False, cutoff=None):
    """
    Outlier Tests for RegressionResults instances.

    Parameters
    ----------
    model_results : RegressionResults
        Linear model results
    method : str
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` :
        - `holm` :
        - `simes-hochberg` :
        - `hommel` :
        - `fdr_bh` : Benjamini/Hochberg
        - `fdr_by` : Benjamini/Yekutieli
        See `statsmodels.stats.multitest.multipletests` for details.
    alpha : float
        familywise error rate
    labels : None or array_like
        If `labels` is not None, then it will be used as index to the
        returned pandas DataFrame. See also Returns below
    order : bool
        Whether or not to order the results by the absolute value of the
        studentized residuals. If labels are provided they will also be sorted.
    cutoff : None or float in [0, 1]
        If cutoff is not None, then the return only includes observations with
        multiple testing corrected p-values strictly below the cutoff. The
        returned array or dataframe can be empty if there are no outlier
        candidates at the specified cutoff.

    Returns
    -------
    table : ndarray or DataFrame
        Returns either an ndarray or a DataFrame if labels is not None.
        Will attempt to get labels from model_results if available. The
        columns are the Studentized residuals, the unadjusted p-value,
        and the corrected p-value according to method.

    Notes
    -----
    The unadjusted p-value is stats.t.sf(abs(resid), df) where
    df = df_resid - 1.
    """
    from scipy import stats  # lazy import
    if labels is None:
        labels = getattr(model_results.model.data, 'row_labels', None)
    infl = getattr(model_results, 'get_influence', None)
    if infl is None:
        results = maybe_unwrap_results(model_results)
        raise AttributeError("model_results object %s does not have a "
                             "get_influence "
                             "method." % results.__class__.__name__)
    resid = infl().resid_studentized_external
    if order:
        idx = np.abs(resid).argsort()[::-1]
        resid = resid[idx]
        if labels is not None:
            labels = np.asarray(labels)[idx]
    df = model_results.df_resid - 1
    unadj_p = stats.t.sf(np.abs(resid), df) * 2
    adj_p = multipletests(unadj_p, alpha=alpha, method=method)

    data = np.c_[resid, unadj_p, adj_p[1]]
    if cutoff is not None:
        mask = data[:, -1] < cutoff
        data = data[mask]
    else:
        mask = slice(None)

    if labels is not None:
        from pandas import DataFrame
        return DataFrame(data,
                         columns=['student_resid', 'unadj_p', method + "(p)"],
                         index=np.asarray(labels)[mask])
    return data


# influence measures

def reset_ramsey(res, degree=5):
    """Ramsey's RESET specification test for linear models

    This is a general specification test, for additional non-linear effects
    in a model.

    Parameters
    ----------
    degree : int
        Maximum power to include in the RESET test.  Powers 0 and 1 are
        excluded, so that degree tests powers 2, ..., degree of the fitted
        values.

    Notes
    -----
    The test fits an auxiliary OLS regression where the design matrix, exog,
    is augmented by powers 2 to degree of the fitted values. Then it performs
    an F-test whether these additional terms are significant.

    If the p-value of the f-test is below a threshold, e.g. 0.1, then this
    indicates that there might be additional non-linear effects in the model
    and that the linear model is mis-specified.

    References
    ----------
    https://en.wikipedia.org/wiki/Ramsey_RESET_test
    """
    order = degree + 1
    k_vars = res.model.exog.shape[1]
    # vander without constant and x, and drop constant
    norm_values = np.asarray(res.fittedvalues)
    norm_values = norm_values / np.sqrt((norm_values ** 2).mean())
    y_fitted_vander = np.vander(norm_values, order)[:, :-2]
    exog = np.column_stack((res.model.exog, y_fitted_vander))
    exog /= np.sqrt((exog ** 2).mean(0))
    endog = res.model.endog / (res.model.endog ** 2).mean()
    res_aux = OLS(endog, exog).fit()
    # r_matrix = np.eye(degree, exog.shape[1], k_vars)
    r_matrix = np.eye(degree - 1, exog.shape[1], k_vars)
    # df1 = degree - 1
    # df2 = exog.shape[0] - degree - res.df_model  (without constant)
    return res_aux.f_test(r_matrix)  # , r_matrix, res_aux


def variance_inflation_factor(exog, exog_idx):
    """variance inflation factor, VIF, for one exogenous variable

    The variance inflation factor is a measure for the increase of the
    variance of the parameter estimates if an additional variable, given by
    exog_idx is added to the linear regression. It is a measure for
    multicollinearity of the design matrix, exog.

    One recommendation is that if VIF is greater than 5, then the explanatory
    variable given by exog_idx is highly collinear with the other explanatory
    variables, and the parameter estimates will have large standard errors
    because of this.

    Parameters
    ----------
    exog : ndarray
        design matrix with all explanatory variables, as for example used in
        regression
    exog_idx : int
        index of the exogenous variable in the columns of exog

    Returns
    -------
    vif : float
        variance inflation factor

    Notes
    -----
    This function does not save the auxiliary regression.

    See Also
    --------
    xxx : class for regression diagnostics  TODO: does not exist yet

    References
    ----------
    https://en.wikipedia.org/wiki/Variance_inflation_factor
    """
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    r_squared_i = OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif


class _BaseInfluenceMixin(object):
    """common methods between OLSInfluence and MLE/GLMInfluence
    """

    @Appender(_plot_influence_doc.format(**{'extra_params_doc': ""}))
    def plot_influence(self, external=None, alpha=.05, criterion="cooks",
                       size=48, plot_alpha=.75, ax=None, **kwargs):

        if external is None:
            external = hasattr(self, '_cache') and 'res_looo' in self._cache
        from statsmodels.graphics.regressionplots import _influence_plot
        res = _influence_plot(self.results, self, external=external,
                              alpha=alpha,
                              criterion=criterion, size=size,
                              plot_alpha=plot_alpha, ax=ax, **kwargs)
        return res

    def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,
                    **kwds):
        from statsmodels.graphics import utils
        fig, ax = utils.create_mpl_ax(ax)
        if title is None:
            title = "Index Plot"
        nobs = len(self.endog)
        index = np.arange(nobs)
        ax.scatter(index, y, **kwds)

        if threshold == 'all':
            large_points = np.ones(nobs, np.bool_)
        else:
            large_points = np.abs(y) > threshold
        psize = 3 * np.ones(nobs)
        # add point labels
        labels = self.results.model.data.row_labels
        if labels is None:
            labels = np.arange(nobs)
        ax = utils.annotate_axes(np.where(large_points)[0], labels,
                                 lzip(index, y),
                                 lzip(-psize, psize), "large",
                                 ax)

        font = {"fontsize": 16, "color": "black"}
        ax.set_ylabel(ylabel, **font)
        ax.set_xlabel("Observation", **font)
        ax.set_title(title, **font)
        return fig

    def plot_index(self, y_var='cooks', threshold=None, title=None, ax=None,
                   idx=None, **kwds):
        """index plot for influence attributes

        Parameters
        ----------
        y_var : str
            Name of attribute or shortcut for predefined attributes that will
            be plotted on the y-axis.
        threshold : None or float
            Threshold for adding annotation with observation labels.
            Observations for which the absolute value of the y_var is larger
            than the threshold will be annotated. Set to a negative number to
            label all observations or to a large number to have no annotation.
        title : str
            If provided, the title will replace the default "Index Plot" title.
        ax : matplolib axis instance
            The plot will be added to the `ax` if provided, otherwise a new
            figure is created.
        idx : {None, int}
            Some attributes require an additional index to select the y-var.
            In dfbetas this refers to the column indes.
        kwds : optional keywords
            Keywords will be used in the call to matplotlib scatter function.
        """
        criterion = y_var  # alias
        if threshold is None:
            # TODO: criterion specific defaults
            threshold = 'all'

        if criterion == 'dfbeta':
            y = self.dfbetas[:, idx]
            ylabel = 'DFBETA for ' + self.results.model.exog_names[idx]
        elif criterion.startswith('cook'):
            y = self.cooks_distance[0]
            ylabel = "Cook's distance"
        elif criterion.startswith('hat') or criterion.startswith('lever'):
            y = self.hat_matrix_diag
            ylabel = "Leverage (diagonal of hat matrix)"
        elif criterion.startswith('cook'):
            y = self.cooks_distance[0]
            ylabel = "Cook's distance"
        elif criterion.startswith('resid'):
            y = self.resid_studentized
            ylabel = "Internally Studentized Residuals"
        else:
            # assume we have the name of an attribute
            y = getattr(self, y_var)
            if idx is not None:
                y = y[idx]
            ylabel = y_var

        fig = self._plot_index(y, ylabel, threshold=threshold, title=title,
                               ax=ax, **kwds)
        return fig


class MLEInfluence(_BaseInfluenceMixin):
    """Local Influence and outlier measures (experimental)

    This currently subclasses GLMInfluence instead of the other way.
    No common superclass yet.
    This is another version before checking what is common

    Parameters
    ----------
    results : instance of results class
        This only works for model and results classes that have the necessary
        helper methods.
    other arguments are only to override default behavior and are used instead
    of the corresponding attribute of the results class.
    By default resid_pearson is used as resid.




    Attributes
    ----------
    hat_matrix_diag (hii) : This is the generalized leverage computed as the
        local derivative of fittedvalues (predicted mean) with respect to the
        observed response for each observation.
    d_params : Change in parameters computed with one Newton step using the
        full Hessian corrected by division by (1 - hii).
    dbetas : change in parameters divided by the standard error of parameters
        from the full model results, ``bse``.
    cooks_distance : quadratic form for change in parameters weighted by
        ``cov_params`` from the full model divided by the number of variables.
        It includes p-values based on the F-distribution which are only
        approximate outside of linear Gaussian models.
    resid_studentized : In the general MLE case resid_studentized are
        computed from the score residuals scaled by hessian factor and
        leverage. This does not use ``cov_params``.
    d_fittedvalues : local change of expected mean given the change in the
        parameters as computed in ``d_params``.
    d_fittedvalues_scaled : same as d_fittedvalues but scaled by the standard
        errors of a predicted mean of the response.
    params_one : is the one step parameter estimate computed as ``params``
        from the full sample minus ``d_params``.

    Notes
    -----
    MLEInfluence produces the same results as GLMInfluence (verified for GLM
Loading ...