Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ genmod / families / family.py

'''
The one parameter exponential family distributions used by GLM.
'''
# TODO: quasi, quasibinomial, quasipoisson
# see
# http://www.biostat.jhsph.edu/~qli/biostatistics_r_doc/library/stats/html/family.html
# for comparison to R, and McCullagh and Nelder


import warnings
import inspect
import numpy as np
from scipy import special
from . import links as L
from . import varfuncs as V

FLOAT_EPS = np.finfo(float).eps


class Family(object):
    """
    The parent class for one-parameter exponential families.

    Parameters
    ----------
    link : a link function instance
        Link is the linear transformation function.
        See the individual families for available links.
    variance : a variance function
        Measures the variance as a function of the mean probabilities.
        See the individual families for the default variance function.

    See Also
    --------
    :ref:`links` : Further details on links.
    """
    # TODO: change these class attributes, use valid somewhere...
    valid = [-np.inf, np.inf]
    links = []

    def _setlink(self, link):
        """
        Helper method to set the link for a family.

        Raises a ``ValueError`` exception if the link is not available. Note
        that  the error message might not be that informative because it tells
        you that the link should be in the base class for the link function.

        See statsmodels.genmod.generalized_linear_model.GLM for a list of
        appropriate links for each family but note that not all of these are
        currently available.
        """
        # TODO: change the links class attribute in the families to hold
        # meaningful information instead of a list of links instances such as
        # [<statsmodels.family.links.Log object at 0x9a4240c>,
        #  <statsmodels.family.links.Power object at 0x9a423ec>,
        #  <statsmodels.family.links.Power object at 0x9a4236c>]
        # for Poisson...
        self._link = link
        if not isinstance(link, L.Link):
            raise TypeError("The input should be a valid Link object.")
        if hasattr(self, "links"):
            validlink = max([isinstance(link, _) for _ in self.links])
            if not validlink:
                errmsg = "Invalid link for family, should be in %s. (got %s)"
                raise ValueError(errmsg % (repr(self.links), link))

    def _getlink(self):
        """
        Helper method to get the link for a family.
        """
        return self._link

    # link property for each family is a pointer to link instance
    link = property(_getlink, _setlink, doc="Link function for family")

    def __init__(self, link, variance):
        if inspect.isclass(link):
            warnmssg = "Calling Family(..) with a link class as argument "
            warnmssg += "is deprecated.\n"
            warnmssg += "Use an instance of a link class instead."
            lvl = 2 if type(self) is Family else 3
            warnings.warn(warnmssg,
                          category=DeprecationWarning, stacklevel=lvl)
            self.link = link()
        else:
            self.link = link
        self.variance = variance

    def starting_mu(self, y):
        r"""
        Starting value for mu in the IRLS algorithm.

        Parameters
        ----------
        y : ndarray
            The untransformed response variable.

        Returns
        -------
        mu_0 : ndarray
            The first guess on the transformed response variable.

        Notes
        -----
        .. math::

           \mu_0 = (Y + \overline{Y})/2

        Only the Binomial family takes a different initial value.
        """
        return (y + y.mean())/2.

    def weights(self, mu):
        r"""
        Weights for IRLS steps

        Parameters
        ----------
        mu : array_like
            The transformed mean response variable in the exponential family

        Returns
        -------
        w : ndarray
            The weights for the IRLS steps

        Notes
        -----
        .. math::

           w = 1 / (g'(\mu)^2  * Var(\mu))
        """
        return 1. / (self.link.deriv(mu)**2 * self.variance(mu))

    def deviance(self, endog, mu, var_weights=1., freq_weights=1., scale=1.):
        r"""
        The deviance function evaluated at (endog, mu, var_weights,
        freq_weights, scale) for the distribution.

        Deviance is usually defined as twice the loglikelihood ratio.

        Parameters
        ----------
        endog : array_like
            The endogenous response variable
        mu : array_like
            The inverse of the link function at the linear predicted values.
        var_weights : array_like
            1d array of variance (analytic) weights. The default is 1.
        freq_weights : array_like
            1d array of frequency weights. The default is 1.
        scale : float, optional
            An optional scale argument. The default is 1.

        Returns
        -------
        Deviance : ndarray
            The value of deviance function defined below.

        Notes
        -----
        Deviance is defined

        .. math::

           D = 2\sum_i (freq\_weights_i * var\_weights *
           (llf(endog_i, endog_i) - llf(endog_i, \mu_i)))

        where y is the endogenous variable. The deviance functions are
        analytically defined for each family.

        Internally, we calculate deviance as:

        .. math::
           D = \sum_i freq\_weights_i * var\_weights * resid\_dev_i  / scale
        """
        resid_dev = self._resid_dev(endog, mu)
        return np.sum(resid_dev * freq_weights * var_weights / scale)

    def resid_dev(self, endog, mu, var_weights=1., scale=1.):
        r"""
        The deviance residuals

        Parameters
        ----------
        endog : array_like
            The endogenous response variable
        mu : array_like
            The inverse of the link function at the linear predicted values.
        var_weights : array_like
            1d array of variance (analytic) weights. The default is 1.
        scale : float, optional
            An optional scale argument. The default is 1.

        Returns
        -------
        resid_dev : float
            Deviance residuals as defined below.

        Notes
        -----
        The deviance residuals are defined by the contribution D_i of
        observation i to the deviance as

        .. math::
           resid\_dev_i = sign(y_i-\mu_i) \sqrt{D_i}

        D_i is calculated from the _resid_dev method in each family.
        Distribution-specific documentation of the calculation is available
        there.
        """
        resid_dev = self._resid_dev(endog, mu)
        resid_dev *= var_weights / scale
        return np.sign(endog - mu) * np.sqrt(np.clip(resid_dev, 0., np.inf))

    def fitted(self, lin_pred):
        r"""
        Fitted values based on linear predictors lin_pred.

        Parameters
        ----------
        lin_pred : ndarray
            Values of the linear predictor of the model.
            :math:`X \cdot \beta` in a classical linear model.

        Returns
        -------
        mu : ndarray
            The mean response variables given by the inverse of the link
            function.
        """
        fits = self.link.inverse(lin_pred)
        return fits

    def predict(self, mu):
        """
        Linear predictors based on given mu values.

        Parameters
        ----------
        mu : ndarray
            The mean response variables

        Returns
        -------
        lin_pred : ndarray
            Linear predictors based on the mean response variables.  The value
            of the link function at the given mu.
        """
        return self.link(mu)

    def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
        r"""
        The log-likelihood function for each observation in terms of the fitted
        mean response for the distribution.

        Parameters
        ----------
        endog : ndarray
            Usually the endogenous response variable.
        mu : ndarray
            Usually but not always the fitted mean response variable.
        var_weights : array_like
            1d array of variance (analytic) weights. The default is 1.
        scale : float
            The scale parameter. The default is 1.

        Returns
        -------
        ll_i : float
            The value of the loglikelihood evaluated at
            (endog, mu, var_weights, scale) as defined below.

        Notes
        -----
        This is defined for each family. endog and mu are not restricted to
        ``endog`` and ``mu`` respectively.  For instance, you could call
        both ``loglike(endog, endog)`` and ``loglike(endog, mu)`` to get the
        log-likelihood ratio.
        """
        raise NotImplementedError

    def loglike(self, endog, mu, var_weights=1., freq_weights=1., scale=1.):
        r"""
        The log-likelihood function in terms of the fitted mean response.

        Parameters
        ----------
        endog : ndarray
            Usually the endogenous response variable.
        mu : ndarray
            Usually but not always the fitted mean response variable.
        var_weights : array_like
            1d array of variance (analytic) weights. The default is 1.
        freq_weights : array_like
            1d array of frequency weights. The default is 1.
        scale : float
            The scale parameter. The default is 1.

        Returns
        -------
        ll : float
            The value of the loglikelihood evaluated at
            (endog, mu, var_weights, freq_weights, scale) as defined below.

        Notes
        -----
        Where :math:`ll_i` is the by-observation log-likelihood:

        .. math::
           ll = \sum(ll_i * freq\_weights_i)

        ``ll_i`` is defined for each family. endog and mu are not restricted
        to ``endog`` and ``mu`` respectively.  For instance, you could call
        both ``loglike(endog, endog)`` and ``loglike(endog, mu)`` to get the
        log-likelihood ratio.
        """
        ll_obs = self.loglike_obs(endog, mu, var_weights, scale)
        return np.sum(ll_obs * freq_weights)

    def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
        r"""
        The Anscombe residuals

        Parameters
        ----------
        endog : ndarray
            The endogenous response variable
        mu : ndarray
            The inverse of the link function at the linear predicted values.
        var_weights : array_like
            1d array of variance (analytic) weights. The default is 1.
        scale : float, optional
            An optional argument to divide the residuals by sqrt(scale).
            The default is 1.

        See Also
        --------
        statsmodels.genmod.families.family.Family : `resid_anscombe` for the
          individual families for more information

        Notes
        -----
        Anscombe residuals are defined by
Loading ...