Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ stats / weightstats.py

'''Ttests and descriptive statistics with weights


Created on 2010-09-18

Author: josef-pktd
License: BSD (3-clause)


References
----------
SPSS manual
SAS manual

This follows in large parts the SPSS manual, which is largely the same as
the SAS manual with different, simpler notation.

Freq, Weight in SAS seems redundant since they always show up as product, SPSS
has only weights.

Notes
-----

This has potential problems with ddof, I started to follow numpy with ddof=0
by default and users can change it, but this might still mess up the t-tests,
since the estimates for the standard deviation will be based on the ddof that
the user chooses.
- fixed ddof for the meandiff ttest, now matches scipy.stats.ttest_ind

Note: scipy has now a separate, pooled variance option in ttest, but I have not
compared yet.

'''


import numpy as np
from scipy import stats

from statsmodels.tools.decorators import cache_readonly


class DescrStatsW(object):
    '''descriptive statistics and tests with weights for case weights

    Assumes that the data is 1d or 2d with (nobs, nvars) observations in rows,
    variables in columns, and that the same weight applies to each column.

    If degrees of freedom correction is used, then weights should add up to the
    number of observations. ttest also assumes that the sum of weights
    corresponds to the sample size.

    This is essentially the same as replicating each observations by its
    weight, if the weights are integers, often called case or frequency weights.

    Parameters
    ----------
    data : array_like, 1-D or 2-D
        dataset
    weights : None or 1-D ndarray
        weights for each observation, with same length as zero axis of data
    ddof : int
        default ddof=0, degrees of freedom correction used for second moments,
        var, std, cov, corrcoef.
        However, statistical tests are independent of `ddof`, based on the
        standard formulas.

    Examples
    --------

    >>> import numpy as np
    >>> np.random.seed(0)
    >>> x1_2d = 1.0 + np.random.randn(20, 3)
    >>> w1 = np.random.randint(1, 4, 20)
    >>> d1 = DescrStatsW(x1_2d, weights=w1)
    >>> d1.mean
    array([ 1.42739844,  1.23174284,  1.083753  ])
    >>> d1.var
    array([ 0.94855633,  0.52074626,  1.12309325])
    >>> d1.std_mean
    array([ 0.14682676,  0.10878944,  0.15976497])

    >>> tstat, pval, df = d1.ttest_mean(0)
    >>> tstat; pval; df
    array([  9.72165021,  11.32226471,   6.78342055])
    array([  1.58414212e-12,   1.26536887e-14,   2.37623126e-08])
    44.0

    >>> tstat, pval, df = d1.ttest_mean([0, 1, 1])
    >>> tstat; pval; df
    array([ 9.72165021,  2.13019609,  0.52422632])
    array([  1.58414212e-12,   3.87842808e-02,   6.02752170e-01])
    44.0

    #if weights are integers, then asrepeats can be used

    >>> x1r = d1.asrepeats()
    >>> x1r.shape
    ...
    >>> stats.ttest_1samp(x1r, [0, 1, 1])
    ...

    '''
    def __init__(self, data, weights=None, ddof=0):

        self.data = np.asarray(data)
        if weights is None:
            self.weights = np.ones(self.data.shape[0])
        else:
            # TODO: why squeeze?
            self.weights = np.asarray(weights).squeeze().astype(float)
        self.ddof = ddof


    @cache_readonly
    def sum_weights(self):
        """Sum of weights"""
        return self.weights.sum(0)

    @cache_readonly
    def nobs(self):
        '''alias for number of observations/cases, equal to sum of weights
        '''
        return self.sum_weights

    @cache_readonly
    def sum(self):
        '''weighted sum of data'''
        return np.dot(self.data.T, self.weights)

    @cache_readonly
    def mean(self):
        '''weighted mean of data'''
        return self.sum / self.sum_weights

    @cache_readonly
    def demeaned(self):
        '''data with weighted mean subtracted'''
        return self.data - self.mean

    @cache_readonly
    def sumsquares(self):
        '''weighted sum of squares of demeaned data'''
        return np.dot((self.demeaned**2).T, self.weights)

    #need memoize instead of cache decorator
    def var_ddof(self, ddof=0):
        '''variance of data given ddof

        Parameters
        ----------
        ddof : int, float
            degrees of freedom correction, independent of attribute ddof

        Returns
        -------
        var : float, ndarray
            variance with denominator ``sum_weights - ddof``
        '''
        return self.sumsquares / (self.sum_weights - ddof)

    def std_ddof(self, ddof=0):
        '''standard deviation of data with given ddof

        Parameters
        ----------
        ddof : int, float
            degrees of freedom correction, independent of attribute ddof

        Returns
        -------
        std : float, ndarray
            standard deviation with denominator ``sum_weights - ddof``
        '''
        return np.sqrt(self.var_ddof(ddof=ddof))

    @cache_readonly
    def var(self):
        '''variance with default degrees of freedom correction
        '''
        return self.sumsquares / (self.sum_weights - self.ddof)

    @cache_readonly
    def _var(self):
        '''variance without degrees of freedom correction

        used for statistical tests with controlled ddof
        '''
        return self.sumsquares / self.sum_weights

    @cache_readonly
    def std(self):
        '''standard deviation with default degrees of freedom correction
        '''
        return np.sqrt(self.var)

    @cache_readonly
    def cov(self):
        '''weighted covariance of data if data is 2 dimensional

        assumes variables in columns and observations in rows
        uses default ddof
        '''
        cov_ = np.dot(self.weights * self.demeaned.T, self.demeaned)
        cov_ /= (self.sum_weights - self.ddof)
        return cov_

    @cache_readonly
    def corrcoef(self):
        '''weighted correlation with default ddof

        assumes variables in columns and observations in rows
        '''
        return self.cov / self.std / self.std[:,None]

    @cache_readonly
    def std_mean(self):
        '''standard deviation of weighted mean
        '''
        std = self.std
        if self.ddof != 0:
            #ddof correction,   (need copy of std)
            std = std * np.sqrt((self.sum_weights - self.ddof)
                                            / self.sum_weights)

        return std / np.sqrt(self.sum_weights - 1)


    def quantile(self, probs, return_pandas=True):
        """
        Compute quantiles for a weighted sample.

        Parameters
        ----------
        probs : array_like
            A vector of probability points at which to calculate the
            quantiles.  Each element of `probs` should fall in [0, 1].
        return_pandas : bool
            If True, return value is a Pandas DataFrame or Series.
            Otherwise returns a ndarray.

        Returns
        -------
        quantiles : Series, DataFrame, or ndarray
            If `return_pandas` = True, returns one of the following:
              * data are 1d, `return_pandas` = True: a Series indexed by
                the probability points.
              * data are 2d, `return_pandas` = True: a DataFrame with
                the probability points as row index and the variables
                as column index.

            If `return_pandas` = False, returns an ndarray containing the
            same values as the Series/DataFrame.

        Notes
        -----
        To compute the quantiles, first, the weights are summed over
        exact ties yielding distinct data values y_1 < y_2 < ..., and
        corresponding weights w_1, w_2, ....  Let s_j denote the sum
        of the first j weights, and let W denote the sum of all the
        weights.  For a probability point p, if pW falls strictly
        between s_j and s_{j+1} then the estimated quantile is
        y_{j+1}.  If pW = s_j then the estimated quantile is (y_j +
        y_{j+1})/2.  If pW < p_1 then the estimated quantile is y_1.

        References
        ----------
        SAS documentation for weighted quantiles:

        https://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_univariate_sect028.htm
        """

        import pandas as pd

        probs = np.asarray(probs)
        probs = np.atleast_1d(probs)

        if self.data.ndim == 1:
            rslt = self._quantile(self.data, probs)
            if return_pandas:
                rslt = pd.Series(rslt, index=probs)
        else:
            rslt = []
            for vec in self.data.T:
                rslt.append(self._quantile(vec, probs))
            rslt = np.column_stack(rslt)
            if return_pandas:
                columns = ["col%d" % (j+1) for j in range(rslt.shape[1])]
                rslt = pd.DataFrame(data=rslt, columns=columns, index=probs)

        if return_pandas:
            rslt.index.name = "p"

        return rslt


    def _quantile(self, vec, probs):
        # Helper function to calculate weighted quantiles for one column.
        # Follows definition from SAS documentation.
        # Returns ndarray

        import pandas as pd

        # Aggregate over ties
        df = pd.DataFrame(index=np.arange(len(self.weights)))
        df["weights"] = self.weights
        df["vec"] = vec
        dfg = df.groupby("vec").agg(np.sum)
        weights = dfg.values[:, 0]
        values = np.asarray(dfg.index)

        cweights = np.cumsum(weights)
        totwt = cweights[-1]
        targets = probs * totwt
        ii = np.searchsorted(cweights, targets)

        rslt = values[ii]

        # Exact hits
        jj = np.flatnonzero(np.abs(targets - cweights[ii]) < 1e-10)
        jj = jj[ii[jj] < len(cweights) - 1]
        rslt[jj] = (values[ii[jj]] + values[ii[jj]+1]) / 2

        return rslt


    def tconfint_mean(self, alpha=0.05, alternative='two-sided'):
        '''two-sided confidence interval for weighted mean of data

        If the data is 2d, then these are separate confidence intervals
        for each column.

        Parameters
        ----------
        alpha : float
            significance level for the confidence interval, coverage is
            ``1-alpha``
        alternative : str
            This specifies the alternative hypothesis for the test that
            corresponds to the confidence interval.
            The alternative hypothesis, H1, has to be one of the following

              'two-sided': H1: mean not equal to value (default)
              'larger' :   H1: mean larger than value
              'smaller' :  H1: mean smaller than value
Loading ...