'''Ttests and descriptive statistics with weights
Created on 2010-09-18
Author: josef-pktd
License: BSD (3-clause)
References
----------
SPSS manual
SAS manual
This follows in large parts the SPSS manual, which is largely the same as
the SAS manual with different, simpler notation.
Freq, Weight in SAS seems redundant since they always show up as product, SPSS
has only weights.
Notes
-----
This has potential problems with ddof, I started to follow numpy with ddof=0
by default and users can change it, but this might still mess up the t-tests,
since the estimates for the standard deviation will be based on the ddof that
the user chooses.
- fixed ddof for the meandiff ttest, now matches scipy.stats.ttest_ind
Note: scipy has now a separate, pooled variance option in ttest, but I have not
compared yet.
'''
import numpy as np
from scipy import stats
from statsmodels.tools.decorators import cache_readonly
class DescrStatsW(object):
'''descriptive statistics and tests with weights for case weights
Assumes that the data is 1d or 2d with (nobs, nvars) observations in rows,
variables in columns, and that the same weight applies to each column.
If degrees of freedom correction is used, then weights should add up to the
number of observations. ttest also assumes that the sum of weights
corresponds to the sample size.
This is essentially the same as replicating each observations by its
weight, if the weights are integers, often called case or frequency weights.
Parameters
----------
data : array_like, 1-D or 2-D
dataset
weights : None or 1-D ndarray
weights for each observation, with same length as zero axis of data
ddof : int
default ddof=0, degrees of freedom correction used for second moments,
var, std, cov, corrcoef.
However, statistical tests are independent of `ddof`, based on the
standard formulas.
Examples
--------
>>> import numpy as np
>>> np.random.seed(0)
>>> x1_2d = 1.0 + np.random.randn(20, 3)
>>> w1 = np.random.randint(1, 4, 20)
>>> d1 = DescrStatsW(x1_2d, weights=w1)
>>> d1.mean
array([ 1.42739844, 1.23174284, 1.083753 ])
>>> d1.var
array([ 0.94855633, 0.52074626, 1.12309325])
>>> d1.std_mean
array([ 0.14682676, 0.10878944, 0.15976497])
>>> tstat, pval, df = d1.ttest_mean(0)
>>> tstat; pval; df
array([ 9.72165021, 11.32226471, 6.78342055])
array([ 1.58414212e-12, 1.26536887e-14, 2.37623126e-08])
44.0
>>> tstat, pval, df = d1.ttest_mean([0, 1, 1])
>>> tstat; pval; df
array([ 9.72165021, 2.13019609, 0.52422632])
array([ 1.58414212e-12, 3.87842808e-02, 6.02752170e-01])
44.0
#if weights are integers, then asrepeats can be used
>>> x1r = d1.asrepeats()
>>> x1r.shape
...
>>> stats.ttest_1samp(x1r, [0, 1, 1])
...
'''
def __init__(self, data, weights=None, ddof=0):
self.data = np.asarray(data)
if weights is None:
self.weights = np.ones(self.data.shape[0])
else:
# TODO: why squeeze?
self.weights = np.asarray(weights).squeeze().astype(float)
self.ddof = ddof
@cache_readonly
def sum_weights(self):
"""Sum of weights"""
return self.weights.sum(0)
@cache_readonly
def nobs(self):
'''alias for number of observations/cases, equal to sum of weights
'''
return self.sum_weights
@cache_readonly
def sum(self):
'''weighted sum of data'''
return np.dot(self.data.T, self.weights)
@cache_readonly
def mean(self):
'''weighted mean of data'''
return self.sum / self.sum_weights
@cache_readonly
def demeaned(self):
'''data with weighted mean subtracted'''
return self.data - self.mean
@cache_readonly
def sumsquares(self):
'''weighted sum of squares of demeaned data'''
return np.dot((self.demeaned**2).T, self.weights)
#need memoize instead of cache decorator
def var_ddof(self, ddof=0):
'''variance of data given ddof
Parameters
----------
ddof : int, float
degrees of freedom correction, independent of attribute ddof
Returns
-------
var : float, ndarray
variance with denominator ``sum_weights - ddof``
'''
return self.sumsquares / (self.sum_weights - ddof)
def std_ddof(self, ddof=0):
'''standard deviation of data with given ddof
Parameters
----------
ddof : int, float
degrees of freedom correction, independent of attribute ddof
Returns
-------
std : float, ndarray
standard deviation with denominator ``sum_weights - ddof``
'''
return np.sqrt(self.var_ddof(ddof=ddof))
@cache_readonly
def var(self):
'''variance with default degrees of freedom correction
'''
return self.sumsquares / (self.sum_weights - self.ddof)
@cache_readonly
def _var(self):
'''variance without degrees of freedom correction
used for statistical tests with controlled ddof
'''
return self.sumsquares / self.sum_weights
@cache_readonly
def std(self):
'''standard deviation with default degrees of freedom correction
'''
return np.sqrt(self.var)
@cache_readonly
def cov(self):
'''weighted covariance of data if data is 2 dimensional
assumes variables in columns and observations in rows
uses default ddof
'''
cov_ = np.dot(self.weights * self.demeaned.T, self.demeaned)
cov_ /= (self.sum_weights - self.ddof)
return cov_
@cache_readonly
def corrcoef(self):
'''weighted correlation with default ddof
assumes variables in columns and observations in rows
'''
return self.cov / self.std / self.std[:,None]
@cache_readonly
def std_mean(self):
'''standard deviation of weighted mean
'''
std = self.std
if self.ddof != 0:
#ddof correction, (need copy of std)
std = std * np.sqrt((self.sum_weights - self.ddof)
/ self.sum_weights)
return std / np.sqrt(self.sum_weights - 1)
def quantile(self, probs, return_pandas=True):
"""
Compute quantiles for a weighted sample.
Parameters
----------
probs : array_like
A vector of probability points at which to calculate the
quantiles. Each element of `probs` should fall in [0, 1].
return_pandas : bool
If True, return value is a Pandas DataFrame or Series.
Otherwise returns a ndarray.
Returns
-------
quantiles : Series, DataFrame, or ndarray
If `return_pandas` = True, returns one of the following:
* data are 1d, `return_pandas` = True: a Series indexed by
the probability points.
* data are 2d, `return_pandas` = True: a DataFrame with
the probability points as row index and the variables
as column index.
If `return_pandas` = False, returns an ndarray containing the
same values as the Series/DataFrame.
Notes
-----
To compute the quantiles, first, the weights are summed over
exact ties yielding distinct data values y_1 < y_2 < ..., and
corresponding weights w_1, w_2, .... Let s_j denote the sum
of the first j weights, and let W denote the sum of all the
weights. For a probability point p, if pW falls strictly
between s_j and s_{j+1} then the estimated quantile is
y_{j+1}. If pW = s_j then the estimated quantile is (y_j +
y_{j+1})/2. If pW < p_1 then the estimated quantile is y_1.
References
----------
SAS documentation for weighted quantiles:
https://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_univariate_sect028.htm
"""
import pandas as pd
probs = np.asarray(probs)
probs = np.atleast_1d(probs)
if self.data.ndim == 1:
rslt = self._quantile(self.data, probs)
if return_pandas:
rslt = pd.Series(rslt, index=probs)
else:
rslt = []
for vec in self.data.T:
rslt.append(self._quantile(vec, probs))
rslt = np.column_stack(rslt)
if return_pandas:
columns = ["col%d" % (j+1) for j in range(rslt.shape[1])]
rslt = pd.DataFrame(data=rslt, columns=columns, index=probs)
if return_pandas:
rslt.index.name = "p"
return rslt
def _quantile(self, vec, probs):
# Helper function to calculate weighted quantiles for one column.
# Follows definition from SAS documentation.
# Returns ndarray
import pandas as pd
# Aggregate over ties
df = pd.DataFrame(index=np.arange(len(self.weights)))
df["weights"] = self.weights
df["vec"] = vec
dfg = df.groupby("vec").agg(np.sum)
weights = dfg.values[:, 0]
values = np.asarray(dfg.index)
cweights = np.cumsum(weights)
totwt = cweights[-1]
targets = probs * totwt
ii = np.searchsorted(cweights, targets)
rslt = values[ii]
# Exact hits
jj = np.flatnonzero(np.abs(targets - cweights[ii]) < 1e-10)
jj = jj[ii[jj] < len(cweights) - 1]
rslt[jj] = (values[ii[jj]] + values[ii[jj]+1]) / 2
return rslt
def tconfint_mean(self, alpha=0.05, alternative='two-sided'):
'''two-sided confidence interval for weighted mean of data
If the data is 2d, then these are separate confidence intervals
for each column.
Parameters
----------
alpha : float
significance level for the confidence interval, coverage is
``1-alpha``
alternative : str
This specifies the alternative hypothesis for the test that
corresponds to the confidence interval.
The alternative hypothesis, H1, has to be one of the following
'two-sided': H1: mean not equal to value (default)
'larger' : H1: mean larger than value
'smaller' : H1: mean smaller than value
Loading ...