"""
Methods for analyzing two-way contingency tables (i.e. frequency
tables for observations that are cross-classified with respect to two
categorical variables).
The main classes are:
* Table : implements methods that can be applied to any two-way
contingency table.
* SquareTable : implements methods that can be applied to a square
two-way contingency table.
* Table2x2 : implements methods that can be applied to a 2x2
contingency table.
* StratifiedTable : implements methods that can be applied to a
collection of 2x2 contingency tables.
Also contains functions for conducting McNemar's test and Cochran's q
test.
Note that the inference procedures may depend on how the data were
sampled. In general the observed units are independent and
identically distributed.
"""
from statsmodels.tools.decorators import cache_readonly
import numpy as np
from scipy import stats
import pandas as pd
import warnings
from statsmodels import iolib
from statsmodels.tools import sm_exceptions
def _make_df_square(table):
"""
Reindex a pandas DataFrame so that it becomes square, meaning that
the row and column indices contain the same values, in the same
order. The row and column index are extended to achieve this.
"""
if not isinstance(table, pd.DataFrame):
return table
# If the table is not square, make it square
if not table.index.equals(table.columns):
ix = list(set(table.index) | set(table.columns))
ix.sort()
table = table.reindex(index=ix, columns=ix, fill_value=0)
# Ensures that the rows and columns are in the same order.
table = table.reindex(table.columns)
return table
class _Bunch(object):
def __repr__(self):
return "<bunch containing results, print to see contents>"
def __str__(self):
ky = [k for k, _ in self.__dict__.items()]
ky.sort()
m = max([len(k) for k in ky])
tab = []
f = "{:" + str(m) + "} {}"
for k in ky:
tab.append(f.format(k, self.__dict__[k]))
return "\n".join(tab)
class Table(object):
"""
A two-way contingency table.
Parameters
----------
table : array_like
A contingency table.
shift_zeros : bool
If True and any cell count is zero, add 0.5 to all values
in the table.
Attributes
----------
table_orig : array_like
The original table is cached as `table_orig`.
See Also
--------
statsmodels.graphics.mosaicplot.mosaic
scipy.stats.chi2_contingency
Notes
-----
The inference procedures used here are all based on a sampling
model in which the units are independent and identically
distributed, with each unit being classified with respect to two
categorical variables.
References
----------
Definitions of residuals:
https://onlinecourses.science.psu.edu/stat504/node/86
"""
def __init__(self, table, shift_zeros=True):
self.table_orig = table
self.table = np.asarray(table, dtype=np.float64)
if shift_zeros and (self.table.min() == 0):
self.table[self.table == 0] = 0.5
def __str__(self):
s = ("A %dx%d contingency table with counts:\n" %
tuple(self.table.shape))
s += np.array_str(self.table)
return s
@classmethod
def from_data(cls, data, shift_zeros=True):
"""
Construct a Table object from data.
Parameters
----------
data : array_like
The raw data, from which a contingency table is constructed
using the first two columns.
shift_zeros : bool
If True and any cell count is zero, add 0.5 to all values
in the table.
Returns
-------
A Table instance.
"""
if isinstance(data, pd.DataFrame):
table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
else:
table = pd.crosstab(data[:, 0], data[:, 1])
return cls(table, shift_zeros)
def test_nominal_association(self):
"""
Assess independence for nominal factors.
Assessment of independence between rows and columns using
chi^2 testing. The rows and columns are treated as nominal
(unordered) categorical variables.
Returns
-------
A bunch containing the following attributes:
statistic : float
The chi^2 test statistic.
df : int
The degrees of freedom of the reference distribution
pvalue : float
The p-value for the test.
"""
statistic = np.asarray(self.chi2_contribs).sum()
df = np.prod(np.asarray(self.table.shape) - 1)
pvalue = 1 - stats.chi2.cdf(statistic, df)
b = _Bunch()
b.statistic = statistic
b.df = df
b.pvalue = pvalue
return b
def test_ordinal_association(self, row_scores=None, col_scores=None):
"""
Assess independence between two ordinal variables.
This is the 'linear by linear' association test, which uses
weights or scores to target the test to have more power
against ordered alternatives.
Parameters
----------
row_scores : array_like
An array of numeric row scores
col_scores : array_like
An array of numeric column scores
Returns
-------
A bunch with the following attributes:
statistic : float
The test statistic.
null_mean : float
The expected value of the test statistic under the null
hypothesis.
null_sd : float
The standard deviation of the test statistic under the
null hypothesis.
zscore : float
The Z-score for the test statistic.
pvalue : float
The p-value for the test.
Notes
-----
The scores define the trend to which the test is most sensitive.
Using the default row and column scores gives the
Cochran-Armitage trend test.
"""
if row_scores is None:
row_scores = np.arange(self.table.shape[0])
if col_scores is None:
col_scores = np.arange(self.table.shape[1])
if len(row_scores) != self.table.shape[0]:
msg = ("The length of `row_scores` must match the first " +
"dimension of `table`.")
raise ValueError(msg)
if len(col_scores) != self.table.shape[1]:
msg = ("The length of `col_scores` must match the second " +
"dimension of `table`.")
raise ValueError(msg)
# The test statistic
statistic = np.dot(row_scores, np.dot(self.table, col_scores))
# Some needed quantities
n_obs = self.table.sum()
rtot = self.table.sum(1)
um = np.dot(row_scores, rtot)
u2m = np.dot(row_scores**2, rtot)
ctot = self.table.sum(0)
vn = np.dot(col_scores, ctot)
v2n = np.dot(col_scores**2, ctot)
# The null mean and variance of the test statistic
e_stat = um * vn / n_obs
v_stat = (u2m - um**2 / n_obs) * (v2n - vn**2 / n_obs) / (n_obs - 1)
sd_stat = np.sqrt(v_stat)
zscore = (statistic - e_stat) / sd_stat
pvalue = 2 * stats.norm.cdf(-np.abs(zscore))
b = _Bunch()
b.statistic = statistic
b.null_mean = e_stat
b.null_sd = sd_stat
b.zscore = zscore
b.pvalue = pvalue
return b
@cache_readonly
def marginal_probabilities(self):
"""
Estimate marginal probability distributions for the rows and columns.
Returns
-------
row : ndarray
Marginal row probabilities
col : ndarray
Marginal column probabilities
"""
n = self.table.sum()
row = self.table.sum(1) / n
col = self.table.sum(0) / n
if isinstance(self.table_orig, pd.DataFrame):
row = pd.Series(row, self.table_orig.index)
col = pd.Series(col, self.table_orig.columns)
return row, col
@cache_readonly
def independence_probabilities(self):
"""
Returns fitted joint probabilities under independence.
The returned table is outer(row, column), where row and
column are the estimated marginal distributions
of the rows and columns.
"""
row, col = self.marginal_probabilities
itab = np.outer(row, col)
if isinstance(self.table_orig, pd.DataFrame):
itab = pd.DataFrame(itab, self.table_orig.index,
self.table_orig.columns)
return itab
@cache_readonly
def fittedvalues(self):
"""
Returns fitted cell counts under independence.
The returned cell counts are estimates under a model
where the rows and columns of the table are independent.
"""
probs = self.independence_probabilities
fit = self.table.sum() * probs
return fit
@cache_readonly
def resid_pearson(self):
"""
Returns Pearson residuals.
The Pearson residuals are calculated under a model where
the rows and columns of the table are independent.
"""
fit = self.fittedvalues
resids = (self.table - fit) / np.sqrt(fit)
return resids
@cache_readonly
def standardized_resids(self):
"""
Returns standardized residuals under independence.
"""
row, col = self.marginal_probabilities
sresids = self.resid_pearson / np.sqrt(np.outer(1 - row, 1 - col))
return sresids
@cache_readonly
def chi2_contribs(self):
"""
Returns the contributions to the chi^2 statistic for independence.
Loading ...