Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ stats / contingency_tables.py

"""
Methods for analyzing two-way contingency tables (i.e. frequency
tables for observations that are cross-classified with respect to two
categorical variables).

The main classes are:

  * Table : implements methods that can be applied to any two-way
  contingency table.

  * SquareTable : implements methods that can be applied to a square
  two-way contingency table.

  * Table2x2 : implements methods that can be applied to a 2x2
  contingency table.

  * StratifiedTable : implements methods that can be applied to a
  collection of 2x2 contingency tables.

Also contains functions for conducting McNemar's test and Cochran's q
test.

Note that the inference procedures may depend on how the data were
sampled.  In general the observed units are independent and
identically distributed.
"""

from statsmodels.tools.decorators import cache_readonly
import numpy as np
from scipy import stats
import pandas as pd
import warnings
from statsmodels import iolib
from statsmodels.tools import sm_exceptions


def _make_df_square(table):
    """
    Reindex a pandas DataFrame so that it becomes square, meaning that
    the row and column indices contain the same values, in the same
    order.  The row and column index are extended to achieve this.
    """

    if not isinstance(table, pd.DataFrame):
        return table

    # If the table is not square, make it square
    if not table.index.equals(table.columns):
        ix = list(set(table.index) | set(table.columns))
        ix.sort()
        table = table.reindex(index=ix, columns=ix, fill_value=0)

    # Ensures that the rows and columns are in the same order.
    table = table.reindex(table.columns)

    return table


class _Bunch(object):

    def __repr__(self):
        return "<bunch containing results, print to see contents>"

    def __str__(self):
        ky = [k for k, _ in self.__dict__.items()]
        ky.sort()
        m = max([len(k) for k in ky])
        tab = []
        f = "{:" + str(m) + "}   {}"
        for k in ky:
            tab.append(f.format(k, self.__dict__[k]))
        return "\n".join(tab)


class Table(object):
    """
    A two-way contingency table.

    Parameters
    ----------
    table : array_like
        A contingency table.
    shift_zeros : bool
        If True and any cell count is zero, add 0.5 to all values
        in the table.

    Attributes
    ----------
    table_orig : array_like
        The original table is cached as `table_orig`.

    See Also
    --------
    statsmodels.graphics.mosaicplot.mosaic
    scipy.stats.chi2_contingency

    Notes
    -----
    The inference procedures used here are all based on a sampling
    model in which the units are independent and identically
    distributed, with each unit being classified with respect to two
    categorical variables.

    References
    ----------
    Definitions of residuals:
        https://onlinecourses.science.psu.edu/stat504/node/86
    """

    def __init__(self, table, shift_zeros=True):

        self.table_orig = table
        self.table = np.asarray(table, dtype=np.float64)

        if shift_zeros and (self.table.min() == 0):
            self.table[self.table == 0] = 0.5

    def __str__(self):
        s = ("A %dx%d contingency table with counts:\n" %
             tuple(self.table.shape))
        s += np.array_str(self.table)
        return s

    @classmethod
    def from_data(cls, data, shift_zeros=True):
        """
        Construct a Table object from data.

        Parameters
        ----------
        data : array_like
            The raw data, from which a contingency table is constructed
            using the first two columns.
        shift_zeros : bool
            If True and any cell count is zero, add 0.5 to all values
            in the table.

        Returns
        -------
        A Table instance.
        """

        if isinstance(data, pd.DataFrame):
            table = pd.crosstab(data.iloc[:, 0], data.iloc[:, 1])
        else:
            table = pd.crosstab(data[:, 0], data[:, 1])

        return cls(table, shift_zeros)

    def test_nominal_association(self):
        """
        Assess independence for nominal factors.

        Assessment of independence between rows and columns using
        chi^2 testing.  The rows and columns are treated as nominal
        (unordered) categorical variables.

        Returns
        -------
        A bunch containing the following attributes:

        statistic : float
            The chi^2 test statistic.
        df : int
            The degrees of freedom of the reference distribution
        pvalue : float
            The p-value for the test.
        """

        statistic = np.asarray(self.chi2_contribs).sum()
        df = np.prod(np.asarray(self.table.shape) - 1)
        pvalue = 1 - stats.chi2.cdf(statistic, df)
        b = _Bunch()
        b.statistic = statistic
        b.df = df
        b.pvalue = pvalue
        return b

    def test_ordinal_association(self, row_scores=None, col_scores=None):
        """
        Assess independence between two ordinal variables.

        This is the 'linear by linear' association test, which uses
        weights or scores to target the test to have more power
        against ordered alternatives.

        Parameters
        ----------
        row_scores : array_like
            An array of numeric row scores
        col_scores : array_like
            An array of numeric column scores

        Returns
        -------
        A bunch with the following attributes:

        statistic : float
            The test statistic.
        null_mean : float
            The expected value of the test statistic under the null
            hypothesis.
        null_sd : float
            The standard deviation of the test statistic under the
            null hypothesis.
        zscore : float
            The Z-score for the test statistic.
        pvalue : float
            The p-value for the test.

        Notes
        -----
        The scores define the trend to which the test is most sensitive.

        Using the default row and column scores gives the
        Cochran-Armitage trend test.
        """

        if row_scores is None:
            row_scores = np.arange(self.table.shape[0])

        if col_scores is None:
            col_scores = np.arange(self.table.shape[1])

        if len(row_scores) != self.table.shape[0]:
            msg = ("The length of `row_scores` must match the first " +
                   "dimension of `table`.")
            raise ValueError(msg)

        if len(col_scores) != self.table.shape[1]:
            msg = ("The length of `col_scores` must match the second " +
                   "dimension of `table`.")
            raise ValueError(msg)

        # The test statistic
        statistic = np.dot(row_scores, np.dot(self.table, col_scores))

        # Some needed quantities
        n_obs = self.table.sum()
        rtot = self.table.sum(1)
        um = np.dot(row_scores, rtot)
        u2m = np.dot(row_scores**2, rtot)
        ctot = self.table.sum(0)
        vn = np.dot(col_scores, ctot)
        v2n = np.dot(col_scores**2, ctot)

        # The null mean and variance of the test statistic
        e_stat = um * vn / n_obs
        v_stat = (u2m - um**2 / n_obs) * (v2n - vn**2 / n_obs) / (n_obs - 1)
        sd_stat = np.sqrt(v_stat)

        zscore = (statistic - e_stat) / sd_stat
        pvalue = 2 * stats.norm.cdf(-np.abs(zscore))

        b = _Bunch()
        b.statistic = statistic
        b.null_mean = e_stat
        b.null_sd = sd_stat
        b.zscore = zscore
        b.pvalue = pvalue
        return b

    @cache_readonly
    def marginal_probabilities(self):
        """
        Estimate marginal probability distributions for the rows and columns.

        Returns
        -------
        row : ndarray
            Marginal row probabilities
        col : ndarray
            Marginal column probabilities
        """

        n = self.table.sum()
        row = self.table.sum(1) / n
        col = self.table.sum(0) / n

        if isinstance(self.table_orig, pd.DataFrame):
            row = pd.Series(row, self.table_orig.index)
            col = pd.Series(col, self.table_orig.columns)

        return row, col

    @cache_readonly
    def independence_probabilities(self):
        """
        Returns fitted joint probabilities under independence.

        The returned table is outer(row, column), where row and
        column are the estimated marginal distributions
        of the rows and columns.
        """

        row, col = self.marginal_probabilities
        itab = np.outer(row, col)

        if isinstance(self.table_orig, pd.DataFrame):
            itab = pd.DataFrame(itab, self.table_orig.index,
                                self.table_orig.columns)

        return itab

    @cache_readonly
    def fittedvalues(self):
        """
        Returns fitted cell counts under independence.

        The returned cell counts are estimates under a model
        where the rows and columns of the table are independent.
        """

        probs = self.independence_probabilities
        fit = self.table.sum() * probs
        return fit

    @cache_readonly
    def resid_pearson(self):
        """
        Returns Pearson residuals.

        The Pearson residuals are calculated under a model where
        the rows and columns of the table are independent.
        """

        fit = self.fittedvalues
        resids = (self.table - fit) / np.sqrt(fit)
        return resids

    @cache_readonly
    def standardized_resids(self):
        """
        Returns standardized residuals under independence.
        """

        row, col = self.marginal_probabilities
        sresids = self.resid_pearson / np.sqrt(np.outer(1 - row, 1 - col))
        return sresids

    @cache_readonly
    def chi2_contribs(self):
        """
        Returns the contributions to the chi^2 statistic for independence.
Loading ...