Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

aaronreidsmith / scipy   python

Repository URL to install this package:

Version: 1.3.3 

/ stats / morestats.py

from __future__ import division, print_function, absolute_import

import math
import warnings
from collections import namedtuple

import numpy as np
from numpy import (isscalar, r_, log, around, unique, asarray,
                   zeros, arange, sort, amin, amax, any, atleast_1d,
                   sqrt, ceil, floor, array, compress,
                   pi, exp, ravel, count_nonzero, sin, cos, arctan2, hypot)

from scipy._lib.six import string_types
from scipy import optimize
from scipy import special
from . import statlib
from . import stats
from .stats import find_repeats, _contains_nan
from .contingency import chi2_contingency
from . import distributions
from ._distn_infrastructure import rv_generic


__all__ = ['mvsdist',
           'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
           'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
           'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',
           'fligner', 'mood', 'wilcoxon', 'median_test',
           'circmean', 'circvar', 'circstd', 'anderson_ksamp',
           'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
           'yeojohnson_normplot'
           ]


Mean = namedtuple('Mean', ('statistic', 'minmax'))
Variance = namedtuple('Variance', ('statistic', 'minmax'))
Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))


def bayes_mvs(data, alpha=0.90):
    r"""
    Bayesian confidence intervals for the mean, var, and std.

    Parameters
    ----------
    data : array_like
        Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
        Requires 2 or more data points.
    alpha : float, optional
        Probability that the returned confidence interval contains
        the true parameter.

    Returns
    -------
    mean_cntr, var_cntr, std_cntr : tuple
        The three results are for the mean, variance and standard deviation,
        respectively.  Each result is a tuple of the form::

            (center, (lower, upper))

        with `center` the mean of the conditional pdf of the value given the
        data, and `(lower, upper)` a confidence interval, centered on the
        median, containing the estimate to a probability ``alpha``.

    See Also
    --------
    mvsdist

    Notes
    -----
    Each tuple of mean, variance, and standard deviation estimates represent
    the (center, (lower, upper)) with center the mean of the conditional pdf
    of the value given the data and (lower, upper) is a confidence interval
    centered on the median, containing the estimate to a probability
    ``alpha``.

    Converts data to 1-D and assumes all data has the same mean and variance.
    Uses Jeffrey's prior for variance and std.

    Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``

    References
    ----------
    T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
    standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
    2006.

    Examples
    --------
    First a basic example to demonstrate the outputs:

    >>> from scipy import stats
    >>> data = [6, 9, 12, 7, 8, 8, 13]
    >>> mean, var, std = stats.bayes_mvs(data)
    >>> mean
    Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
    >>> var
    Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))
    >>> std
    Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))

    Now we generate some normally distributed random data, and get estimates of
    mean and standard deviation with 95% confidence intervals for those
    estimates:

    >>> n_samples = 100000
    >>> data = stats.norm.rvs(size=n_samples)
    >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)

    >>> import matplotlib.pyplot as plt
    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
    >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
    >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
    ...            alpha=0.2, label=r'Estimated mean (95% limits)')
    >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
    >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
    ...            label=r'Estimated scale (95% limits)')

    >>> ax.legend(fontsize=10)
    >>> ax.set_xlim([-4, 4])
    >>> ax.set_ylim([0, 0.5])
    >>> plt.show()

    """
    m, v, s = mvsdist(data)
    if alpha >= 1 or alpha <= 0:
        raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."
                         % alpha)

    m_res = Mean(m.mean(), m.interval(alpha))
    v_res = Variance(v.mean(), v.interval(alpha))
    s_res = Std_dev(s.mean(), s.interval(alpha))

    return m_res, v_res, s_res


def mvsdist(data):
    """
    'Frozen' distributions for mean, variance, and standard deviation of data.

    Parameters
    ----------
    data : array_like
        Input array. Converted to 1-D using ravel.
        Requires 2 or more data-points.

    Returns
    -------
    mdist : "frozen" distribution object
        Distribution object representing the mean of the data
    vdist : "frozen" distribution object
        Distribution object representing the variance of the data
    sdist : "frozen" distribution object
        Distribution object representing the standard deviation of the data

    See Also
    --------
    bayes_mvs

    Notes
    -----
    The return values from ``bayes_mvs(data)`` is equivalent to
    ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.

    In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
    on the three distribution objects returned from this function will give
    the same results that are returned from `bayes_mvs`.

    References
    ----------
    T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
    standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
    2006.

    Examples
    --------
    >>> from scipy import stats
    >>> data = [6, 9, 12, 7, 8, 8, 13]
    >>> mean, var, std = stats.mvsdist(data)

    We now have frozen distribution objects "mean", "var" and "std" that we can
    examine:

    >>> mean.mean()
    9.0
    >>> mean.interval(0.95)
    (6.6120585482655692, 11.387941451734431)
    >>> mean.std()
    1.1952286093343936

    """
    x = ravel(data)
    n = len(x)
    if n < 2:
        raise ValueError("Need at least 2 data-points.")
    xbar = x.mean()
    C = x.var()
    if n > 1000:  # gaussian approximations for large n
        mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
        sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
        vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
    else:
        nm1 = n - 1
        fac = n * C / 2.
        val = nm1 / 2.
        mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
        sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
        vdist = distributions.invgamma(val, scale=fac)
    return mdist, vdist, sdist


def kstat(data, n=2):
    r"""
    Return the nth k-statistic (1<=n<=4 so far).

    The nth k-statistic k_n is the unique symmetric unbiased estimator of the
    nth cumulant kappa_n.

    Parameters
    ----------
    data : array_like
        Input array. Note that n-D input gets flattened.
    n : int, {1, 2, 3, 4}, optional
        Default is equal to 2.

    Returns
    -------
    kstat : float
        The nth k-statistic.

    See Also
    --------
    kstatvar: Returns an unbiased estimator of the variance of the k-statistic.
    moment: Returns the n-th central moment about the mean for a sample.

    Notes
    -----
    For a sample size n, the first few k-statistics are given by:

    .. math::

        k_{1} = \mu
        k_{2} = \frac{n}{n-1} m_{2}
        k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}
        k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}

    where :math:`\mu` is the sample mean, :math:`m_2` is the sample
    variance, and :math:`m_i` is the i-th sample central moment.

    References
    ----------
    http://mathworld.wolfram.com/k-Statistic.html

    http://mathworld.wolfram.com/Cumulant.html

    Examples
    --------
    >>> from scipy import stats
    >>> rndm = np.random.RandomState(1234)

    As sample size increases, n-th moment and n-th k-statistic converge to the
    same number (although they aren't identical). In the case of the normal
    distribution, they converge to zero.

    >>> for n in [2, 3, 4, 5, 6, 7]:
    ...     x = rndm.normal(size=10**n)
    ...     m, k = stats.moment(x, 3), stats.kstat(x, 3)
    ...     print("%.3g %.3g %.3g" % (m, k, m-k))
    -0.631 -0.651 0.0194
    0.0282 0.0283 -8.49e-05
    -0.0454 -0.0454 1.36e-05
    7.53e-05 7.53e-05 -2.26e-09
    0.00166 0.00166 -4.99e-09
    -2.88e-06 -2.88e-06 8.63e-13
    """
    if n > 4 or n < 1:
        raise ValueError("k-statistics only supported for 1<=n<=4")
    n = int(n)
    S = np.zeros(n + 1, np.float64)
    data = ravel(data)
    N = data.size

    # raise ValueError on empty input
    if N == 0:
        raise ValueError("Data input must not be empty")

    # on nan input, return nan without warning
    if np.isnan(np.sum(data)):
        return np.nan

    for k in range(1, n + 1):
        S[k] = np.sum(data**k, axis=0)
    if n == 1:
        return S[1] * 1.0/N
    elif n == 2:
        return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
    elif n == 3:
        return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
    elif n == 4:
        return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
                 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
                 (N*(N-1.0)*(N-2.0)*(N-3.0)))
    else:
        raise ValueError("Should not be here.")


def kstatvar(data, n=2):
    r"""
    Returns an unbiased estimator of the variance of the k-statistic.

    See `kstat` for more details of the k-statistic.

    Parameters
    ----------
    data : array_like
        Input array. Note that n-D input gets flattened.
    n : int, {1, 2}, optional
        Default is equal to 2.

    Returns
    -------
    kstatvar : float
        The nth k-statistic variance.

    See Also
    --------
    kstat: Returns the n-th k-statistic.
    moment: Returns the n-th central moment about the mean for a sample.

    Notes
    -----
    The variances of the first few k-statistics are given by:

    .. math::

        var(k_{1}) = \frac{\kappa^2}{n}
        var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}
        var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +
                     \frac{9 \kappa^2_{3}}{n - 1} +
                     \frac{6 n \kappa^3_{2}}{(n-1) (n-2)}
        var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +
                     \frac{48 \kappa_{3} \kappa_5}{n - 1} +
                     \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +
Loading ...