from __future__ import division, print_function, absolute_import
import math
import warnings
from collections import namedtuple
import numpy as np
from numpy import (isscalar, r_, log, around, unique, asarray,
zeros, arange, sort, amin, amax, any, atleast_1d,
sqrt, ceil, floor, array, compress,
pi, exp, ravel, count_nonzero, sin, cos, arctan2, hypot)
from scipy._lib.six import string_types
from scipy import optimize
from scipy import special
from . import statlib
from . import stats
from .stats import find_repeats, _contains_nan
from .contingency import chi2_contingency
from . import distributions
from ._distn_infrastructure import rv_generic
__all__ = ['mvsdist',
'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',
'fligner', 'mood', 'wilcoxon', 'median_test',
'circmean', 'circvar', 'circstd', 'anderson_ksamp',
'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
'yeojohnson_normplot'
]
Mean = namedtuple('Mean', ('statistic', 'minmax'))
Variance = namedtuple('Variance', ('statistic', 'minmax'))
Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
def bayes_mvs(data, alpha=0.90):
r"""
Bayesian confidence intervals for the mean, var, and std.
Parameters
----------
data : array_like
Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
Requires 2 or more data points.
alpha : float, optional
Probability that the returned confidence interval contains
the true parameter.
Returns
-------
mean_cntr, var_cntr, std_cntr : tuple
The three results are for the mean, variance and standard deviation,
respectively. Each result is a tuple of the form::
(center, (lower, upper))
with `center` the mean of the conditional pdf of the value given the
data, and `(lower, upper)` a confidence interval, centered on the
median, containing the estimate to a probability ``alpha``.
See Also
--------
mvsdist
Notes
-----
Each tuple of mean, variance, and standard deviation estimates represent
the (center, (lower, upper)) with center the mean of the conditional pdf
of the value given the data and (lower, upper) is a confidence interval
centered on the median, containing the estimate to a probability
``alpha``.
Converts data to 1-D and assumes all data has the same mean and variance.
Uses Jeffrey's prior for variance and std.
Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
References
----------
T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
2006.
Examples
--------
First a basic example to demonstrate the outputs:
>>> from scipy import stats
>>> data = [6, 9, 12, 7, 8, 8, 13]
>>> mean, var, std = stats.bayes_mvs(data)
>>> mean
Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
>>> var
Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))
>>> std
Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))
Now we generate some normally distributed random data, and get estimates of
mean and standard deviation with 95% confidence intervals for those
estimates:
>>> n_samples = 100000
>>> data = stats.norm.rvs(size=n_samples)
>>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.hist(data, bins=100, density=True, label='Histogram of data')
>>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
>>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
... alpha=0.2, label=r'Estimated mean (95% limits)')
>>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
>>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
... label=r'Estimated scale (95% limits)')
>>> ax.legend(fontsize=10)
>>> ax.set_xlim([-4, 4])
>>> ax.set_ylim([0, 0.5])
>>> plt.show()
"""
m, v, s = mvsdist(data)
if alpha >= 1 or alpha <= 0:
raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."
% alpha)
m_res = Mean(m.mean(), m.interval(alpha))
v_res = Variance(v.mean(), v.interval(alpha))
s_res = Std_dev(s.mean(), s.interval(alpha))
return m_res, v_res, s_res
def mvsdist(data):
"""
'Frozen' distributions for mean, variance, and standard deviation of data.
Parameters
----------
data : array_like
Input array. Converted to 1-D using ravel.
Requires 2 or more data-points.
Returns
-------
mdist : "frozen" distribution object
Distribution object representing the mean of the data
vdist : "frozen" distribution object
Distribution object representing the variance of the data
sdist : "frozen" distribution object
Distribution object representing the standard deviation of the data
See Also
--------
bayes_mvs
Notes
-----
The return values from ``bayes_mvs(data)`` is equivalent to
``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
on the three distribution objects returned from this function will give
the same results that are returned from `bayes_mvs`.
References
----------
T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
2006.
Examples
--------
>>> from scipy import stats
>>> data = [6, 9, 12, 7, 8, 8, 13]
>>> mean, var, std = stats.mvsdist(data)
We now have frozen distribution objects "mean", "var" and "std" that we can
examine:
>>> mean.mean()
9.0
>>> mean.interval(0.95)
(6.6120585482655692, 11.387941451734431)
>>> mean.std()
1.1952286093343936
"""
x = ravel(data)
n = len(x)
if n < 2:
raise ValueError("Need at least 2 data-points.")
xbar = x.mean()
C = x.var()
if n > 1000: # gaussian approximations for large n
mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
else:
nm1 = n - 1
fac = n * C / 2.
val = nm1 / 2.
mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
vdist = distributions.invgamma(val, scale=fac)
return mdist, vdist, sdist
def kstat(data, n=2):
r"""
Return the nth k-statistic (1<=n<=4 so far).
The nth k-statistic k_n is the unique symmetric unbiased estimator of the
nth cumulant kappa_n.
Parameters
----------
data : array_like
Input array. Note that n-D input gets flattened.
n : int, {1, 2, 3, 4}, optional
Default is equal to 2.
Returns
-------
kstat : float
The nth k-statistic.
See Also
--------
kstatvar: Returns an unbiased estimator of the variance of the k-statistic.
moment: Returns the n-th central moment about the mean for a sample.
Notes
-----
For a sample size n, the first few k-statistics are given by:
.. math::
k_{1} = \mu
k_{2} = \frac{n}{n-1} m_{2}
k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}
k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}
where :math:`\mu` is the sample mean, :math:`m_2` is the sample
variance, and :math:`m_i` is the i-th sample central moment.
References
----------
http://mathworld.wolfram.com/k-Statistic.html
http://mathworld.wolfram.com/Cumulant.html
Examples
--------
>>> from scipy import stats
>>> rndm = np.random.RandomState(1234)
As sample size increases, n-th moment and n-th k-statistic converge to the
same number (although they aren't identical). In the case of the normal
distribution, they converge to zero.
>>> for n in [2, 3, 4, 5, 6, 7]:
... x = rndm.normal(size=10**n)
... m, k = stats.moment(x, 3), stats.kstat(x, 3)
... print("%.3g %.3g %.3g" % (m, k, m-k))
-0.631 -0.651 0.0194
0.0282 0.0283 -8.49e-05
-0.0454 -0.0454 1.36e-05
7.53e-05 7.53e-05 -2.26e-09
0.00166 0.00166 -4.99e-09
-2.88e-06 -2.88e-06 8.63e-13
"""
if n > 4 or n < 1:
raise ValueError("k-statistics only supported for 1<=n<=4")
n = int(n)
S = np.zeros(n + 1, np.float64)
data = ravel(data)
N = data.size
# raise ValueError on empty input
if N == 0:
raise ValueError("Data input must not be empty")
# on nan input, return nan without warning
if np.isnan(np.sum(data)):
return np.nan
for k in range(1, n + 1):
S[k] = np.sum(data**k, axis=0)
if n == 1:
return S[1] * 1.0/N
elif n == 2:
return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
elif n == 3:
return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
elif n == 4:
return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
(N*(N-1.0)*(N-2.0)*(N-3.0)))
else:
raise ValueError("Should not be here.")
def kstatvar(data, n=2):
r"""
Returns an unbiased estimator of the variance of the k-statistic.
See `kstat` for more details of the k-statistic.
Parameters
----------
data : array_like
Input array. Note that n-D input gets flattened.
n : int, {1, 2}, optional
Default is equal to 2.
Returns
-------
kstatvar : float
The nth k-statistic variance.
See Also
--------
kstat: Returns the n-th k-statistic.
moment: Returns the n-th central moment about the mean for a sample.
Notes
-----
The variances of the first few k-statistics are given by:
.. math::
var(k_{1}) = \frac{\kappa^2}{n}
var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}
var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +
\frac{9 \kappa^2_{3}}{n - 1} +
\frac{6 n \kappa^3_{2}}{(n-1) (n-2)}
var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +
\frac{48 \kappa_{3} \kappa_5}{n - 1} +
\frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +
Loading ...