'''get versions of mstats percentile functions that also work with non-masked arrays
uses dispatch to mstats version for difficult cases:
- data is masked array
- data requires nan handling (masknan=True)
- data should be trimmed (limit is non-empty)
handle simple cases directly, which does not require apply_along_axis
changes compared to mstats: plotting_positions for n-dim with axis argument
addition: plotting_positions_w1d: with weights, 1d ndarray only
TODO:
consistency with scipy.stats versions not checked
docstrings from mstats not updated yet
code duplication, better solutions (?)
convert examples to tests
rename alphap, betap for consistency
timing question: one additional argsort versus apply_along_axis
weighted plotting_positions
- I have not figured out nd version of weighted plotting_positions
- add weighted quantiles
'''
import numpy as np
from numpy import ma
from scipy import stats
#from numpy.ma import nomask
#####--------------------------------------------------------------------------
#---- --- Percentiles ---
#####--------------------------------------------------------------------------
def quantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
limit=(), masknan=False):
"""
Computes empirical quantiles for a data array.
Samples quantile are defined by :math:`Q(p) = (1-g).x[i] +g.x[i+1]`,
where :math:`x[j]` is the *j*th order statistic, and
`i = (floor(n*p+m))`, `m=alpha+p*(1-alpha-beta)` and `g = n*p + m - i`.
Typical values of (alpha,beta) are:
- (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
- (.5,.5) : *p(k) = (k+1/2.)/n* : piecewise linear
function (R, type 5)
- (0,0) : *p(k) = k/(n+1)* : (R type 6)
- (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
That's R default (R type 7)
- (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
The resulting quantile estimates are approximately median-unbiased
regardless of the distribution of x. (R type 8)
- (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
The resulting quantile estimates are approximately unbiased
if x is normally distributed (R type 9)
- (.4,.4) : approximately quantile unbiased (Cunnane)
- (.35,.35): APL, used with PWM ?? JP
- (0.35, 0.65): PWM ?? JP p(k) = (k-0.35)/n
Parameters
----------
a : array_like
Input data, as a sequence or array of dimension at most 2.
prob : array_like, optional
List of quantiles to compute.
alpha : float, optional
Plotting positions parameter, default is 0.4.
beta : float, optional
Plotting positions parameter, default is 0.4.
axis : int, optional
Axis along which to perform the trimming.
If None (default), the input array is first flattened.
limit : tuple
Tuple of (lower, upper) values.
Values of `a` outside this closed interval are ignored.
Returns
-------
quants : MaskedArray
An array containing the calculated quantiles.
Examples
--------
>>> from scipy.stats.mstats import mquantiles
>>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
>>> mquantiles(a)
array([ 19.2, 40. , 42.8])
Using a 2D array, specifying axis and limit.
>>> data = np.array([[ 6., 7., 1.],
[ 47., 15., 2.],
[ 49., 36., 3.],
[ 15., 39., 4.],
[ 42., 40., -999.],
[ 41., 41., -999.],
[ 7., -999., -999.],
[ 39., -999., -999.],
[ 43., -999., -999.],
[ 40., -999., -999.],
[ 36., -999., -999.]])
>>> mquantiles(data, axis=0, limit=(0, 50))
array([[ 19.2 , 14.6 , 1.45],
[ 40. , 37.5 , 2.5 ],
[ 42.8 , 40.05, 3.55]])
>>> data[:, 2] = -999.
>>> mquantiles(data, axis=0, limit=(0, 50))
masked_array(data =
[[19.2 14.6 --]
[40.0 37.5 --]
[42.8 40.05 --]],
mask =
[[False False True]
[False False True]
[False False True]],
fill_value = 1e+20)
"""
if isinstance(a, np.ma.MaskedArray):
return stats.mstats.mquantiles(a, prob=prob, alphap=alphap, betap=alphap, axis=axis,
limit=limit)
if limit:
marr = stats.mstats.mquantiles(a, prob=prob, alphap=alphap, betap=alphap, axis=axis,
limit=limit)
return ma.filled(marr, fill_value=np.nan)
if masknan:
nanmask = np.isnan(a)
if nanmask.any():
marr = ma.array(a, mask=nanmask)
marr = stats.mstats.mquantiles(marr, prob=prob, alphap=alphap, betap=alphap,
axis=axis, limit=limit)
return ma.filled(marr, fill_value=np.nan)
# Initialization & checks ---------
data = np.asarray(a)
p = np.array(prob, copy=False, ndmin=1)
m = alphap + p*(1.-alphap-betap)
isrolled = False
#from _quantiles1d
if (axis is None):
data = data.ravel() #reshape(-1,1)
axis = 0
else:
axis = np.arange(data.ndim)[axis]
data = np.rollaxis(data, axis)
isrolled = True # keep track, maybe can be removed
x = np.sort(data, axis=0)
n = x.shape[0]
returnshape = list(data.shape)
returnshape[axis] = p
#TODO: check these
if n == 0:
return np.empty(len(p), dtype=float)
elif n == 1:
return np.resize(x, p.shape)
aleph = (n*p + m)
k = np.floor(aleph.clip(1, n-1)).astype(int)
ind = [None]*x.ndim
ind[0] = slice(None)
gamma = (aleph-k).clip(0,1)[ind]
q = (1.-gamma)*x[k-1] + gamma*x[k]
if isrolled:
return np.rollaxis(q, 0, axis+1)
else:
return q
def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4, axis=0, masknan=None):
"""Calculate the score at the given 'per' percentile of the
sequence a. For example, the score at per=50 is the median.
This function is a shortcut to mquantile
"""
per = np.asarray(per, float)
if (per < 0).any() or (per > 100.).any():
raise ValueError("The percentile should be between 0. and 100. !"\
" (got %s)" % per)
return quantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
limit=limit, axis=axis, masknan=masknan).squeeze()
def plotting_positions(data, alpha=0.4, beta=0.4, axis=0, masknan=False):
"""Returns the plotting positions (or empirical percentile points) for the
data.
Plotting positions are defined as (i-alpha)/(n+1-alpha-beta), where:
- i is the rank order statistics (starting at 1)
- n is the number of unmasked values along the given axis
- alpha and beta are two parameters.
Typical values for alpha and beta are:
- (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
- (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5)
(Bliss 1967: "Rankit")
- (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6), (Van der Waerden 1952)
- (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
That's R default (R type 7)
- (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
The resulting quantile estimates are approximately median-unbiased
regardless of the distribution of x. (R type 8), (Tukey 1962)
- (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*.
The resulting quantile estimates are approximately unbiased
if x is normally distributed (R type 9) (Blom 1958)
- (.4,.4) : approximately quantile unbiased (Cunnane)
- (.35,.35): APL, used with PWM
Parameters
----------
x : sequence
Input data, as a sequence or array of dimension at most 2.
prob : sequence
List of quantiles to compute.
alpha : {0.4, float} optional
Plotting positions parameter.
beta : {0.4, float} optional
Plotting positions parameter.
Notes
-----
I think the adjustments assume that there are no ties in order to be a reasonable
approximation to a continuous density function. TODO: check this
References
----------
unknown,
dates to original papers from Beasley, Erickson, Allison 2009 Behav Genet
"""
if isinstance(data, np.ma.MaskedArray):
if axis is None or data.ndim == 1:
return stats.mstats.plotting_positions(data, alpha=alpha, beta=beta)
else:
return ma.apply_along_axis(stats.mstats.plotting_positions, axis, data, alpha=alpha, beta=beta)
if masknan:
nanmask = np.isnan(data)
if nanmask.any():
marr = ma.array(data, mask=nanmask)
#code duplication:
if axis is None or data.ndim == 1:
marr = stats.mstats.plotting_positions(marr, alpha=alpha, beta=beta)
else:
marr = ma.apply_along_axis(stats.mstats.plotting_positions, axis, marr, alpha=alpha, beta=beta)
return ma.filled(marr, fill_value=np.nan)
data = np.asarray(data)
if data.size == 1: # use helper function instead
data = np.atleast_1d(data)
axis = 0
if axis is None:
data = data.ravel()
axis = 0
n = data.shape[axis]
if data.ndim == 1:
plpos = np.empty(data.shape, dtype=float)
plpos[data.argsort()] = (np.arange(1,n+1) - alpha)/(n+1.-alpha-beta)
else:
#nd assignment instead of second argsort does not look easy
plpos = (data.argsort(axis).argsort(axis) + 1. - alpha)/(n+1.-alpha-beta)
return plpos
meppf = plotting_positions
def plotting_positions_w1d(data, weights=None, alpha=0.4, beta=0.4,
method='notnormed'):
'''Weighted plotting positions (or empirical percentile points) for the data.
observations are weighted and the plotting positions are defined as
(ws-alpha)/(n-alpha-beta), where:
- ws is the weighted rank order statistics or cumulative weighted sum,
normalized to n if method is "normed"
- n is the number of values along the given axis if method is "normed"
and total weight otherwise
- alpha and beta are two parameters.
wtd.quantile in R package Hmisc seems to use the "notnormed" version.
notnormed coincides with unweighted segment in example, drop "normed" version ?
See Also
--------
plotting_positions : unweighted version that works also with more than one
dimension and has other options
'''
x = np.atleast_1d(data)
if x.ndim > 1:
raise ValueError('currently implemented only for 1d')
if weights is None:
weights = np.ones(x.shape)
else:
weights = np.array(weights, float, copy=False, ndmin=1) #atleast_1d(weights)
if weights.shape != x.shape:
raise ValueError('if weights is given, it needs to be the same'
'shape as data')
n = len(x)
xargsort = x.argsort()
ws = weights[xargsort].cumsum()
res = np.empty(x.shape)
if method == 'normed':
res[xargsort] = (1.*ws/ws[-1]*n-alpha)/(n+1.-alpha-beta)
else:
res[xargsort] = (1.*ws-alpha)/(ws[-1]+1.-alpha-beta)
return res
def edf_normal_inverse_transformed(x, alpha=3./8, beta=3./8, axis=0):
'''rank based normal inverse transformed cdf
'''
from scipy import stats
ranks = plotting_positions(x, alpha=alpha, beta=alpha, axis=0, masknan=False)
ranks_transf = stats.norm.ppf(ranks)
return ranks_transf
if __name__ == '__main__':
x = np.arange(5)
print(plotting_positions(x))
x = np.arange(10).reshape(-1,2)
print(plotting_positions(x))
print(quantiles(x, axis=0))
print(quantiles(x, axis=None))
print(quantiles(x, axis=1))
xm = ma.array(x)
x2 = x.astype(float)
x2[1,0] = np.nan
print(plotting_positions(xm, axis=0))
# test 0d, 1d
for sl1 in [slice(None), 0]:
print((plotting_positions(xm[sl1,0]) == plotting_positions(x[sl1,0])).all())
print((quantiles(xm[sl1,0]) == quantiles(x[sl1,0])).all())
print((stats.mstats.mquantiles(ma.fix_invalid(x2[sl1,0])) == quantiles(x2[sl1,0], masknan=1)).all())
#test 2d
for ax in [0, 1, None, -1]:
print((plotting_positions(xm, axis=ax) == plotting_positions(x, axis=ax)).all())
print((quantiles(xm, axis=ax) == quantiles(x, axis=ax)).all())
print((stats.mstats.mquantiles(ma.fix_invalid(x2), axis=ax) == quantiles(x2, axis=ax, masknan=1)).all())
#stats version does not have axis
print((stats.mstats.plotting_positions(ma.fix_invalid(x2)) == plotting_positions(x2, axis=None, masknan=1)).all())
#test 3d
x3 = np.dstack((x,x)).T
for ax in [1,2]:
print((plotting_positions(x3, axis=ax)[0] == plotting_positions(x.T, axis=ax-1)).all())
np.testing.assert_equal(plotting_positions(np.arange(10), alpha=0.35, beta=1-0.35), (1+np.arange(10)-0.35)/10)
np.testing.assert_equal(plotting_positions(np.arange(10), alpha=0.4, beta=0.4), (1+np.arange(10)-0.4)/(10+0.2))
np.testing.assert_equal(plotting_positions(np.arange(10)), (1+np.arange(10)-0.4)/(10+0.2))
print('')
print(scoreatpercentile(x, [10,90]))
print(plotting_positions_w1d(x[:,0]))
print((plotting_positions_w1d(x[:,0]) == plotting_positions(x[:,0])).all())
#weights versus replicating multiple occurencies of same x value
w1 = [1, 1, 2, 1, 1]
plotexample = 1
if plotexample:
import matplotlib.pyplot as plt
plt.figure()
plt.title('ppf, cdf values on horizontal axis')
plt.step(plotting_positions_w1d(x[:,0], weights=w1, method='0'), x[:,0], where='post')
plt.step(stats.mstats.plotting_positions(np.repeat(x[:,0],w1,axis=0)),np.repeat(x[:,0],w1,axis=0),where='post')
plt.plot(plotting_positions_w1d(x[:,0], weights=w1, method='0'), x[:,0], '-o')
plt.plot(stats.mstats.plotting_positions(np.repeat(x[:,0],w1,axis=0)),np.repeat(x[:,0],w1,axis=0), '-o')
plt.figure()
plt.title('cdf, cdf values on vertical axis')
plt.step(x[:,0], plotting_positions_w1d(x[:,0], weights=w1, method='0'),where='post')
plt.step(np.repeat(x[:,0],w1,axis=0), stats.mstats.plotting_positions(np.repeat(x[:,0],w1,axis=0)),where='post')
plt.plot(x[:,0], plotting_positions_w1d(x[:,0], weights=w1, method='0'), '-o')
plt.plot(np.repeat(x[:,0],w1,axis=0), stats.mstats.plotting_positions(np.repeat(x[:,0],w1,axis=0)), '-o')
plt.show()