"""
Histogram-related functions
"""
from __future__ import division, absolute_import, print_function
import functools
import operator
import warnings
import numpy as np
from numpy.compat.py3k import basestring
from numpy.core import overrides
__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']
array_function_dispatch = functools.partial(
overrides.array_function_dispatch, module='numpy')
# range is a keyword argument to many functions, so save the builtin so they can
# use it.
_range = range
def _hist_bin_sqrt(x, range):
"""
Square root histogram bin estimator.
Bin width is inversely proportional to the data size. Used by many
programs for its simplicity.
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return x.ptp() / np.sqrt(x.size)
def _hist_bin_sturges(x, range):
"""
Sturges histogram bin estimator.
A very simplistic estimator based on the assumption of normality of
the data. This estimator has poor performance for non-normal data,
which becomes especially obvious for large data sets. The estimate
depends only on size of the data.
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return x.ptp() / (np.log2(x.size) + 1.0)
def _hist_bin_rice(x, range):
"""
Rice histogram bin estimator.
Another simple estimator with no normality assumption. It has better
performance for large data than Sturges, but tends to overestimate
the number of bins. The number of bins is proportional to the cube
root of data size (asymptotically optimal). The estimate depends
only on size of the data.
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return x.ptp() / (2.0 * x.size ** (1.0 / 3))
def _hist_bin_scott(x, range):
"""
Scott histogram bin estimator.
The binwidth is proportional to the standard deviation of the data
and inversely proportional to the cube root of data size
(asymptotically optimal).
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
def _hist_bin_stone(x, range):
"""
Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
This paper by Stone appears to be the origination of this rule.
http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
range : (float, float)
The lower and upper range of the bins.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
n = x.size
ptp_x = np.ptp(x)
if n <= 1 or ptp_x == 0:
return 0
def jhat(nbins):
hh = ptp_x / nbins
p_k = np.histogram(x, bins=nbins, range=range)[0] / n
return (2 - (n + 1) * p_k.dot(p_k)) / hh
nbins_upper_bound = max(100, int(np.sqrt(n)))
nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
if nbins == nbins_upper_bound:
warnings.warn("The number of bins estimated may be suboptimal.", RuntimeWarning, stacklevel=2)
return ptp_x / nbins
def _hist_bin_doane(x, range):
"""
Doane's histogram bin estimator.
Improved version of Sturges' formula which works better for
non-normal data. See
stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
if x.size > 2:
sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
sigma = np.std(x)
if sigma > 0.0:
# These three operations add up to
# g1 = np.mean(((x - np.mean(x)) / sigma)**3)
# but use only one temp array instead of three
temp = x - np.mean(x)
np.true_divide(temp, sigma, temp)
np.power(temp, 3, temp)
g1 = np.mean(temp)
return x.ptp() / (1.0 + np.log2(x.size) +
np.log2(1.0 + np.absolute(g1) / sg1))
return 0.0
def _hist_bin_fd(x, range):
"""
The Freedman-Diaconis histogram bin estimator.
The Freedman-Diaconis rule uses interquartile range (IQR) to
estimate binwidth. It is considered a variation of the Scott rule
with more robustness as the IQR is less affected by outliers than
the standard deviation. However, the IQR depends on fewer points
than the standard deviation, so it is less accurate, especially for
long tailed distributions.
If the IQR is 0, this function returns 1 for the number of bins.
Binwidth is inversely proportional to the cube root of data size
(asymptotically optimal).
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
iqr = np.subtract(*np.percentile(x, [75, 25]))
return 2.0 * iqr * x.size ** (-1.0 / 3.0)
def _hist_bin_auto(x, range):
"""
Histogram bin estimator that uses the minimum width of the
Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
and the Sturges estimator if the FD bandwidth is 0.
The FD estimator is usually the most robust method, but its width
estimate tends to be too large for small `x` and bad for data with limited
variance. The Sturges estimator is quite good for small (<1000) datasets
and is the default in the R language. This method gives good off the shelf
behaviour.
.. versionchanged:: 1.15.0
If there is limited variance the IQR can be 0, which results in the
FD bin width being 0 too. This is not a valid bin width, so
``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
If the IQR is 0, it's unlikely any variance based estimators will be of
use, so we revert to the sturges estimator, which only uses the size of the
dataset in its calculation.
Parameters
----------
x : array_like
Input data that is to be histogrammed, trimmed to range. May not
be empty.
Returns
-------
h : An estimate of the optimal bin width for the given data.
See Also
--------
_hist_bin_fd, _hist_bin_sturges
"""
fd_bw = _hist_bin_fd(x, range)
sturges_bw = _hist_bin_sturges(x, range)
del range # unused
if fd_bw:
return min(fd_bw, sturges_bw)
else:
# limited variance, so we return a len dependent bw estimator
return sturges_bw
# Private dict initialized at module load time
_hist_bin_selectors = {'stone': _hist_bin_stone,
'auto': _hist_bin_auto,
'doane': _hist_bin_doane,
'fd': _hist_bin_fd,
'rice': _hist_bin_rice,
'scott': _hist_bin_scott,
'sqrt': _hist_bin_sqrt,
'sturges': _hist_bin_sturges}
def _ravel_and_check_weights(a, weights):
""" Check a and weights have matching shapes, and ravel both """
a = np.asarray(a)
# Ensure that the array is a "subtractable" dtype
if a.dtype == np.bool_:
warnings.warn("Converting input from {} to {} for compatibility."
.format(a.dtype, np.uint8),
RuntimeWarning, stacklevel=2)
a = a.astype(np.uint8)
if weights is not None:
weights = np.asarray(weights)
if weights.shape != a.shape:
raise ValueError(
'weights should have the same shape as a.')
weights = weights.ravel()
a = a.ravel()
return a, weights
def _get_outer_edges(a, range):
"""
Determine the outer bin edges to use, from either the data or the range
argument
"""
if range is not None:
first_edge, last_edge = range
if first_edge > last_edge:
raise ValueError(
'max must be larger than min in range parameter.')
if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
raise ValueError(
"supplied range of [{}, {}] is not finite".format(first_edge, last_edge))
elif a.size == 0:
# handle empty arrays. Can't determine range, so use 0-1.
first_edge, last_edge = 0, 1
else:
first_edge, last_edge = a.min(), a.max()
if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
raise ValueError(
"autodetected range of [{}, {}] is not finite".format(first_edge, last_edge))
# expand empty range to avoid divide by zero
if first_edge == last_edge:
first_edge = first_edge - 0.5
last_edge = last_edge + 0.5
return first_edge, last_edge
def _unsigned_subtract(a, b):
"""
Subtract two values where a >= b, and produce an unsigned result
This is needed when finding the difference between the upper and lower
bound of an int16 histogram
"""
# coerce to a single type
signed_to_unsigned = {
np.byte: np.ubyte,
np.short: np.ushort,
np.intc: np.uintc,
np.int_: np.uint,
np.longlong: np.ulonglong
}
dt = np.result_type(a, b)
try:
dt = signed_to_unsigned[dt.type]
except KeyError:
return np.subtract(a, b, dtype=dt)
else:
Loading ...