import numpy as np
def _make_index(prob,size):
"""
Returns a boolean index for given probabilities.
Notes
---------
prob = [.75,.25] means that there is a 75% chance of the first column
being True and a 25% chance of the second column being True. The
columns are mutually exclusive.
"""
rv = np.random.uniform(size=(size,1))
cumprob = np.cumsum(prob)
return np.logical_and(np.r_[0,cumprob[:-1]] <= rv, rv < cumprob)
def mixture_rvs(prob, size, dist, kwargs=None):
"""
Sample from a mixture of distributions.
Parameters
----------
prob : array_like
Probability of sampling from each distribution in dist
size : int
The length of the returned sample.
dist : array_like
An iterable of distributions objects from scipy.stats.
kwargs : tuple of dicts, optional
A tuple of dicts. Each dict in kwargs can have keys loc, scale, and
args to be passed to the respective distribution in dist. If not
provided, the distribution defaults are used.
Examples
--------
Say we want 5000 random variables from mixture of normals with two
distributions norm(-1,.5) and norm(1,.5) and we want to sample from the
first with probability .75 and the second with probability .25.
>>> from scipy import stats
>>> prob = [.75,.25]
>>> Y = mixture_rvs(prob, 5000, dist=[stats.norm, stats.norm],
... kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))
"""
if len(prob) != len(dist):
raise ValueError("You must provide as many probabilities as distributions")
if not np.allclose(np.sum(prob), 1):
raise ValueError("prob does not sum to 1")
if kwargs is None:
kwargs = ({},)*len(prob)
idx = _make_index(prob,size)
sample = np.empty(size)
for i in range(len(prob)):
sample_idx = idx[...,i]
sample_size = sample_idx.sum()
loc = kwargs[i].get('loc',0)
scale = kwargs[i].get('scale',1)
args = kwargs[i].get('args',())
sample[sample_idx] = dist[i].rvs(*args, **dict(loc=loc,scale=scale,
size=sample_size))
return sample
class MixtureDistribution(object):
'''univariate mixture distribution
for simple case for now (unbound support)
does not yet inherit from scipy.stats.distributions
adding pdf to mixture_rvs, some restrictions on broadcasting
Currently it does not hold any state, all arguments included in each method.
'''
#def __init__(self, prob, size, dist, kwargs=None):
def rvs(self, prob, size, dist, kwargs=None):
return mixture_rvs(prob, size, dist, kwargs=kwargs)
def pdf(self, x, prob, dist, kwargs=None):
"""
pdf a mixture of distributions.
Parameters
----------
x : array_like
Array containing locations where the PDF should be evaluated
prob : array_like
Probability of sampling from each distribution in dist
dist : array_like
An iterable of distributions objects from scipy.stats.
kwargs : tuple of dicts, optional
A tuple of dicts. Each dict in kwargs can have keys loc, scale, and
args to be passed to the respective distribution in dist. If not
provided, the distribution defaults are used.
Examples
--------
Say we want 5000 random variables from mixture of normals with two
distributions norm(-1,.5) and norm(1,.5) and we want to sample from the
first with probability .75 and the second with probability .25.
>>> import numpy as np
>>> from scipy import stats
>>> from statsmodels.distributions.mixture_rvs import MixtureDistribution
>>> x = np.arange(-4.0, 4.0, 0.01)
>>> prob = [.75,.25]
>>> mixture = MixtureDistribution()
>>> Y = mixture.pdf(x, prob, dist=[stats.norm, stats.norm],
... kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))
"""
if len(prob) != len(dist):
raise ValueError("You must provide as many probabilities as distributions")
if not np.allclose(np.sum(prob), 1):
raise ValueError("prob does not sum to 1")
if kwargs is None:
kwargs = ({},)*len(prob)
for i in range(len(prob)):
loc = kwargs[i].get('loc',0)
scale = kwargs[i].get('scale',1)
args = kwargs[i].get('args',())
if i == 0: #assume all broadcast the same as the first dist
pdf_ = prob[i] * dist[i].pdf(x, *args, loc=loc, scale=scale)
else:
pdf_ += prob[i] * dist[i].pdf(x, *args, loc=loc, scale=scale)
return pdf_
def cdf(self, x, prob, dist, kwargs=None):
"""
cdf of a mixture of distributions.
Parameters
----------
x : array_like
Array containing locations where the CDF should be evaluated
prob : array_like
Probability of sampling from each distribution in dist
size : int
The length of the returned sample.
dist : array_like
An iterable of distributions objects from scipy.stats.
kwargs : tuple of dicts, optional
A tuple of dicts. Each dict in kwargs can have keys loc, scale, and
args to be passed to the respective distribution in dist. If not
provided, the distribution defaults are used.
Examples
--------
Say we want 5000 random variables from mixture of normals with two
distributions norm(-1,.5) and norm(1,.5) and we want to sample from the
first with probability .75 and the second with probability .25.
>>> import numpy as np
>>> from scipy import stats
>>> from statsmodels.distributions.mixture_rvs import MixtureDistribution
>>> x = np.arange(-4.0, 4.0, 0.01)
>>> prob = [.75,.25]
>>> mixture = MixtureDistribution()
>>> Y = mixture.pdf(x, prob, dist=[stats.norm, stats.norm],
... kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))
"""
if len(prob) != len(dist):
raise ValueError("You must provide as many probabilities as distributions")
if not np.allclose(np.sum(prob), 1):
raise ValueError("prob does not sum to 1")
if kwargs is None:
kwargs = ({},)*len(prob)
for i in range(len(prob)):
loc = kwargs[i].get('loc',0)
scale = kwargs[i].get('scale',1)
args = kwargs[i].get('args',())
if i == 0: #assume all broadcast the same as the first dist
cdf_ = prob[i] * dist[i].cdf(x, *args, loc=loc, scale=scale)
else:
cdf_ += prob[i] * dist[i].cdf(x, *args, loc=loc, scale=scale)
return cdf_
def mv_mixture_rvs(prob, size, dist, nvars, **kwargs):
"""
Sample from a mixture of multivariate distributions.
Parameters
----------
prob : array_like
Probability of sampling from each distribution in dist
size : int
The length of the returned sample.
dist : array_like
An iterable of distributions instances with callable method rvs.
nvargs : int
dimension of the multivariate distribution, could be inferred instead
kwargs : tuple of dicts, optional
ignored
Examples
--------
Say we want 2000 random variables from mixture of normals with two
multivariate normal distributions, and we want to sample from the
first with probability .4 and the second with probability .6.
import statsmodels.sandbox.distributions.mv_normal as mvd
cov3 = np.array([[ 1. , 0.5 , 0.75],
[ 0.5 , 1.5 , 0.6 ],
[ 0.75, 0.6 , 2. ]])
mu = np.array([-1, 0.0, 2.0])
mu2 = np.array([4, 2.0, 2.0])
mvn3 = mvd.MVNormal(mu, cov3)
mvn32 = mvd.MVNormal(mu2, cov3/2., 4)
rvs = mix.mv_mixture_rvs([0.4, 0.6], 2000, [mvn3, mvn32], 3)
"""
if len(prob) != len(dist):
raise ValueError("You must provide as many probabilities as distributions")
if not np.allclose(np.sum(prob), 1):
raise ValueError("prob does not sum to 1")
if kwargs is None:
kwargs = ({},)*len(prob)
idx = _make_index(prob,size)
sample = np.empty((size, nvars))
for i in range(len(prob)):
sample_idx = idx[...,i]
sample_size = sample_idx.sum()
#loc = kwargs[i].get('loc',0)
#scale = kwargs[i].get('scale',1)
#args = kwargs[i].get('args',())
# use int to avoid numpy bug with np.random.multivariate_normal
sample[sample_idx] = dist[i].rvs(size=int(sample_size))
return sample
if __name__ == '__main__':
from scipy import stats
obs_dist = mixture_rvs([.25,.75], size=10000, dist=[stats.norm, stats.beta],
kwargs=(dict(loc=-1,scale=.5),dict(loc=1,scale=1,args=(1,.5))))
nobs = 10000
mix = MixtureDistribution()
## mrvs = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
## kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75)))
mix_kwds = (dict(loc=-1,scale=.25),dict(loc=1,scale=.75))
mrvs = mix.rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
grid = np.linspace(-4,4, 100)
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
mcdf = mix.cdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
doplot = 1
if doplot:
import matplotlib.pyplot as plt
plt.figure()
plt.hist(mrvs, bins=50, normed=True, color='red')
plt.title('histogram of sample and pdf')
plt.plot(grid, mpdf, lw=2, color='black')
plt.figure()
plt.hist(mrvs, bins=50, normed=True, cumulative=True, color='red')
plt.title('histogram of sample and pdf')
plt.plot(grid, mcdf, lw=2, color='black')
plt.show()