from statsmodels.compat.platform import PLATFORM_OSX
import os
import csv
import warnings
import numpy as np
import pandas as pd
from scipy import sparse
import pytest
from statsmodels.regression.mixed_linear_model import (
MixedLM, MixedLMParams, _smw_solver, _smw_logdet)
from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose,
assert_)
from statsmodels.base import _penalties as penalties
import statsmodels.tools.numdiff as nd
from .results import lme_r_results
# TODO: add tests with unequal group sizes
class R_Results(object):
"""
A class for holding various results obtained from fitting one data
set using lmer in R.
Parameters
----------
meth : str
Either "ml" or "reml".
irfs : str
Either "irf", for independent random effects, or "drf" for
dependent random effects.
ds_ix : int
The number of the data set
"""
def __init__(self, meth, irfs, ds_ix):
bname = "_%s_%s_%d" % (meth, irfs, ds_ix)
self.coef = getattr(lme_r_results, "coef" + bname)
self.vcov_r = getattr(lme_r_results, "vcov" + bname)
self.cov_re_r = getattr(lme_r_results, "cov_re" + bname)
self.scale_r = getattr(lme_r_results, "scale" + bname)
self.loglike = getattr(lme_r_results, "loglike" + bname)
if hasattr(lme_r_results, "ranef_mean" + bname):
self.ranef_postmean = getattr(lme_r_results, "ranef_mean" + bname)
self.ranef_condvar = getattr(lme_r_results,
"ranef_condvar" + bname)
self.ranef_condvar = np.atleast_2d(self.ranef_condvar)
# Load the data file
cur_dir = os.path.dirname(os.path.abspath(__file__))
rdir = os.path.join(cur_dir, 'results')
fname = os.path.join(rdir, "lme%02d.csv" % ds_ix)
with open(fname) as fid:
rdr = csv.reader(fid)
header = next(rdr)
data = [[float(x) for x in line] for line in rdr]
data = np.asarray(data)
# Split into exog, endog, etc.
self.endog = data[:, header.index("endog")]
self.groups = data[:, header.index("groups")]
ii = [i for i, x in enumerate(header) if x.startswith("exog_fe")]
self.exog_fe = data[:, ii]
ii = [i for i, x in enumerate(header) if x.startswith("exog_re")]
self.exog_re = data[:, ii]
def loglike_function(model, profile_fe, has_fe):
# Returns a function that evaluates the negative log-likelihood for
# the given model.
def f(x):
params = MixedLMParams.from_packed(
x, model.k_fe, model.k_re, model.use_sqrt, has_fe=has_fe)
return -model.loglike(params, profile_fe=profile_fe)
return f
class TestMixedLM(object):
# Test analytic scores and Hessian using numeric differentiation
@pytest.mark.slow
@pytest.mark.parametrize("use_sqrt", [False, True])
@pytest.mark.parametrize("reml", [False, True])
@pytest.mark.parametrize("profile_fe", [False, True])
def test_compare_numdiff(self, use_sqrt, reml, profile_fe):
n_grp = 200
grpsize = 5
k_fe = 3
k_re = 2
np.random.seed(3558)
exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe))
exog_re = np.random.normal(size=(n_grp * grpsize, k_re))
exog_re[:, 0] = 1
exog_vc = np.random.normal(size=(n_grp * grpsize, 3))
slopes = np.random.normal(size=(n_grp, k_re))
slopes[:, -1] *= 2
slopes = np.kron(slopes, np.ones((grpsize, 1)))
slopes_vc = np.random.normal(size=(n_grp, 3))
slopes_vc = np.kron(slopes_vc, np.ones((grpsize, 1)))
slopes_vc[:, -1] *= 2
re_values = (slopes * exog_re).sum(1)
vc_values = (slopes_vc * exog_vc).sum(1)
err = np.random.normal(size=n_grp * grpsize)
endog = exog_fe.sum(1) + re_values + vc_values + err
groups = np.kron(range(n_grp), np.ones(grpsize))
vc = {"a": {}, "b": {}}
for i in range(n_grp):
ix = np.flatnonzero(groups == i)
vc["a"][i] = exog_vc[ix, 0:2]
vc["b"][i] = exog_vc[ix, 2:3]
with pytest.warns(UserWarning, match="Using deprecated variance"):
model = MixedLM(
endog,
exog_fe,
groups,
exog_re,
exog_vc=vc,
use_sqrt=use_sqrt)
rslt = model.fit(reml=reml)
loglike = loglike_function(
model, profile_fe=profile_fe, has_fe=not profile_fe)
try:
# Test the score at several points.
for kr in range(5):
fe_params = np.random.normal(size=k_fe)
cov_re = np.random.normal(size=(k_re, k_re))
cov_re = np.dot(cov_re.T, cov_re)
vcomp = np.random.normal(size=2)**2
params = MixedLMParams.from_components(
fe_params, cov_re=cov_re, vcomp=vcomp)
params_vec = params.get_packed(
has_fe=not profile_fe, use_sqrt=use_sqrt)
# Check scores
gr = -model.score(params, profile_fe=profile_fe)
ngr = nd.approx_fprime(params_vec, loglike)
assert_allclose(gr, ngr, rtol=1e-3)
# Check Hessian matrices at the MLE (we do not have
# the profile Hessian matrix and we do not care
# about the Hessian for the square root
# transformed parameter).
if (profile_fe is False) and (use_sqrt is False):
hess = -model.hessian(rslt.params_object)
params_vec = rslt.params_object.get_packed(
use_sqrt=False, has_fe=True)
loglike_h = loglike_function(
model, profile_fe=False, has_fe=True)
nhess = nd.approx_hess(params_vec, loglike_h)
assert_allclose(hess, nhess, rtol=1e-3)
except AssertionError:
# See GH#5628; because this test fails unpredictably but only on
# OSX, we only xfail it there.
if PLATFORM_OSX:
pytest.xfail("fails on OSX due to unresolved "
"numerical differences")
else:
raise
def test_default_re(self):
np.random.seed(3235)
exog = np.random.normal(size=(300, 4))
groups = np.kron(np.arange(100), [1, 1, 1])
g_errors = np.kron(np.random.normal(size=100), [1, 1, 1])
endog = exog.sum(1) + g_errors + np.random.normal(size=300)
mdf1 = MixedLM(endog, exog, groups).fit()
mdf2 = MixedLM(endog, exog, groups, np.ones(300)).fit()
assert_almost_equal(mdf1.params, mdf2.params, decimal=8)
def test_history(self):
np.random.seed(3235)
exog = np.random.normal(size=(300, 4))
groups = np.kron(np.arange(100), [1, 1, 1])
g_errors = np.kron(np.random.normal(size=100), [1, 1, 1])
endog = exog.sum(1) + g_errors + np.random.normal(size=300)
mod = MixedLM(endog, exog, groups)
rslt = mod.fit(full_output=True)
assert_equal(hasattr(rslt, "hist"), True)
@pytest.mark.slow
@pytest.mark.smoke
def test_profile_inference(self):
np.random.seed(9814)
k_fe = 2
gsize = 3
n_grp = 100
exog = np.random.normal(size=(n_grp * gsize, k_fe))
exog_re = np.ones((n_grp * gsize, 1))
groups = np.kron(np.arange(n_grp), np.ones(gsize))
vca = np.random.normal(size=n_grp * gsize)
vcb = np.random.normal(size=n_grp * gsize)
errors = 0
g_errors = np.kron(np.random.normal(size=100), np.ones(gsize))
errors += g_errors + exog_re[:, 0]
rc = np.random.normal(size=n_grp)
errors += np.kron(rc, np.ones(gsize)) * vca
rc = np.random.normal(size=n_grp)
errors += np.kron(rc, np.ones(gsize)) * vcb
errors += np.random.normal(size=n_grp * gsize)
endog = exog.sum(1) + errors
vc = {"a": {}, "b": {}}
for k in range(n_grp):
ii = np.flatnonzero(groups == k)
vc["a"][k] = vca[ii][:, None]
vc["b"][k] = vcb[ii][:, None]
with pytest.warns(UserWarning, match="Using deprecated variance"):
rslt = MixedLM(endog, exog, groups=groups,
exog_re=exog_re, exog_vc=vc).fit()
rslt.profile_re(0, vtype='re', dist_low=1, num_low=3,
dist_high=1, num_high=3)
rslt.profile_re('b', vtype='vc', dist_low=0.5, num_low=3,
dist_high=0.5, num_high=3)
def test_vcomp_1(self):
# Fit the same model using constrained random effects and
# variance components.
np.random.seed(4279)
exog = np.random.normal(size=(400, 1))
exog_re = np.random.normal(size=(400, 2))
groups = np.kron(np.arange(100), np.ones(4))
slopes = np.random.normal(size=(100, 2))
slopes[:, 1] *= 2
slopes = np.kron(slopes, np.ones((4, 1))) * exog_re
errors = slopes.sum(1) + np.random.normal(size=400)
endog = exog.sum(1) + errors
free = MixedLMParams(1, 2, 0)
free.fe_params = np.ones(1)
free.cov_re = np.eye(2)
free.vcomp = np.zeros(0)
model1 = MixedLM(endog, exog, groups, exog_re=exog_re)
result1 = model1.fit(free=free)
exog_vc = {"a": {}, "b": {}}
for k, group in enumerate(model1.group_labels):
ix = model1.row_indices[group]
exog_vc["a"][group] = exog_re[ix, 0:1]
exog_vc["b"][group] = exog_re[ix, 1:2]
with pytest.warns(UserWarning, match="Using deprecated variance"):
model2 = MixedLM(endog, exog, groups, exog_vc=exog_vc)
result2 = model2.fit()
result2.summary()
assert_allclose(result1.fe_params, result2.fe_params, atol=1e-4)
assert_allclose(
np.diag(result1.cov_re), result2.vcomp, atol=1e-2, rtol=1e-4)
assert_allclose(
result1.bse[[0, 1, 3]], result2.bse, atol=1e-2, rtol=1e-2)
def test_vcomp_2(self):
# Simulated data comparison to R
np.random.seed(6241)
n = 1600
exog = np.random.normal(size=(n, 2))
groups = np.kron(np.arange(n / 16), np.ones(16))
# Build up the random error vector
errors = 0
# The random effects
exog_re = np.random.normal(size=(n, 2))
slopes = np.random.normal(size=(n // 16, 2))
slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
errors += slopes.sum(1)
# First variance component
subgroups1 = np.kron(np.arange(n / 4), np.ones(4))
errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))
# Second variance component
subgroups2 = np.kron(np.arange(n / 2), np.ones(2))
errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))
# iid errors
errors += np.random.normal(size=n)
endog = exog.sum(1) + errors
df = pd.DataFrame(index=range(n))
df["y"] = endog
df["groups"] = groups
df["x1"] = exog[:, 0]
df["x2"] = exog[:, 1]
df["z1"] = exog_re[:, 0]
df["z2"] = exog_re[:, 1]
df["v1"] = subgroups1
df["v2"] = subgroups2
# Equivalent model in R:
# df.to_csv("tst.csv")
# model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 |
# v2), df)
vcf = {"a": "0 + C(v1)", "b": "0 + C(v2)"}
model1 = MixedLM.from_formula(
"y ~ x1 + x2",
groups=groups,
re_formula="0+z1+z2",
vc_formula=vcf,
data=df)
result1 = model1.fit()
# Compare to R
assert_allclose(
result1.fe_params, [0.16527, 0.99911, 0.96217], rtol=1e-4)
assert_allclose(
result1.cov_re, [[1.244, 0.146], [0.146, 1.371]], rtol=1e-3)
assert_allclose(result1.vcomp, [4.024, 3.997], rtol=1e-3)
assert_allclose(
result1.bse.iloc[0:3], [0.12610, 0.03938, 0.03848], rtol=1e-3)
def test_vcomp_3(self):
# Test a model with vcomp but no other random effects, using formulas.
np.random.seed(4279)
x1 = np.random.normal(size=400)
groups = np.kron(np.arange(100), np.ones(4))
slopes = np.random.normal(size=100)
slopes = np.kron(slopes, np.ones(4)) * x1
y = slopes + np.random.normal(size=400)
vc_fml = {"a": "0 + x1"}
df = pd.DataFrame({"y": y, "x1": x1, "groups": groups})
Loading ...