Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ regression / tests / test_lme.py

from statsmodels.compat.platform import PLATFORM_OSX

import os
import csv
import warnings

import numpy as np
import pandas as pd
from scipy import sparse
import pytest

from statsmodels.regression.mixed_linear_model import (
    MixedLM, MixedLMParams, _smw_solver, _smw_logdet)
from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose,
                           assert_)

from statsmodels.base import _penalties as penalties
import statsmodels.tools.numdiff as nd

from .results import lme_r_results

# TODO: add tests with unequal group sizes


class R_Results(object):
    """
    A class for holding various results obtained from fitting one data
    set using lmer in R.

    Parameters
    ----------
    meth : str
        Either "ml" or "reml".
    irfs : str
        Either "irf", for independent random effects, or "drf" for
        dependent random effects.
    ds_ix : int
        The number of the data set
    """

    def __init__(self, meth, irfs, ds_ix):

        bname = "_%s_%s_%d" % (meth, irfs, ds_ix)

        self.coef = getattr(lme_r_results, "coef" + bname)
        self.vcov_r = getattr(lme_r_results, "vcov" + bname)
        self.cov_re_r = getattr(lme_r_results, "cov_re" + bname)
        self.scale_r = getattr(lme_r_results, "scale" + bname)
        self.loglike = getattr(lme_r_results, "loglike" + bname)

        if hasattr(lme_r_results, "ranef_mean" + bname):
            self.ranef_postmean = getattr(lme_r_results, "ranef_mean" + bname)
            self.ranef_condvar = getattr(lme_r_results,
                                         "ranef_condvar" + bname)
            self.ranef_condvar = np.atleast_2d(self.ranef_condvar)

        # Load the data file
        cur_dir = os.path.dirname(os.path.abspath(__file__))
        rdir = os.path.join(cur_dir, 'results')
        fname = os.path.join(rdir, "lme%02d.csv" % ds_ix)
        with open(fname) as fid:
            rdr = csv.reader(fid)
            header = next(rdr)
            data = [[float(x) for x in line] for line in rdr]
        data = np.asarray(data)

        # Split into exog, endog, etc.
        self.endog = data[:, header.index("endog")]
        self.groups = data[:, header.index("groups")]
        ii = [i for i, x in enumerate(header) if x.startswith("exog_fe")]
        self.exog_fe = data[:, ii]
        ii = [i for i, x in enumerate(header) if x.startswith("exog_re")]
        self.exog_re = data[:, ii]


def loglike_function(model, profile_fe, has_fe):
    # Returns a function that evaluates the negative log-likelihood for
    # the given model.

    def f(x):
        params = MixedLMParams.from_packed(
            x, model.k_fe, model.k_re, model.use_sqrt, has_fe=has_fe)
        return -model.loglike(params, profile_fe=profile_fe)

    return f


class TestMixedLM(object):

    # Test analytic scores and Hessian using numeric differentiation
    @pytest.mark.slow
    @pytest.mark.parametrize("use_sqrt", [False, True])
    @pytest.mark.parametrize("reml", [False, True])
    @pytest.mark.parametrize("profile_fe", [False, True])
    def test_compare_numdiff(self, use_sqrt, reml, profile_fe):

        n_grp = 200
        grpsize = 5
        k_fe = 3
        k_re = 2

        np.random.seed(3558)
        exog_fe = np.random.normal(size=(n_grp * grpsize, k_fe))
        exog_re = np.random.normal(size=(n_grp * grpsize, k_re))
        exog_re[:, 0] = 1
        exog_vc = np.random.normal(size=(n_grp * grpsize, 3))
        slopes = np.random.normal(size=(n_grp, k_re))
        slopes[:, -1] *= 2
        slopes = np.kron(slopes, np.ones((grpsize, 1)))
        slopes_vc = np.random.normal(size=(n_grp, 3))
        slopes_vc = np.kron(slopes_vc, np.ones((grpsize, 1)))
        slopes_vc[:, -1] *= 2
        re_values = (slopes * exog_re).sum(1)
        vc_values = (slopes_vc * exog_vc).sum(1)
        err = np.random.normal(size=n_grp * grpsize)
        endog = exog_fe.sum(1) + re_values + vc_values + err
        groups = np.kron(range(n_grp), np.ones(grpsize))

        vc = {"a": {}, "b": {}}
        for i in range(n_grp):
            ix = np.flatnonzero(groups == i)
            vc["a"][i] = exog_vc[ix, 0:2]
            vc["b"][i] = exog_vc[ix, 2:3]

        with pytest.warns(UserWarning, match="Using deprecated variance"):
            model = MixedLM(
                endog,
                exog_fe,
                groups,
                exog_re,
                exog_vc=vc,
                use_sqrt=use_sqrt)
        rslt = model.fit(reml=reml)

        loglike = loglike_function(
            model, profile_fe=profile_fe, has_fe=not profile_fe)

        try:
            # Test the score at several points.
            for kr in range(5):
                fe_params = np.random.normal(size=k_fe)
                cov_re = np.random.normal(size=(k_re, k_re))
                cov_re = np.dot(cov_re.T, cov_re)
                vcomp = np.random.normal(size=2)**2
                params = MixedLMParams.from_components(
                    fe_params, cov_re=cov_re, vcomp=vcomp)
                params_vec = params.get_packed(
                    has_fe=not profile_fe, use_sqrt=use_sqrt)

                # Check scores
                gr = -model.score(params, profile_fe=profile_fe)
                ngr = nd.approx_fprime(params_vec, loglike)
                assert_allclose(gr, ngr, rtol=1e-3)

            # Check Hessian matrices at the MLE (we do not have
            # the profile Hessian matrix and we do not care
            # about the Hessian for the square root
            # transformed parameter).
            if (profile_fe is False) and (use_sqrt is False):
                hess = -model.hessian(rslt.params_object)
                params_vec = rslt.params_object.get_packed(
                    use_sqrt=False, has_fe=True)
                loglike_h = loglike_function(
                    model, profile_fe=False, has_fe=True)
                nhess = nd.approx_hess(params_vec, loglike_h)
                assert_allclose(hess, nhess, rtol=1e-3)
        except AssertionError:
            # See GH#5628; because this test fails unpredictably but only on
            #  OSX, we only xfail it there.
            if PLATFORM_OSX:
                pytest.xfail("fails on OSX due to unresolved "
                             "numerical differences")
            else:
                raise

    def test_default_re(self):

        np.random.seed(3235)
        exog = np.random.normal(size=(300, 4))
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = np.kron(np.random.normal(size=100), [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)
        mdf1 = MixedLM(endog, exog, groups).fit()
        mdf2 = MixedLM(endog, exog, groups, np.ones(300)).fit()
        assert_almost_equal(mdf1.params, mdf2.params, decimal=8)

    def test_history(self):

        np.random.seed(3235)
        exog = np.random.normal(size=(300, 4))
        groups = np.kron(np.arange(100), [1, 1, 1])
        g_errors = np.kron(np.random.normal(size=100), [1, 1, 1])
        endog = exog.sum(1) + g_errors + np.random.normal(size=300)
        mod = MixedLM(endog, exog, groups)
        rslt = mod.fit(full_output=True)
        assert_equal(hasattr(rslt, "hist"), True)

    @pytest.mark.slow
    @pytest.mark.smoke
    def test_profile_inference(self):
        np.random.seed(9814)
        k_fe = 2
        gsize = 3
        n_grp = 100
        exog = np.random.normal(size=(n_grp * gsize, k_fe))
        exog_re = np.ones((n_grp * gsize, 1))
        groups = np.kron(np.arange(n_grp), np.ones(gsize))
        vca = np.random.normal(size=n_grp * gsize)
        vcb = np.random.normal(size=n_grp * gsize)
        errors = 0
        g_errors = np.kron(np.random.normal(size=100), np.ones(gsize))
        errors += g_errors + exog_re[:, 0]
        rc = np.random.normal(size=n_grp)
        errors += np.kron(rc, np.ones(gsize)) * vca
        rc = np.random.normal(size=n_grp)
        errors += np.kron(rc, np.ones(gsize)) * vcb
        errors += np.random.normal(size=n_grp * gsize)

        endog = exog.sum(1) + errors
        vc = {"a": {}, "b": {}}
        for k in range(n_grp):
            ii = np.flatnonzero(groups == k)
            vc["a"][k] = vca[ii][:, None]
            vc["b"][k] = vcb[ii][:, None]
        with pytest.warns(UserWarning, match="Using deprecated variance"):
            rslt = MixedLM(endog, exog, groups=groups,
                           exog_re=exog_re, exog_vc=vc).fit()
        rslt.profile_re(0, vtype='re', dist_low=1, num_low=3,
                        dist_high=1, num_high=3)
        rslt.profile_re('b', vtype='vc', dist_low=0.5, num_low=3,
                        dist_high=0.5, num_high=3)

    def test_vcomp_1(self):
        # Fit the same model using constrained random effects and
        # variance components.

        np.random.seed(4279)
        exog = np.random.normal(size=(400, 1))
        exog_re = np.random.normal(size=(400, 2))
        groups = np.kron(np.arange(100), np.ones(4))
        slopes = np.random.normal(size=(100, 2))
        slopes[:, 1] *= 2
        slopes = np.kron(slopes, np.ones((4, 1))) * exog_re
        errors = slopes.sum(1) + np.random.normal(size=400)
        endog = exog.sum(1) + errors

        free = MixedLMParams(1, 2, 0)
        free.fe_params = np.ones(1)
        free.cov_re = np.eye(2)
        free.vcomp = np.zeros(0)

        model1 = MixedLM(endog, exog, groups, exog_re=exog_re)
        result1 = model1.fit(free=free)

        exog_vc = {"a": {}, "b": {}}
        for k, group in enumerate(model1.group_labels):
            ix = model1.row_indices[group]
            exog_vc["a"][group] = exog_re[ix, 0:1]
            exog_vc["b"][group] = exog_re[ix, 1:2]
        with pytest.warns(UserWarning, match="Using deprecated variance"):
            model2 = MixedLM(endog, exog, groups, exog_vc=exog_vc)
        result2 = model2.fit()
        result2.summary()

        assert_allclose(result1.fe_params, result2.fe_params, atol=1e-4)
        assert_allclose(
            np.diag(result1.cov_re), result2.vcomp, atol=1e-2, rtol=1e-4)
        assert_allclose(
            result1.bse[[0, 1, 3]], result2.bse, atol=1e-2, rtol=1e-2)

    def test_vcomp_2(self):
        # Simulated data comparison to R

        np.random.seed(6241)
        n = 1600
        exog = np.random.normal(size=(n, 2))
        groups = np.kron(np.arange(n / 16), np.ones(16))

        # Build up the random error vector
        errors = 0

        # The random effects
        exog_re = np.random.normal(size=(n, 2))
        slopes = np.random.normal(size=(n // 16, 2))
        slopes = np.kron(slopes, np.ones((16, 1))) * exog_re
        errors += slopes.sum(1)

        # First variance component
        subgroups1 = np.kron(np.arange(n / 4), np.ones(4))
        errors += np.kron(2 * np.random.normal(size=n // 4), np.ones(4))

        # Second variance component
        subgroups2 = np.kron(np.arange(n / 2), np.ones(2))
        errors += np.kron(2 * np.random.normal(size=n // 2), np.ones(2))

        # iid errors
        errors += np.random.normal(size=n)

        endog = exog.sum(1) + errors

        df = pd.DataFrame(index=range(n))
        df["y"] = endog
        df["groups"] = groups
        df["x1"] = exog[:, 0]
        df["x2"] = exog[:, 1]
        df["z1"] = exog_re[:, 0]
        df["z2"] = exog_re[:, 1]
        df["v1"] = subgroups1
        df["v2"] = subgroups2

        # Equivalent model in R:
        # df.to_csv("tst.csv")
        # model = lmer(y ~ x1 + x2 + (0 + z1 + z2 | groups) + (1 | v1) + (1 |
        # v2), df)

        vcf = {"a": "0 + C(v1)", "b": "0 + C(v2)"}
        model1 = MixedLM.from_formula(
            "y ~ x1 + x2",
            groups=groups,
            re_formula="0+z1+z2",
            vc_formula=vcf,
            data=df)
        result1 = model1.fit()

        # Compare to R
        assert_allclose(
            result1.fe_params, [0.16527, 0.99911, 0.96217], rtol=1e-4)
        assert_allclose(
            result1.cov_re, [[1.244, 0.146], [0.146, 1.371]], rtol=1e-3)
        assert_allclose(result1.vcomp, [4.024, 3.997], rtol=1e-3)
        assert_allclose(
            result1.bse.iloc[0:3], [0.12610, 0.03938, 0.03848], rtol=1e-3)

    def test_vcomp_3(self):
        # Test a model with vcomp but no other random effects, using formulas.

        np.random.seed(4279)
        x1 = np.random.normal(size=400)
        groups = np.kron(np.arange(100), np.ones(4))
        slopes = np.random.normal(size=100)
        slopes = np.kron(slopes, np.ones(4)) * x1
        y = slopes + np.random.normal(size=400)
        vc_fml = {"a": "0 + x1"}
        df = pd.DataFrame({"y": y, "x1": x1, "groups": groups})
Loading ...