Gemfury

alkaline-ml / statsmodels python

Repository URL to install this package:
Details
statsmodels / genmod / tests / test_glm_weights.py
"""
Test for weights in GLM, Poisson and OLS/WLS, continuous test_glm.py


Below is a table outlining the test coverage.

================================= ====================== ====== ===================== === ======= ======== ============== ============= ============== ============= ============== ==== =========
Test                              Compared To            params normalized_cov_params bse loglike deviance resid_response resid_pearson resid_deviance resid_working resid_anscombe chi2 optimizer
================================= ====================== ====== ===================== === ======= ======== ============== ============= ============== ============= ============== ==== =========
TestGlmPoissonPlain               stata                  X                            X   X       X        X              X             X              X             X              X    bfgs
TestGlmPoissonFwNr                stata                  X                            X   X       X        X              X             X              X             X              X    bfgs
TestGlmPoissonAwNr                stata                  X                            X   X       X        X              X             X              X             X              X    bfgs
TestGlmPoissonFwHC                stata                  X                            X   X       X                                                                                 X
TestGlmPoissonAwHC                stata                  X                            X   X       X                                                                                 X
TestGlmPoissonFwClu               stata                  X                            X   X       X                                                                                 X
TestGlmTweedieAwNr                R                      X                            X           X        X              X             X              X                                 newton
TestGlmGammaAwNr                  R                      X                            X   special X        X              X             X              X                                 bfgs
TestGlmGaussianAwNr               R                      X                            X   special X        X              X             X              X                                 bfgs
TestRepeatedvsAggregated          statsmodels.GLM        X      X                                                                                                                        bfgs
TestRepeatedvsAverage             statsmodels.GLM        X      X                                                                                                                        bfgs
TestTweedieRepeatedvsAggregated   statsmodels.GLM        X      X                                                                                                                        bfgs
TestTweedieRepeatedvsAverage      statsmodels.GLM        X      X                                                                                                                        bfgs
TestBinomial0RepeatedvsAverage    statsmodels.GLM        X      X
TestBinomial0RepeatedvsDuplicated statsmodels.GLM        X      X                                                                                                                        bfgs
TestBinomialVsVarWeights          statsmodels.GLM        X      X                     X                                                                                                  bfgs
TestGlmGaussianWLS                statsmodels.WLS        X      X                     X                                                                                                  bfgs
================================= ====================== ====== ===================== === ======= ======== ============== ============= ============== ============= ============== ==== =========
"""  # noqa: E501
import warnings

import numpy as np
from numpy.testing import (assert_raises, assert_allclose)
import pandas as pd
import pytest

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.tools.tools import add_constant
from statsmodels.discrete import discrete_model as discrete
from statsmodels.tools.sm_exceptions import SpecificationWarning

from .results import results_glm_poisson_weights as res_stata
from .results import res_R_var_weight as res_r

# load data into module namespace
from statsmodels.datasets.cpunish import load

cpunish_data = load(as_pandas=False)
cpunish_data.exog[:, 3] = np.log(cpunish_data.exog[:, 3])
cpunish_data.exog = add_constant(cpunish_data.exog, prepend=False)


class CheckWeight(object):
    def test_basic(self):
        res1 = self.res1
        res2 = self.res2

        assert_allclose(res1.params, res2.params, atol=1e-6, rtol=2e-6)
        corr_fact = getattr(self, 'corr_fact', 1)
        if hasattr(res2, 'normalized_cov_params'):
            assert_allclose(res1.normalized_cov_params,
                            res2.normalized_cov_params,
                            atol=1e-8, rtol=2e-6)
        if isinstance(self, (TestRepeatedvsAggregated, TestRepeatedvsAverage,
                             TestTweedieRepeatedvsAggregated,
                             TestTweedieRepeatedvsAverage,
                             TestBinomial0RepeatedvsAverage,
                             TestBinomial0RepeatedvsDuplicated)):
            # Loglikelihood, scale, deviance is different between repeated vs.
            # exposure/average
            return None
        assert_allclose(res1.bse, corr_fact * res2.bse, atol=1e-6, rtol=2e-6)
        if isinstance(self, TestBinomialVsVarWeights):
            # Binomial ll and deviance are different for 1d vs. counts...
            return None
        if isinstance(self, TestGlmGaussianWLS):
            # This will not work right now either
            return None
        if not isinstance(self, (TestGlmGaussianAwNr, TestGlmGammaAwNr)):
            # Matching R is hard
            assert_allclose(res1.llf, res2.ll, atol=1e-6, rtol=1e-7)
        assert_allclose(res1.deviance, res2.deviance, atol=1e-6, rtol=1e-7)

    def test_residuals(self):
        if isinstance(self, (TestRepeatedvsAggregated, TestRepeatedvsAverage,
                             TestTweedieRepeatedvsAggregated,
                             TestTweedieRepeatedvsAverage,
                             TestBinomial0RepeatedvsAverage,
                             TestBinomial0RepeatedvsDuplicated)):
            # This will not match as different number of records
            return None
        res1 = self.res1
        res2 = self.res2
        if not hasattr(res2, 'resids'):
            return None  # use SkipError instead
        resid_all = dict(zip(res2.resids_colnames, res2.resids.T))

        assert_allclose(res1.resid_response, resid_all['resid_response'], atol= 1e-6, rtol=2e-6)
        assert_allclose(res1.resid_pearson, resid_all['resid_pearson'], atol= 1e-6, rtol=2e-6)
        assert_allclose(res1.resid_deviance, resid_all['resid_deviance'], atol= 1e-6, rtol=2e-6)
        assert_allclose(res1.resid_working, resid_all['resid_working'], atol= 1e-6, rtol=2e-6)
        if resid_all.get('resid_anscombe') is None:
            return None
        # Stata does not use var_weights in anscombe residuals, it seems.
        # Adjust residuals to match our approach.
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            resid_a = res1.resid_anscombe

        assert_allclose(resid_a, resid_all['resid_anscombe'] * np.sqrt(res1._var_weights), atol= 1e-6, rtol=2e-6)

    def test_compare_optimizers(self):
        res1 = self.res1
        if isinstance(res1.model.family, sm.families.Tweedie):
            method = 'newton'
            optim_hessian = 'eim'
        else:
            method = 'bfgs'
            optim_hessian = 'oim'
        if isinstance(self, (TestGlmPoissonFwHC, TestGlmPoissonAwHC,
                             TestGlmPoissonFwClu,
                             TestBinomial0RepeatedvsAverage)):
            return None

        start_params = res1.params
        res2 = self.res1.model.fit(start_params=start_params, method=method,
                                   optim_hessian=optim_hessian)
        assert_allclose(res1.params, res2.params, atol=1e-3, rtol=2e-3)
        H = res2.model.hessian(res2.params, observed=False)
        res2_bse = np.sqrt(-np.diag(np.linalg.inv(H)))
        assert_allclose(res1.bse, res2_bse, atol=1e-3, rtol=1e-3)

    def test_pearson_chi2(self):
        if hasattr(self.res2, 'chi2'):
            assert_allclose(self.res1.pearson_chi2, self.res2.deviance_p,
                            atol=1e-6, rtol=1e-6)


class TestGlmPoissonPlain(CheckWeight):
    @classmethod
    def setup_class(cls):
        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                       family=sm.families.Poisson()).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_none_nonrobust


class TestGlmPoissonFwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        fweights = np.array(fweights)

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                    family=sm.families.Poisson(), freq_weights=fweights).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_nonrobust


class TestGlmPoissonAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                    family=sm.families.Poisson(), var_weights=aweights).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)

        # Need to copy to avoid inplace adjustment
        from copy import copy
        cls.res2 = copy(res_stata.results_poisson_aweight_nonrobust)
        cls.res2.resids = cls.res2.resids.copy()

        # Need to adjust resids for pearson and deviance to add weights
        cls.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])


# prob_weights fail with HC, not properly implemented yet
class TestGlmPoissonPwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                    family=sm.families.Poisson(), freq_weights=fweights).fit(cov_type='HC1')
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_pweight_nonrobust

    # TODO: find more informative reasons why these fail
    @pytest.mark.xfail(reason='Known to fail', strict=True)
    def test_basic(self):
        super(TestGlmPoissonPwNr, self).test_basic()

    @pytest.mark.xfail(reason='Known to fail', strict=True)
    def test_compare_optimizers(self):
        super(TestGlmPoissonPwNr, self).test_compare_optimizers()


class TestGlmPoissonFwHC(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum)

        mod = GLM(cpunish_data.endog, cpunish_data.exog,
                  family=sm.families.Poisson(),
                  freq_weights=fweights)
        cls.res1 = mod.fit(cov_type='HC0') #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_hc1


# var_weights (aweights fail with HC, not properly implemented yet
class TestGlmPoissonAwHC(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        # This is really close when corr_fact = (wsum - 1.) / wsum, but to
        # avoid having loosen precision of the assert_allclose, I'm doing this
        # manually. Its *possible* lowering the IRLS convergence criterion
        # in stata and here will make this less sketchy.
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609
        mod = GLM(cpunish_data.endog, cpunish_data.exog,
                  family=sm.families.Poisson(),
                  var_weights=aweights)
        cls.res1 = mod.fit(cov_type='HC0') #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        # modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_aweight_hc1


class TestGlmPoissonFwClu(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        gid = np.arange(1, 17 + 1) // 2
        n_groups = len(np.unique(gid))

        # no wnobs yet in sandwich covariance calcualtion
        cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1))   #np.sqrt((wsum - 1.) / wsum)
        cov_kwds = {'groups': gid, 'use_correction':False}
        with pytest.warns(None):
            mod = GLM(cpunish_data.endog, cpunish_data.exog,
                      family=sm.families.Poisson(),
                      freq_weights=fweights)
            cls.res1 = mod.fit(cov_type='cluster', cov_kwds=cov_kwds)

        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_clu1


class TestGlmTweedieAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.fair.load_pandas()
        endog = data.endog
        data = data.exog
        data['fair'] = endog
        aweights = np.repeat(1, len(data.index))
        aweights[::5] = 5
        aweights[::13] = 3
        model = smf.glm(
                'fair ~ age + yrs_married',
                data=data,
                family=sm.families.Tweedie(
                    var_power=1.55,
                    link=sm.families.links.log()
                    ),
                var_weights=aweights
        )
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_tweedie_aweights_nonrobust


class TestGlmGammaAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        from .results.results_glm import CancerLog
        res2 = CancerLog()
        endog = res2.endog
        exog = res2.exog[:, :-1]
        exog = sm.add_constant(exog, prepend=True)

        aweights = np.repeat(1, len(endog))
        aweights[::5] = 5
        aweights[::13] = 3
        model = sm.GLM(endog, exog,
                       family=sm.families.Gamma(link=sm.families.links.log()),
                       var_weights=aweights)
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_gamma_aweights_nonrobust

    def test_r_llf(self):
        scale = self.res1.deviance / self.res1._iweights.sum()
        ll = self.res1.family.loglike(self.res1.model.endog,
                                      self.res1.mu,
                                      freq_weights=self.res1._var_weights,
                                      scale=scale)
        assert_allclose(ll, self.res2.ll, atol=1e-6, rtol=1e-7)


class TestGlmGaussianAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
        data['INCOME'] /= 1000
        aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2,
                             1])
        model = smf.glm(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                family=sm.families.Gaussian(link=sm.families.links.log()),
                var_weights=aweights
        )
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_gaussian_aweights_nonrobust

    def test_r_llf(self):
        res1 = self.res1
        res2 = self.res2
        model = self.res1.model

        # Need to make a few adjustments...
        # First, calculate scale using nobs as denominator
        scale = res1.scale * model.df_resid / model.wnobs
        # Calculate llf using adj scale and wts = freq_weights
        wts = model.freq_weights
        llf = model.family.loglike(model.endog, res1.mu,
                                   freq_weights=wts,
                                   scale=scale)
        # SM uses (essentially) stat's loglike formula... first term is
        # (endog - mu) ** 2 / scale
        adj_sm = -1 / 2 * ((model.endog - res1.mu) ** 2).sum() / scale
        # R has these 2 terms that stata/sm do not
        adj_r = -model.wnobs / 2 + np.sum(np.log(model.var_weights)) / 2
        llf_adj = llf - adj_sm + adj_r
        assert_allclose(llf_adj, res2.ll, atol=1e-6, rtol=1e-7)


def gen_endog(lin_pred, family_class, link, binom_version=0):

    np.random.seed(872)

    fam = sm.families

    mu = link().inverse(lin_pred)

    if family_class == fam.Binomial:
        if binom_version == 0:
            endog = 1*(np.random.uniform(size=len(lin_pred)) < mu)
        else:
            endog = np.empty((len(lin_pred), 2))
            n = 10
            endog[:, 0] = (np.random.uniform(size=(len(lin_pred), n)) < mu[:, None]).sum(1)
            endog[:, 1] = n - endog[:, 0]
    elif family_class == fam.Poisson:
        endog = np.random.poisson(mu)
    elif family_class == fam.Gamma:
        endog = np.random.gamma(2, mu)
    elif family_class == fam.Gaussian:
        endog = mu + np.random.normal(size=len(lin_pred))
    elif family_class == fam.NegativeBinomial:
        from scipy.stats.distributions import nbinom
        endog = nbinom.rvs(mu, 0.5)
    elif family_class == fam.InverseGaussian:
        from scipy.stats.distributions import invgauss
        endog = invgauss.rvs(mu)
    elif family_class == fam.Tweedie:
        rate = 1
        shape = 1.0
        scale = mu / (rate * shape)
        endog = (np.random.poisson(rate, size=scale.shape[0]) *
                 np.random.gamma(shape * scale))
    else:
        raise ValueError

    return endog


def test_wtd_gradient_irls():
    # Compare the results when using gradient optimization and IRLS.
    # TODO: Find working examples for inverse_squared link

    np.random.seed(87342)

    fam = sm.families
    lnk = sm.families.links
    families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log,
                                lnk.cauchy]),
                (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]),
                (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]),
                (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]),
                (fam.InverseGaussian, [lnk.log, lnk.identity,
                                       lnk.inverse_power,
                                       lnk.inverse_squared]),
                (fam.NegativeBinomial, [lnk.log, lnk.inverse_power,
                                        lnk.inverse_squared, lnk.identity])]

    n = 100
    p = 3
    exog = np.random.normal(size=(n, p))
    exog[:, 0] = 1

    skip_one = False
    for family_class, family_links in families:
        for link in family_links:
            for binom_version in 0, 1:
                method = 'bfgs'

                if family_class != fam.Binomial and binom_version == 1:
                    continue
                elif family_class == fam.Binomial and link == lnk.cloglog:
                    # Cannot get gradient to converage with var_weights here
                    continue
                elif family_class == fam.Binomial and link == lnk.log:
                    # Cannot get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Poisson, lnk.identity):
                    lin_pred = 20 + exog.sum(1)
                elif (family_class, link) == (fam.Binomial, lnk.log):
                    lin_pred = -1 + exog.sum(1) / 8
                elif (family_class, link) == (fam.Poisson, lnk.sqrt):
                    lin_pred = -2 + exog.sum(1)
                elif (family_class, link) == (fam.Gamma, lnk.log):
                    # Cannot get gradient to converge with var_weights here
                    continue
                elif (family_class, link) == (fam.Gamma, lnk.identity):
                    # Cannot get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Gamma, lnk.inverse_power):
                    # Cannot get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Gaussian, lnk.log):
                    # Cannot get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
                    # Cannot get gradient to converage with var_weights here
                    continue
                elif (family_class, link) == (fam.InverseGaussian, lnk.log):
                    # Cannot get gradient to converage with var_weights here
                    lin_pred = -1 + exog.sum(1)
                    continue
                elif (family_class, link) == (fam.InverseGaussian,
                                              lnk.identity):
                    # Cannot get gradient to converage with var_weights here
                    lin_pred = 20 + 5*exog.sum(1)
                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
                    continue
                elif (family_class, link) == (fam.InverseGaussian,
                                              lnk.inverse_squared):
                    lin_pred = 0.5 + exog.sum(1) / 5
                    continue  # skip due to non-convergence
                elif (family_class, link) == (fam.InverseGaussian,
                                              lnk.inverse_power):
                    lin_pred = 1 + exog.sum(1) / 5
                    method = 'newton'
                elif (family_class, link) == (fam.NegativeBinomial,
                                              lnk.identity):
                    lin_pred = 20 + 5*exog.sum(1)
                    lin_pred = np.clip(lin_pred, 1e-3, np.inf)
                    method = 'newton'
                elif (family_class, link) == (fam.NegativeBinomial,
                                              lnk.inverse_squared):
                    lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
                    continue  # skip due to non-convergence
                elif (family_class, link) == (fam.NegativeBinomial,
                                              lnk.inverse_power):
                    # Cannot get gradient to converage with var_weights here
                    lin_pred = 1 + exog.sum(1) / 5
                    continue

                elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
                    # adding skip because of convergence failure
                    skip_one = True
                else:
                    lin_pred = np.random.uniform(size=exog.shape[0])

                endog = gen_endog(lin_pred, family_class, link, binom_version)
                if binom_version == 0:
                    wts = np.ones_like(endog)
                    tmp = np.random.randint(
                            2,
                            5,
                            size=(endog > endog.mean()).sum()
                    )
                    wts[endog > endog.mean()] = tmp
                else:
                    wts = np.ones(shape=endog.shape[0])
                    y = endog[:, 0] / endog.sum(axis=1)
                    tmp = np.random.gamma(2, size=(y > y.mean()).sum())
                    wts[y > y.mean()] = tmp

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    mod_irls = sm.GLM(endog, exog, var_weights=wts,
                                      family=family_class(link=link()))
                rslt_irls = mod_irls.fit(method="IRLS", atol=1e-10,
                                         tol_criterion='params')

                # Try with and without starting values.
                for max_start_irls, start_params in ((0, rslt_irls.params),
                                                     (3, None)):
                    # TODO: skip convergence failures for now
                    if max_start_irls > 0 and skip_one:
                        continue
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        mod_gradient = sm.GLM(endog, exog, var_weights=wts,
                                              family=family_class(link=link()))
                    rslt_gradient = mod_gradient.fit(
                            max_start_irls=max_start_irls,
                            start_params=start_params,
                            method=method
                    )
                    assert_allclose(rslt_gradient.params,
                                    rslt_irls.params, rtol=1e-6, atol=5e-5)

                    assert_allclose(rslt_gradient.llf, rslt_irls.llf,
                                    rtol=1e-6, atol=1e-6)

                    assert_allclose(rslt_gradient.scale, rslt_irls.scale,
                                    rtol=1e-6, atol=1e-6)

                    # Get the standard errors using expected information.
                    gradient_bse = rslt_gradient.bse
                    ehess = mod_gradient.hessian(rslt_gradient.params,
                                                 observed=False)
                    gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
                    assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6,
                                    atol=5e-5)


def get_dummies(x):
    values = np.sort(np.unique(x))
    out = np.zeros(shape=(x.shape[0], len(values) - 1))
    for i, v in enumerate(values):
        if i == 0:
            continue
        out[:, i - 1] = np.where(v == x, 1, 0)
    return out


class TestRepeatedvsAggregated(CheckWeight):
    @classmethod
    def setup_class(cls):
        np.random.seed(4321)
        n = 100
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Poisson
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link)
        mod1 = sm.GLM(endog, exog, family=family(link=link()))
        cls.res1 = mod1.fit()

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        agg_endog = agg_endog['endog']
        mod2 = sm.GLM(agg_endog, agg_exog, family=family(link=link()),
                      exposure=agg_wt)
        cls.res2 = mod2.fit()


class TestRepeatedvsAverage(CheckWeight):
    @classmethod
    def setup_class(cls):
        np.random.seed(4321)
        n = 10000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Poisson
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link)
        mod1 = sm.GLM(endog, exog, family=family(link=link()))
        cls.res1 = mod1.fit()

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        avg_endog = agg_endog['endog'] / agg_wt
        mod2 = sm.GLM(avg_endog, agg_exog, family=family(link=link()),
                      var_weights=agg_wt)
        cls.res2 = mod2.fit()


class TestTweedieRepeatedvsAggregated(CheckWeight):
    @classmethod
    def setup_class(cls):
        np.random.seed(4321)
        n = 10000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([7, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Tweedie
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link)
        mod1 = sm.GLM(endog, exog, family=family(link=link(), var_power=1.5))
        cls.res1 = mod1.fit(rtol=1e-20, atol=0, tol_criterion='params')

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        agg_endog = agg_endog['endog']
        mod2 = sm.GLM(agg_endog, agg_exog,
                      family=family(link=link(), var_power=1.5),
                      exposure=agg_wt, var_weights=agg_wt ** 0.5)
        cls.res2 = mod2.fit(rtol=1e-20, atol=0, tol_criterion='params')


class TestTweedieRepeatedvsAverage(CheckWeight):
    @classmethod
    def setup_class(cls):
        np.random.seed(4321)
        n = 1000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([7, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Tweedie
        link = sm.families.links.log
        endog = gen_endog(lin_pred, family, link)
        mod1 = sm.GLM(endog, exog, family=family(link=link(), var_power=1.5))
        cls.res1 = mod1.fit(rtol=1e-10, atol=0, tol_criterion='params',
                             scaletype='x2')

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        avg_endog = agg_endog['endog'] / agg_wt
        mod2 = sm.GLM(avg_endog, agg_exog,
                      family=family(link=link(), var_power=1.5),
                      var_weights=agg_wt)
        cls.res2 = mod2.fit(rtol=1e-10, atol=0, tol_criterion='params')


class TestBinomial0RepeatedvsAverage(CheckWeight):
    @classmethod
    def setup_class(cls):
        np.random.seed(4321)
        n = 20
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Binomial
        link = sm.families.links.logit
        endog = gen_endog(lin_pred, family, link, binom_version=0)
        mod1 = sm.GLM(endog, exog, family=family(link=link()))
        cls.res1 = mod1.fit(rtol=1e-10, atol=0, tol_criterion='params',
                             scaletype='x2')

        agg = pd.DataFrame(exog)
        agg['endog'] = endog
        agg_endog = agg.groupby([0, 1, 2, 3, 4]).sum()[['endog']]
        agg_wt = agg.groupby([0, 1, 2, 3, 4]).count()[['endog']]
        agg_exog = np.array(agg_endog.index.tolist())
        agg_wt = agg_wt['endog']
        avg_endog = agg_endog['endog'] / agg_wt
        mod2 = sm.GLM(avg_endog, agg_exog,
                      family=family(link=link()),
                      var_weights=agg_wt)
        cls.res2 = mod2.fit(rtol=1e-10, atol=0, tol_criterion='params')


class TestBinomial0RepeatedvsDuplicated(CheckWeight):
    @classmethod
    def setup_class(cls):
        np.random.seed(4321)
        n = 10000
        p = 5
        exog = np.empty((n, p))
        exog[:, 0] = 1
        exog[:, 1] = np.random.randint(low=-5, high=5, size=n)
        x = np.repeat(np.array([1, 2, 3, 4]), n / 4)
        exog[:, 2:] = get_dummies(x)
        beta = np.array([-1, 0.1, -0.05, .2, 0.35])
        lin_pred = (exog * beta).sum(axis=1)
        family = sm.families.Binomial
        link = sm.families.links.logit
        endog = gen_endog(lin_pred, family, link, binom_version=0)
        wt = np.random.randint(1, 5, n)
        mod1 = sm.GLM(endog, exog, family=family(link=link()), freq_weights=wt)
        cls.res1 = mod1.fit()

        exog_dup = np.repeat(exog, wt, axis=0)
        endog_dup = np.repeat(endog, wt)
        mod2 = sm.GLM(endog_dup, exog_dup, family=family(link=link()))
        cls.res2 = mod2.fit()


def test_warnings_raised():
    weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
    # faking aweights by using normalized freq_weights
    weights = np.array(weights)

    gid = np.arange(1, 17 + 1) // 2

    cov_kwds = {'groups': gid, 'use_correction': False}

    with pytest.warns(SpecificationWarning):
        res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                   family=sm.families.Poisson(), freq_weights=weights
                   ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        res1.summary()

    with pytest.warns(SpecificationWarning):
        res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                   family=sm.families.Poisson(), var_weights=weights
                   ).fit(cov_type='cluster', cov_kwds=cov_kwds)
        res1.summary()


weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]


@pytest.mark.parametrize('formatted', [weights, np.asarray(weights), pd.Series(weights)],
                         ids=['list', 'ndarray', 'Series'])
def test_weights_different_formats(formatted):
    check_weights_as_formats(formatted)


def check_weights_as_formats(weights):
    res = GLM(cpunish_data.endog, cpunish_data.exog,
              family=sm.families.Poisson(), freq_weights=weights
              ).fit()
    assert isinstance(res._freq_weights, np.ndarray)
    assert isinstance(res._var_weights, np.ndarray)
    assert isinstance(res._iweights, np.ndarray)

    res = GLM(cpunish_data.endog, cpunish_data.exog,
              family=sm.families.Poisson(), var_weights=weights
              ).fit()
    assert isinstance(res._freq_weights, np.ndarray)
    assert isinstance(res._var_weights, np.ndarray)
    assert isinstance(res._iweights, np.ndarray)


class TestBinomialVsVarWeights(CheckWeight):
    @classmethod
    def setup_class(cls):
        from statsmodels.datasets.star98 import load
        data = load(as_pandas=False)
        data.exog /= data.exog.std(0)
        data.exog = add_constant(data.exog, prepend=False)

        cls.res1 = GLM(data.endog, data.exog,
                        family=sm.families.Binomial()).fit()
        weights = data.endog.sum(axis=1)
        endog2 = data.endog[:, 0] / weights
        cls.res2 = GLM(endog2, data.exog,
                        family=sm.families.Binomial(),
                        var_weights=weights).fit()


class TestGlmGaussianWLS(CheckWeight):
    @classmethod
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
        data['INCOME'] /= 1000
        aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2,
                             1])
        model = smf.glm(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                family=sm.families.Gaussian(link=sm.families.links.identity()),
                var_weights=aweights
        )
        wlsmodel = smf.wls(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                weights=aweights)
        cls.res1 = model.fit(rtol=1e-25, atol=1e-25)
        cls.res2 = wlsmodel.fit()


def test_incompatible_input():
    weights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
    exog = cpunish_data.exog
    endog = cpunish_data.endog
    family = sm.families.Poisson()
    # Too short
    assert_raises(ValueError, GLM, endog, exog, family=family,
                  freq_weights=weights[:-1])
    assert_raises(ValueError, GLM, endog, exog, family=family,
                  var_weights=weights[:-1])
    # Too long
    assert_raises(ValueError, GLM, endog, exog, family=family,
                  freq_weights=weights + [3])
    assert_raises(ValueError, GLM, endog, exog, family=family,
                  var_weights=weights + [3])

    # Too many dimensions
    assert_raises(ValueError, GLM, endog, exog, family=family,
                  freq_weights=[weights, weights])
    assert_raises(ValueError, GLM, endog, exog, family=family,
                  var_weights=[weights, weights])


def test_poisson_residuals():
    nobs, k_exog = 100, 5
    np.random.seed(987125)
    x = np.random.randn(nobs, k_exog - 1)
    x = add_constant(x)

    y_true = x.sum(1) / 2
    y = y_true + 2 * np.random.randn(nobs)
    exposure = 1 + np.arange(nobs) // 4

    yp = np.random.poisson(np.exp(y_true) * exposure)
    yp[10:15] += 10

    fam = sm.families.Poisson()
    mod_poi_e = GLM(yp, x, family=fam, exposure=exposure)
    res_poi_e = mod_poi_e.fit()

    mod_poi_w = GLM(yp / exposure, x, family=fam, var_weights=exposure)
    res_poi_w = mod_poi_w.fit()

    assert_allclose(res_poi_e.resid_response / exposure,
                    res_poi_w.resid_response)
    assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson)
    assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe)
    assert_allclose(res_poi_e.resid_anscombe_unscaled,
                    res_poi_w.resid_anscombe)
alkaline-ml / statsmodels python

Products

About

Resources

Contact Gemfury