Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ genmod / tests / test_glm_weights.py

"""
Test for weights in GLM, Poisson and OLS/WLS, continuous test_glm.py


Below is a table outlining the test coverage.

================================= ====================== ====== ===================== === ======= ======== ============== ============= ============== ============= ============== ==== =========
Test                              Compared To            params normalized_cov_params bse loglike deviance resid_response resid_pearson resid_deviance resid_working resid_anscombe chi2 optimizer
================================= ====================== ====== ===================== === ======= ======== ============== ============= ============== ============= ============== ==== =========
TestGlmPoissonPlain               stata                  X                            X   X       X        X              X             X              X             X              X    bfgs
TestGlmPoissonFwNr                stata                  X                            X   X       X        X              X             X              X             X              X    bfgs
TestGlmPoissonAwNr                stata                  X                            X   X       X        X              X             X              X             X              X    bfgs
TestGlmPoissonFwHC                stata                  X                            X   X       X                                                                                 X
TestGlmPoissonAwHC                stata                  X                            X   X       X                                                                                 X
TestGlmPoissonFwClu               stata                  X                            X   X       X                                                                                 X
TestGlmTweedieAwNr                R                      X                            X           X        X              X             X              X                                 newton
TestGlmGammaAwNr                  R                      X                            X   special X        X              X             X              X                                 bfgs
TestGlmGaussianAwNr               R                      X                            X   special X        X              X             X              X                                 bfgs
TestRepeatedvsAggregated          statsmodels.GLM        X      X                                                                                                                        bfgs
TestRepeatedvsAverage             statsmodels.GLM        X      X                                                                                                                        bfgs
TestTweedieRepeatedvsAggregated   statsmodels.GLM        X      X                                                                                                                        bfgs
TestTweedieRepeatedvsAverage      statsmodels.GLM        X      X                                                                                                                        bfgs
TestBinomial0RepeatedvsAverage    statsmodels.GLM        X      X
TestBinomial0RepeatedvsDuplicated statsmodels.GLM        X      X                                                                                                                        bfgs
TestBinomialVsVarWeights          statsmodels.GLM        X      X                     X                                                                                                  bfgs
TestGlmGaussianWLS                statsmodels.WLS        X      X                     X                                                                                                  bfgs
================================= ====================== ====== ===================== === ======= ======== ============== ============= ============== ============= ============== ==== =========
"""  # noqa: E501
import warnings

import numpy as np
from numpy.testing import (assert_raises, assert_allclose)
import pandas as pd
import pytest

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.tools.tools import add_constant
from statsmodels.discrete import discrete_model as discrete
from statsmodels.tools.sm_exceptions import SpecificationWarning

from .results import results_glm_poisson_weights as res_stata
from .results import res_R_var_weight as res_r

# load data into module namespace
from statsmodels.datasets.cpunish import load

cpunish_data = load(as_pandas=False)
cpunish_data.exog[:, 3] = np.log(cpunish_data.exog[:, 3])
cpunish_data.exog = add_constant(cpunish_data.exog, prepend=False)


class CheckWeight(object):
    def test_basic(self):
        res1 = self.res1
        res2 = self.res2

        assert_allclose(res1.params, res2.params, atol=1e-6, rtol=2e-6)
        corr_fact = getattr(self, 'corr_fact', 1)
        if hasattr(res2, 'normalized_cov_params'):
            assert_allclose(res1.normalized_cov_params,
                            res2.normalized_cov_params,
                            atol=1e-8, rtol=2e-6)
        if isinstance(self, (TestRepeatedvsAggregated, TestRepeatedvsAverage,
                             TestTweedieRepeatedvsAggregated,
                             TestTweedieRepeatedvsAverage,
                             TestBinomial0RepeatedvsAverage,
                             TestBinomial0RepeatedvsDuplicated)):
            # Loglikelihood, scale, deviance is different between repeated vs.
            # exposure/average
            return None
        assert_allclose(res1.bse, corr_fact * res2.bse, atol=1e-6, rtol=2e-6)
        if isinstance(self, TestBinomialVsVarWeights):
            # Binomial ll and deviance are different for 1d vs. counts...
            return None
        if isinstance(self, TestGlmGaussianWLS):
            # This will not work right now either
            return None
        if not isinstance(self, (TestGlmGaussianAwNr, TestGlmGammaAwNr)):
            # Matching R is hard
            assert_allclose(res1.llf, res2.ll, atol=1e-6, rtol=1e-7)
        assert_allclose(res1.deviance, res2.deviance, atol=1e-6, rtol=1e-7)

    def test_residuals(self):
        if isinstance(self, (TestRepeatedvsAggregated, TestRepeatedvsAverage,
                             TestTweedieRepeatedvsAggregated,
                             TestTweedieRepeatedvsAverage,
                             TestBinomial0RepeatedvsAverage,
                             TestBinomial0RepeatedvsDuplicated)):
            # This will not match as different number of records
            return None
        res1 = self.res1
        res2 = self.res2
        if not hasattr(res2, 'resids'):
            return None  # use SkipError instead
        resid_all = dict(zip(res2.resids_colnames, res2.resids.T))

        assert_allclose(res1.resid_response, resid_all['resid_response'], atol= 1e-6, rtol=2e-6)
        assert_allclose(res1.resid_pearson, resid_all['resid_pearson'], atol= 1e-6, rtol=2e-6)
        assert_allclose(res1.resid_deviance, resid_all['resid_deviance'], atol= 1e-6, rtol=2e-6)
        assert_allclose(res1.resid_working, resid_all['resid_working'], atol= 1e-6, rtol=2e-6)
        if resid_all.get('resid_anscombe') is None:
            return None
        # Stata does not use var_weights in anscombe residuals, it seems.
        # Adjust residuals to match our approach.
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=FutureWarning)
            resid_a = res1.resid_anscombe

        assert_allclose(resid_a, resid_all['resid_anscombe'] * np.sqrt(res1._var_weights), atol= 1e-6, rtol=2e-6)

    def test_compare_optimizers(self):
        res1 = self.res1
        if isinstance(res1.model.family, sm.families.Tweedie):
            method = 'newton'
            optim_hessian = 'eim'
        else:
            method = 'bfgs'
            optim_hessian = 'oim'
        if isinstance(self, (TestGlmPoissonFwHC, TestGlmPoissonAwHC,
                             TestGlmPoissonFwClu,
                             TestBinomial0RepeatedvsAverage)):
            return None

        start_params = res1.params
        res2 = self.res1.model.fit(start_params=start_params, method=method,
                                   optim_hessian=optim_hessian)
        assert_allclose(res1.params, res2.params, atol=1e-3, rtol=2e-3)
        H = res2.model.hessian(res2.params, observed=False)
        res2_bse = np.sqrt(-np.diag(np.linalg.inv(H)))
        assert_allclose(res1.bse, res2_bse, atol=1e-3, rtol=1e-3)

    def test_pearson_chi2(self):
        if hasattr(self.res2, 'chi2'):
            assert_allclose(self.res1.pearson_chi2, self.res2.deviance_p,
                            atol=1e-6, rtol=1e-6)


class TestGlmPoissonPlain(CheckWeight):
    @classmethod
    def setup_class(cls):
        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                       family=sm.families.Poisson()).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_none_nonrobust


class TestGlmPoissonFwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        fweights = np.array(fweights)

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                    family=sm.families.Poisson(), freq_weights=fweights).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_nonrobust


class TestGlmPoissonAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                    family=sm.families.Poisson(), var_weights=aweights).fit()
        # compare with discrete, start close to save time
        modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)

        # Need to copy to avoid inplace adjustment
        from copy import copy
        cls.res2 = copy(res_stata.results_poisson_aweight_nonrobust)
        cls.res2.resids = cls.res2.resids.copy()

        # Need to adjust resids for pearson and deviance to add weights
        cls.res2.resids[:, 3:5] *= np.sqrt(aweights[:, np.newaxis])


# prob_weights fail with HC, not properly implemented yet
class TestGlmPoissonPwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        cls.res1 = GLM(cpunish_data.endog, cpunish_data.exog,
                    family=sm.families.Poisson(), freq_weights=fweights).fit(cov_type='HC1')
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_pweight_nonrobust

    # TODO: find more informative reasons why these fail
    @pytest.mark.xfail(reason='Known to fail', strict=True)
    def test_basic(self):
        super(TestGlmPoissonPwNr, self).test_basic()

    @pytest.mark.xfail(reason='Known to fail', strict=True)
    def test_compare_optimizers(self):
        super(TestGlmPoissonPwNr, self).test_compare_optimizers()


class TestGlmPoissonFwHC(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum)

        mod = GLM(cpunish_data.endog, cpunish_data.exog,
                  family=sm.families.Poisson(),
                  freq_weights=fweights)
        cls.res1 = mod.fit(cov_type='HC0') #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_hc1


# var_weights (aweights fail with HC, not properly implemented yet
class TestGlmPoissonAwHC(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        # This is really close when corr_fact = (wsum - 1.) / wsum, but to
        # avoid having loosen precision of the assert_allclose, I'm doing this
        # manually. Its *possible* lowering the IRLS convergence criterion
        # in stata and here will make this less sketchy.
        cls.corr_fact = np.sqrt((wsum - 1.) / wsum) * 0.98518473599905609
        mod = GLM(cpunish_data.endog, cpunish_data.exog,
                  family=sm.families.Poisson(),
                  var_weights=aweights)
        cls.res1 = mod.fit(cov_type='HC0') #, cov_kwds={'use_correction':False})
        # compare with discrete, start close to save time
        # modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_aweight_hc1


class TestGlmPoissonFwClu(CheckWeight):
    @classmethod
    def setup_class(cls):
        fweights = [1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3]
        # faking aweights by using normalized freq_weights
        fweights = np.array(fweights)
        wsum = fweights.sum()
        nobs = len(cpunish_data.endog)
        aweights = fweights / wsum * nobs

        gid = np.arange(1, 17 + 1) // 2
        n_groups = len(np.unique(gid))

        # no wnobs yet in sandwich covariance calcualtion
        cls.corr_fact = 1 / np.sqrt(n_groups / (n_groups - 1))   #np.sqrt((wsum - 1.) / wsum)
        cov_kwds = {'groups': gid, 'use_correction':False}
        with pytest.warns(None):
            mod = GLM(cpunish_data.endog, cpunish_data.exog,
                      family=sm.families.Poisson(),
                      freq_weights=fweights)
            cls.res1 = mod.fit(cov_type='cluster', cov_kwds=cov_kwds)

        # compare with discrete, start close to save time
        #modd = discrete.Poisson(cpunish_data.endog, cpunish_data.exog)
        cls.res2 = res_stata.results_poisson_fweight_clu1


class TestGlmTweedieAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.fair.load_pandas()
        endog = data.endog
        data = data.exog
        data['fair'] = endog
        aweights = np.repeat(1, len(data.index))
        aweights[::5] = 5
        aweights[::13] = 3
        model = smf.glm(
                'fair ~ age + yrs_married',
                data=data,
                family=sm.families.Tweedie(
                    var_power=1.55,
                    link=sm.families.links.log()
                    ),
                var_weights=aweights
        )
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_tweedie_aweights_nonrobust


class TestGlmGammaAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        from .results.results_glm import CancerLog
        res2 = CancerLog()
        endog = res2.endog
        exog = res2.exog[:, :-1]
        exog = sm.add_constant(exog, prepend=True)

        aweights = np.repeat(1, len(endog))
        aweights[::5] = 5
        aweights[::13] = 3
        model = sm.GLM(endog, exog,
                       family=sm.families.Gamma(link=sm.families.links.log()),
                       var_weights=aweights)
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_gamma_aweights_nonrobust

    def test_r_llf(self):
        scale = self.res1.deviance / self.res1._iweights.sum()
        ll = self.res1.family.loglike(self.res1.model.endog,
                                      self.res1.mu,
                                      freq_weights=self.res1._var_weights,
                                      scale=scale)
        assert_allclose(ll, self.res2.ll, atol=1e-6, rtol=1e-7)


class TestGlmGaussianAwNr(CheckWeight):
    @classmethod
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
Loading ...