Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ nonparametric / tests / test_kernel_regression.py


import pytest
import numpy as np
import numpy.testing as npt

import statsmodels.api as sm
nparam = sm.nonparametric


class KernelRegressionTestBase(object):
    @classmethod
    def setup_class(cls):
        nobs = 60
        np.random.seed(123456)
        cls.o = np.random.binomial(2, 0.7, size=(nobs, 1))
        cls.o2 = np.random.binomial(3, 0.7, size=(nobs, 1))
        cls.c1 = np.random.normal(size=(nobs, 1))
        cls.c2 = np.random.normal(10, 1, size=(nobs, 1))
        cls.c3 = np.random.normal(10, 2, size=(nobs, 1))
        cls.noise = np.random.normal(size=(nobs, 1))
        b0 = 0.3
        b1 = 1.2
        b2 = 3.7  # regression coefficients
        cls.y = b0 + b1 * cls.c1 + b2 * cls.c2 + cls.noise
        cls.y2 = b0 + b1 * cls.c1 + b2 * cls.c2 + cls.o + cls.noise
        # Italy data from R's np package (the first 50 obs) R>> data (Italy)

        cls.Italy_gdp = \
            [8.556, 12.262, 9.587, 8.119, 5.537, 6.796, 8.638,
             6.483, 6.212, 5.111, 6.001, 7.027, 4.616, 3.922,
             4.688, 3.957, 3.159, 3.763, 3.829, 5.242, 6.275,
             8.518, 11.542, 9.348, 8.02, 5.527, 6.865, 8.666,
             6.672, 6.289, 5.286, 6.271, 7.94, 4.72, 4.357,
             4.672, 3.883, 3.065, 3.489, 3.635, 5.443, 6.302,
             9.054, 12.485, 9.896, 8.33, 6.161, 7.055, 8.717,
             6.95]

        cls.Italy_year = \
            [1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951,
             1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1952,
             1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952,
             1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1953, 1953,
             1953, 1953, 1953, 1953, 1953, 1953]

        # OECD panel data from NP  R>> data(oecdpanel)
        cls.growth = \
            [-0.0017584, 0.00740688, 0.03424461, 0.03848719, 0.02932506,
             0.03769199, 0.0466038, 0.00199456, 0.03679607, 0.01917304,
             -0.00221, 0.00787269, 0.03441118, -0.0109228, 0.02043064,
             -0.0307962, 0.02008947, 0.00580313, 0.00344502, 0.04706358,
             0.03585851, 0.01464953, 0.04525762, 0.04109222, -0.0087903,
             0.04087915, 0.04551403, 0.036916, 0.00369293, 0.0718669,
             0.02577732, -0.0130759, -0.01656641, 0.00676429, 0.08833017,
             0.05092105, 0.02005877, 0.00183858, 0.03903173, 0.05832116,
             0.0494571, 0.02078484, 0.09213897, 0.0070534, 0.08677202,
             0.06830603, -0.00041, 0.0002856, 0.03421225, -0.0036825]

        cls.oecd = \
            [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
             0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
             0, 0, 0, 0]

    def write2file(self, file_name, data):  # pragma: no cover
        """Write some data to a csv file.  Only use for debugging!"""
        import csv

        data_file = csv.writer(open(file_name, "w"))
        data = np.column_stack(data)
        nobs = max(np.shape(data))
        K = min(np.shape(data))
        data = np.reshape(data, (nobs,K))
        for i in range(nobs):
            data_file.writerow(list(data[i, :]))


class TestKernelReg(KernelRegressionTestBase):
    def test_ordered_lc_cvls(self):
        model = nparam.KernelReg(endog=[self.Italy_gdp],
                                 exog=[self.Italy_year], reg_type='lc',
                                 var_type='o', bw='cv_ls')
        sm_bw = model.bw
        R_bw = 0.1390096

        sm_mean, sm_mfx = model.fit()
        sm_mean = sm_mean[0:5]
        sm_mfx = sm_mfx[0:5]
        R_mean = 6.190486

        sm_R2 = model.r_squared()
        R_R2 = 0.1435323

        ## CODE TO REPRODUCE IN R
        ## library(np)
        ## data(Italy)
        ## attach(Italy)
        ## bw <- npregbw(formula=gdp[1:50]~ordered(year[1:50]))
        npt.assert_allclose(sm_bw, R_bw, atol=1e-2)
        npt.assert_allclose(sm_mean, R_mean, atol=1e-2)
        npt.assert_allclose(sm_R2, R_R2, atol=1e-2)

    def test_continuousdata_lc_cvls(self):
        model = nparam.KernelReg(endog=[self.y], exog=[self.c1, self.c2],
                                 reg_type='lc', var_type='cc', bw='cv_ls')
        # Bandwidth
        sm_bw = model.bw
        R_bw = [0.6163835, 0.1649656]
        # Conditional Mean
        sm_mean, sm_mfx = model.fit()
        sm_mean = sm_mean[0:5]
        sm_mfx = sm_mfx[0:5]
        R_mean = [31.49157, 37.29536, 43.72332, 40.58997, 36.80711]
        # R-Squared
        sm_R2 = model.r_squared()
        R_R2 = 0.956381720885

        npt.assert_allclose(sm_bw, R_bw, atol=1e-2)
        npt.assert_allclose(sm_mean, R_mean, atol=1e-2)
        npt.assert_allclose(sm_R2, R_R2, atol=1e-2)

    def test_continuousdata_ll_cvls(self):
        model = nparam.KernelReg(endog=[self.y], exog=[self.c1, self.c2],
                                 reg_type='ll', var_type='cc', bw='cv_ls')

        sm_bw = model.bw
        R_bw = [1.717891, 2.449415]
        sm_mean, sm_mfx = model.fit()
        sm_mean = sm_mean[0:5]
        sm_mfx = sm_mfx[0:5]
        R_mean = [31.16003, 37.30323, 44.49870, 40.73704, 36.19083]

        sm_R2 = model.r_squared()
        R_R2 = 0.9336019

        npt.assert_allclose(sm_bw, R_bw, atol=1e-2)
        npt.assert_allclose(sm_mean, R_mean, atol=1e-2)
        npt.assert_allclose(sm_R2, R_R2, atol=1e-2)

    def test_continuous_mfx_ll_cvls(self, file_name='RegData.csv'):
        nobs = 200
        np.random.seed(1234)
        C1 = np.random.normal(size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        C3 = np.random.beta(0.5,0.2, size=(nobs,))
        noise = np.random.normal(size=(nobs, ))
        b0 = 3
        b1 = 1.2
        b2 = 3.7  # regression coefficients
        b3 = 2.3
        Y = b0+ b1 * C1 + b2*C2+ b3 * C3 + noise
        bw_cv_ls = np.array([0.96075, 0.5682, 0.29835])
        model = nparam.KernelReg(endog=[Y], exog=[C1, C2, C3],
                                 reg_type='ll', var_type='ccc', bw=bw_cv_ls)
        sm_mean, sm_mfx = model.fit()
        sm_mean = sm_mean[0:5]
        npt.assert_allclose(sm_mfx[0,:], [b1,b2,b3], rtol=2e-1)

    def test_mixed_mfx_ll_cvls(self, file_name='RegData.csv'):
        nobs = 200
        np.random.seed(1234)
        ovals = np.random.binomial(2, 0.5, size=(nobs, ))
        C1 = np.random.normal(size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        noise = np.random.normal(size=(nobs, ))
        b0 = 3
        b1 = 1.2
        b2 = 3.7  # regression coefficients
        b3 = 2.3
        Y = b0+ b1 * C1 + b2*C2+ b3 * ovals + noise
        bw_cv_ls = np.array([1.04726, 1.67485, 0.39852])
        model = nparam.KernelReg(endog=[Y], exog=[C1, C2, ovals],
                                 reg_type='ll', var_type='cco', bw=bw_cv_ls)
        sm_mean, sm_mfx = model.fit()
        # TODO: add expected result
        sm_R2 = model.r_squared()  # noqa: F841
        npt.assert_allclose(sm_mfx[0, :], [b1, b2, b3], rtol=2e-1)

    @pytest.mark.slow
    @pytest.mark.xfail(reason="Test does not make much sense - always passes "
                              "with very small bw.")
    def test_mfx_nonlinear_ll_cvls(self, file_name='RegData.csv'):
        nobs = 200
        np.random.seed(1234)
        C1 = np.random.normal(size=(nobs,))
        C2 = np.random.normal(2, 1, size=(nobs,))
        C3 = np.random.beta(0.5,0.2, size=(nobs,))
        noise = np.random.normal(size=(nobs,))
        b0 = 3
        b1 = 1.2
        b3 = 2.3
        Y = b0+ b1 * C1 * C2 + b3 * C3 + noise
        model = nparam.KernelReg(endog=[Y], exog=[C1, C2, C3],
                                 reg_type='ll', var_type='ccc', bw='cv_ls')
        sm_bw = model.bw
        sm_mean, sm_mfx = model.fit()
        sm_R2 = model.r_squared()
        # Theoretical marginal effects
        mfx1 = b1 * C2
        mfx2 = b1 * C1
        npt.assert_allclose(sm_mean, Y, rtol = 2e-1)

        npt.assert_allclose(sm_mfx[:, 0], mfx1, rtol=2e-1)
        npt.assert_allclose(sm_mfx[0:10, 1], mfx2[0:10], rtol=2e-1)

    @pytest.mark.slow
    def test_continuous_cvls_efficient(self):
        nobs = 500
        np.random.seed(12345)
        C1 = np.random.normal(size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        b0 = 3
        b1 = 1.2
        b2 = 3.7  # regression coefficients
        Y = b0+ b1 * C1 + b2*C2

        model_efficient = nparam.KernelReg(endog=[Y], exog=[C1], reg_type='lc',
                              var_type='c', bw='cv_ls',
                              defaults=nparam.EstimatorSettings(efficient=True,
                                                                n_sub=100))

        model = nparam.KernelReg(endog=[Y], exog=[C1], reg_type='ll',
                                 var_type='c', bw='cv_ls')
        npt.assert_allclose(model.bw, model_efficient.bw, atol=5e-2, rtol=1e-1)

    @pytest.mark.slow
    def test_censored_ll_cvls(self):
        nobs = 200
        np.random.seed(1234)
        C1 = np.random.normal(size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        noise = np.random.normal(size=(nobs, ))
        Y = 0.3 +1.2 * C1 - 0.9 * C2 + noise
        Y[Y>0] = 0  # censor the data
        model = nparam.KernelCensoredReg(endog=[Y], exog=[C1, C2],
                                         reg_type='ll', var_type='cc',
                                         bw='cv_ls', censor_val=0)
        sm_mean, sm_mfx = model.fit()
        npt.assert_allclose(sm_mfx[0,:], [1.2, -0.9], rtol = 2e-1)

    @pytest.mark.slow
    def test_continuous_lc_aic(self):
        nobs = 200
        np.random.seed(1234)
        C1 = np.random.normal(size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        noise = np.random.normal(size=(nobs, ))
        Y = 0.3 +1.2 * C1 - 0.9 * C2 + noise
        #self.write2file('RegData.csv', (Y, C1, C2))

        #CODE TO PRODUCE BANDWIDTH ESTIMATION IN R
        #library(np)
        #data <- read.csv('RegData.csv', header=FALSE)
        #bw <- npregbw(formula=data$V1 ~ data$V2 + data$V3,
        #                bwmethod='cv.aic', regtype='lc')
        model = nparam.KernelReg(endog=[Y], exog=[C1, C2],
                                 reg_type='lc', var_type='cc', bw='aic')
        #R_bw = [0.4017893, 0.4943397]  # Bandwidth obtained in R
        bw_expected = [0.3987821, 0.50933458]
        npt.assert_allclose(model.bw, bw_expected, rtol=1e-3)

    @pytest.mark.slow
    def test_significance_continuous(self):
        nobs = 250
        np.random.seed(12345)
        C1 = np.random.normal(size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        C3 = np.random.beta(0.5,0.2, size=(nobs,))
        noise = np.random.normal(size=(nobs, ))
        b1 = 1.2
        b2 = 3.7  # regression coefficients
        Y = b1 * C1 + b2 * C2 + noise

        # This is the cv_ls bandwidth estimated earlier
        bw=[11108137.1087194, 1333821.85150218]
        model = nparam.KernelReg(endog=[Y], exog=[C1, C3],
                                 reg_type='ll', var_type='cc', bw=bw)
        nboot = 45  # Number of bootstrap samples
        sig_var12 = model.sig_test([0,1], nboot=nboot)  # H0: b1 = 0 and b2 = 0
        npt.assert_equal(sig_var12 == 'Not Significant', False)
        sig_var1 = model.sig_test([0], nboot=nboot)  # H0: b1 = 0
        npt.assert_equal(sig_var1 == 'Not Significant', False)
        sig_var2 = model.sig_test([1], nboot=nboot)  # H0: b2 = 0
        npt.assert_equal(sig_var2 == 'Not Significant', True)

    @pytest.mark.slow
    def test_significance_discrete(self):
        nobs = 200
        np.random.seed(12345)
        ovals = np.random.binomial(2, 0.5, size=(nobs, ))
        C2 = np.random.normal(2, 1, size=(nobs, ))
        C3 = np.random.beta(0.5,0.2, size=(nobs,))
        noise = np.random.normal(size=(nobs, ))
        b1 = 1.2
        b2 = 3.7  # regression coefficients
        Y = b1 * ovals + b2 * C2 + noise

        bw= [3.63473198e+00, 1.21404803e+06]
        # This is the cv_ls bandwidth estimated earlier
        # The cv_ls bandwidth was estimated earlier to save time
        model = nparam.KernelReg(endog=[Y], exog=[ovals, C3],
                                 reg_type='ll', var_type='oc', bw=bw)
        # This was also tested with local constant estimator
        nboot = 45  # Number of bootstrap samples
        sig_var1 = model.sig_test([0], nboot=nboot)  # H0: b1 = 0
        npt.assert_equal(sig_var1 == 'Not Significant', False)
        sig_var2 = model.sig_test([1], nboot=nboot)  # H0: b2 = 0
        npt.assert_equal(sig_var2 == 'Not Significant', True)

    def test_user_specified_kernel(self):
        model = nparam.KernelReg(endog=[self.y], exog=[self.c1, self.c2],
                                 reg_type='ll', var_type='cc', bw='cv_ls',
                                 ckertype='tricube')
        # Bandwidth
        sm_bw = model.bw
        R_bw = [0.581663, 0.5652]
        # Conditional Mean
        sm_mean, sm_mfx = model.fit()
        sm_mean = sm_mean[0:5]
        sm_mfx = sm_mfx[0:5]
        R_mean = [30.926714, 36.994604, 44.438358, 40.680598, 35.961593]
        # R-Squared
        sm_R2 = model.r_squared()
        R_R2 = 0.934825

        npt.assert_allclose(sm_bw, R_bw, atol=1e-2)
        npt.assert_allclose(sm_mean, R_mean, atol=1e-2)
        npt.assert_allclose(sm_R2, R_R2, atol=1e-2)

    def test_censored_user_specified_kernel(self):
        model = nparam.KernelCensoredReg(endog=[self.y], exog=[self.c1, self.c2],
                                 reg_type='ll', var_type='cc', bw='cv_ls',
                                 censor_val=0, ckertype='tricube')
        # Bandwidth
        sm_bw = model.bw
        R_bw = [0.581663, 0.5652]
        # Conditional Mean
        sm_mean, sm_mfx = model.fit()
        sm_mean = sm_mean[0:5]
        sm_mfx = sm_mfx[0:5]
        R_mean = [29.205526, 29.538008, 31.667581, 31.978866, 30.926714]
        # R-Squared
        sm_R2 = model.r_squared()
        R_R2 = 0.934825

        npt.assert_allclose(sm_bw, R_bw, atol=1e-2)
        npt.assert_allclose(sm_mean, R_mean, atol=1e-2)
Loading ...