Repository URL to install this package:
|
Version:
0.10.2 ▾
|
"""
Test functions for models.regression
"""
# TODO: Test for LM
from statsmodels.compat.python import long, lrange
import warnings
import pandas
import numpy as np
from numpy.testing import (assert_almost_equal, assert_,
assert_raises, assert_equal, assert_allclose)
import pytest
from scipy.linalg import toeplitz
from statsmodels.tools.tools import add_constant, categorical
from statsmodels.regression.linear_model import (OLS, WLS, GLS, yule_walker,
burg)
from statsmodels.datasets import longley
from scipy.stats import t as student_t
DECIMAL_4 = 4
DECIMAL_3 = 3
DECIMAL_2 = 2
DECIMAL_1 = 1
DECIMAL_7 = 7
DECIMAL_0 = 0
try:
import cvxopt # noqa:F401
has_cvxopt = True
except ImportError:
has_cvxopt = False
class CheckRegressionResults(object):
"""
res2 contains results from Rmodelwrap or were obtained from a statistical
packages such as R, Stata, or SAS and were written to model_results
"""
decimal_params = DECIMAL_4
def test_params(self):
assert_almost_equal(self.res1.params, self.res2.params,
self.decimal_params)
decimal_standarderrors = DECIMAL_4
def test_standarderrors(self):
assert_almost_equal(self.res1.bse, self.res2.bse,
self.decimal_standarderrors)
decimal_confidenceintervals = DECIMAL_4
def test_confidenceintervals(self):
# NOTE: stata rounds residuals (at least) to sig digits so approx_equal
conf1 = self.res1.conf_int()
conf2 = self.res2.conf_int()
for i in range(len(conf1)):
assert_allclose(conf1[i][0], conf2[i][0],
rtol=10**-self.decimal_confidenceintervals)
assert_allclose(conf1[i][1], conf2[i][1],
rtol=10**-self.decimal_confidenceintervals)
decimal_conf_int_subset = DECIMAL_4
def test_conf_int_subset(self):
if len(self.res1.params) > 1:
ci1 = self.res1.conf_int(cols=(1, 2))
ci2 = self.res1.conf_int()[1:3]
assert_almost_equal(ci1, ci2, self.decimal_conf_int_subset)
else:
pass
decimal_scale = DECIMAL_4
def test_scale(self):
assert_almost_equal(self.res1.scale, self.res2.scale,
self.decimal_scale)
decimal_rsquared = DECIMAL_4
def test_rsquared(self):
assert_almost_equal(self.res1.rsquared, self.res2.rsquared,
self.decimal_rsquared)
decimal_rsquared_adj = DECIMAL_4
def test_rsquared_adj(self):
assert_almost_equal(self.res1.rsquared_adj, self.res2.rsquared_adj,
self.decimal_rsquared_adj)
def test_degrees(self):
assert_equal(self.res1.model.df_model, self.res2.df_model)
assert_equal(self.res1.model.df_resid, self.res2.df_resid)
decimal_ess = DECIMAL_4
def test_ess(self):
# Explained Sum of Squares
assert_almost_equal(self.res1.ess, self.res2.ess,
self.decimal_ess)
decimal_ssr = DECIMAL_4
def test_sumof_squaredresids(self):
assert_almost_equal(self.res1.ssr, self.res2.ssr, self.decimal_ssr)
decimal_mse_resid = DECIMAL_4
def test_mse_resid(self):
# Mean squared error of residuals
assert_almost_equal(self.res1.mse_model, self.res2.mse_model,
self.decimal_mse_resid)
decimal_mse_model = DECIMAL_4
def test_mse_model(self):
assert_almost_equal(self.res1.mse_resid, self.res2.mse_resid,
self.decimal_mse_model)
decimal_mse_total = DECIMAL_4
def test_mse_total(self):
assert_almost_equal(self.res1.mse_total, self.res2.mse_total,
self.decimal_mse_total,
err_msg="Test class %s" % self)
decimal_fvalue = DECIMAL_4
def test_fvalue(self):
# didn't change this, not sure it should complain -inf not equal -inf
# if not (np.isinf(self.res1.fvalue) and np.isinf(self.res2.fvalue)):
assert_almost_equal(self.res1.fvalue, self.res2.fvalue,
self.decimal_fvalue)
decimal_loglike = DECIMAL_4
def test_loglike(self):
assert_almost_equal(self.res1.llf, self.res2.llf, self.decimal_loglike)
decimal_aic = DECIMAL_4
def test_aic(self):
assert_almost_equal(self.res1.aic, self.res2.aic, self.decimal_aic)
decimal_bic = DECIMAL_4
def test_bic(self):
assert_almost_equal(self.res1.bic, self.res2.bic, self.decimal_bic)
decimal_pvalues = DECIMAL_4
def test_pvalues(self):
assert_almost_equal(self.res1.pvalues, self.res2.pvalues,
self.decimal_pvalues)
decimal_wresid = DECIMAL_4
def test_wresid(self):
assert_almost_equal(self.res1.wresid, self.res2.wresid,
self.decimal_wresid)
decimal_resids = DECIMAL_4
def test_resids(self):
assert_almost_equal(self.res1.resid, self.res2.resid,
self.decimal_resids)
decimal_norm_resids = DECIMAL_4
def test_norm_resids(self):
assert_almost_equal(self.res1.resid_pearson, self.res2.resid_pearson,
self.decimal_norm_resids)
# TODO: test fittedvalues and what else?
class TestOLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
from .results.results_regression import Longley
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
res2 = Longley()
res2.wresid = res1.wresid # workaround hack
cls.res1 = res1
cls.res2 = res2
res_qr = OLS(data.endog, data.exog).fit(method="qr")
model_qr = OLS(data.endog, data.exog)
Q, R = np.linalg.qr(data.exog)
model_qr.exog_Q, model_qr.exog_R = Q, R
model_qr.normalized_cov_params = np.linalg.inv(np.dot(R.T, R))
model_qr.rank = np.linalg.matrix_rank(R)
res_qr2 = model_qr.fit(method="qr")
cls.res_qr = res_qr
cls.res_qr_manual = res_qr2
def test_eigenvalues(self):
eigenval_perc_diff = (self.res_qr.eigenvals -
self.res_qr_manual.eigenvals)
eigenval_perc_diff /= self.res_qr.eigenvals
zeros = np.zeros_like(eigenval_perc_diff)
assert_almost_equal(eigenval_perc_diff, zeros, DECIMAL_7)
# Robust error tests. Compare values computed with SAS
def test_HC0_errors(self):
# They are split up because the copied results do not have any
# DECIMAL_4 places for the last place.
assert_almost_equal(self.res1.HC0_se[:-1],
self.res2.HC0_se[:-1], DECIMAL_4)
assert_allclose(np.round(self.res1.HC0_se[-1]),
self.res2.HC0_se[-1])
def test_HC1_errors(self):
assert_almost_equal(self.res1.HC1_se[:-1],
self.res2.HC1_se[:-1], DECIMAL_4)
# Note: tolerance is tight; rtol=3e-7 fails while 4e-7 passes
assert_allclose(self.res1.HC1_se[-1], self.res2.HC1_se[-1],
rtol=4e-7)
def test_HC2_errors(self):
assert_almost_equal(self.res1.HC2_se[:-1],
self.res2.HC2_se[:-1], DECIMAL_4)
# Note: tolerance is tight; rtol=4e-7 fails while 5e-7 passes
assert_allclose(self.res1.HC2_se[-1], self.res2.HC2_se[-1],
rtol=5e-7)
def test_HC3_errors(self):
assert_almost_equal(self.res1.HC3_se[:-1],
self.res2.HC3_se[:-1], DECIMAL_4)
# Note: tolerance is tight; rtol=1e-7 fails while 1.5e-7 passes
assert_allclose(self.res1.HC3_se[-1], self.res2.HC3_se[-1],
rtol=1.5e-7)
def test_qr_params(self):
assert_almost_equal(self.res1.params,
self.res_qr.params, 6)
def test_qr_normalized_cov_params(self):
# todo: need assert_close
assert_almost_equal(np.ones_like(self.res1.normalized_cov_params),
self.res1.normalized_cov_params /
self.res_qr.normalized_cov_params, 5)
def test_missing(self):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
data.endog[[3, 7, 14]] = np.nan
mod = OLS(data.endog, data.exog, missing='drop')
assert_equal(mod.endog.shape[0], 13)
assert_equal(mod.exog.shape[0], 13)
def test_rsquared_adj_overfit(self):
# Test that if df_resid = 0, rsquared_adj = 0.
# This is a regression test for user issue:
# https://github.com/statsmodels/statsmodels/issues/868
with warnings.catch_warnings(record=True):
x = np.random.randn(5)
y = np.random.randn(5, 6)
results = OLS(x, y).fit()
rsquared_adj = results.rsquared_adj
assert_equal(rsquared_adj, np.nan)
def test_qr_alternatives(self):
assert_allclose(self.res_qr.params, self.res_qr_manual.params,
rtol=5e-12)
def test_norm_resid(self):
resid = self.res1.wresid
norm_resid = resid / np.sqrt(np.sum(resid**2.0) / self.res1.df_resid)
model_norm_resid = self.res1.resid_pearson
assert_almost_equal(model_norm_resid, norm_resid, DECIMAL_7)
def test_norm_resid_zero_variance(self):
with warnings.catch_warnings(record=True):
y = self.res1.model.endog
res = OLS(y, y).fit()
assert_allclose(res.scale, 0, atol=1e-20)
assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)
class TestRTO(CheckRegressionResults):
@classmethod
def setup_class(cls):
from .results.results_regression import LongleyRTO
data = longley.load(as_pandas=False)
res1 = OLS(data.endog, data.exog).fit()
res2 = LongleyRTO()
res2.wresid = res1.wresid # workaround hack
cls.res1 = res1
cls.res2 = res2
res_qr = OLS(data.endog, data.exog).fit(method="qr")
cls.res_qr = res_qr
class TestFtest(object):
"""
Tests f_test vs. RegressionResults
"""
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
cls.res1 = OLS(data.endog, data.exog).fit()
R = np.identity(7)[:-1, :]
cls.Ftest = cls.res1.f_test(R)
def test_F(self):
assert_almost_equal(self.Ftest.fvalue, self.res1.fvalue, DECIMAL_4)
def test_p(self):
assert_almost_equal(self.Ftest.pvalue, self.res1.f_pvalue, DECIMAL_4)
def test_Df_denom(self):
assert_equal(self.Ftest.df_denom, self.res1.model.df_resid)
def test_Df_num(self):
assert_equal(self.Ftest.df_num, 6)
class TestFTest2(object):
"""
A joint test that the coefficient on
GNP = the coefficient on UNEMP and that the coefficient on
POP = the coefficient on YEAR for the Longley dataset.
Ftest1 is from statsmodels. Results are from Rpy using R's car library.
"""
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]]
cls.Ftest1 = res1.f_test(R2)
hyp = 'x2 = x3, x5 = x6'
cls.NewFtest1 = res1.f_test(hyp)
def test_new_ftest(self):
assert_equal(self.NewFtest1.fvalue, self.Ftest1.fvalue)
def test_fvalue(self):
assert_almost_equal(self.Ftest1.fvalue, 9.7404618732968196, DECIMAL_4)
def test_pvalue(self):
assert_almost_equal(self.Ftest1.pvalue, 0.0056052885317493459,
DECIMAL_4)
def test_df_denom(self):
assert_equal(self.Ftest1.df_denom, 9)
def test_df_num(self):
assert_equal(self.Ftest1.df_num, 2)
class TestFtestQ(object):
"""
A joint hypothesis test that Rb = q. Coefficient tests are essentially
made up. Test values taken from Stata.
"""
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
R = np.array([[0, 1, 1, 0, 0, 0, 0],
[0, 1, 0, 1, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 1, 0]])
q = np.array([0, 0, 0, 1, 0])
cls.Ftest1 = res1.f_test((R, q))
def test_fvalue(self):
assert_almost_equal(self.Ftest1.fvalue, 70.115557, 5)
def test_pvalue(self):
assert_almost_equal(self.Ftest1.pvalue, 6.229e-07, 10)
def test_df_denom(self):
assert_equal(self.Ftest1.df_denom, 9)
def test_df_num(self):
assert_equal(self.Ftest1.df_num, 5)
class TestTtest(object):
"""
Test individual t-tests. Ie., are the coefficients significantly
different than zero.
"""
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
cls.res1 = OLS(data.endog, data.exog).fit()
R = np.identity(7)
cls.Ttest = cls.res1.t_test(R)
hyp = 'x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0'
cls.NewTTest = cls.res1.t_test(hyp)
def test_new_tvalue(self):
assert_equal(self.NewTTest.tvalue, self.Ttest.tvalue)
def test_tvalue(self):
assert_almost_equal(self.Ttest.tvalue, self.res1.tvalues, DECIMAL_4)
def test_sd(self):
assert_almost_equal(self.Ttest.sd, self.res1.bse, DECIMAL_4)
def test_pvalue(self):
assert_almost_equal(self.Ttest.pvalue, student_t.sf(
np.abs(self.res1.tvalues), self.res1.model.df_resid)*2,
DECIMAL_4)
def test_df_denom(self):
assert_equal(self.Ttest.df_denom, self.res1.model.df_resid)
def test_effect(self):
assert_almost_equal(self.Ttest.effect, self.res1.params)
class TestTtest2(object):
"""
Tests the hypothesis that the coefficients on POP and YEAR
are equal.
Results from RPy using 'car' package.
"""
@classmethod
def setup_class(cls):
R = np.zeros(7)
R[4:6] = [1, -1]
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
cls.Ttest1 = res1.t_test(R)
def test_tvalue(self):
assert_almost_equal(self.Ttest1.tvalue, -4.0167754636397284,
DECIMAL_4)
def test_sd(self):
assert_almost_equal(self.Ttest1.sd, 455.39079425195314, DECIMAL_4)
def test_pvalue(self):
assert_almost_equal(self.Ttest1.pvalue, 2*0.0015163772380932246,
DECIMAL_4)
def test_df_denom(self):
assert_equal(self.Ttest1.df_denom, 9)
def test_effect(self):
assert_almost_equal(self.Ttest1.effect, -1829.2025687186533, DECIMAL_4)
class TestGLS(object):
"""
These test results were obtained by replication with R.
"""
@classmethod
def setup_class(cls):
from .results.results_regression import LongleyGls
data = longley.load(as_pandas=False)
exog = add_constant(np.column_stack(
(data.exog[:, 1], data.exog[:, 4])), prepend=False)
tmp_results = OLS(data.endog, exog).fit()
rho = np.corrcoef(tmp_results.resid[1:],
tmp_results.resid[:-1])[0][1] # by assumption
order = toeplitz(np.arange(16))
sigma = rho**order
GLS_results = GLS(data.endog, exog, sigma=sigma).fit()
cls.res1 = GLS_results
cls.res2 = LongleyGls()
# attach for test_missing
cls.sigma = sigma
cls.exog = exog
cls.endog = data.endog
def test_aic(self):
# Note: tolerance is tight; rtol=3e-3 fails while 4e-3 passes
assert_allclose(self.res1.aic+2, self.res2.aic, rtol=4e-3)
def test_bic(self):
# Note: tolerance is tight; rtol=1e-2 fails while 1.5e-2 passes
assert_allclose(self.res1.bic, self.res2.bic, rtol=1.5e-2)
def test_loglike(self):
assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_0)
def test_params(self):
assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_1)
def test_resid(self):
assert_almost_equal(self.res1.resid, self.res2.resid, DECIMAL_4)
def test_scale(self):
assert_almost_equal(self.res1.scale, self.res2.scale, DECIMAL_4)
def test_tvalues(self):
assert_almost_equal(self.res1.tvalues, self.res2.tvalues, DECIMAL_4)
def test_standarderrors(self):
assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4)
def test_fittedvalues(self):
assert_almost_equal(self.res1.fittedvalues, self.res2.fittedvalues,
DECIMAL_4)
def test_pvalues(self):
assert_almost_equal(self.res1.pvalues, self.res2.pvalues, DECIMAL_4)
def test_missing(self):
endog = self.endog.copy() # copy or changes endog for other methods
endog[[4, 7, 14]] = np.nan
mod = GLS(endog, self.exog, sigma=self.sigma, missing='drop')
assert_equal(mod.endog.shape[0], 13)
assert_equal(mod.exog.shape[0], 13)
assert_equal(mod.sigma.shape, (13, 13))
class TestGLS_alt_sigma(CheckRegressionResults):
"""
Test that GLS with no argument is equivalent to OLS.
"""
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
ols_res = OLS(data.endog, data.exog).fit()
gls_res = GLS(data.endog, data.exog).fit()
gls_res_scalar = GLS(data.endog, data.exog, sigma=1)
cls.endog = data.endog
cls.exog = data.exog
cls.res1 = gls_res
cls.res2 = ols_res
cls.res3 = gls_res_scalar
# self.res2.conf_int = self.res2.conf_int()
def test_wrong_size_sigma_1d(self):
n = len(self.endog)
assert_raises(ValueError, GLS, self.endog, self.exog,
sigma=np.ones(n-1))
def test_wrong_size_sigma_2d(self):
n = len(self.endog)
assert_raises(ValueError, GLS, self.endog, self.exog,
sigma=np.ones((n-1, n-1)))
# def check_confidenceintervals(self, conf1, conf2):
# assert_almost_equal(conf1, conf2, DECIMAL_4)
class TestLM(object):
@classmethod
def setup_class(cls):
# TODO: Test HAC method
X = np.random.randn(100, 3)
b = np.ones((3, 1))
e = np.random.randn(100, 1)
y = np.dot(X, b) + e
# Cases?
# Homoskedastic
# HC0
cls.res1_full = OLS(y, X).fit()
cls.res1_restricted = OLS(y, X[:, 0]).fit()
cls.res2_full = cls.res1_full.get_robustcov_results('HC0')
cls.res2_restricted = cls.res1_restricted.get_robustcov_results('HC0')
cls.X = X
cls.Y = y
def test_LM_homoskedastic(self):
resid = self.res1_restricted.wresid
n = resid.shape[0]
X = self.X
S = np.dot(resid, resid) / n * np.dot(X.T, X) / n
Sinv = np.linalg.inv(S)
s = np.mean(X * resid[:, None], 0)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res1_full.compare_lm_test(self.res1_restricted)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_heteroskedastic_nodemean(self):
resid = self.res1_restricted.wresid
n = resid.shape[0]
X = self.X
scores = X * resid[:, None]
S = np.dot(scores.T, scores) / n
Sinv = np.linalg.inv(S)
s = np.mean(scores, 0)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted,
demean=False)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_heteroskedastic_demean(self):
resid = self.res1_restricted.wresid
n = resid.shape[0]
X = self.X
scores = X * resid[:, None]
scores_demean = scores - scores.mean(0)
S = np.dot(scores_demean.T, scores_demean) / n
Sinv = np.linalg.inv(S)
s = np.mean(scores, 0)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_heteroskedastic_LRversion(self):
resid = self.res1_restricted.wresid
resid_full = self.res1_full.wresid
n = resid.shape[0]
X = self.X
scores = X * resid[:, None]
s = np.mean(scores, 0)
scores = X * resid_full[:, None]
S = np.dot(scores.T, scores) / n
Sinv = np.linalg.inv(S)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted,
use_lr=True)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_nonnested(self):
assert_raises(ValueError, self.res2_restricted.compare_lm_test,
self.res2_full)
class TestOLS_GLS_WLS_equivalence(object):
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
y = data.endog
X = data.exog
n = y.shape[0]
w = np.ones(n)
cls.results = []
cls.results.append(OLS(y, X).fit())
cls.results.append(WLS(y, X, w).fit())
cls.results.append(GLS(y, X, 100*w).fit())
cls.results.append(GLS(y, X, np.diag(0.1*w)).fit())
def test_ll(self):
llf = np.array([r.llf for r in self.results])
llf_1 = np.ones_like(llf) * self.results[0].llf
assert_almost_equal(llf, llf_1, DECIMAL_7)
ic = np.array([r.aic for r in self.results])
ic_1 = np.ones_like(ic) * self.results[0].aic
assert_almost_equal(ic, ic_1, DECIMAL_7)
ic = np.array([r.bic for r in self.results])
ic_1 = np.ones_like(ic) * self.results[0].bic
assert_almost_equal(ic, ic_1, DECIMAL_7)
def test_params(self):
params = np.array([r.params for r in self.results])
params_1 = np.array([self.results[0].params] * len(self.results))
assert_allclose(params, params_1)
def test_ss(self):
bse = np.array([r.bse for r in self.results])
bse_1 = np.array([self.results[0].bse] * len(self.results))
assert_allclose(bse, bse_1)
def test_rsquared(self):
rsquared = np.array([r.rsquared for r in self.results])
rsquared_1 = np.array([self.results[0].rsquared] * len(self.results))
assert_almost_equal(rsquared, rsquared_1, DECIMAL_7)
class TestGLS_WLS_equivalence(TestOLS_GLS_WLS_equivalence):
# reuse test methods
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
y = data.endog
X = data.exog
n = y.shape[0]
np.random.seed(5)
w = np.random.uniform(0.5, 1, n)
w_inv = 1. / w
cls.results = []
cls.results.append(WLS(y, X, w).fit())
cls.results.append(WLS(y, X, 0.01 * w).fit())
cls.results.append(GLS(y, X, 100 * w_inv).fit())
cls.results.append(GLS(y, X, np.diag(0.1 * w_inv)).fit())
class TestNonFit(object):
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
cls.endog = data.endog
cls.exog = data.exog
cls.ols_model = OLS(data.endog, data.exog)
def test_df_resid(self):
df_resid = self.endog.shape[0] - self.exog.shape[1]
assert_equal(self.ols_model.df_resid, long(9))
class TestWLS_CornerCases(object):
@classmethod
def setup_class(cls):
cls.exog = np.ones((1,))
cls.endog = np.ones((1,))
weights = 1
cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit()
def test_wrong_size_weights(self):
weights = np.ones((10, 10))
assert_raises(ValueError, WLS, self.endog, self.exog, weights=weights)
class TestWLSExogWeights(CheckRegressionResults):
# Test WLS with Greene's credit card data
# reg avgexp age income incomesq ownrent [aw=1/incomesq]
@classmethod
def setup_class(cls):
from .results.results_regression import CCardWLS
from statsmodels.datasets.ccard import load
dta = load(as_pandas=False)
dta.exog = add_constant(dta.exog, prepend=False)
nobs = 72.
weights = 1 / dta.exog[:, 2]
# for comparison with stata analytic weights
scaled_weights = ((weights * nobs) / weights.sum())
cls.res1 = WLS(dta.endog, dta.exog, weights=scaled_weights).fit()
cls.res2 = CCardWLS()
cls.res2.wresid = scaled_weights ** .5 * cls.res2.resid
# correction because we use different definition for loglike/llf
corr_ic = 2 * (cls.res1.llf - cls.res2.llf)
cls.res2.aic -= corr_ic
cls.res2.bic -= corr_ic
cls.res2.llf += 0.5 * np.sum(np.log(cls.res1.model.weights))
def test_wls_example():
# example from the docstring, there was a note about a bug, should
# be fixed now
Y = [1, 3, 4, 5, 2, 3, 4]
X = lrange(1, 8)
X = add_constant(X, prepend=False)
wls_model = WLS(Y, X, weights=lrange(1, 8)).fit()
# taken from R lm.summary
assert_almost_equal(wls_model.fvalue, 0.127337843215, 6)
assert_almost_equal(wls_model.scale, 2.44608530786**2, 6)
def test_wls_tss():
y = np.array([22, 22, 22, 23, 23, 23])
X = [[1, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 1]]
ols_mod = OLS(y, add_constant(X, prepend=False)).fit()
yw = np.array([22, 22, 23.])
Xw = [[1, 0], [1, 1], [0, 1]]
w = np.array([2, 1, 3.])
wls_mod = WLS(yw, add_constant(Xw, prepend=False), weights=w).fit()
assert_equal(ols_mod.centered_tss, wls_mod.centered_tss)
class TestWLSScalarVsArray(CheckRegressionResults):
@classmethod
def setup_class(cls):
from statsmodels.datasets.longley import load
dta = load(as_pandas=False)
dta.exog = add_constant(dta.exog, prepend=True)
wls_scalar = WLS(dta.endog, dta.exog, weights=1./3).fit()
weights = [1/3.] * len(dta.endog)
wls_array = WLS(dta.endog, dta.exog, weights=weights).fit()
cls.res1 = wls_scalar
cls.res2 = wls_array
#class TestWLS_GLS(CheckRegressionResults):
# @classmethod
# def setup_class(cls):
# from statsmodels.datasets.ccard import load
# data = load(as_pandas=False)
# cls.res1 = WLS(data.endog, data.exog, weights = 1/data.exog[:,2]).fit()
# cls.res2 = GLS(data.endog, data.exog, sigma = data.exog[:,2]).fit()
#
# def check_confidenceintervals(self, conf1, conf2):
# assert_almost_equal(conf1, conf2(), DECIMAL_4)
def test_wls_missing():
from statsmodels.datasets.ccard import load
data = load(as_pandas=False)
endog = data.endog
endog[[10, 25]] = np.nan
mod = WLS(data.endog, data.exog, weights=1 / data.exog[:, 2],
missing='drop')
assert_equal(mod.endog.shape[0], 70)
assert_equal(mod.exog.shape[0], 70)
assert_equal(mod.weights.shape[0], 70)
class TestWLS_OLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
cls.res1 = OLS(data.endog, data.exog).fit()
cls.res2 = WLS(data.endog, data.exog).fit()
def check_confidenceintervals(self, conf1, conf2):
assert_almost_equal(conf1, conf2(), DECIMAL_4)
class TestGLS_OLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
data = longley.load(as_pandas=False)
data.exog = add_constant(data.exog, prepend=False)
cls.res1 = GLS(data.endog, data.exog).fit()
cls.res2 = OLS(data.endog, data.exog).fit()
def check_confidenceintervals(self, conf1, conf2):
assert_almost_equal(conf1, conf2(), DECIMAL_4)
# TODO: test AR
# why the two-stage in AR?
# class test_ar(object):
# from statsmodels.datasets.sunspots import load
# data = load(as_pandas=False)
# model = AR(data.endog, rho=4).fit()
# R_res = RModel(data.endog, aic="FALSE", order_max=4)#
# def test_params(self):
# assert_almost_equal(self.model.rho,
# pass
# def test_order(self):
# In R this can be defined or chosen by minimizing the AIC if aic=True
# pass
class TestYuleWalker(object):
@classmethod
def setup_class(cls):
from statsmodels.datasets.sunspots import load
data = load(as_pandas=False)
cls.rho, cls.sigma = yule_walker(data.endog, order=4,
method="mle")
cls.R_params = [1.2831003105694765, -0.45240924374091945,
-0.20770298557575195, 0.047943648089542337]
def test_params(self):
assert_almost_equal(self.rho, self.R_params, DECIMAL_4)
class TestDataDimensions(CheckRegressionResults):
@classmethod
def setup_class(cls):
np.random.seed(54321)
cls.endog_n_ = np.random.uniform(0, 20, size=30)
cls.endog_n_one = cls.endog_n_[:, None]
cls.exog_n_ = np.random.uniform(0, 20, size=30)
cls.exog_n_one = cls.exog_n_[:, None]
cls.degen_exog = cls.exog_n_one[:-1]
cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one)
cls.mod1.df_model += 1
cls.res1 = cls.mod1.fit()
# Note that these are created for every subclass..
# A little extra overhead probably
cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
def check_confidenceintervals(self, conf1, conf2):
assert_almost_equal(conf1, conf2(), DECIMAL_4)
class TestGLS_large_data(TestDataDimensions):
@classmethod
def setup_class(cls):
super(TestGLS_large_data, cls).setup_class()
nobs = 1000
y = np.random.randn(nobs, 1)
X = np.random.randn(nobs, 20)
sigma = np.ones_like(y)
cls.gls_res = GLS(y, X, sigma=sigma).fit()
cls.gls_res_scalar = GLS(y, X, sigma=1).fit()
cls.gls_res_none = GLS(y, X).fit()
cls.ols_res = OLS(y, X).fit()
def test_large_equal_params(self):
assert_almost_equal(self.ols_res.params, self.gls_res.params, DECIMAL_7)
def test_large_equal_loglike(self):
assert_almost_equal(self.ols_res.llf, self.gls_res.llf, DECIMAL_7)
def test_large_equal_params_none(self):
assert_almost_equal(self.gls_res.params, self.gls_res_none.params,
DECIMAL_7)
class TestNxNx(TestDataDimensions):
@classmethod
def setup_class(cls):
super(TestNxNx, cls).setup_class()
cls.mod2 = OLS(cls.endog_n_, cls.exog_n_)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
class TestNxOneNx(TestDataDimensions):
@classmethod
def setup_class(cls):
super(TestNxOneNx, cls).setup_class()
cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
class TestNxNxOne(TestDataDimensions):
@classmethod
def setup_class(cls):
super(TestNxNxOne, cls).setup_class()
cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
def test_bad_size():
np.random.seed(54321)
data = np.random.uniform(0, 20, 31)
assert_raises(ValueError, OLS, data, data[1:])
def test_const_indicator():
np.random.seed(12345)
X = np.random.randint(0, 3, size=30)
X = categorical(X, drop=True)
y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30)
resc = OLS(y, add_constant(X[:, 1:], prepend=True)).fit()
res = OLS(y, X, hasconst=True).fit()
assert_almost_equal(resc.rsquared, res.rsquared, 12)
assert res.model.data.k_constant == 1
assert resc.model.data.k_constant == 1
def test_fvalue_const_only():
np.random.seed(12345)
x = np.random.randint(0, 3, size=30)
x = categorical(x, drop=True)
x[:, 0] = 1
y = np.dot(x, [1., 2., 3.]) + np.random.normal(size=30)
res = OLS(y, x, hasconst=True).fit(cov_type='HC1')
assert not np.isnan(res.fvalue)
def test_conf_int_single_regressor():
# GH#706 single-regressor model (i.e. no intercept) with 1D exog
# should get passed to DataFrame for conf_int
y = pandas.Series(np.random.randn(10))
x = pandas.Series(np.ones(10))
res = OLS(y, x).fit()
conf_int = res.conf_int()
np.testing.assert_equal(conf_int.shape, (1, 2))
np.testing.assert_(isinstance(conf_int, pandas.DataFrame))
def test_summary_as_latex():
# GH#734
import re
dta = longley.load_pandas()
X = dta.exog
X["constant"] = 1
y = dta.endog
res = OLS(y, X).fit()
with pytest.warns(UserWarning):
table = res.summary().as_latex()
# replace the date and time
table = re.sub("(?<=\n\\\\textbf\\{Date:\\} &).+?&",
" Sun, 07 Apr 2013 &", table)
table = re.sub("(?<=\n\\\\textbf\\{Time:\\} &).+?&",
" 13:46:07 &", table)
expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\
\\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\
\\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\
\\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\
\\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\
\\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\
\\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\
\\textbf{Df Model:} & 6 & \\textbf{ } & \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lcccccc}
& \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\
\\midrule
\\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\
\\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\
\\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\
\\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\
\\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\
\\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\
\\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\
\\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\
\\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\
\\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}
Warnings: \\newline
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline
[2] The condition number is large, 4.86e+09. This might indicate that there are \\newline
strong multicollinearity or other numerical problems."""
assert_equal(table, expected)
class TestRegularizedFit(object):
# Make sure there are no problems when no variables are selected.
def test_empty_model(self):
np.random.seed(742)
n = 100
endog = np.random.normal(size=n)
exog = np.random.normal(size=(n, 3))
for cls in OLS, WLS, GLS:
model = cls(endog, exog)
result = model.fit_regularized(alpha=1000)
assert_equal(result.params, 0.)
def test_regularized(self):
import os
from .results import glmnet_r_results
cur_dir = os.path.dirname(os.path.abspath(__file__))
data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"),
delimiter=",")
tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")]
for test in tests:
vec = getattr(glmnet_r_results, test)
n = vec[0]
p = vec[1]
L1_wt = float(vec[2])
lam = float(vec[3])
params = vec[4:].astype(np.float64)
endog = data[0:int(n), 0]
exog = data[0:int(n), 1:(int(p)+1)]
endog = endog - endog.mean()
endog /= endog.std(ddof=1)
exog = exog - exog.mean(0)
exog /= exog.std(0, ddof=1)
for cls in OLS, WLS, GLS:
mod = cls(endog, exog)
rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam)
assert_almost_equal(rslt.params, params, decimal=3)
# Smoke test for profile likeihood
mod.fit_regularized(L1_wt=L1_wt, alpha=lam,
profile_scale=True)
def test_regularized_weights(self):
np.random.seed(1432)
exog1 = np.random.normal(size=(100, 3))
endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
exog2 = np.random.normal(size=(100, 3))
endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)
exog_a = np.vstack((exog1, exog1, exog2))
endog_a = np.concatenate((endog1, endog1, endog2))
# Should be equivalent to exog_a, endog_a.
exog_b = np.vstack((exog1, exog2))
endog_b = np.concatenate((endog1, endog2))
wgts = np.ones(200)
wgts[0:100] = 2
sigma = np.diag(1/wgts)
for L1_wt in 0, 0.5, 1:
for alpha in 0, 1:
mod1 = OLS(endog_a, exog_a)
rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)
mod2 = WLS(endog_b, exog_b, weights=wgts)
rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)
mod3 = GLS(endog_b, exog_b, sigma=sigma)
rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)
assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def test_formula_missing_cat():
# gh-805
import statsmodels.api as sm
from statsmodels.formula.api import ols
from patsy import PatsyError
dta = sm.datasets.grunfeld.load_pandas().data
dta.loc[dta.index[0], 'firm'] = np.nan
mod = ols(formula='value ~ invest + capital + firm + year',
data=dta.dropna())
res = mod.fit()
mod2 = ols(formula='value ~ invest + capital + firm + year',
data=dta)
res2 = mod2.fit()
assert_almost_equal(res.params.values, res2.params.values)
assert_raises(PatsyError, ols, 'value ~ invest + capital + firm + year',
data=dta, missing='raise')
def test_missing_formula_predict():
# see 2171
nsample = 30
data = np.linspace(0, 10, nsample)
null = np.array([np.nan])
data = pandas.DataFrame({'x': np.concatenate((data, null))})
beta = np.array([1, 0.1])
e = np.random.normal(size=nsample+1)
data['y'] = beta[0] + beta[1] * data['x'] + e
model = OLS.from_formula('y ~ x', data=data)
fit = model.fit()
fit.predict(exog=data[:-1])
def test_fvalue_implicit_constant():
# if constant is implicit, return nan see #2444
nobs = 100
np.random.seed(2)
x = np.random.randn(nobs, 1)
x = ((x > 0) == [True, False]).astype(int)
y = x.sum(1) + np.random.randn(nobs)
from statsmodels.regression.linear_model import OLS, WLS
res = OLS(y, x).fit(cov_type='HC1')
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
res = WLS(y, x).fit(cov_type='HC1')
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
def test_fvalue_only_constant():
# if only constant in model, return nan see #3642
nobs = 20
np.random.seed(2)
x = np.ones(nobs)
y = np.random.randn(nobs)
from statsmodels.regression.linear_model import OLS, WLS
res = OLS(y, x).fit(cov_type='hac', cov_kwds={'maxlags': 3})
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
res = WLS(y, x).fit(cov_type='HC1')
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
def test_ridge():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
yvec = xmat.sum(1) + np.random.normal(size=n)
v = np.ones(p)
v[0] = 0
for a in (0, 1, 10):
for alpha in (a, a*np.ones(p), a*v):
model1 = OLS(yvec, xmat)
result1 = model1._fit_ridge(alpha=alpha)
model2 = OLS(yvec, xmat)
result2 = model2.fit_regularized(alpha=alpha, L1_wt=0)
assert_allclose(result1.params, result2.params)
model3 = OLS(yvec, xmat)
result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10)
assert_allclose(result1.params, result3.params)
fv1 = result1.fittedvalues
fv2 = np.dot(xmat, result1.params)
assert_allclose(fv1, fv2)
def test_regularized_refit():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
# covariates 0 and 2 matter
yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n)
model1 = OLS(yvec, xmat)
result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
model2 = OLS(yvec, xmat[:, [0, 2]])
result2 = model2.fit()
ii = [0, 2]
assert_allclose(result1.params[ii], result2.params)
assert_allclose(result1.bse[ii], result2.bse)
def test_regularized_predict():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
yvec = xmat.sum(1) + np.random.normal(size=n)
wgt = np.random.uniform(1, 2, n)
for klass in WLS, GLS:
model1 = klass(yvec, xmat, weights=wgt)
result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
params = result1.params
fittedvalues = np.dot(xmat, params)
pr = model1.predict(result1.params)
assert_allclose(fittedvalues, pr)
assert_allclose(result1.fittedvalues, pr)
pr = result1.predict()
assert_allclose(fittedvalues, pr)
def test_regularized_options():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
yvec = xmat.sum(1) + np.random.normal(size=n)
model1 = OLS(yvec - 1, xmat)
result1 = model1.fit_regularized(alpha=1., L1_wt=0.5)
model2 = OLS(yvec, xmat, offset=1)
result2 = model2.fit_regularized(alpha=1., L1_wt=0.5,
start_params=np.zeros(5))
assert_allclose(result1.params, result2.params)
def test_burg():
rnd = np.random.RandomState(12345)
e = rnd.randn(10001)
y = e[1:] + 0.5 * e[:-1]
# R, ar.burg
expected = [
[0.3909931],
[0.4602607, -0.1771582],
[0.47473245, -0.21475602, 0.08168813],
[0.4787017, -0.2251910, 0.1047554, -0.0485900],
[0.47975462, - 0.22746106, 0.10963527, -0.05896347, 0.02167001]
]
for i in range(1, 6):
ar, _ = burg(y, i)
assert_allclose(ar, expected[i - 1], atol=1e-6)
as_nodemean, _ = burg(1 + y, i, False)
assert np.all(ar != as_nodemean)
def test_burg_errors():
with pytest.raises(ValueError):
burg(np.ones((100, 2)))
with pytest.raises(ValueError):
burg(np.random.randn(100), 0)
with pytest.raises(ValueError):
burg(np.random.randn(100), 'apple')
@pytest.mark.skipif(not has_cvxopt, reason="sqrt_lasso requires cvxopt")
def test_sqrt_lasso():
np.random.seed(234923)
# Based on the example in the Belloni paper
n = 100
p = 500
ii = np.arange(p)
cx = 0.5 ** np.abs(np.subtract.outer(ii, ii))
cxr = np.linalg.cholesky(cx)
x = np.dot(np.random.normal(size=(n, p)), cxr.T)
b = np.zeros(p)
b[0:5] = [1, 1, 1, 1, 1]
from scipy.stats.distributions import norm
alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p))
# Use very low noise level for a unit test
y = np.dot(x, b) + 0.25 * np.random.normal(size=n)
# At low noise levels, the sqrt lasso should be around a
# factor of 3 from the oracle without refit, and should
# almost equal the oracle with refit.
expected_oracle = {False: 3, True: 1}
# Used for regression testing
expected_params = {False: np.r_[0.87397122, 0.96051874, 0.9905915 , 0.93868953, 0.90771773],
True: np.r_[0.95114241, 1.0302987 , 1.01723074, 0.97587343, 0.99846403]}
for refit in False, True:
rslt = OLS(y, x).fit_regularized(method="sqrt_lasso", alpha=alpha, refit=refit)
err = rslt.params - b
numer = np.sqrt(np.dot(err, np.dot(cx, err)))
oracle = OLS(y, x[:, 0:5]).fit()
oracle_err = np.zeros(p)
oracle_err[0:5] = oracle.params - b[0:5]
denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err)))
# Check performance relative to oracle, should be around
assert_allclose(numer / denom, expected_oracle[refit],
rtol=0.5, atol=0.1)
# Regression test the parameters
assert_allclose(rslt.params[0:5], expected_params[refit],
rtol=1e-5, atol=1e-5)