import numpy as np
from numpy.testing import assert_allclose
import pandas as pd
import pytest
from statsmodels.genmod.qif import (QIF, QIFIndependence, QIFExchangeable,
QIFAutoregressive)
from statsmodels.tools.numdiff import approx_fprime
from statsmodels.genmod import families
@pytest.mark.parametrize("fam", [families.Gaussian(), families.Poisson(),
families.Binomial()])
@pytest.mark.parametrize("cov_struct", [QIFIndependence(), QIFExchangeable(),
QIFAutoregressive()])
def test_qif_numdiff(fam, cov_struct):
# Test the analytic scores against numeric derivatives
np.random.seed(234234)
n = 200
q = 4
x = np.random.normal(size=(n, 3))
if isinstance(fam, families.Gaussian):
e = np.kron(np.random.normal(size=n//q), np.ones(q))
e = np.sqrt(0.5)*e + np.sqrt(1 - 0.5**2)*np.random.normal(size=n)
y = x.sum(1) + e
elif isinstance(fam, families.Poisson):
y = np.random.poisson(5, size=n)
elif isinstance(fam, families.Binomial):
y = np.random.randint(0, 2, size=n)
g = np.kron(np.arange(n//q), np.ones(q)).astype(np.int)
model = QIF(y, x, groups=g, family=fam, cov_struct=cov_struct)
for _ in range(5):
pt = np.random.normal(size=3)
# Check the Jacobian of the vector of estimating equations.
_, grad, _, _, gn_deriv = model.objective(pt)
def llf_gn(params):
return model.objective(params)[3]
gn_numdiff = approx_fprime(pt, llf_gn, 1e-7)
assert_allclose(gn_deriv, gn_numdiff, 1e-4)
# Check the gradient of the QIF
def llf(params):
return model.objective(params)[0]
grad_numdiff = approx_fprime(pt, llf, 1e-7)
assert_allclose(grad, grad_numdiff, 1e-4)
@pytest.mark.parametrize("fam", [families.Gaussian(), families.Poisson(),
families.Binomial()])
@pytest.mark.parametrize("cov_struct", [QIFIndependence(), QIFExchangeable(),
QIFAutoregressive()])
def test_qif_fit(fam, cov_struct):
np.random.seed(234234)
n = 1000
q = 4
params = np.r_[1, -0.5, 0.2]
x = np.random.normal(size=(n, len(params)))
if isinstance(fam, families.Gaussian):
e = np.kron(np.random.normal(size=n//q), np.ones(q))
e = np.sqrt(0.5)*e + np.sqrt(1 - 0.5**2)*np.random.normal(size=n)
y = np.dot(x, params) + e
elif isinstance(fam, families.Poisson):
lpr = np.dot(x, params)
mean = np.exp(lpr)
y = np.random.poisson(mean)
elif isinstance(fam, families.Binomial):
lpr = np.dot(x, params)
mean = 1 / (1 + np.exp(-lpr))
y = (np.random.uniform(0, 1, size=n) < mean).astype(np.int)
g = np.kron(np.arange(n // q), np.ones(q)).astype(np.int)
model = QIF(y, x, groups=g, family=fam, cov_struct=cov_struct)
rslt = model.fit()
# Slack comparison to population values
assert_allclose(rslt.params, params, atol=0.05, rtol=0.05)
# Smoke test
_ = rslt.summary()
@pytest.mark.parametrize("cov_struct", [QIFIndependence(), QIFExchangeable(),
QIFAutoregressive()])
def test_formula(cov_struct):
np.random.seed(3423)
y = np.random.normal(size=100)
x = np.random.normal(size=(100, 2))
groups = np.kron(np.arange(25), np.ones(4))
model1 = QIF(y, x, groups=groups, cov_struct=cov_struct)
result1 = model1.fit()
df = pd.DataFrame({"y": y, "x1": x[:, 0], "x2": x[:, 1], "groups": groups})
model2 = QIF.from_formula("y ~ 0 + x1 + x2", groups="groups",
cov_struct=cov_struct, data=df)
result2 = model2.fit()
assert_allclose(result1.params, result2.params)
assert_allclose(result1.bse, result2.bse)
if not isinstance(cov_struct, QIFIndependence):
_ = result2.bic
_ = result2.aic