import numpy as np
import pandas as pd
import pytest
from statsmodels.regression.dimred import (
SlicedInverseReg, SAVE, PHD, CORE)
from numpy.testing import (assert_equal, assert_allclose)
from statsmodels.tools.numdiff import approx_fprime
def test_poisson():
np.random.seed(43242)
# Generate a non-orthogonal design matrix
xmat = np.random.normal(size=(500, 5))
xmat[:, 1] = 0.5*xmat[:, 0] + np.sqrt(1 - 0.5**2) * xmat[:, 1]
xmat[:, 3] = 0.5*xmat[:, 2] + np.sqrt(1 - 0.5**2) * xmat[:, 3]
b = np.r_[0, 1, -1, 0, 0.5]
lpr = np.dot(xmat, b)
ev = np.exp(lpr)
y = np.random.poisson(ev)
for method in range(6):
if method == 0:
model = SlicedInverseReg(y, xmat)
rslt = model.fit()
elif method == 1:
model = SAVE(y, xmat)
rslt = model.fit(slice_n=100)
elif method == 2:
model = SAVE(y, xmat, bc=True)
rslt = model.fit(slice_n=100)
elif method == 3:
df = pd.DataFrame({"y": y,
"x0": xmat[:, 0],
"x1": xmat[:, 1],
"x2": xmat[:, 2],
"x3": xmat[:, 3],
"x4": xmat[:, 4]})
model = SlicedInverseReg.from_formula(
"y ~ 0 + x0 + x1 + x2 + x3 + x4", data=df)
rslt = model.fit()
elif method == 4:
model = PHD(y, xmat)
rslt = model.fit()
elif method == 5:
model = PHD(y, xmat)
rslt = model.fit(resid=True)
# Check for concentration in one direction (this is
# a single index model)
assert_equal(np.abs(rslt.eigs[0] / rslt.eigs[1]) > 5, True)
# Check that the estimated direction aligns with the true
# direction
params = np.asarray(rslt.params)
q = np.dot(params[:, 0], b)
q /= np.sqrt(np.sum(params[:, 0]**2))
q /= np.sqrt(np.sum(b**2))
assert_equal(np.abs(q) > 0.95, True)
def test_sir_regularized_numdiff():
# Use numeric gradients to check the analytic gradient
# for the regularized SIRobjective function.
np.random.seed(93482)
n = 1000
p = 10
xmat = np.random.normal(size=(n, p))
y1 = np.dot(xmat, np.linspace(-1, 1, p))
y2 = xmat.sum(1)
y = y2 / (1 + y1**2) + np.random.normal(size=n)
model = SlicedInverseReg(y, xmat)
_ = model.fit()
# Second difference penalty matrix.
fmat = np.zeros((p-2, p))
for i in range(p-2):
fmat[i, i:i+3] = [1, -2, 1]
with pytest.warns(UserWarning, match="SIR.fit_regularized did not"):
_ = model.fit_regularized(2, 3*fmat)
# Compare the gradients to the numerical derivatives
for _ in range(5):
pa = np.random.normal(size=(p, 2))
pa, _, _ = np.linalg.svd(pa, 0)
gn = approx_fprime(pa.ravel(), model._regularized_objective, 1e-7)
gr = model._regularized_grad(pa.ravel())
assert_allclose(gn, gr, atol=1e-5, rtol=1e-4)
def test_sir_regularized_1d():
# Compare regularized SIR to traditional SIR, in a setting where the
# regularization is compatible with the true parameters (i.e. there
# is no regularization bias).
np.random.seed(93482)
n = 1000
p = 10
xmat = np.random.normal(size=(n, p))
y = np.dot(xmat[:, 0:4], np.r_[1, 1, -1, -1]) + np.random.normal(size=n)
model = SlicedInverseReg(y, xmat)
rslt = model.fit()
# The penalty drives p[0] ~ p[1] and p[2] ~ p[3]]
fmat = np.zeros((2, p))
fmat[0, 0:2] = [1, -1]
fmat[1, 2:4] = [1, -1]
rslt2 = model.fit_regularized(1, 3*fmat)
pa0 = np.zeros(p)
pa0[0:4] = [1, 1, -1, -1]
pa1 = rslt.params[:, 0]
pa2 = rslt2.params[:, 0:2]
# Compare two 1d subspaces
def sim(x, y):
x = x / np.sqrt(np.sum(x * x))
y = y / np.sqrt(np.sum(y * y))
return 1 - np.abs(np.dot(x, y))
# Regularized SIRshould be closer to the truth than traditional SIR
assert_equal(sim(pa0, pa1) > sim(pa0, pa2), True)
# Regularized SIR should be close to the truth
assert_equal(sim(pa0, pa2) < 1e-3, True)
# Regularized SIR should have a smaller penalty value than traditional SIR
assert_equal(np.sum(np.dot(fmat, pa1)**2) > np.sum(np.dot(fmat, pa2)**2),
True)
def test_sir_regularized_2d():
# Compare regularized SIR to traditional SIR when there is no penalty.
# The two procedures should agree exactly.
np.random.seed(93482)
n = 1000
p = 10
xmat = np.random.normal(size=(n, p))
y1 = np.dot(xmat[:, 0:4], np.r_[1, 1, -1, -1])
y2 = np.dot(xmat[:, 4:8], np.r_[1, 1, -1, -1])
y = y1 + np.arctan(y2) + np.random.normal(size=n)
model = SlicedInverseReg(y, xmat)
rslt1 = model.fit()
fmat = np.zeros((1, p))
for d in 1, 2, 3, 4:
if d < 3:
rslt2 = model.fit_regularized(d, fmat)
else:
with pytest.warns(UserWarning, match="SIR.fit_regularized did"):
rslt2 = model.fit_regularized(d, fmat)
pa1 = rslt1.params[:, 0:d]
pa1, _, _ = np.linalg.svd(pa1, 0)
pa2 = rslt2.params
_, s, _ = np.linalg.svd(np.dot(pa1.T, pa2))
assert_allclose(np.sum(s), d, atol=1e-1, rtol=1e-1)
def test_covreduce():
np.random.seed(34324)
p = 4
endog = []
exog = []
for k in range(3):
c = np.eye(p)
x = np.random.normal(size=(2, 2))
# The differences between the covariance matrices
# are all in the first 2 rows/columns.
c[0:2, 0:2] = np.dot(x.T, x)
cr = np.linalg.cholesky(c)
m = 1000*k + 50*k
x = np.random.normal(size=(m, p))
x = np.dot(x, cr.T)
exog.append(x)
endog.append(k * np.ones(m))
endog = np.concatenate(endog)
exog = np.concatenate(exog, axis=0)
for dim in 1, 2, 3:
cr = CORE(endog, exog, dim)
pt = np.random.normal(size=(p, dim))
pt, _, _ = np.linalg.svd(pt, 0)
gn = approx_fprime(pt.ravel(), cr.loglike, 1e-7)
g = cr.score(pt.ravel())
assert_allclose(g, gn, 1e-5, 1e-5)
rslt = cr.fit()
proj = rslt.params
assert_equal(proj.shape[0], p)
assert_equal(proj.shape[1], dim)
assert_allclose(np.dot(proj.T, proj), np.eye(dim), 1e-8, 1e-8)
if dim == 2:
# Here we know the approximate truth
projt = np.zeros((p, 2))
projt[0:2, 0:2] = np.eye(2)
assert_allclose(np.trace(np.dot(proj.T, projt)), 2,
rtol=1e-3, atol=1e-3)