Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ regression / tests / test_dimred.py

import numpy as np
import pandas as pd
import pytest

from statsmodels.regression.dimred import (
     SlicedInverseReg, SAVE, PHD, CORE)
from numpy.testing import (assert_equal, assert_allclose)
from statsmodels.tools.numdiff import approx_fprime


def test_poisson():

    np.random.seed(43242)

    # Generate a non-orthogonal design matrix
    xmat = np.random.normal(size=(500, 5))
    xmat[:, 1] = 0.5*xmat[:, 0] + np.sqrt(1 - 0.5**2) * xmat[:, 1]
    xmat[:, 3] = 0.5*xmat[:, 2] + np.sqrt(1 - 0.5**2) * xmat[:, 3]

    b = np.r_[0, 1, -1, 0, 0.5]
    lpr = np.dot(xmat, b)
    ev = np.exp(lpr)
    y = np.random.poisson(ev)

    for method in range(6):

        if method == 0:
            model = SlicedInverseReg(y, xmat)
            rslt = model.fit()
        elif method == 1:
            model = SAVE(y, xmat)
            rslt = model.fit(slice_n=100)
        elif method == 2:
            model = SAVE(y, xmat, bc=True)
            rslt = model.fit(slice_n=100)
        elif method == 3:
            df = pd.DataFrame({"y": y,
                               "x0": xmat[:, 0],
                               "x1": xmat[:, 1],
                               "x2": xmat[:, 2],
                               "x3": xmat[:, 3],
                               "x4": xmat[:, 4]})
            model = SlicedInverseReg.from_formula(
                        "y ~ 0 + x0 + x1 + x2 + x3 + x4", data=df)
            rslt = model.fit()
        elif method == 4:
            model = PHD(y, xmat)
            rslt = model.fit()
        elif method == 5:
            model = PHD(y, xmat)
            rslt = model.fit(resid=True)

        # Check for concentration in one direction (this is
        # a single index model)
        assert_equal(np.abs(rslt.eigs[0] / rslt.eigs[1]) > 5, True)

        # Check that the estimated direction aligns with the true
        # direction
        params = np.asarray(rslt.params)
        q = np.dot(params[:, 0], b)
        q /= np.sqrt(np.sum(params[:, 0]**2))
        q /= np.sqrt(np.sum(b**2))
        assert_equal(np.abs(q) > 0.95, True)


def test_sir_regularized_numdiff():
    # Use numeric gradients to check the analytic gradient
    # for the regularized SIRobjective function.

    np.random.seed(93482)

    n = 1000
    p = 10
    xmat = np.random.normal(size=(n, p))
    y1 = np.dot(xmat, np.linspace(-1, 1, p))
    y2 = xmat.sum(1)
    y = y2 / (1 + y1**2) + np.random.normal(size=n)
    model = SlicedInverseReg(y, xmat)
    _ = model.fit()

    # Second difference penalty matrix.
    fmat = np.zeros((p-2, p))
    for i in range(p-2):
        fmat[i, i:i+3] = [1, -2, 1]

    with pytest.warns(UserWarning, match="SIR.fit_regularized did not"):
        _ = model.fit_regularized(2, 3*fmat)

    # Compare the gradients to the numerical derivatives
    for _ in range(5):
        pa = np.random.normal(size=(p, 2))
        pa, _, _ = np.linalg.svd(pa, 0)
        gn = approx_fprime(pa.ravel(), model._regularized_objective, 1e-7)
        gr = model._regularized_grad(pa.ravel())
        assert_allclose(gn, gr, atol=1e-5, rtol=1e-4)


def test_sir_regularized_1d():
    # Compare regularized SIR to traditional SIR, in a setting where the
    # regularization is compatible with the true parameters (i.e. there
    # is no regularization bias).

    np.random.seed(93482)

    n = 1000
    p = 10
    xmat = np.random.normal(size=(n, p))
    y = np.dot(xmat[:, 0:4], np.r_[1, 1, -1, -1]) + np.random.normal(size=n)
    model = SlicedInverseReg(y, xmat)
    rslt = model.fit()

    # The penalty drives p[0] ~ p[1] and p[2] ~ p[3]]
    fmat = np.zeros((2, p))
    fmat[0, 0:2] = [1, -1]
    fmat[1, 2:4] = [1, -1]

    rslt2 = model.fit_regularized(1, 3*fmat)

    pa0 = np.zeros(p)
    pa0[0:4] = [1, 1, -1, -1]
    pa1 = rslt.params[:, 0]
    pa2 = rslt2.params[:, 0:2]

    # Compare two 1d subspaces
    def sim(x, y):
        x = x / np.sqrt(np.sum(x * x))
        y = y / np.sqrt(np.sum(y * y))
        return 1 - np.abs(np.dot(x, y))

    # Regularized SIRshould be closer to the truth than traditional SIR
    assert_equal(sim(pa0, pa1) > sim(pa0, pa2), True)

    # Regularized SIR should be close to the truth
    assert_equal(sim(pa0, pa2) < 1e-3, True)

    # Regularized SIR should have a smaller penalty value than traditional SIR
    assert_equal(np.sum(np.dot(fmat, pa1)**2) > np.sum(np.dot(fmat, pa2)**2),
                 True)


def test_sir_regularized_2d():
    # Compare regularized SIR to traditional SIR when there is no penalty.
    # The two procedures should agree exactly.

    np.random.seed(93482)

    n = 1000
    p = 10
    xmat = np.random.normal(size=(n, p))
    y1 = np.dot(xmat[:, 0:4], np.r_[1, 1, -1, -1])
    y2 = np.dot(xmat[:, 4:8], np.r_[1, 1, -1, -1])
    y = y1 + np.arctan(y2) + np.random.normal(size=n)
    model = SlicedInverseReg(y, xmat)
    rslt1 = model.fit()

    fmat = np.zeros((1, p))

    for d in 1, 2, 3, 4:
        if d < 3:
            rslt2 = model.fit_regularized(d, fmat)
        else:
            with pytest.warns(UserWarning, match="SIR.fit_regularized did"):
                rslt2 = model.fit_regularized(d, fmat)
        pa1 = rslt1.params[:, 0:d]
        pa1, _, _ = np.linalg.svd(pa1, 0)
        pa2 = rslt2.params
        _, s, _ = np.linalg.svd(np.dot(pa1.T, pa2))
        assert_allclose(np.sum(s), d, atol=1e-1, rtol=1e-1)


def test_covreduce():

    np.random.seed(34324)

    p = 4
    endog = []
    exog = []
    for k in range(3):
        c = np.eye(p)
        x = np.random.normal(size=(2, 2))
        # The differences between the covariance matrices
        # are all in the first 2 rows/columns.
        c[0:2, 0:2] = np.dot(x.T, x)

        cr = np.linalg.cholesky(c)
        m = 1000*k + 50*k
        x = np.random.normal(size=(m, p))
        x = np.dot(x, cr.T)
        exog.append(x)
        endog.append(k * np.ones(m))

    endog = np.concatenate(endog)
    exog = np.concatenate(exog, axis=0)

    for dim in 1, 2, 3:

        cr = CORE(endog, exog, dim)

        pt = np.random.normal(size=(p, dim))
        pt, _, _ = np.linalg.svd(pt, 0)
        gn = approx_fprime(pt.ravel(), cr.loglike, 1e-7)
        g = cr.score(pt.ravel())

        assert_allclose(g, gn, 1e-5, 1e-5)

        rslt = cr.fit()
        proj = rslt.params
        assert_equal(proj.shape[0], p)
        assert_equal(proj.shape[1], dim)
        assert_allclose(np.dot(proj.T, proj), np.eye(dim), 1e-8, 1e-8)

        if dim == 2:
            # Here we know the approximate truth
            projt = np.zeros((p, 2))
            projt[0:2, 0:2] = np.eye(2)
            assert_allclose(np.trace(np.dot(proj.T, projt)), 2,
                            rtol=1e-3, atol=1e-3)