Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ multivariate / tests / test_factor.py

# -*- coding: utf-8 -*-

import os

import numpy as np
import pandas as pd
from statsmodels.multivariate.factor import Factor
from numpy.testing import (assert_equal, assert_array_almost_equal, assert_,
                           assert_raises, assert_array_equal,
                           assert_array_less, assert_allclose)
import pytest

try:
    import matplotlib.pyplot as plt
    missing_matplotlib = False
    plt.switch_backend('Agg')

except ImportError:
    missing_matplotlib = True

# Example data
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
#     viewer.htm#statug_introreg_sect012.htm
X = pd.DataFrame([['Minas Graes', 2.068, 2.070, 1.580, 1, 0],
                  ['Minas Graes', 2.068, 2.074, 1.602, 2, 1],
                  ['Minas Graes', 2.090, 2.090, 1.613, 3, 0],
                  ['Minas Graes', 2.097, 2.093, 1.613, 4, 1],
                  ['Minas Graes', 2.117, 2.125, 1.663, 5, 0],
                  ['Minas Graes', 2.140, 2.146, 1.681, 6, 1],
                  ['Matto Grosso', 2.045, 2.054, 1.580, 7, 0],
                  ['Matto Grosso', 2.076, 2.088, 1.602, 8, 1],
                  ['Matto Grosso', 2.090, 2.093, 1.643, 9, 0],
                  ['Matto Grosso', 2.111, 2.114, 1.643, 10, 1],
                  ['Santa Cruz', 2.093, 2.098, 1.653, 11, 0],
                  ['Santa Cruz', 2.100, 2.106, 1.623, 12, 1],
                  ['Santa Cruz', 2.104, 2.101, 1.653, 13, 0]],
                 columns=['Loc', 'Basal', 'Occ', 'Max', 'id', 'alt'])


def test_auto_col_name():
    # Test auto generated variable names when endog_names is None
    mod = Factor(None, 2, corr=np.eye(11), endog_names=None,
                 smc=False)
    assert_array_equal(mod.endog_names,
                       ['var00', 'var01', 'var02', 'var03', 'var04', 'var05',
                        'var06', 'var07', 'var08', 'var09', 'var10'])


def test_direct_corr_matrix():
    # Test specifying the correlation matrix directly
    mod = Factor(None, 2, corr=np.corrcoef(X.iloc[:, 1:-1], rowvar=0),
                 smc=False)
    results = mod.fit(tol=1e-10)
    a = np.array([[0.965392158864, 0.225880658666255],
                  [0.967587154301, 0.212758741910989],
                  [0.929891035996, -0.000603217967568],
                  [0.486822656362, -0.869649573289374]])
    assert_array_almost_equal(results.loadings, a, decimal=8)
    # Test set and get endog_names
    mod.endog_names = X.iloc[:, 1:-1].columns
    assert_array_equal(mod.endog_names, ['Basal', 'Occ', 'Max', 'id'])

    # Test set endog_names with the wrong number of elements
    assert_raises(ValueError, setattr, mod, 'endog_names',
                  X.iloc[:, :1].columns)


def test_unknown_fa_method_error():
    # Test raise error if an unkonwn FA method is specified in fa.method
    mod = Factor(X.iloc[:, 1:-1], 2, method='ab')
    assert_raises(ValueError, mod.fit)


def test_example_compare_to_R_output():
    # Testing basic functions and compare to R output

    # R code for producing the results:
    # library(psych)
    # library(GPArotation)
    # Basal = c(2.068,	2.068,	2.09,	2.097,	2.117,	2.14,	2.045,	2.076,	2.09,	2.111,	2.093,	2.1,	2.104)
    # Occ = c(2.07,	2.074,	2.09,	2.093,	2.125,	2.146,	2.054,	2.088,	2.093,	2.114,	2.098,	2.106,	2.101)
    # Max = c(1.58,	1.602,	1.613,	1.613,	1.663,	1.681,	1.58,	1.602,	1.643,	1.643,	1.653,	1.623,	1.653)
    # id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
    # Y <- cbind(Basal, Occ, Max, id)
    # a <- fa(Y, nfactors=2, fm="pa", rotate="none", SMC=FALSE, min.err=1e-10)
    # b <- cbind(a$loadings[,1], -a$loadings[,2])
    # b
    # a <- fa(Y, nfactors=2, fm="pa", rotate="Promax", SMC=TRUE, min.err=1e-10)
    # b <- cbind(a$loadings[,1], a$loadings[,2])
    # b
    # a <- fa(Y, nfactors=2, fm="pa", rotate="Varimax", SMC=TRUE, min.err=1e-10)
    # b <- cbind(a$loadings[,1], a$loadings[,2])
    # b
    # a <- fa(Y, nfactors=2, fm="pa", rotate="quartimax", SMC=TRUE, min.err=1e-10)
    # b <- cbind(a$loadings[,1], -a$loadings[,2])
    # b
    # a <- fa(Y, nfactors=2, fm="pa", rotate="oblimin", SMC=TRUE, min.err=1e-10)
    # b <- cbind(a$loadings[,1], a$loadings[,2])
    # b

    # No rotation without squared multiple correlations prior
    # produce same results as in R `fa`
    mod = Factor(X.iloc[:, 1:-1], 2, smc=False)
    results = mod.fit(tol=1e-10)
    a = np.array([[0.965392158864, 0.225880658666255],
                  [0.967587154301, 0.212758741910989],
                  [0.929891035996, -0.000603217967568],
                  [0.486822656362, -0.869649573289374]])
    assert_array_almost_equal(results.loadings, a, decimal=8)

    # No rotation WITH squared multiple correlations prior
    # produce same results as in R `fa`
    mod = Factor(X.iloc[:, 1:-1], 2, smc=True)
    results = mod.fit()
    a = np.array([[0.97541115, 0.20280987],
                  [0.97113975, 0.17207499],
                  [0.9618705, -0.2004196],
                  [0.37570708, -0.45821379]])
    assert_array_almost_equal(results.loadings, a, decimal=8)

    # Same as R GRArotation
    results.rotate('varimax')
    a = np.array([[0.98828898, -0.12587155],
                  [0.97424206, -0.15354033],
                  [0.84418097, -0.502714],
                  [0.20601929, -0.55558235]])
    assert_array_almost_equal(results.loadings, a, decimal=8)

    results.rotate('quartimax')  # Same as R fa
    a = np.array([[0.98935598, 0.98242714, 0.94078972, 0.33442284],
                  [0.117190049, 0.086943252, -0.283332952, -0.489159543]])
    assert_array_almost_equal(results.loadings, a.T, decimal=8)

    results.rotate('equamax')  # Not the same as R fa

    results.rotate('promax')  # Not the same as R fa

    results.rotate('biquartimin')  # Not the same as R fa

    results.rotate('oblimin')  # Same as R fa
    a = np.array([[1.02834170170, 1.00178840104, 0.71824931384,
                   -0.00013510048],
                  [0.06563421, 0.03096076, -0.39658839, -0.59261944]])
    assert_array_almost_equal(results.loadings, a.T, decimal=8)

    # Testing result summary string
    results.rotate('varimax')
    desired = (
"""   Factor analysis results
=============================
      Eigenvalues
-----------------------------
 Basal   Occ    Max      id
-----------------------------
 2.9609 0.3209 0.0000 -0.0000
-----------------------------

-----------------------------
      Communality
-----------------------------
  Basal   Occ    Max     id
-----------------------------
  0.9926 0.9727 0.9654 0.3511
-----------------------------

-----------------------------
   Pre-rotated loadings
-----------------------------------
            factor 0       factor 1
-----------------------------------
Basal         0.9754         0.2028
Occ           0.9711         0.1721
Max           0.9619        -0.2004
id            0.3757        -0.4582
-----------------------------

-----------------------------
   varimax rotated loadings
-----------------------------------
            factor 0       factor 1
-----------------------------------
Basal         0.9883        -0.1259
Occ           0.9742        -0.1535
Max           0.8442        -0.5027
id            0.2060        -0.5556
=============================
""")
    actual = results.summary().as_text()
    actual = "\n".join(line.rstrip() for line in actual.splitlines()) + "\n"
    assert_equal(actual, desired)


@pytest.mark.skipif(missing_matplotlib, reason='matplotlib not available')
def test_plots(close_figures):
    mod = Factor(X.iloc[:, 1:], 3)
    results = mod.fit()
    results.rotate('oblimin')
    fig = results.plot_scree()

    fig_loadings = results.plot_loadings()
    assert_equal(3, len(fig_loadings))


@pytest.mark.smoke
def test_getframe_smoke():
    #  mostly smoke tests for now
    mod = Factor(X.iloc[:, 1:-1], 2, smc=True)
    res = mod.fit()

    df = res.get_loadings_frame(style='raw')
    assert_(isinstance(df, pd.DataFrame))

    lds = res.get_loadings_frame(style='strings', decimals=3, threshold=0.3)
    lds.to_latex()

    # The Styler option require jinja2, skip if not available
    try:
        from jinja2 import Template  # noqa:F401
    except ImportError:
        return
        # TODO: separate this and do pytest.skip?

    try:
        from pandas.io import formats as pd_formats
    except ImportError:
        from pandas import formats as pd_formats

    ldf = res.get_loadings_frame(style='display')
    assert_(isinstance(ldf, pd_formats.style.Styler))
    assert_(isinstance(ldf.data, pd.DataFrame))

    res.get_loadings_frame(style='display', decimals=3, threshold=0.2)

    res.get_loadings_frame(style='display', decimals=3, color_max='GAINSBORO')

    res.get_loadings_frame(style='display', decimals=3, threshold=0.45, highlight_max=False, sort_=False)


def test_factor_missing():
    xm = X.iloc[:, 1:-1].copy()
    nobs, k_endog = xm.shape
    xm.iloc[2,2] = np.nan
    mod = Factor(xm, 2)
    assert_equal(mod.nobs, nobs - 1)
    assert_equal(mod.k_endog, k_endog)
    assert_equal(mod.endog.shape, (nobs - 1, k_endog))


def _zscore(x):
    # helper function
    return (x - x.mean(0)) / x.std(0)


@pytest.mark.smoke
def test_factor_scoring():
    path = os.path.abspath(__file__)
    dir_path = os.path.dirname(path)
    csv_path = os.path.join(dir_path, 'results', 'factor_data.csv')
    y = pd.read_csv(csv_path)
    csv_path = os.path.join(dir_path, 'results', 'factors_stata.csv')
    f_s = pd.read_csv(csv_path)
    #  mostly smoke tests for now
    mod = Factor(y, 2)
    res = mod.fit(maxiter=1)
    res.rotate('varimax')
    f_reg = res.factor_scoring(method='reg')
    assert_allclose(f_reg * [1, -1], f_s[["f1", 'f2']].values,
                    atol=1e-4, rtol=1e-3)
    f_bart = res.factor_scoring()
    assert_allclose(f_bart * [1, -1], f_s[["f1b", 'f2b']].values,
                    atol=1e-4, rtol=1e-3)

    # check we have high correlation to ols and gls
    f_ols = res.factor_scoring(method='ols')
    f_gls = res.factor_scoring(method='gls')
    f_reg_z = _zscore(f_reg)
    f_ols_z = _zscore(f_ols)
    f_gls_z = _zscore(f_gls)
    assert_array_less(0.98, (f_ols_z * f_reg_z).mean(0))
    assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0))

    # with oblique rotation
    res.rotate('oblimin')
    # Note: Stata has second factor with flipped sign compared to statsmodels
    assert_allclose(res._corr_factors()[0, 1],  (-1) * 0.25651037, rtol=1e-3)
    f_reg = res.factor_scoring(method='reg')
    assert_allclose(f_reg * [1, -1], f_s[["f1o", 'f2o']].values,
                    atol=1e-4, rtol=1e-3)
    f_bart = res.factor_scoring()
    assert_allclose(f_bart * [1, -1], f_s[["f1ob", 'f2ob']].values,
                    atol=1e-4, rtol=1e-3)

    # check we have high correlation to ols and gls
    f_ols = res.factor_scoring(method='ols')
    f_gls = res.factor_scoring(method='gls')
    f_reg_z = _zscore(f_reg)
    f_ols_z = _zscore(f_ols)
    f_gls_z = _zscore(f_gls)
    assert_array_less(0.97, (f_ols_z * f_reg_z).mean(0))
    assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0))

    # check provided endog
    f_ols2 = res.factor_scoring(method='ols', endog=res.model.endog)
    assert_allclose(f_ols2, f_ols, rtol=1e-13)