Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ formula / tests / test_formula.py

from statsmodels.compat.python import iteritems
from statsmodels.compat.pandas import assert_series_equal

from io import StringIO
import warnings

from statsmodels.formula.api import ols
from statsmodels.formula.formulatools import make_hypotheses_matrices
from statsmodels.tools import add_constant
from statsmodels.datasets.longley import load, load_pandas
from statsmodels.datasets import cpunish

import numpy.testing as npt
from statsmodels.tools.testing import assert_equal
import numpy as np
import pandas as pd
import patsy
import pytest


longley_formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR'


class CheckFormulaOLS(object):

    @classmethod
    def setup_class(cls):
        cls.data = load(as_pandas=False)

    def test_endog_names(self):
        assert self.model.endog_names == 'TOTEMP'

    def test_exog_names(self):
        assert self.model.exog_names == ['Intercept', 'GNPDEFL', 'GNP',
                                         'UNEMP', 'ARMED', 'POP', 'YEAR']

    def test_design(self):
        npt.assert_equal(self.model.exog,
                         add_constant(self.data.exog, prepend=True))

    def test_endog(self):
        npt.assert_equal(self.model.endog, self.data.endog)

    @pytest.mark.smoke
    def test_summary(self):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    "kurtosistest only valid for n>=20")
            self.model.fit().summary()


class TestFormulaPandas(CheckFormulaOLS):
    @classmethod
    def setup_class(cls):
        data = load_pandas().data
        cls.model = ols(longley_formula, data)
        super(TestFormulaPandas, cls).setup_class()


class TestFormulaDict(CheckFormulaOLS):
    @classmethod
    def setup_class(cls):
        data = dict((k, v.tolist()) for k, v in iteritems(load_pandas().data))
        cls.model = ols(longley_formula, data)
        super(TestFormulaDict, cls).setup_class()


def test_tests():
    formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR'
    dta = load_pandas().data
    results = ols(formula, dta).fit()
    test_formula = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)'
    LC = make_hypotheses_matrices(results, test_formula)
    R = LC.coefs
    Q = LC.constants
    npt.assert_almost_equal(R, [[0, 1, -1, 0, 0, 0, 0],
                                [0, 0, 0, 1, 0, 0, 0],
                                [0, 0, 0, 0, 0, 0, 1./1829]], 8)
    npt.assert_array_equal(Q, [[0], [2], [1]])


def test_formula_labels():
    # make sure labels pass through patsy as expected
    # data(Duncan) from car in R
    dta = StringIO('"type","income","education","prestige"\n'
                   '"accountant","prof",62,86,82\n'
                   '"pilot","prof",72,76,83\n'
                   '"architect","prof",75,92,90\n'
                   '"author","prof",55,90,76\n'
                   '"chemist","prof",64,86,90\n'
                   '"minister","prof",21,84,87\n'
                   '"professor","prof",64,93,93\n'
                   '"dentist","prof",80,100,90\n'
                   '"reporter","wc",67,87,52\n'
                   '"engineer","prof",72,86,88\n'
                   '"undertaker","prof",42,74,57\n'
                   '"lawyer","prof",76,98,89\n'
                   '"physician","prof",76,97,97\n'
                   '"welfare.worker","prof",41,84,59\n'
                   '"teacher","prof",48,91,73\n'
                   '"conductor","wc",76,34,38\n'
                   '"contractor","prof",53,45,76\n'
                   '"factory.owner","prof",60,56,81\n'
                   '"store.manager","prof",42,44,45\n'
                   '"banker","prof",78,82,92\n'
                   '"bookkeeper","wc",29,72,39\n'
                   '"mail.carrier","wc",48,55,34\n'
                   '"insurance.agent","wc",55,71,41\n'
                   '"store.clerk","wc",29,50,16\n'
                   '"carpenter","bc",21,23,33\n'
                   '"electrician","bc",47,39,53\n'
                   '"RR.engineer","bc",81,28,67\n'
                   '"machinist","bc",36,32,57\n'
                   '"auto.repairman","bc",22,22,26\n'
                   '"plumber","bc",44,25,29\n'
                   '"gas.stn.attendant","bc",15,29,10\n'
                   '"coal.miner","bc",7,7,15\n'
                   '"streetcar.motorman","bc",42,26,19\n'
                   '"taxi.driver","bc",9,19,10\n'
                   '"truck.driver","bc",21,15,13\n'
                   '"machine.operator","bc",21,20,24\n'
                   '"barber","bc",16,26,20\n'
                   '"bartender","bc",16,28,7\n'
                   '"shoe.shiner","bc",9,17,3\n'
                   '"cook","bc",14,22,16\n'
                   '"soda.clerk","bc",12,30,6\n'
                   '"watchman","bc",17,25,11\n'
                   '"janitor","bc",7,20,8\n'
                   '"policeman","bc",34,47,41\n'
                   '"waiter","bc",8,32,10')
    from pandas import read_csv
    dta = read_csv(dta)
    model = ols("prestige ~ income + education", dta).fit()
    assert_equal(model.fittedvalues.index, dta.index)


def test_formula_predict():
    # `log` is needed in the namespace for patsy to find
    from numpy import log  # noqa:F401
    formula = """TOTEMP ~ log(GNPDEFL) + log(GNP) + UNEMP + ARMED +
                    POP + YEAR"""
    data = load_pandas()
    dta = load_pandas().data
    results = ols(formula, dta).fit()
    npt.assert_almost_equal(results.fittedvalues.values,
                            results.predict(data.exog), 8)


def test_formula_predict_series():
    data = pd.DataFrame({"y": [1, 2, 3], "x": [1, 2, 3]}, index=[5, 3, 1])
    results = ols('y ~ x', data).fit()

    result = results.predict(data)
    expected = pd.Series([1., 2., 3.], index=[5, 3, 1])
    assert_series_equal(result, expected)

    result = results.predict(data.x)
    assert_series_equal(result, expected)

    result = results.predict(pd.Series([1, 2, 3], index=[1, 2, 3], name='x'))
    expected = pd.Series([1., 2., 3.], index=[1, 2, 3])
    assert_series_equal(result, expected)

    result = results.predict({"x": [1, 2, 3]})
    expected = pd.Series([1., 2., 3.], index=[0, 1, 2])
    assert_series_equal(result, expected)


def test_patsy_lazy_dict():
    class LazyDict(dict):
        def __init__(self, data):
            self.data = data

        def __missing__(self, key):
            return np.array(self.data[key])

    data = cpunish.load_pandas().data
    data = LazyDict(data)
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()

    res2 = res.predict(data)
    npt.assert_allclose(res.fittedvalues, res2)

    data = cpunish.load_pandas().data
    data['INCOME'].loc[0] = None

    data = LazyDict(data)
    data.index = cpunish.load_pandas().data.index
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()

    res2 = res.predict(data)
    assert_equal(res.fittedvalues, res2)  # Should lose a record
    assert_equal(len(res2) + 1, len(cpunish.load_pandas().data))


def test_patsy_missing_data():
    # Test pandas-style first
    data = cpunish.load_pandas().data
    data['INCOME'].loc[0] = None
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()
    res2 = res.predict(data)
    # First record will be dropped during fit, but not during predict
    assert_equal(res.fittedvalues, res2[1:])

    # Non-pandas version
    data = cpunish.load_pandas().data
    data['INCOME'].loc[0] = None
    data = data.to_records(index=False)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        res2 = res.predict(data)
        assert 'ValueWarning' in repr(w[-1].message)
        assert 'nan values have been dropped' in repr(w[-1].message)
    # Frist record will be dropped in both cases
    assert_equal(res.fittedvalues, res2)


def test_predict_nondataframe():
    df = pd.DataFrame([[3, 0.030], [10, 0.060], [20, 0.120]],
                      columns=['BSA', 'Absorbance'])

    model = ols('Absorbance ~ BSA', data=df)
    fit = model.fit()
    error = patsy.PatsyError
    with pytest.raises(error):
        fit.predict([0.25])


def test_formula_environment():
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [2, 4, 6]})
    env = patsy.EvalEnvironment([{'z': [3, 6, 9]}])
    model = ols('y ~ x + z', eval_env=env, data=df)
    assert 'z' in model.exog_names
    with pytest.raises(TypeError):
        ols('y ~ x', eval_env='env', data=df)