Gemfury

alkaline-ml / statsmodels python

Repository URL to install this package:
Details
statsmodels / tsa / statespace / tests / test_mlemodel.py
"""
Tests for the generic MLEModel

Author: Chad Fulton
License: Simplified-BSD
"""
from __future__ import division, absolute_import, print_function

import os
import re
import warnings

import numpy as np
import pandas as pd
import pytest

from statsmodels.tsa.statespace import (sarimax, varmax, kalman_filter,
                                        kalman_smoother)
from statsmodels.tsa.statespace.mlemodel import MLEModel, MLEResultsWrapper
from statsmodels.datasets import nile
from numpy.testing import (
    assert_almost_equal, assert_equal, assert_allclose, assert_raises)
from statsmodels.tsa.statespace.tests.results import (
    results_sarimax, results_var_misc)

current_path = os.path.dirname(os.path.abspath(__file__))

# Basic kwargs
kwargs = {
    'k_states': 1, 'design': [[1]], 'transition': [[1]],
    'selection': [[1]], 'state_cov': [[1]],
    'initialization': 'approximate_diffuse'
}


def get_dummy_mod(fit=True, pandas=False):
    # This tests time-varying parameters regression when in fact the parameters
    # are not time-varying, and in fact the regression fit is perfect
    endog = np.arange(100)*1.0
    exog = 2*endog

    if pandas:
        index = pd.date_range('1960-01-01', periods=100, freq='MS')
        endog = pd.Series(endog, index=index)
        exog = pd.Series(exog, index=index)

    mod = sarimax.SARIMAX(
        endog, exog=exog, order=(0, 0, 0),
        time_varying_regression=True, mle_regression=False)

    if fit:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            res = mod.fit(disp=-1)
    else:
        res = None

    return mod, res


def test_init_matrices_time_invariant():
    # Test setting state space system matrices in __init__, with time-invariant
    # matrices
    k_endog = 2
    k_states = 3
    k_posdef = 1

    endog = np.zeros((10, 2))
    obs_intercept = np.arange(k_endog) * 1.0
    design = np.reshape(
        np.arange(k_endog * k_states) * 1.0, (k_endog, k_states))
    obs_cov = np.reshape(np.arange(k_endog**2) * 1.0, (k_endog, k_endog))
    state_intercept = np.arange(k_states) * 1.0
    transition = np.reshape(np.arange(k_states**2) * 1.0, (k_states, k_states))
    selection = np.reshape(
        np.arange(k_states * k_posdef) * 1.0, (k_states, k_posdef))
    state_cov = np.reshape(np.arange(k_posdef**2) * 1.0, (k_posdef, k_posdef))

    mod = MLEModel(endog, k_states=k_states, k_posdef=k_posdef,
                   obs_intercept=obs_intercept, design=design,
                   obs_cov=obs_cov, state_intercept=state_intercept,
                   transition=transition, selection=selection,
                   state_cov=state_cov)

    assert_allclose(mod['obs_intercept'], obs_intercept)
    assert_allclose(mod['design'], design)
    assert_allclose(mod['obs_cov'], obs_cov)
    assert_allclose(mod['state_intercept'], state_intercept)
    assert_allclose(mod['transition'], transition)
    assert_allclose(mod['selection'], selection)
    assert_allclose(mod['state_cov'], state_cov)


def test_init_matrices_time_varying():
    # Test setting state space system matrices in __init__, with time-varying
    # matrices
    nobs = 10
    k_endog = 2
    k_states = 3
    k_posdef = 1

    endog = np.zeros((10, 2))
    obs_intercept = np.reshape(np.arange(k_endog * nobs) * 1.0,
                               (k_endog, nobs))
    design = np.reshape(
        np.arange(k_endog * k_states * nobs) * 1.0, (k_endog, k_states, nobs))
    obs_cov = np.reshape(
        np.arange(k_endog**2 * nobs) * 1.0, (k_endog, k_endog, nobs))
    state_intercept = np.reshape(
        np.arange(k_states * nobs) * 1.0, (k_states, nobs))
    transition = np.reshape(
        np.arange(k_states**2 * nobs) * 1.0, (k_states, k_states, nobs))
    selection = np.reshape(
        np.arange(k_states * k_posdef * nobs) * 1.0,
        (k_states, k_posdef, nobs))
    state_cov = np.reshape(
        np.arange(k_posdef**2 * nobs) * 1.0, (k_posdef, k_posdef, nobs))

    mod = MLEModel(endog, k_states=k_states, k_posdef=k_posdef,
                   obs_intercept=obs_intercept, design=design,
                   obs_cov=obs_cov, state_intercept=state_intercept,
                   transition=transition, selection=selection,
                   state_cov=state_cov)

    assert_allclose(mod['obs_intercept'], obs_intercept)
    assert_allclose(mod['design'], design)
    assert_allclose(mod['obs_cov'], obs_cov)
    assert_allclose(mod['state_intercept'], state_intercept)
    assert_allclose(mod['transition'], transition)
    assert_allclose(mod['selection'], selection)
    assert_allclose(mod['state_cov'], state_cov)


def test_wrapping():
    # Test the wrapping of various Representation / KalmanFilter /
    # KalmanSmoother methods / attributes
    mod, _ = get_dummy_mod(fit=False)

    # Test that we can get the design matrix
    assert_equal(mod['design', 0, 0], 2.0 * np.arange(100))

    # Test that we can set individual elements of the design matrix
    mod['design', 0, 0, :] = 2
    assert_equal(mod.ssm['design', 0, 0, :], 2)
    assert_equal(mod.ssm['design'].shape, (1, 1, 100))

    # Test that we can set the entire design matrix
    mod['design'] = [[3.]]
    assert_equal(mod.ssm['design', 0, 0], 3.)
    # (Now it's no longer time-varying, so only 2-dim)
    assert_equal(mod.ssm['design'].shape, (1, 1))

    # Test that we can change the following properties: loglikelihood_burn,
    # initial_variance, tolerance
    assert_equal(mod.loglikelihood_burn, 1)
    mod.loglikelihood_burn = 0
    assert_equal(mod.ssm.loglikelihood_burn, 0)

    assert_equal(mod.tolerance, mod.ssm.tolerance)
    mod.tolerance = 0.123
    assert_equal(mod.ssm.tolerance, 0.123)

    assert_equal(mod.initial_variance, 1e10)
    mod.initial_variance = 1e12
    assert_equal(mod.ssm.initial_variance, 1e12)

    # Test that we can use the following wrappers: initialization,
    # initialize_known, initialize_stationary, initialize_approximate_diffuse

    # Initialization starts off as none
    assert_equal(isinstance(mod.initialization, object), True)

    # Since the SARIMAX model may be fully stationary or may have diffuse
    # elements, it uses a custom initialization by default, but it can be
    # overridden by users
    mod.initialize_default()  # no-op here

    mod.initialize_approximate_diffuse(1e5)
    assert_equal(mod.initialization.initialization_type, 'approximate_diffuse')
    assert_equal(mod.initialization.approximate_diffuse_variance, 1e5)

    mod.initialize_known([5.], [[40]])
    assert_equal(mod.initialization.initialization_type, 'known')
    assert_equal(mod.initialization.constant, [5.])
    assert_equal(mod.initialization.stationary_cov, [[40]])

    mod.initialize_stationary()
    assert_equal(mod.initialization.initialization_type, 'stationary')

    # Test that we can use the following wrapper methods: set_filter_method,
    # set_stability_method, set_conserve_memory, set_smoother_output

    # The defaults are as follows:
    assert_equal(mod.ssm.filter_method, kalman_filter.FILTER_CONVENTIONAL)
    assert_equal(
        mod.ssm.stability_method,
        kalman_filter.STABILITY_FORCE_SYMMETRY)
    assert_equal(mod.ssm.conserve_memory, kalman_filter.MEMORY_STORE_ALL)
    assert_equal(mod.ssm.smoother_output, kalman_smoother.SMOOTHER_ALL)

    # Now, create the Cython filter object and assert that they have
    # transferred correctly
    mod.ssm._initialize_filter()
    kf = mod.ssm._kalman_filter
    assert_equal(kf.filter_method, kalman_filter.FILTER_CONVENTIONAL)
    assert_equal(kf.stability_method, kalman_filter.STABILITY_FORCE_SYMMETRY)
    assert_equal(kf.conserve_memory, kalman_filter.MEMORY_STORE_ALL)
    # (the smoother object is so far not in Cython, so there is no
    # transferring)

    # Change the attributes in the model class
    mod.set_filter_method(100)
    mod.set_stability_method(101)
    mod.set_conserve_memory(102)
    mod.set_smoother_output(103)

    # Assert that the changes have occurred in the ssm class
    assert_equal(mod.ssm.filter_method, 100)
    assert_equal(mod.ssm.stability_method, 101)
    assert_equal(mod.ssm.conserve_memory, 102)
    assert_equal(mod.ssm.smoother_output, 103)

    # Assert that the changes have *not yet* occurred in the filter object
    assert_equal(kf.filter_method, kalman_filter.FILTER_CONVENTIONAL)
    assert_equal(kf.stability_method, kalman_filter.STABILITY_FORCE_SYMMETRY)
    assert_equal(kf.conserve_memory, kalman_filter.MEMORY_STORE_ALL)

    # Now, test the setting of the other two methods by resetting the
    # filter method to a valid value
    mod.set_filter_method(1)
    mod.ssm._initialize_filter()
    # Retrieve the new kalman filter object (a new object had to be created
    # due to the changing filter method)
    kf = mod.ssm._kalman_filter

    assert_equal(kf.filter_method, 1)
    assert_equal(kf.stability_method, 101)
    assert_equal(kf.conserve_memory, 102)


def test_fit_misc():
    true = results_sarimax.wpi1_stationary
    endog = np.diff(true['data'])[1:]

    mod = sarimax.SARIMAX(endog, order=(1, 0, 1), trend='c')

    # Test optim_hessian={'opg','oim','approx'}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        res1 = mod.fit(method='ncg', disp=0, optim_hessian='opg',
                       optim_complex_step=False)
        res2 = mod.fit(method='ncg', disp=0, optim_hessian='oim',
                       optim_complex_step=False)
    # Check that the Hessians broadly result in the same optimum
    assert_allclose(res1.llf, res2.llf, rtol=1e-2)

    # Test return_params=True
    mod, _ = get_dummy_mod(fit=False)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        res_params = mod.fit(disp=-1, return_params=True)

    # 5 digits necessary to accommodate 32-bit numpy/scipy with OpenBLAS 0.2.18
    assert_almost_equal(res_params, [0, 0], 5)


@pytest.mark.smoke
def test_score_misc():
    mod, res = get_dummy_mod()

    # Test that the score function works
    mod.score(res.params)


def test_from_formula():
    assert_raises(NotImplementedError, lambda: MLEModel.from_formula(1, 2, 3))


def test_score_analytic_ar1():
    # Test the score against the analytic score for an AR(1) model with 2
    # observations
    # Let endog = [1, 0.5], params=[0, 1]
    mod = sarimax.SARIMAX([1, 0.5], order=(1, 0, 0))

    def partial_phi(phi, sigma2):
        return -0.5 * (phi**2 + 2*phi*sigma2 - 1) / (sigma2 * (1 - phi**2))

    def partial_sigma2(phi, sigma2):
        return -0.5 * (2*sigma2 + phi - 1.25) / (sigma2**2)

    params = np.r_[0., 2]

    # Compute the analytic score
    analytic_score = np.r_[
        partial_phi(params[0], params[1]),
        partial_sigma2(params[0], params[1])]

    # Check each of the approximations, transformed parameters
    approx_cs = mod.score(params, transformed=True, approx_complex_step=True)
    assert_allclose(approx_cs, analytic_score)

    approx_fd = mod.score(params, transformed=True, approx_complex_step=False)
    assert_allclose(approx_fd, analytic_score, atol=1e-5)

    approx_fd_centered = (
        mod.score(params, transformed=True, approx_complex_step=False,
                  approx_centered=True))
    assert_allclose(approx_fd, analytic_score, atol=1e-5)

    harvey_cs = mod.score(params, transformed=True, method='harvey',
                          approx_complex_step=True)
    assert_allclose(harvey_cs, analytic_score)
    harvey_fd = mod.score(params, transformed=True, method='harvey',
                          approx_complex_step=False)
    assert_allclose(harvey_fd, analytic_score, atol=1e-5)
    harvey_fd_centered = mod.score(params, transformed=True, method='harvey',
                                   approx_complex_step=False,
                                   approx_centered=True)
    assert_allclose(harvey_fd_centered, analytic_score, atol=1e-5)

    # Check the approximations for untransformed parameters. The analytic
    # check now comes from chain rule with the analytic derivative of the
    # transformation
    # if L* is the likelihood evaluated at untransformed parameters and
    # L is the likelihood evaluated at transformed parameters, then we have:
    # L*(u) = L(t(u))
    # and then
    # L'*(u) = L'(t(u)) * t'(u)
    def partial_transform_phi(phi):
        return -1. / (1 + phi**2)**(3./2)

    def partial_transform_sigma2(sigma2):
        return 2. * sigma2

    uparams = mod.untransform_params(params)

    analytic_score = np.dot(
        np.diag(np.r_[partial_transform_phi(uparams[0]),
                      partial_transform_sigma2(uparams[1])]),
        np.r_[partial_phi(params[0], params[1]),
              partial_sigma2(params[0], params[1])])

    approx_cs = mod.score(uparams, transformed=False, approx_complex_step=True)
    assert_allclose(approx_cs, analytic_score)

    approx_fd = mod.score(uparams, transformed=False,
                          approx_complex_step=False)
    assert_allclose(approx_fd, analytic_score, atol=1e-5)

    approx_fd_centered = (
        mod.score(uparams, transformed=False, approx_complex_step=False,
                  approx_centered=True))
    assert_allclose(approx_fd_centered, analytic_score, atol=1e-5)

    harvey_cs = mod.score(uparams, transformed=False, method='harvey',
                          approx_complex_step=True)
    assert_allclose(harvey_cs, analytic_score)
    harvey_fd = mod.score(uparams, transformed=False, method='harvey',
                          approx_complex_step=False)
    assert_allclose(harvey_fd, analytic_score, atol=1e-5)
    harvey_fd_centered = mod.score(uparams, transformed=False, method='harvey',
                                   approx_complex_step=False,
                                   approx_centered=True)
    assert_allclose(harvey_fd_centered, analytic_score, atol=1e-5)

    # Check the Hessian: these approximations are not very good, particularly
    # when phi is close to 0
    params = np.r_[0.5, 1.]

    def hessian(phi, sigma2):
        hessian = np.zeros((2, 2))
        hessian[0, 0] = (-phi**2 - 1) / (phi**2 - 1)**2
        hessian[1, 0] = hessian[0, 1] = -1 / (2 * sigma2**2)
        hessian[1, 1] = (sigma2 + phi - 1.25) / sigma2**3
        return hessian

    analytic_hessian = hessian(params[0], params[1])

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        assert_allclose(mod._hessian_complex_step(params) * 2,
                        analytic_hessian, atol=1e-1)
        assert_allclose(mod._hessian_finite_difference(params) * 2,
                        analytic_hessian, atol=1e-1)


def test_cov_params():
    mod, res = get_dummy_mod()

    # Smoke test for each of the covariance types
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        res = mod.fit(res.params, disp=-1, cov_type='none')
        assert_equal(
            res.cov_kwds['description'],
            'Covariance matrix not calculated.')

        res = mod.fit(res.params, disp=-1, cov_type='approx')
        assert_equal(res.cov_type, 'approx')
        assert_equal(
            res.cov_kwds['description'],
            'Covariance matrix calculated using numerical (complex-step) '
            'differentiation.')

        res = mod.fit(res.params, disp=-1, cov_type='oim')
        assert_equal(res.cov_type, 'oim')
        assert_equal(
            res.cov_kwds['description'],
            'Covariance matrix calculated using the observed information '
            'matrix (complex-step) described in Harvey (1989).')

        res = mod.fit(res.params, disp=-1, cov_type='opg')
        assert_equal(res.cov_type, 'opg')
        assert_equal(
            res.cov_kwds['description'],
            'Covariance matrix calculated using the outer product of '
            'gradients (complex-step).')

        res = mod.fit(res.params, disp=-1, cov_type='robust')
        assert_equal(res.cov_type, 'robust')
        assert_equal(
            res.cov_kwds['description'],
            'Quasi-maximum likelihood covariance matrix used for robustness '
            'to some misspecifications; calculated using the observed '
            'information matrix (complex-step) described in Harvey (1989).')

        res = mod.fit(res.params, disp=-1, cov_type='robust_oim')
        assert_equal(res.cov_type, 'robust_oim')
        assert_equal(
            res.cov_kwds['description'],
            'Quasi-maximum likelihood covariance matrix used for robustness '
            'to some misspecifications; calculated using the observed '
            'information matrix (complex-step) described in Harvey (1989).')

        res = mod.fit(res.params, disp=-1, cov_type='robust_approx')
        assert_equal(res.cov_type, 'robust_approx')
        assert_equal(
            res.cov_kwds['description'],
            'Quasi-maximum likelihood covariance matrix used for robustness '
            'to some misspecifications; calculated using numerical '
            '(complex-step) differentiation.')

        with pytest.raises(NotImplementedError):
            mod.fit(res.params, disp=-1, cov_type='invalid_cov_type')


def test_transform():
    # The transforms in MLEModel are noops
    mod = MLEModel([1, 2], **kwargs)

    # Test direct transform, untransform
    assert_allclose(mod.transform_params([2, 3]), [2, 3])
    assert_allclose(mod.untransform_params([2, 3]), [2, 3])

    # Smoke test for transformation in `filter`, `update`, `loglike`,
    # `loglikeobs`
    mod.filter([], transformed=False)
    mod.update([], transformed=False)
    mod.loglike([], transformed=False)
    mod.loglikeobs([], transformed=False)

    # Note that mod is an SARIMAX instance, and the two parameters are
    # variances
    mod, _ = get_dummy_mod(fit=False)

    # Test direct transform, untransform
    assert_allclose(mod.transform_params([2, 3]), [4, 9])
    assert_allclose(mod.untransform_params([4, 9]), [2, 3])

    # Test transformation in `filter`
    res = mod.filter([2, 3], transformed=True)
    assert_allclose(res.params, [2, 3])

    res = mod.filter([2, 3], transformed=False)
    assert_allclose(res.params, [4, 9])


def test_filter():
    endog = np.array([1., 2.])
    mod = MLEModel(endog, **kwargs)

    # Test return of ssm object
    res = mod.filter([], return_ssm=True)
    assert_equal(isinstance(res, kalman_filter.FilterResults), True)

    # Test return of full results object
    res = mod.filter([])
    assert_equal(isinstance(res, MLEResultsWrapper), True)
    assert_equal(res.cov_type, 'opg')

    # Test return of full results object, specific covariance type
    res = mod.filter([], cov_type='oim')
    assert_equal(isinstance(res, MLEResultsWrapper), True)
    assert_equal(res.cov_type, 'oim')


def test_params():
    mod = MLEModel([1, 2], **kwargs)

    # By default start_params raises NotImplementedError
    assert_raises(NotImplementedError, lambda: mod.start_params)
    # But param names are by default an empty array
    assert_equal(mod.param_names, [])

    # We can set them in the object if we want
    mod._start_params = [1]
    mod._param_names = ['a']

    assert_equal(mod.start_params, [1])
    assert_equal(mod.param_names, ['a'])


def check_results(pandas):
    mod, res = get_dummy_mod(pandas=pandas)

    # Test fitted values
    assert_almost_equal(res.fittedvalues[2:], mod.endog[2:].squeeze())

    # Test residuals
    assert_almost_equal(res.resid[2:], np.zeros(mod.nobs-2))

    # Test loglikelihood_burn
    assert_equal(res.loglikelihood_burn, 1)


def test_results(pandas=False):
    check_results(pandas=False)
    check_results(pandas=True)


def test_predict():
    dates = pd.date_range(start='1980-01-01', end='1981-01-01', freq='AS')
    endog = pd.Series([1, 2], index=dates)
    mod = MLEModel(endog, **kwargs)
    res = mod.filter([])

    # Test that predict with start=None, end=None does prediction with full
    # dataset
    predict = res.predict()
    assert_equal(predict.shape, (mod.nobs,))
    assert_allclose(res.get_prediction().predicted_mean, predict)

    # Test a string value to the dynamic option
    assert_allclose(res.predict(dynamic='1981-01-01'), res.predict())

    # Test an invalid date string value to the dynamic option
    # assert_raises(ValueError, res.predict, dynamic='1982-01-01')

    # Test for passing a string to predict when dates are not set
    mod = MLEModel([1, 2], **kwargs)
    res = mod.filter([])
    assert_raises(KeyError, res.predict, dynamic='string')


def test_forecast():
    # Numpy
    mod = MLEModel([1, 2], **kwargs)
    res = mod.filter([])
    forecast = res.forecast(steps=10)
    assert_allclose(forecast, np.ones((10,)) * 2)
    assert_allclose(res.get_forecast(steps=10).predicted_mean, forecast)

    # Pandas
    index = pd.date_range('1960-01-01', periods=2, freq='MS')
    mod = MLEModel(pd.Series([1, 2], index=index), **kwargs)
    res = mod.filter([])
    assert_allclose(res.forecast(steps=10), np.ones((10,)) * 2)
    assert_allclose(res.forecast(steps='1960-12-01'), np.ones((10,)) * 2)
    assert_allclose(res.get_forecast(steps=10).predicted_mean,
                    np.ones((10,)) * 2)


def test_summary():
    dates = pd.date_range(start='1980-01-01', end='1984-01-01', freq='AS')
    endog = pd.Series([1, 2, 3, 4, 5], index=dates)
    mod = MLEModel(endog, **kwargs)
    res = mod.filter([])

    # Get the summary
    txt = str(res.summary())

    # Test res.summary when the model has dates
    assert_equal(re.search(r'Sample:\s+01-01-1980', txt) is not None, True)
    assert_equal(re.search(r'\s+- 01-01-1984', txt) is not None, True)

    # Test res.summary when `model_name` was not provided
    assert_equal(re.search(r'Model:\s+MLEModel', txt) is not None, True)

    # Smoke test that summary still works when diagnostic tests fail
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        res.filter_results._standardized_forecasts_error[:] = np.nan
        res.summary()
        res.filter_results._standardized_forecasts_error = 1
        res.summary()
        res.filter_results._standardized_forecasts_error = 'a'
        res.summary()


def check_endog(endog, nobs=2, k_endog=1, **kwargs):
    # create the model
    mod = MLEModel(endog, **kwargs)
    # the data directly available in the model is the Statsmodels version of
    # the data; it should be 2-dim, C-contiguous, long-shaped:
    # (nobs, k_endog) == (2, 1)
    assert_equal(mod.endog.ndim, 2)
    assert_equal(mod.endog.flags['C_CONTIGUOUS'], True)
    assert_equal(mod.endog.shape, (nobs, k_endog))
    # the data in the `ssm` object is the state space version of the data; it
    # should be 2-dim, F-contiguous, wide-shaped (k_endog, nobs) == (1, 2)
    # and it should share data with mod.endog
    assert_equal(mod.ssm.endog.ndim, 2)
    assert_equal(mod.ssm.endog.flags['F_CONTIGUOUS'], True)
    assert_equal(mod.ssm.endog.shape, (k_endog, nobs))
    assert_equal(mod.ssm.endog.base is mod.endog, True)

    return mod


def test_basic_endog():
    # Test various types of basic python endog inputs (e.g. lists, scalars...)

    # Check cannot call with non-array-like
    # fails due to checks in Statsmodels base classes
    assert_raises(ValueError, MLEModel, endog=1, k_states=1)
    assert_raises(ValueError, MLEModel, endog='a', k_states=1)
    assert_raises(ValueError, MLEModel, endog=True, k_states=1)

    # Check behavior with different types
    mod = MLEModel([1], **kwargs)
    res = mod.filter([])
    assert_equal(res.filter_results.endog, [[1]])

    mod = MLEModel([1.], **kwargs)
    res = mod.filter([])
    assert_equal(res.filter_results.endog, [[1]])

    mod = MLEModel([True], **kwargs)
    res = mod.filter([])
    assert_equal(res.filter_results.endog, [[1]])

    mod = MLEModel(['a'], **kwargs)
    # raises error due to inability coerce string to numeric
    assert_raises(ValueError, mod.filter, [])

    # Check that a different iterable tpyes give the expected result
    endog = [1., 2.]
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    endog = [[1.], [2.]]
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    endog = (1., 2.)
    mod = check_endog(endog, **kwargs)
    mod.filter([])


def test_numpy_endog():
    # Test various types of numpy endog inputs

    # Check behavior of the link maintained between passed `endog` and
    # `mod.endog` arrays
    endog = np.array([1., 2.])
    mod = MLEModel(endog, **kwargs)
    assert_equal(mod.endog.base is not mod.data.orig_endog, True)
    assert_equal(mod.endog.base is not endog, True)
    assert_equal(mod.data.orig_endog.base is not endog, True)
    endog[0] = 2
    # there is no link to mod.endog
    assert_equal(mod.endog, np.r_[1, 2].reshape(2, 1))
    # there remains a link to mod.data.orig_endog
    assert_equal(mod.data.orig_endog, endog)

    # Check behavior with different memory layouts / shapes

    # Example  (failure): 0-dim array
    endog = np.array(1.)
    # raises error due to len(endog) failing in Statsmodels base classes
    assert_raises(TypeError, check_endog, endog, **kwargs)

    # Example : 1-dim array, both C- and F-contiguous, length 2
    endog = np.array([1., 2.])
    assert_equal(endog.ndim, 1)
    assert_equal(endog.flags['C_CONTIGUOUS'], True)
    assert_equal(endog.flags['F_CONTIGUOUS'], True)
    assert_equal(endog.shape, (2,))
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    # Example : 2-dim array, C-contiguous, long-shaped: (nobs, k_endog)
    endog = np.array([1., 2.]).reshape(2, 1)
    assert_equal(endog.ndim, 2)
    assert_equal(endog.flags['C_CONTIGUOUS'], True)
    # On newer numpy (>= 0.10), this array is (rightly) both C and F contiguous
    # assert_equal(endog.flags['F_CONTIGUOUS'], False)
    assert_equal(endog.shape, (2, 1))
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    # Example : 2-dim array, C-contiguous, wide-shaped: (k_endog, nobs)
    endog = np.array([1., 2.]).reshape(1, 2)
    assert_equal(endog.ndim, 2)
    assert_equal(endog.flags['C_CONTIGUOUS'], True)
    # On newer numpy (>= 0.10), this array is (rightly) both C and F contiguous
    # assert_equal(endog.flags['F_CONTIGUOUS'], False)
    assert_equal(endog.shape, (1, 2))
    # raises error because arrays are always interpreted as
    # (nobs, k_endog), which means that k_endog=2 is incompatibile with shape
    # of design matrix (1, 1)
    assert_raises(ValueError, check_endog, endog, **kwargs)

    # Example : 2-dim array, F-contiguous, long-shaped (nobs, k_endog)
    endog = np.array([1., 2.]).reshape(1, 2).transpose()
    assert_equal(endog.ndim, 2)
    # On newer numpy (>= 0.10), this array is (rightly) both C and F contiguous
    # assert_equal(endog.flags['C_CONTIGUOUS'], False)
    assert_equal(endog.flags['F_CONTIGUOUS'], True)
    assert_equal(endog.shape, (2, 1))
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    # Example : 2-dim array, F-contiguous, wide-shaped (k_endog, nobs)
    endog = np.array([1., 2.]).reshape(2, 1).transpose()
    assert_equal(endog.ndim, 2)
    # On newer numpy (>= 0.10), this array is (rightly) both C and F contiguous
    # assert_equal(endog.flags['C_CONTIGUOUS'], False)
    assert_equal(endog.flags['F_CONTIGUOUS'], True)
    assert_equal(endog.shape, (1, 2))
    # raises error because arrays are always interpreted as
    # (nobs, k_endog), which means that k_endog=2 is incompatibile with shape
    # of design matrix (1, 1)
    assert_raises(ValueError, check_endog, endog, **kwargs)

    # Example  (failure): 3-dim array
    endog = np.array([1., 2.]).reshape(2, 1, 1)
    # raises error due to direct ndim check in Statsmodels base classes
    assert_raises(ValueError, check_endog, endog, **kwargs)

    # Example : np.array with 2 columns
    # Update kwargs for k_endog=2
    kwargs2 = {
        'k_states': 1, 'design': [[1], [0.]], 'obs_cov': [[1, 0], [0, 1]],
        'transition': [[1]], 'selection': [[1]], 'state_cov': [[1]],
        'initialization': 'approximate_diffuse'
    }
    endog = np.array([[1., 2.], [3., 4.]])
    mod = check_endog(endog, k_endog=2, **kwargs2)
    mod.filter([])


def test_pandas_endog():
    # Test various types of pandas endog inputs (e.g. TimeSeries, etc.)

    # Example (failure): pandas.Series, no dates
    endog = pd.Series([1., 2.])
    # raises error due to no dates
    warnings.simplefilter('always')
    # assert_raises(ValueError, check_endog, endog, **kwargs)

    # Example : pandas.Series
    dates = pd.date_range(start='1980-01-01', end='1981-01-01', freq='AS')
    endog = pd.Series([1., 2.], index=dates)
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    # Example : pandas.Series, string datatype
    endog = pd.Series(['a', 'b'], index=dates)
    # raises error due to direct type casting check in Statsmodels base classes
    assert_raises(ValueError, check_endog, endog, **kwargs)

    # Example : pandas.Series
    endog = pd.Series([1., 2.], index=dates)
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    # Example : pandas.DataFrame with 1 column
    endog = pd.DataFrame({'a': [1., 2.]}, index=dates)
    mod = check_endog(endog, **kwargs)
    mod.filter([])

    # Example (failure): pandas.DataFrame with 2 columns
    endog = pd.DataFrame({'a': [1., 2.], 'b': [3., 4.]}, index=dates)
    # raises error because 2-columns means k_endog=2, but the design matrix
    # set in **kwargs is shaped (1, 1)
    assert_raises(ValueError, check_endog, endog, **kwargs)

    # Check behavior of the link maintained between passed `endog` and
    # `mod.endog` arrays
    endog = pd.DataFrame({'a': [1., 2.]}, index=dates)
    mod = check_endog(endog, **kwargs)
    assert_equal(mod.endog.base is not mod.data.orig_endog, True)
    assert_equal(mod.endog.base is not endog, True)
    assert_equal(mod.data.orig_endog.values.base is not endog, True)
    endog.iloc[0, 0] = 2
    # there is no link to mod.endog
    assert_equal(mod.endog, np.r_[1, 2].reshape(2, 1))
    # there remains a link to mod.data.orig_endog
    assert_allclose(mod.data.orig_endog, endog)

    # Example : pandas.DataFrame with 2 columns
    # Update kwargs for k_endog=2
    kwargs2 = {
        'k_states': 1, 'design': [[1], [0.]], 'obs_cov': [[1, 0], [0, 1]],
        'transition': [[1]], 'selection': [[1]], 'state_cov': [[1]],
        'initialization': 'approximate_diffuse'
    }
    endog = pd.DataFrame({'a': [1., 2.], 'b': [3., 4.]}, index=dates)
    mod = check_endog(endog, k_endog=2, **kwargs2)
    mod.filter([])


def test_diagnostics():
    mod, res = get_dummy_mod()

    # Override the standardized forecasts errors to get more reasonable values
    # for the tests to run (not necessary, but prevents some annoying warnings)
    shape = res.filter_results._standardized_forecasts_error.shape
    res.filter_results._standardized_forecasts_error = (
        np.random.normal(size=shape))

    # Make sure method=None selects the appropriate test
    actual = res.test_normality(method=None)
    desired = res.test_normality(method='jarquebera')
    assert_allclose(actual, desired)

    assert_raises(NotImplementedError, res.test_normality, method='invalid')

    actual = res.test_heteroskedasticity(method=None)
    desired = res.test_heteroskedasticity(method='breakvar')
    assert_allclose(actual, desired)

    with pytest.raises(ValueError):
        res.test_heteroskedasticity(method=None, alternative='invalid')
    with pytest.raises(NotImplementedError):
        res.test_heteroskedasticity(method='invalid')

    actual = res.test_serial_correlation(method=None)
    desired = res.test_serial_correlation(method='ljungbox')
    assert_allclose(actual, desired)

    with pytest.raises(NotImplementedError):
        res.test_serial_correlation(method='invalid')

    # Smoke tests for other options
    actual = res.test_heteroskedasticity(method=None, alternative='d',
                                         use_f=False)
    desired = res.test_serial_correlation(method='boxpierce')


def test_diagnostics_nile_eviews():
    # Test the diagnostic tests using the Nile dataset. Results are from
    # "Fitting State Space Models with EViews" (Van den Bossche 2011,
    # Journal of Statistical Software).
    # For parameter values, see Figure 2
    # For Ljung-Box and Jarque-Bera statistics and p-values, see Figure 5
    # The Heteroskedasticity statistic is not provided in this paper.
    niledata = nile.data.load_pandas().data
    niledata.index = pd.date_range('1871-01-01', '1970-01-01', freq='AS')

    mod = MLEModel(
        niledata['volume'], k_states=1,
        initialization='approximate_diffuse', initial_variance=1e15,
        loglikelihood_burn=1)
    mod.ssm['design', 0, 0] = 1
    mod.ssm['obs_cov', 0, 0] = np.exp(9.600350)
    mod.ssm['transition', 0, 0] = 1
    mod.ssm['selection', 0, 0] = 1
    mod.ssm['state_cov', 0, 0] = np.exp(7.348705)
    res = mod.filter([])

    # Test Ljung-Box
    # Note: only 3 digits provided in the reference paper
    actual = res.test_serial_correlation(method='ljungbox', lags=10)[0, :, -1]
    assert_allclose(actual, [13.117, 0.217], atol=1e-3)

    # Test Jarque-Bera
    actual = res.test_normality(method='jarquebera')[0, :2]
    assert_allclose(actual, [0.041686, 0.979373], atol=1e-5)


def test_diagnostics_nile_durbinkoopman():
    # Test the diagnostic tests using the Nile dataset. Results are from
    # Durbin and Koopman (2012); parameter values reported on page 37; test
    # statistics on page 40
    niledata = nile.data.load_pandas().data
    niledata.index = pd.date_range('1871-01-01', '1970-01-01', freq='AS')

    mod = MLEModel(
        niledata['volume'], k_states=1,
        initialization='approximate_diffuse', initial_variance=1e15,
        loglikelihood_burn=1)
    mod.ssm['design', 0, 0] = 1
    mod.ssm['obs_cov', 0, 0] = 15099.
    mod.ssm['transition', 0, 0] = 1
    mod.ssm['selection', 0, 0] = 1
    mod.ssm['state_cov', 0, 0] = 1469.1
    res = mod.filter([])

    # Test Ljung-Box
    # Note: only 3 digits provided in the reference paper
    actual = res.test_serial_correlation(method='ljungbox', lags=9)[0, 0, -1]
    assert_allclose(actual, [8.84], atol=1e-2)

    # Test Jarque-Bera
    # Note: The book reports 0.09 for Kurtosis, because it is reporting the
    # statistic less the mean of the Kurtosis distribution (which is 3).
    norm = res.test_normality(method='jarquebera')[0]
    actual = [norm[0], norm[2], norm[3]]
    assert_allclose(actual, [0.05, -0.03, 3.09], atol=1e-2)

    # Test Heteroskedasticity
    # Note: only 2 digits provided in the book
    actual = res.test_heteroskedasticity(method='breakvar')[0, 0]
    assert_allclose(actual, [0.61], atol=1e-2)


@pytest.mark.smoke
def test_prediction_results():
    # Just smoke tests for the PredictionResults class, which is copied from
    # elsewhere in Statsmodels

    mod, res = get_dummy_mod()
    predict = res.get_prediction()
    predict.summary_frame()


def test_lutkepohl_information_criteria():
    # Setup dataset, use Lutkepohl data
    dta = pd.DataFrame(
        results_var_misc.lutkepohl_data, columns=['inv', 'inc', 'consump'],
        index=pd.date_range('1960-01-01', '1982-10-01', freq='QS'))

    dta['dln_inv'] = np.log(dta['inv']).diff()
    dta['dln_inc'] = np.log(dta['inc']).diff()
    dta['dln_consump'] = np.log(dta['consump']).diff()

    endog = dta.loc['1960-04-01':'1978-10-01',
                    ['dln_inv', 'dln_inc', 'dln_consump']]

    # AR model - SARIMAX
    # (use loglikelihood_burn=1 to mimic conditional MLE used by Stata's var
    # command).
    true = results_var_misc.lutkepohl_ar1_lustats
    mod = sarimax.SARIMAX(endog['dln_inv'], order=(1, 0, 0), trend='c',
                          loglikelihood_burn=1)
    res = mod.filter(true['params'])
    assert_allclose(res.llf, true['loglike'])
    # Test the Lutkepohl ICs
    # Note: for the Lutkepohl ICs, Stata only counts the AR coefficients as
    # estimated parameters for the purposes of information criteria, whereas we
    # count all parameters including scale and constant, so we need to adjust
    # for that
    aic = (res.info_criteria('aic', method='lutkepohl') -
           2 * 2 / res.nobs_effective)
    bic = (res.info_criteria('bic', method='lutkepohl') -
           2 * np.log(res.nobs_effective) / res.nobs_effective)
    hqic = (res.info_criteria('hqic', method='lutkepohl') -
            2 * 2 * np.log(np.log(res.nobs_effective)) / res.nobs_effective)
    assert_allclose(aic, true['aic'])
    assert_allclose(bic, true['bic'])
    assert_allclose(hqic, true['hqic'])

    # Test the non-Lutkepohl ICs
    # Note: for the non-Lutkepohl ICs, Stata does not count the scale as an
    # estimated parameter, but does count the constant term, for the
    # purposes of information criteria, whereas we count both, so we need to
    # adjust for that
    true = results_var_misc.lutkepohl_ar1
    aic = res.aic - 2
    bic = res.bic - np.log(res.nobs_effective)
    assert_allclose(aic, true['estat_aic'])
    assert_allclose(bic, true['estat_bic'])
    aic = res.info_criteria('aic') - 2
    bic = res.info_criteria('bic') - np.log(res.nobs_effective)
    assert_allclose(aic, true['estat_aic'])
    assert_allclose(bic, true['estat_bic'])

    # Note: could also test the "dfk" (degree of freedom corrections), but not
    # really necessary since they just rescale things a bit

    # VAR model - VARMAX
    # (use loglikelihood_burn=1 to mimic conditional MLE used by Stata's var
    # command).
    true = results_var_misc.lutkepohl_var1_lustats
    mod = varmax.VARMAX(endog, order=(1, 0), trend='n',
                        error_cov_type='unstructured', loglikelihood_burn=1,)
    res = mod.filter(true['params'])
    assert_allclose(res.llf, true['loglike'])

    # Test the Lutkepohl ICs
    # Note: for the Lutkepohl ICs, Stata only counts the AR coefficients as
    # estimated parameters for the purposes of information criteria, whereas we
    # count all parameters including the elements of the covariance matrix, so
    # we need to adjust for that
    aic = (res.info_criteria('aic', method='lutkepohl') -
           2 * 6 / res.nobs_effective)
    bic = (res.info_criteria('bic', method='lutkepohl') -
           6 * np.log(res.nobs_effective) / res.nobs_effective)
    hqic = (res.info_criteria('hqic', method='lutkepohl') -
            2 * 6 * np.log(np.log(res.nobs_effective)) / res.nobs_effective)
    assert_allclose(aic, true['aic'])
    assert_allclose(bic, true['bic'])
    assert_allclose(hqic, true['hqic'])

    # Test the non-Lutkepohl ICs
    # Note: for the non-Lutkepohl ICs, Stata does not count the elements of the
    # covariance matrix as estimated parameters for the purposes of information
    # criteria, whereas we count both, so we need to adjust for that
    true = results_var_misc.lutkepohl_var1
    aic = res.aic - 2 * 6
    bic = res.bic - 6 * np.log(res.nobs_effective)
    assert_allclose(aic, true['estat_aic'])
    assert_allclose(bic, true['estat_bic'])
    aic = res.info_criteria('aic') - 2 * 6
    bic = res.info_criteria('bic') - 6 * np.log(res.nobs_effective)
    assert_allclose(aic, true['estat_aic'])
    assert_allclose(bic, true['estat_bic'])
alkaline-ml / statsmodels python

Products

About

Resources

Contact Gemfury