Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
scikits.statsmodels / statsmodels / sandbox / examples / example_sysreg.py
Size: Mime:
"""Example: scikits.statsmodels.sandbox.sysreg
"""
#TODO: this is going to change significantly once we have a panel data structure

import numpy as np
import scikits.statsmodels.api as sm
from scikits.statsmodels.sandbox.sysreg import *

#for Python 3 compatibility
from scikits.statsmodels.compatnp.py3k import asbytes

# Seemingly Unrelated Regressions (SUR) Model

# This example uses the subset of the Grunfeld data in Greene's Econometric
# Analysis Chapter 14 (5th Edition)

grun_data = sm.datasets.grunfeld.load()

firms = ['General Motors', 'Chrysler', 'General Electric', 'Westinghouse',
        'US Steel']
#for Python 3 compatibility
firms = map(asbytes, firms)

grun_exog = grun_data.exog
grun_endog = grun_data.endog

# Right now takes SUR takes a list of arrays
# The array alternates between the LHS of an equation and RHS side of an
# equation
# This is very likely to change
grun_sys = []
for i in firms:
    index = grun_exog['firm'] == i
    grun_sys.append(grun_endog[index])
    exog = grun_exog[index][['value','capital']].view(float).reshape(-1,2)
    exog = sm.add_constant(exog, prepend=True)
    grun_sys.append(exog)

# Note that the results in Greene (5th edition) uses a slightly different
# version of the Grunfeld data. To reproduce Table 14.1 the following changes
# are necessary.
grun_sys[-2][5] = 261.6
grun_sys[-2][-3] = 645.2
grun_sys[-1][11,2] = 232.6

grun_mod = SUR(grun_sys)
grun_res = grun_mod.fit()
print "Results for the 2-step GLS"
print "Compare to Greene Table 14.1, 5th edition"
print grun_res.params
# or you can do an iterative fit
# you have to define a new model though this will be fixed
# TODO: note the above
print "Results for iterative GLS (equivalent to MLE)"
print "Compare to Greene Table 14.3"
#TODO: these are slightly off, could be a convergence issue
# or might use a different default DOF correction?
grun_imod = SUR(grun_sys)
grun_ires = grun_imod.fit(igls=True)
print grun_ires.params

# Two-Stage Least Squares for Simultaneous Equations
#TODO: we are going to need *some kind* of formula framework

# This follows the simple macroeconomic model given in
# Greene Example 15.1 (5th Edition)
# The data however is from statsmodels and is not the same as
# Greene's

# The model is
# consumption: c_{t} = \alpha_{0} + \alpha_{1}y_{t} + \alpha_{2}c_{t-1} + \epsilon_{t1}
# investment: i_{t} = \beta_{0} + \beta_{1}r_{t} + \beta_{2}\left(y_{t}-y_{t-1}\right) + \epsilon_{t2}
# demand: y_{t} = c_{t} + I_{t} + g_{t}

# See Greene's Econometric Analysis for more information

# Load the data
macrodata = sm.datasets.macrodata.load().data

# Not needed, but make sure the data is sorted
macrodata = np.sort(macrodata, order=['year','quarter'])

# Impose the demand restriction
y = macrodata['realcons'] + macrodata['realinv'] + macrodata['realgovt']

# Build the system
macro_sys = []
# First equation LHS
macro_sys.append(macrodata['realcons'][1:]) # leave off first date
# First equation RHS
exog1 = np.column_stack((y[1:],macrodata['realcons'][:-1]))
#TODO: it might be nice to have "lag" and "lead" functions
exog1 = sm.add_constant(exog1, prepend=True)
macro_sys.append(exog1)
# Second equation LHS
macro_sys.append(macrodata['realinv'][1:])
# Second equation RHS
exog2 = np.column_stack((macrodata['tbilrate'][1:], np.diff(y)))
exog2 = sm.add_constant(exog2, prepend=True)
macro_sys.append(exog2)

# We need to say that y_{t} in the RHS of equation 1 is an endogenous regressor
# We will call these independent endogenous variables
# Right now, we use a dictionary to declare these
indep_endog = {0 : [1]}

# We also need to create a design of our instruments
# This will be done automatically in the future
instruments = np.column_stack((macrodata[['realgovt',
    'tbilrate']][1:].view(float).reshape(-1,2),macrodata['realcons'][:-1],
    y[:-1]))
instruments = sm.add_constant(instruments, prepend=True)
macro_mod = Sem2SLS(macro_sys, indep_endog=indep_endog, instruments=instruments)
# Right now this only returns parameters
macro_params = macro_mod.fit()
print "The parameters for the first equation are correct."
print "The parameters for the second equation are not."
print macro_params

#TODO: Note that the above is incorrect, because we have no way of telling the
# model that *part* of the y_{t} - y_{t-1} is an independent endogenous variable
# To correct for this we would have to do the following
y_instrumented = macro_mod.wexog[0][:,1]
whitened_ydiff = y_instrumented - y[:-1]
wexog = np.column_stack((macrodata['tbilrate'][1:],whitened_ydiff))
wexog = sm.add_constant(wexog, prepend=True)
correct_params = sm.GLS(macrodata['realinv'][1:], wexog).fit().params

print "If we correctly instrument everything, then these are the parameters"
print "for the second equation"
print correct_params
print "Compare to output of R script statsmodels/sandbox/tests/macrodata.s"


### Below is the same example using Greene's data ###

run_greene = 0
if run_greene:
    try:
        data3 = np.genfromtxt('/home/skipper/school/MetricsII/Greene \
TableF5-1.txt', names=True)
    except:
        raise ValueError, "Based on Greene TableF5-1.  You should download it \
from his web site and edit this script accordingly."

    # Example 15.1 in Greene 5th Edition
# c_t = constant + y_t + c_t-1
# i_t = constant + r_t + (y_t - y_t-1)
# y_t = c_t + i_t + g_t
    sys3 = []
    sys3.append(data3['realcons'][1:])  # have to leave off a beg. date
# impose 3rd equation on y
    y = data3['realcons'] + data3['realinvs'] + data3['realgovt']

    exog1 = np.column_stack((y[1:],data3['realcons'][:-1]))
    exog1 = sm.add_constant(exog1)
    sys3.append(exog1)
    sys3.append(data3['realinvs'][1:])
    exog2 = np.column_stack((data3['tbilrate'][1:],
        np.diff(y)))
    # realint is missing 1st observation
    exog2 = sm.add_constant(exog2)
    sys3.append(exog2)
    indep_endog = {0 : [0]} # need to be able to say that y_1 is an instrument..
    instruments = np.column_stack((data3[['realgovt',
        'tbilrate']][1:].view(float).reshape(-1,2),data3['realcons'][:-1],
        y[:-1]))
    instruments = sm.add_constant(instruments)
    sem_mod = Sem2SLS(sys3, indep_endog = indep_endog, instruments=instruments)
    sem_params = sem_mod.fit() # first equation is right, but not second?
                               # should y_t in the diff be instrumented?
                               # how would R know this in the script?
    # well, let's check...
    y_instr = sem_mod.wexog[0][:,0]
    wyd = y_instr - y[:-1]
    wexog = np.column_stack((data3['tbilrate'][1:],wyd))
    wexog = sm.add_constant(wexog)
    params = sm.GLS(data3['realinvs'][1:], wexog).fit().params

    print "These are the simultaneous equation estimates for Greene's \
example 13-1 (Also application 13-1 in 6th edition."
    print sem_params
    print "The first set of parameters is correct.  The second set is not."
    print "Compare to the solution manual at \
http://pages.stern.nyu.edu/~wgreene/Text/econometricanalysis.htm"
    print "The reason is the restriction on (y_t - y_1)"
    print "Compare to R script GreeneEx15_1.s"
    print "Somehow R carries y.1 in yd to know that it needs to be \
instrumented"
    print "If we replace our estimate with the instrumented one"
    print params
    print "We get the right estimate"
    print "Without a formula framework we have to be able to do restrictions."
# yep!, but how in the world does R know this when we just fed it yd??
# must be implicit in the formula framework...
# we are going to need to keep the two equations separate and use
# a restrictions matrix.  Ugh, is a formula framework really, necessary to get
# around this?