sandbox/datarich/factormodels.py · alkaline-ml/statsmodels

Learn more » Push, build, and install RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages
alkaline-ml / statsmodels python

Repository URL to install this package:
Version: 0.11.1

/ sandbox / datarich / factormodels.py

# -*- coding: utf-8 -*-
"""
Created on Sun Nov 14 08:21:41 2010

Author: josef-pktd
License: BSD (3-clause)
"""

import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.tools import pca
from statsmodels.sandbox.tools.cross_val import LeaveOneOut

#converting example Principal Component Regression to a class
#from sandbox/example_pca_regression.py


class FactorModelUnivariate(object):
    '''

    Todo:
    check treatment of const, make it optional ?
        add hasconst (0 or 1), needed when selecting nfact+hasconst
    options are arguments in calc_factors, should be more public instead
    cross-validation is slow for large number of observations
    '''
    def __init__(self, endog, exog):
        #do this in a superclass?
        self.endog = np.asarray(endog)
        self.exog = np.asarray(exog)


    def calc_factors(self, x=None, keepdim=0, addconst=True):
        '''get factor decomposition of exogenous variables

        This uses principal component analysis to obtain the factors. The number
        of factors kept is the maximum that will be considered in the regression.
        '''
        if x is None:
            x = self.exog
        else:
            x = np.asarray(x)
        xred, fact, evals, evecs  = pca(x, keepdim=keepdim, normalize=1)
        self.exog_reduced = xred
        #self.factors = fact
        if addconst:
            self.factors = sm.add_constant(fact, prepend=True)
            self.hasconst = 1  #needs to be int
        else:
            self.factors = fact
            self.hasconst = 0  #needs to be int

        self.evals = evals
        self.evecs = evecs

    def fit_fixed_nfact(self, nfact):
        if not hasattr(self, 'factors_wconst'):
            self.calc_factors()
        return sm.OLS(self.endog, self.factors[:,:nfact+1]).fit()

    def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
        '''estimate the model and selection criteria for up to maxfact factors

        The selection criteria that are calculated are AIC, BIC, and R2_adj. and
        additionally cross-validation prediction error sum of squares if `skip_crossval`
        is false. Cross-validation is not used by default because it can be
        time consuming to calculate.

        By default the cross-validation method is Leave-one-out on the full dataset.
        A different cross-validation sample can be specified as an argument to
        cv_iter.

        Results are attached in `results_find_nfact`



        '''
        #print 'OLS on Factors'
        if not hasattr(self, 'factors'):
            self.calc_factors()

        hasconst = self.hasconst
        if maxfact is None:
            maxfact = self.factors.shape[1] - hasconst

        if (maxfact+hasconst) < 1:
            raise ValueError('nothing to do, number of factors (incl. constant) should ' +
                             'be at least 1')

        #temporary safety
        maxfact = min(maxfact, 10)

        y0 = self.endog
        results = []
        #xred, fact, eva, eve  = pca(x0, keepdim=0, normalize=1)
        for k in range(1, maxfact+hasconst): #k includes now the constnat
            #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
            # this is faster and same result
            fact = self.factors[:,:k]
            res = sm.OLS(y0, fact).fit()
        ##    print 'k =', k
        ##    print res.params
        ##    print 'aic:  ', res.aic
        ##    print 'bic:  ', res.bic
        ##    print 'llf:  ', res.llf
        ##    print 'R2    ', res.rsquared
        ##    print 'R2 adj', res.rsquared_adj

            if not skip_crossval:
                if cv_iter is None:
                    cv_iter = LeaveOneOut(len(y0))
                prederr2 = 0.
                for inidx, outidx in cv_iter:
                    res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit()
                    #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
                    prederr2 += (y0[outidx] -
                                 res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2.
            else:
                prederr2 = np.nan

            results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])

        self.results_find_nfact = results = np.array(results)
        self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
                     np.argmin(results[:,-1],0))]

    def summary_find_nfact(self):
        '''provides a summary for the selection of the number of factors

        Returns
        -------
        sumstr : str
            summary of the results for selecting the number of factors

        '''
        if not hasattr(self, 'results_find_nfact'):
            self.fit_find_nfact()


        results = self.results_find_nfact
        sumstr = ''
        sumstr += '\n' + 'Best result for k, by AIC, BIC, R2_adj, L1O'
#        best = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
#                     np.argmin(results[:,-1],0))]

        sumstr += '\n' + ' '*19 + '%5d %4d %6d %5d' % tuple(self.best_nfact)

        from statsmodels.iolib.table import SimpleTable

        headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ')
        numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f'
        txt_fmt1 = dict(data_fmts = numformat)
        tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1)

        sumstr += '\n' + "PCA regression on simulated data,"
        sumstr += '\n' + "DGP: 2 factors and 4 explanatory variables"
        sumstr += '\n' + tabl.__str__()
        sumstr += '\n' + "Notes: k is number of components of PCA,"
        sumstr += '\n' + "       constant is added additionally"
        sumstr += '\n' + "       k=0 means regression on constant only"
        sumstr += '\n' + "       L1O: sum of squared prediction errors for leave-one-out"
        return sumstr


if __name__ == '__main__':

    examples = [1]
    if 1 in examples:
        nobs = 500
        f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))]
        f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T
        f2xcoef = np.array([[ 1.,  1.,  0.,  0.],
                            [ 0.,  0.,  1.,  1.],
                            [ 3.,  2.,  1.,  0.]])
        f2xcoef = np.array([[ 0.1,  3.,  1.,    0.],
                            [ 0.,  0.,  1.5,   0.1],
                            [ 3.,  2.,  1.,    0.]])
        x0 = np.dot(f0, f2xcoef)
        x0 += 0.1*np.random.normal(size=x0.shape)
        ytrue = np.dot(f0,[1., 1., 1.])
        y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)

        mod = FactorModelUnivariate(y0, x0)
        print(mod.summary_find_nfact())
        print("with cross validation - slower")
        mod.fit_find_nfact(maxfact=None, skip_crossval=False, cv_iter=None)
        print(mod.summary_find_nfact())
alkaline-ml / statsmodels python

Version: 0.11.1

/ sandbox / datarich / factormodels.py

Products

About

Resources

Contact Gemfury