Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
statsmodels / sandbox / regression / ols_anova_original.py
Size: Mime:
''' convenience functions for ANOVA type analysis with OLS

Note: statistical results of ANOVA are not checked, OLS is
checked but not whether the reported results are the ones used
in ANOVA

'''

from __future__ import print_function
import numpy as np
import numpy.lib.recfunctions

from statsmodels.compat.python import lmap
import statsmodels.api as sm


dt_b = np.dtype([('breed', int), ('sex', int), ('litter', int),
               ('pen', int), ('pig', int), ('age', float),
               ('bage', float), ('y', float)])
''' too much work using structured masked arrays
dta = np.mafromtxt('dftest3.data', dtype=dt_b)

dta_use = np.ma.column_stack[[dta[col] for col in 'y sex age'.split()]]
'''


dta = np.genfromtxt('dftest3.data')
print(dta.shape)
mask = np.isnan(dta)
print("rows with missing values", mask.any(1).sum())
vars = dict((v[0], (idx, v[1])) for idx, v in enumerate((('breed', int),
                                                         ('sex', int),
                                                         ('litter', int),
                                                         ('pen', int),
                                                         ('pig', int),
                                                         ('age', float),
                                                         ('bage', float),
                                                         ('y', float))))

datavarnames = 'y sex age'.split()
#possible to avoid temporary array ?
dta_use = dta[:, [vars[col][0] for col in datavarnames]]
keeprows = ~np.isnan(dta_use).any(1)
print('number of complete observations', keeprows.sum())
dta_used = dta_use[keeprows,:]

varsused = dict((k, [dta_used[:,idx], idx, vars[k][1]]) for idx, k in enumerate(datavarnames))

# use function for dummy
#sexgroups = np.unique(dta_used[:,1])
#sexdummy = (dta_used[:,1][:, None] == sexgroups).astype(int)

def data2dummy(x, returnall=False):
    '''convert array of categories to dummy variables
    by default drops dummy variable for last category
    uses ravel, 1d only'''
    x = x.ravel()
    groups = np.unique(x)
    if returnall:
        return (x[:, None] == groups).astype(int)
    else:
        return (x[:, None] == groups).astype(int)[:,:-1]

def data2proddummy(x):
    '''creates product dummy variables from 2 columns of 2d array

    drops last dummy variable, but not from each category
    singular with simple dummy variable but not with constant

    quickly written, no safeguards

    '''
    #brute force, assumes x is 2d
    #replace with encoding if possible
    groups = np.unique(lmap(tuple, x.tolist()))
    #includes singularity with additive factors
    return (x==groups[:,None,:]).all(-1).T.astype(int)[:,:-1]

def data2groupcont(x1,x2):
    '''create dummy continuous variable

    Parameters
    ----------
    x1 : 1d array
        label or group array
    x2 : 1d array (float)
        continuous variable

    Notes
    -----
    useful for group specific slope coefficients in regression
    '''
    if x2.ndim == 1:
        x2 = x2[:,None]
    dummy = data2dummy(x1, returnall=True)
    return dummy * x2

sexdummy = data2dummy(dta_used[:,1])
factors = ['sex']
for k in factors:
    varsused[k][0] = data2dummy(varsused[k][0])

products = [('sex', 'age')]
for k in products:
    varsused[''.join(k)] = data2proddummy(np.c_[varsused[k[0]][0],varsused[k[1]][0]])

# make dictionary of variables with dummies as one variable
#vars_to_use = {name: data or dummy variables}

X_b0 = np.c_[sexdummy, dta_used[:,2], np.ones((dta_used.shape[0],1))]
y_b0 = dta_used[:,0]
res_b0 = sm.OLS(y_b0, X_b0).results
print(res_b0.params)
print(res_b0.ssr)

anova_str0 = '''
ANOVA statistics (model sum of squares excludes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i        %(ess)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

anova_str = '''
ANOVA statistics (model sum of squares includes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i      %(ssmwithmean)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

#print(anova_str % dict([('df_model', res.df_model)])
#anovares = ['df_model' , 'df_resid'

def anovadict(res):
    '''update regression results dictionary with ANOVA specific statistics

    not checked for completeness
    '''
    ad = {}
    ad.update(res.__dict__)
    anova_attr = ['df_model', 'df_resid', 'ess', 'ssr','uncentered_tss',
                 'mse_model', 'mse_resid', 'mse_total', 'fvalue', 'f_pvalue',
                  'rsquared']
    for key in anova_attr:
        ad[key] = getattr(res, key)
    ad['nobs'] = res.model.nobs
    ad['ssmwithmean'] = res.uncentered_tss - res.ssr
    return ad


print(anova_str0 % anovadict(res_b0))
#the following leaves the constant in, not with NIST regression
#but something fishy with res.ess negative in examples
print(anova_str % anovadict(res_b0))

print('using sex only')
X2 = np.c_[sexdummy, np.ones((dta_used.shape[0],1))]
res2 = sm.OLS(y_b0, X2).results
print(res2.params)
print(res2.ssr)
print(anova_str % anovadict(res2))

print('using age only')
X3 = np.c_[ dta_used[:,2], np.ones((dta_used.shape[0],1))]
res3 = sm.OLS(y_b0, X3).results
print(res3.params)
print(res3.ssr)
print(anova_str % anovadict(res3))


def form2design(ss, data):
    '''convert string formula to data dictionary

    ss : string
     * I : add constant
     * varname : for simple varnames data is used as is
     * F:varname : create dummy variables for factor varname
     * P:varname1*varname2 : create product dummy variables for
       varnames
     * G:varname1*varname2 : create product between factor and
       continuous variable
    data : dict or structured array
       data set, access of variables by name as in dictionaries

    Returns
    -------
    vars : dictionary
        dictionary of variables with converted dummy variables
    names : list
        list of names, product (P:) and grouped continuous
        variables (G:) have name by joining individual names
        sorted according to input

    Examples
    --------
    >>> xx, n = form2design('I a F:b P:c*d G:c*f', testdata)
    >>> xx.keys()
    ['a', 'b', 'const', 'cf', 'cd']
    >>> n
    ['const', 'a', 'b', 'cd', 'cf']

    Notes
    -----

    with sorted dict, separate name list wouldn't be necessary
    '''
    vars = {}
    names = []
    for item in ss.split():
        if item == 'I':
            vars['const'] = np.ones(data.shape[0])
            names.append('const')
        elif ':' not in item:
            vars[item] = data[item]
            names.append(item)
        elif item[:2] == 'F:':
            v = item.split(':')[1]
            vars[v] = data2dummy(data[v])
            names.append(v)
        elif item[:2] == 'P:':
            v = item.split(':')[1].split('*')
            vars[''.join(v)] = data2proddummy(np.c_[data[v[0]],data[v[1]]])
            names.append(''.join(v))
        elif item[:2] == 'G:':
            v = item.split(':')[1].split('*')
            vars[''.join(v)] = data2groupcont(data[v[0]], data[v[1]])
            names.append(''.join(v))
        else:
            raise ValueError('unknown expression in formula')
    return vars, names

nobs = 1000
testdataint = np.random.randint(3, size=(nobs,4)).view([('a',int),('b',int),('c',int),('d',int)])
testdatacont = np.random.normal( size=(nobs,2)).view([('e',float), ('f',float)])
dt2 = numpy.lib.recfunctions.zip_descr((testdataint, testdatacont),flatten=True)
# concatenate structured arrays
testdata = np.empty((nobs,1), dt2)
for name in testdataint.dtype.names:
    testdata[name] = testdataint[name]
for name in testdatacont.dtype.names:
    testdata[name] = testdatacont[name]


#print(form2design('a',testdata))

if 0:
    xx, n = form2design('F:a',testdata)
    print(xx)
    print(form2design('P:a*b',testdata))
    print(data2proddummy((np.c_[testdata['a'],testdata['b']])))

    xx, names = form2design('a F:b P:c*d',testdata)

#xx, names = form2design('I a F:b F:c F:d P:c*d',testdata)
xx, names = form2design('I a F:b P:c*d', testdata)
xx, names = form2design('I a F:b P:c*d G:a*e f', testdata)


X = np.column_stack([xx[nn] for nn in names])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01*np.random.normal(size=(nobs))
rest1 = sm.OLS(y,X).results
print(rest1.params)
print(anova_str % anovadict(rest1))

def dropname(ss, li):
    '''drop names from a list of strings,
    names to drop are in space delimeted list
    does not change original list
    '''
    newli = li[:]
    for item in ss.split():
        newli.remove(item)
    return newli

X = np.column_stack([xx[nn] for nn in dropname('ae f', names)])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01*np.random.normal(size=(nobs))
rest1 = sm.OLS(y,X).results
print(rest1.params)
print(anova_str % anovadict(rest1))


# Example: from Bruce
# -------------------

# read data set and drop rows with missing data
dta = np.genfromtxt('dftest3.data', dt_b,missing='.', usemask=True)
print('missing', [dta.mask[k].sum() for k in dta.dtype.names])
m = dta.mask.view(bool)
droprows = m.reshape(-1,len(dta.dtype.names)).any(1)
# get complete data as plain structured array
# maybe doesn't work with masked arrays
dta_use_b1 = dta[~droprows,:].data
print(dta_use_b1.shape)
print(dta_use_b1.dtype)

#Example b1: variables from Bruce's glm

# prepare data and dummy variables
xx_b1, names_b1 = form2design('I F:sex age', dta_use_b1)
# create design matrix
X_b1 = np.column_stack([xx_b1[nn] for nn in dropname('', names_b1)])
y_b1 = dta_use_b1['y']
# estimate using OLS
rest_b1 = sm.OLS(y_b1, X_b1).results
# print(results)
print(rest_b1.params)
print(anova_str % anovadict(rest_b1))
#compare with original version only in original version
print(anova_str % anovadict(res_b0))

# Example: use all variables except pig identifier

allexog = ' '.join(dta.dtype.names[:-1])
#'breed sex litter pen pig age bage'

xx_b1a, names_b1a = form2design('I F:breed F:sex F:litter F:pen age bage', dta_use_b1)
X_b1a = np.column_stack([xx_b1a[nn] for nn in dropname('', names_b1a)])
y_b1a = dta_use_b1['y']
rest_b1a = sm.OLS(y_b1a, X_b1a).results
print(rest_b1a.params)
print(anova_str % anovadict(rest_b1a))

for dropn in names_b1a:
    print('\nResults dropping', dropn)
    X_b1a_ = np.column_stack([xx_b1a[nn] for nn in dropname(dropn, names_b1a)])
    y_b1a_ = dta_use_b1['y']
    rest_b1a_ = sm.OLS(y_b1a_, X_b1a_).results
    #print(rest_b1a_.params
    print(anova_str % anovadict(rest_b1a_))