Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ sandbox / regression / ols_anova_original.py

''' convenience functions for ANOVA type analysis with OLS

Note: statistical results of ANOVA are not checked, OLS is
checked but not whether the reported results are the ones used
in ANOVA

'''

import numpy as np
import numpy.lib.recfunctions

from statsmodels.compat.python import lmap
import statsmodels.api as sm


dt_b = np.dtype([('breed', int), ('sex', int), ('litter', int),
               ('pen', int), ('pig', int), ('age', float),
               ('bage', float), ('y', float)])
''' too much work using structured masked arrays
dta = np.mafromtxt('dftest3.data', dtype=dt_b)

dta_use = np.ma.column_stack[[dta[col] for col in 'y sex age'.split()]]
'''


dta = np.genfromtxt('dftest3.data')
print(dta.shape)
mask = np.isnan(dta)
print("rows with missing values", mask.any(1).sum())
vars = dict((v[0], (idx, v[1])) for idx, v in enumerate((('breed', int),
                                                         ('sex', int),
                                                         ('litter', int),
                                                         ('pen', int),
                                                         ('pig', int),
                                                         ('age', float),
                                                         ('bage', float),
                                                         ('y', float))))

datavarnames = 'y sex age'.split()
#possible to avoid temporary array ?
dta_use = dta[:, [vars[col][0] for col in datavarnames]]
keeprows = ~np.isnan(dta_use).any(1)
print('number of complete observations', keeprows.sum())
dta_used = dta_use[keeprows,:]

varsused = dict((k, [dta_used[:,idx], idx, vars[k][1]]) for idx, k in enumerate(datavarnames))

# use function for dummy
#sexgroups = np.unique(dta_used[:,1])
#sexdummy = (dta_used[:,1][:, None] == sexgroups).astype(int)

def data2dummy(x, returnall=False):
    '''convert array of categories to dummy variables
    by default drops dummy variable for last category
    uses ravel, 1d only'''
    x = x.ravel()
    groups = np.unique(x)
    if returnall:
        return (x[:, None] == groups).astype(int)
    else:
        return (x[:, None] == groups).astype(int)[:,:-1]

def data2proddummy(x):
    '''creates product dummy variables from 2 columns of 2d array

    drops last dummy variable, but not from each category
    singular with simple dummy variable but not with constant

    quickly written, no safeguards

    '''
    #brute force, assumes x is 2d
    #replace with encoding if possible
    groups = np.unique(lmap(tuple, x.tolist()))
    #includes singularity with additive factors
    return (x==groups[:,None,:]).all(-1).T.astype(int)[:,:-1]

def data2groupcont(x1,x2):
    '''create dummy continuous variable

    Parameters
    ----------
    x1 : 1d array
        label or group array
    x2 : 1d array (float)
        continuous variable

    Notes
    -----
    useful for group specific slope coefficients in regression
    '''
    if x2.ndim == 1:
        x2 = x2[:,None]
    dummy = data2dummy(x1, returnall=True)
    return dummy * x2

sexdummy = data2dummy(dta_used[:,1])
factors = ['sex']
for k in factors:
    varsused[k][0] = data2dummy(varsused[k][0])

products = [('sex', 'age')]
for k in products:
    varsused[''.join(k)] = data2proddummy(np.c_[varsused[k[0]][0],varsused[k[1]][0]])

# make dictionary of variables with dummies as one variable
#vars_to_use = {name: data or dummy variables}

X_b0 = np.c_[sexdummy, dta_used[:,2], np.ones((dta_used.shape[0],1))]
y_b0 = dta_used[:,0]
res_b0 = sm.OLS(y_b0, X_b0).results
print(res_b0.params)
print(res_b0.ssr)

anova_str0 = '''
ANOVA statistics (model sum of squares excludes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i        %(ess)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

anova_str = '''
ANOVA statistics (model sum of squares includes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i      %(ssmwithmean)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

#print(anova_str % dict([('df_model', res.df_model)])
#anovares = ['df_model' , 'df_resid'

def anovadict(res):
    '''update regression results dictionary with ANOVA specific statistics

    not checked for completeness
    '''
    ad = {}
    ad.update(res.__dict__)
    anova_attr = ['df_model', 'df_resid', 'ess', 'ssr','uncentered_tss',
                 'mse_model', 'mse_resid', 'mse_total', 'fvalue', 'f_pvalue',
                  'rsquared']
    for key in anova_attr:
        ad[key] = getattr(res, key)
    ad['nobs'] = res.model.nobs
    ad['ssmwithmean'] = res.uncentered_tss - res.ssr
    return ad


print(anova_str0 % anovadict(res_b0))
#the following leaves the constant in, not with NIST regression
#but something fishy with res.ess negative in examples
print(anova_str % anovadict(res_b0))

print('using sex only')
X2 = np.c_[sexdummy, np.ones((dta_used.shape[0],1))]
res2 = sm.OLS(y_b0, X2).results
print(res2.params)
print(res2.ssr)
print(anova_str % anovadict(res2))

print('using age only')
X3 = np.c_[ dta_used[:,2], np.ones((dta_used.shape[0],1))]
res3 = sm.OLS(y_b0, X3).results
print(res3.params)
print(res3.ssr)
print(anova_str % anovadict(res3))


def form2design(ss, data):
    '''convert string formula to data dictionary

    ss : str
     * I : add constant
     * varname : for simple varnames data is used as is
     * F:varname : create dummy variables for factor varname
     * P:varname1*varname2 : create product dummy variables for
       varnames
     * G:varname1*varname2 : create product between factor and
       continuous variable
    data : dict or structured array
       data set, access of variables by name as in dictionaries

    Returns
    -------
    vars : dictionary
        dictionary of variables with converted dummy variables
    names : list
        list of names, product (P:) and grouped continuous
        variables (G:) have name by joining individual names
        sorted according to input

    Examples
    --------
    >>> xx, n = form2design('I a F:b P:c*d G:c*f', testdata)
    >>> xx.keys()
    ['a', 'b', 'const', 'cf', 'cd']
    >>> n
    ['const', 'a', 'b', 'cd', 'cf']

    Notes
    -----

    with sorted dict, separate name list would not be necessary
    '''
    vars = {}
    names = []
    for item in ss.split():
        if item == 'I':
            vars['const'] = np.ones(data.shape[0])
            names.append('const')
        elif ':' not in item:
            vars[item] = data[item]
            names.append(item)
        elif item[:2] == 'F:':
            v = item.split(':')[1]
            vars[v] = data2dummy(data[v])
            names.append(v)
        elif item[:2] == 'P:':
            v = item.split(':')[1].split('*')
            vars[''.join(v)] = data2proddummy(np.c_[data[v[0]],data[v[1]]])
            names.append(''.join(v))
        elif item[:2] == 'G:':
            v = item.split(':')[1].split('*')
            vars[''.join(v)] = data2groupcont(data[v[0]], data[v[1]])
            names.append(''.join(v))
        else:
            raise ValueError('unknown expression in formula')
    return vars, names

nobs = 1000
testdataint = np.random.randint(3, size=(nobs,4)).view([('a',int),('b',int),('c',int),('d',int)])
testdatacont = np.random.normal( size=(nobs,2)).view([('e',float), ('f',float)])
dt2 = numpy.lib.recfunctions.zip_descr((testdataint, testdatacont),flatten=True)
# concatenate structured arrays
testdata = np.empty((nobs,1), dt2)
for name in testdataint.dtype.names:
    testdata[name] = testdataint[name]
for name in testdatacont.dtype.names:
    testdata[name] = testdatacont[name]


#print(form2design('a',testdata))

if 0:
    xx, n = form2design('F:a',testdata)
    print(xx)
    print(form2design('P:a*b',testdata))
    print(data2proddummy((np.c_[testdata['a'],testdata['b']])))

    xx, names = form2design('a F:b P:c*d',testdata)

#xx, names = form2design('I a F:b F:c F:d P:c*d',testdata)
xx, names = form2design('I a F:b P:c*d', testdata)
xx, names = form2design('I a F:b P:c*d G:a*e f', testdata)


X = np.column_stack([xx[nn] for nn in names])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01*np.random.normal(size=(nobs))
rest1 = sm.OLS(y,X).results
print(rest1.params)
print(anova_str % anovadict(rest1))

def dropname(ss, li):
    '''drop names from a list of strings,
    names to drop are in space delimited list
    does not change original list
    '''
    newli = li[:]
    for item in ss.split():
        newli.remove(item)
    return newli

X = np.column_stack([xx[nn] for nn in dropname('ae f', names)])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01*np.random.normal(size=(nobs))
rest1 = sm.OLS(y,X).results
print(rest1.params)
print(anova_str % anovadict(rest1))


# Example: from Bruce
# -------------------

# read data set and drop rows with missing data
dta = np.genfromtxt('dftest3.data', dt_b,missing='.', usemask=True)
print('missing', [dta.mask[k].sum() for k in dta.dtype.names])
m = dta.mask.view(bool)
droprows = m.reshape(-1,len(dta.dtype.names)).any(1)
# get complete data as plain structured array
# maybe does not work with masked arrays
dta_use_b1 = dta[~droprows,:].data
print(dta_use_b1.shape)
print(dta_use_b1.dtype)

#Example b1: variables from Bruce's glm

# prepare data and dummy variables
xx_b1, names_b1 = form2design('I F:sex age', dta_use_b1)
# create design matrix
X_b1 = np.column_stack([xx_b1[nn] for nn in dropname('', names_b1)])
y_b1 = dta_use_b1['y']
# estimate using OLS
rest_b1 = sm.OLS(y_b1, X_b1).results
# print(results)
print(rest_b1.params)
print(anova_str % anovadict(rest_b1))
#compare with original version only in original version
print(anova_str % anovadict(res_b0))

# Example: use all variables except pig identifier

allexog = ' '.join(dta.dtype.names[:-1])
#'breed sex litter pen pig age bage'

xx_b1a, names_b1a = form2design('I F:breed F:sex F:litter F:pen age bage', dta_use_b1)
X_b1a = np.column_stack([xx_b1a[nn] for nn in dropname('', names_b1a)])
y_b1a = dta_use_b1['y']
rest_b1a = sm.OLS(y_b1a, X_b1a).results
print(rest_b1a.params)
print(anova_str % anovadict(rest_b1a))

for dropn in names_b1a:
    print('\nResults dropping', dropn)
    X_b1a_ = np.column_stack([xx_b1a[nn] for nn in dropname(dropn, names_b1a)])
    y_b1a_ = dta_use_b1['y']
    rest_b1a_ = sm.OLS(y_b1a_, X_b1a_).results
    #print(rest_b1a_.params
    print(anova_str % anovadict(rest_b1a_))