Repository URL to install this package:
|
Version:
0.10.2 ▾
|
"""
Assesment of Generalized Estimating Equations using simulation.
This script checks ordinal and nominal models for multinomial data.
See the generated file "gee_categorical_simulation_check.txt" for
results.
"""
from statsmodels.compat.python import lrange
import numpy as np
from scipy import stats
from statsmodels.genmod.generalized_estimating_equations import GEE,\
gee_setup_ordinal, gee_setup_nominal,\
gee_ordinal_starting_values, Multinomial
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.cov_struct import GlobalOddsRatio
from .gee_gaussian_simulation_check import GEE_simulator
class ordinal_simulator(GEE_simulator):
# The thresholds where the latent continuous process is cut to
# obtain the categorical values.
thresholds = None
def true_params(self):
return np.concatenate((self.thresholds, self.params))
def starting_values(self, nconstraints):
beta = gee_ordinal_starting_values(self.endog,
len(self.params))
if nconstraints > 0:
m = self.exog_ex.shape[1] - nconstraints
beta = beta[0:m]
return beta
def print_dparams(self, dparams_est):
OUT.write("Odds ratio estimate: %8.4f\n" % dparams_est[0])
OUT.write("Odds ratio truth: %8.4f\n" %
self.dparams[0])
OUT.write("\n")
def simulate(self):
endog, exog, group, time = [], [], [], []
for i in range(self.ngroups):
gsize = np.random.randint(self.group_size_range[0],
self.group_size_range[1])
group.append([i,] * gsize)
time1 = np.random.normal(size=(gsize,2))
time.append(time1)
exog1 = np.random.normal(size=(gsize, len(self.params)))
exog.append(exog1)
lp = np.dot(exog1, self.params)
z = np.random.uniform(size=gsize)
z = np.log(z / (1 - z)) + lp
endog1 = np.array([np.sum(x > self.thresholds) for x in z])
endog.append(endog1)
self.exog = np.concatenate(exog, axis=0)
self.endog = np.concatenate(endog)
self.time = np.concatenate(time, axis=0)
self.group = np.concatenate(group)
self.offset = np.zeros(len(self.endog), dtype=np.float64)
class nominal_simulator(GEE_simulator):
def starting_values(self, nconstraints):
return None
def true_params(self):
return np.concatenate(self.params[:-1])
def print_dparams(self, dparams_est):
OUT.write("Odds ratio estimate: %8.4f\n" % dparams_est[0])
OUT.write("Odds ratio truth: %8.4f\n" % self.dparams[0])
OUT.write("\n")
def simulate(self):
endog, exog, group, time = [], [], [], []
for i in range(self.ngroups):
gsize = np.random.randint(self.group_size_range[0],
self.group_size_range[1])
group.append([i,] * gsize)
time1 = np.random.normal(size=(gsize,2))
time.append(time1)
exog1 = np.random.normal(size=(gsize, len(self.params[0])))
exog.append(exog1)
# Probabilities for each outcome
prob = [np.exp(np.dot(exog1, p)) for p in self.params]
prob = np.vstack(prob).T
prob /= prob.sum(1)[:, None]
m = len(self.params)
endog1 = []
for k in range(gsize):
pdist = stats.rv_discrete(values=(lrange(m),
prob[k,:]))
endog1.append(pdist.rvs())
endog.append(np.asarray(endog1))
self.exog = np.concatenate(exog, axis=0)
self.endog = np.concatenate(endog).astype(np.int32)
self.time = np.concatenate(time, axis=0)
self.group = np.concatenate(group)
self.offset = np.zeros(len(self.endog), dtype=np.float64)
def gendat_ordinal():
os = ordinal_simulator()
os.params = np.r_[0., 1]
os.ngroups = 200
os.thresholds = [1, 0, -1]
os.dparams = [1.,]
os.simulate()
data = np.concatenate((os.endog[:,None], os.exog,
os.group[:,None]), axis=1)
os.endog_ex, os.exog_ex, os.intercepts, os.nthresh = \
gee_setup_ordinal(data, 0)
os.group_ex = os.exog_ex[:,-1]
os.exog_ex = os.exog_ex[:,0:-1]
os.exog_ex = np.concatenate((os.intercepts, os.exog_ex),
axis=1)
va = GlobalOddsRatio(4, "ordinal")
lhs = np.array([[0., 0., 0, 1., 0.], [0., 0, 0, 0, 1]])
rhs = np.r_[0., 1]
return os, va, Binomial(), (lhs, rhs)
def gendat_nominal():
ns = nominal_simulator()
# The last component of params must be identically zero
ns.params = [np.r_[0., 1], np.r_[-1., 0], np.r_[0., 0]]
ns.ngroups = 200
ns.dparams = [1., ]
ns.simulate()
data = np.concatenate((ns.endog[:,None], ns.exog,
ns.group[:,None]), axis=1)
ns.endog_ex, ns.exog_ex, ns.exog_ne, ns.nlevel = \
gee_setup_nominal(data, 0, [3,])
ns.group_ex = ns.exog_ne[:,0]
va = GlobalOddsRatio(3, "nominal")
lhs = np.array([[0., 1., 1, 0],])
rhs = np.r_[0.,]
return ns, va, Multinomial(3), (lhs, rhs)
if __name__ == '__main__':
nrep = 100
OUT = open("gee_categorical_simulation_check.txt", "w")
np.set_printoptions(formatter={'all': lambda x: "%8.3f" % x},
suppress=True)
# Loop over data generating models
gendats = [gendat_nominal, gendat_ordinal]
for jg,gendat in enumerate(gendats):
dparams = []
params = []
std_errors = []
pvalues = []
for j in range(nrep):
da, va, mt, constraint = gendat()
beta = da.starting_values(0)
md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None,
mt, va)
mdf = md.fit(start_params = beta)
if mdf is None:
continue
scale_inv = 1 / md.estimate_scale()
dparams.append(np.r_[va.dparams, scale_inv])
params.append(np.asarray(mdf.params))
std_errors.append(np.asarray(mdf.standard_errors))
da, va, mt, constraint = gendat()
beta = da.starting_values(constraint[0].shape[0])
md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None,
mt, va, constraint=constraint)
mdf = md.fit(start_params = beta)
if mdf is None:
continue
score = md.score_test_results
pvalues.append(score["p-value"])
dparams_mean = np.array(sum(dparams) / len(dparams))
OUT.write("%s data.\n" % ("Nominal", "Ordinal")[jg])
OUT.write("%d runs converged successfully.\n" % len(pvalues))
OUT.write("Checking dependence parameters:\n")
da.print_dparams(dparams_mean)
params = np.array(params)
eparams = params.mean(0)
sdparams = params.std(0)
std_errors = np.array(std_errors)
std_errors = std_errors.mean(0)
true_params = da.true_params()
OUT.write("Checking parameter values:\n")
OUT.write("Observed: ")
OUT.write(np.array_str(eparams) + "\n")
OUT.write("Expected: ")
OUT.write(np.array_str(true_params) + "\n")
OUT.write("Absolute difference: ")
OUT.write(np.array_str(eparams - true_params) + "\n")
OUT.write("Relative difference: ")
OUT.write(np.array_str((eparams - true_params) / true_params)
+ "\n")
OUT.write("\n")
OUT.write("Checking standard errors:\n")
OUT.write("Observed: ")
OUT.write(np.array_str(sdparams) + "\n")
OUT.write("Expected: ")
OUT.write(np.array_str(std_errors) + "\n")
OUT.write("Absolute difference: ")
OUT.write(np.array_str(sdparams - std_errors) + "\n")
OUT.write("Relative difference: ")
OUT.write(np.array_str((sdparams - std_errors) / std_errors)
+ "\n")
OUT.write("\n")
OUT.write("Checking constrained estimation:\n")
OUT.write("Observed Expected\n")
pvalues.sort()
for q in np.arange(0.1, 0.91, 0.1):
OUT.write("%10.3f %10.3f\n" %
(pvalues[int(q*len(pvalues))], q))
OUT.write("=" * 80 + "\n\n")
OUT.close()