"""
Implementation of proportional hazards regression models for duration
data that may be censored ("Cox models").
References
----------
T Therneau (1996). Extending the Cox model. Technical report.
http://www.mayo.edu/research/documents/biostat-58pdf/DOC-10027288
G Rodriguez (2005). Non-parametric estimation in survival models.
http://data.princeton.edu/pop509/NonParametricSurvival.pdf
B Gillespie (2006). Checking the assumptions in the Cox proportional
hazards model.
http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf
"""
import numpy as np
from statsmodels.base import model
import statsmodels.base.model as base
from statsmodels.tools.decorators import cache_readonly
from statsmodels.compat.pandas import Appender
_predict_docstring = """
Returns predicted values from the proportional hazards
regression model.
Parameters
----------%(params_doc)s
exog : array_like
Data to use as `exog` in forming predictions. If not
provided, the `exog` values from the model used to fit the
data are used.%(cov_params_doc)s
endog : array_like
Duration (time) values at which the predictions are made.
Only used if pred_type is either 'cumhaz' or 'surv'. If
using model `exog`, defaults to model `endog` (time), but
may be provided explicitly to make predictions at
alternative times.
strata : array_like
A vector of stratum values used to form the predictions.
Not used (may be 'None') if pred_type is 'lhr' or 'hr'.
If `exog` is None, the model stratum values are used. If
`exog` is not None and pred_type is 'surv' or 'cumhaz',
stratum values must be provided (unless there is only one
stratum).
offset : array_like
Offset values used to create the predicted values.
pred_type : str
If 'lhr', returns log hazard ratios, if 'hr' returns
hazard ratios, if 'surv' returns the survival function, if
'cumhaz' returns the cumulative hazard function.
Returns
-------
A bunch containing two fields: `predicted_values` and
`standard_errors`.
Notes
-----
Standard errors are only returned when predicting the log
hazard ratio (pred_type is 'lhr').
Types `surv` and `cumhaz` require estimation of the cumulative
hazard function.
"""
_predict_params_doc = """
params : array_like
The proportional hazards model parameters."""
_predict_cov_params_docstring = """
cov_params : array_like
The covariance matrix of the estimated `params` vector,
used to obtain prediction errors if pred_type='lhr',
otherwise optional."""
class PHSurvivalTime(object):
def __init__(self, time, status, exog, strata=None, entry=None,
offset=None):
"""
Represent a collection of survival times with possible
stratification and left truncation.
Parameters
----------
time : array_like
The times at which either the event (failure) occurs or
the observation is censored.
status : array_like
Indicates whether the event (failure) occurs at `time`
(`status` is 1), or if `time` is a censoring time (`status`
is 0).
exog : array_like
The exogeneous (covariate) data matrix, cases are rows and
variables are columns.
strata : array_like
Grouping variable defining the strata. If None, all
observations are in a single stratum.
entry : array_like
Entry (left truncation) times. The observation is not
part of the risk set for times before the entry time. If
None, the entry time is treated as being zero, which
gives no left truncation. The entry time must be less
than or equal to `time`.
offset : array_like
An optional array of offsets
"""
# Default strata
if strata is None:
strata = np.zeros(len(time), dtype=np.int32)
# Default entry times
if entry is None:
entry = np.zeros(len(time))
# Parameter validity checks.
n1, n2, n3, n4 = len(time), len(status), len(strata),\
len(entry)
nv = [n1, n2, n3, n4]
if max(nv) != min(nv):
raise ValueError("endog, status, strata, and " +
"entry must all have the same length")
if min(time) < 0:
raise ValueError("endog must be non-negative")
if min(entry) < 0:
raise ValueError("entry time must be non-negative")
# In Stata, this is entry >= time, in R it is >.
if np.any(entry > time):
raise ValueError("entry times may not occur " +
"after event or censoring times")
# Get the row indices for the cases in each stratum
stu = np.unique(strata)
#sth = {x: [] for x in stu} # needs >=2.7
sth = dict([(x, []) for x in stu])
for i,k in enumerate(strata):
sth[k].append(i)
stratum_rows = [np.asarray(sth[k], dtype=np.int32) for k in stu]
stratum_names = stu
# Remove strata with no events
ix = [i for i,ix in enumerate(stratum_rows) if status[ix].sum() > 0]
self.nstrat_orig = len(stratum_rows)
stratum_rows = [stratum_rows[i] for i in ix]
stratum_names = [stratum_names[i] for i in ix]
# The number of strata
nstrat = len(stratum_rows)
self.nstrat = nstrat
# Remove subjects whose entry time occurs after the last event
# in their stratum.
for stx,ix in enumerate(stratum_rows):
last_failure = max(time[ix][status[ix] == 1])
# Stata uses < here, R uses <=
ii = [i for i,t in enumerate(entry[ix]) if
t <= last_failure]
stratum_rows[stx] = stratum_rows[stx][ii]
# Remove subjects who are censored before the first event in
# their stratum.
for stx,ix in enumerate(stratum_rows):
first_failure = min(time[ix][status[ix] == 1])
ii = [i for i,t in enumerate(time[ix]) if
t >= first_failure]
stratum_rows[stx] = stratum_rows[stx][ii]
# Order by time within each stratum
for stx,ix in enumerate(stratum_rows):
ii = np.argsort(time[ix])
stratum_rows[stx] = stratum_rows[stx][ii]
if offset is not None:
self.offset_s = []
for stx in range(nstrat):
self.offset_s.append(offset[stratum_rows[stx]])
else:
self.offset_s = None
# Number of informative subjects
self.n_obs = sum([len(ix) for ix in stratum_rows])
# Split everything by stratum
self.time_s = []
self.exog_s = []
self.status_s = []
self.entry_s = []
for ix in stratum_rows:
self.time_s.append(time[ix])
self.exog_s.append(exog[ix,:])
self.status_s.append(status[ix])
self.entry_s.append(entry[ix])
self.stratum_rows = stratum_rows
self.stratum_names = stratum_names
# Precalculate some indices needed to fit Cox models.
# Distinct failure times within a stratum are always taken to
# be sorted in ascending order.
#
# ufailt_ix[stx][k] is a list of indices for subjects who fail
# at the k^th sorted unique failure time in stratum stx
#
# risk_enter[stx][k] is a list of indices for subjects who
# enter the risk set at the k^th sorted unique failure time in
# stratum stx
#
# risk_exit[stx][k] is a list of indices for subjects who exit
# the risk set at the k^th sorted unique failure time in
# stratum stx
self.ufailt_ix, self.risk_enter, self.risk_exit, self.ufailt =\
[], [], [], []
for stx in range(self.nstrat):
# All failure times
ift = np.flatnonzero(self.status_s[stx] == 1)
ft = self.time_s[stx][ift]
# Unique failure times
uft = np.unique(ft)
nuft = len(uft)
# Indices of cases that fail at each unique failure time
#uft_map = {x:i for i,x in enumerate(uft)} # requires >=2.7
uft_map = dict([(x, i) for i,x in enumerate(uft)]) # 2.6
uft_ix = [[] for k in range(nuft)]
for ix,ti in zip(ift,ft):
uft_ix[uft_map[ti]].append(ix)
# Indices of cases (failed or censored) that enter the
# risk set at each unique failure time.
risk_enter1 = [[] for k in range(nuft)]
for i,t in enumerate(self.time_s[stx]):
ix = np.searchsorted(uft, t, "right") - 1
if ix >= 0:
risk_enter1[ix].append(i)
# Indices of cases (failed or censored) that exit the
# risk set at each unique failure time.
risk_exit1 = [[] for k in range(nuft)]
for i,t in enumerate(self.entry_s[stx]):
ix = np.searchsorted(uft, t)
risk_exit1[ix].append(i)
self.ufailt.append(uft)
self.ufailt_ix.append([np.asarray(x, dtype=np.int32)
for x in uft_ix])
self.risk_enter.append([np.asarray(x, dtype=np.int32)
for x in risk_enter1])
self.risk_exit.append([np.asarray(x, dtype=np.int32)
for x in risk_exit1])
class PHReg(model.LikelihoodModel):
"""
Cox Proportional Hazards Regression Model
The Cox PH Model is for right censored data.
Parameters
----------
endog : array_like
The observed times (event or censoring)
exog : 2D array_like
The covariates or exogeneous variables
status : array_like
The censoring status values; status=1 indicates that an
event occurred (e.g. failure or death), status=0 indicates
that the observation was right censored. If None, defaults
to status=1 for all cases.
entry : array_like
The entry times, if left truncation occurs
strata : array_like
Stratum labels. If None, all observations are taken to be
in a single stratum.
ties : str
The method used to handle tied times, must be either 'breslow'
or 'efron'.
offset : array_like
Array of offset values
missing : str
The method used to handle missing data
Notes
-----
Proportional hazards regression models should not include an
explicit or implicit intercept. The effect of an intercept is
not identified using the partial likelihood approach.
`endog`, `event`, `strata`, `entry`, and the first dimension
of `exog` all must have the same length
"""
def __init__(self, endog, exog, status=None, entry=None,
strata=None, offset=None, ties='breslow',
missing='drop', **kwargs):
# Default is no censoring
if status is None:
status = np.ones(len(endog))
super(PHReg, self).__init__(endog, exog, status=status,
entry=entry, strata=strata,
offset=offset, missing=missing,
**kwargs)
# endog and exog are automatically converted, but these are
# not
if self.status is not None:
self.status = np.asarray(self.status)
if self.entry is not None:
self.entry = np.asarray(self.entry)
if self.strata is not None:
self.strata = np.asarray(self.strata)
if self.offset is not None:
self.offset = np.asarray(self.offset)
self.surv = PHSurvivalTime(self.endog, self.status,
self.exog, self.strata,
self.entry, self.offset)
self.nobs = len(self.endog)
self.groups = None
# TODO: not used?
self.missing = missing
self.df_resid = (np.float(self.exog.shape[0] -
np.linalg.matrix_rank(self.exog)))
self.df_model = np.float(np.linalg.matrix_rank(self.exog))
ties = ties.lower()
if ties not in ("efron", "breslow"):
raise ValueError("`ties` must be either `efron` or " +
"`breslow`")
Loading ...