Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ duration / hazard_regression.py

"""
Implementation of proportional hazards regression models for duration
data that may be censored ("Cox models").

References
----------
T Therneau (1996).  Extending the Cox model.  Technical report.
http://www.mayo.edu/research/documents/biostat-58pdf/DOC-10027288

G Rodriguez (2005).  Non-parametric estimation in survival models.
http://data.princeton.edu/pop509/NonParametricSurvival.pdf

B Gillespie (2006).  Checking the assumptions in the Cox proportional
hazards model.
http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf
"""
import numpy as np

from statsmodels.base import model
import statsmodels.base.model as base
from statsmodels.tools.decorators import cache_readonly
from statsmodels.compat.pandas import Appender


_predict_docstring = """
    Returns predicted values from the proportional hazards
    regression model.

    Parameters
    ----------%(params_doc)s
    exog : array_like
        Data to use as `exog` in forming predictions.  If not
        provided, the `exog` values from the model used to fit the
        data are used.%(cov_params_doc)s
    endog : array_like
        Duration (time) values at which the predictions are made.
        Only used if pred_type is either 'cumhaz' or 'surv'.  If
        using model `exog`, defaults to model `endog` (time), but
        may be provided explicitly to make predictions at
        alternative times.
    strata : array_like
        A vector of stratum values used to form the predictions.
        Not used (may be 'None') if pred_type is 'lhr' or 'hr'.
        If `exog` is None, the model stratum values are used.  If
        `exog` is not None and pred_type is 'surv' or 'cumhaz',
        stratum values must be provided (unless there is only one
        stratum).
    offset : array_like
        Offset values used to create the predicted values.
    pred_type : str
        If 'lhr', returns log hazard ratios, if 'hr' returns
        hazard ratios, if 'surv' returns the survival function, if
        'cumhaz' returns the cumulative hazard function.

    Returns
    -------
    A bunch containing two fields: `predicted_values` and
    `standard_errors`.

    Notes
    -----
    Standard errors are only returned when predicting the log
    hazard ratio (pred_type is 'lhr').

    Types `surv` and `cumhaz` require estimation of the cumulative
    hazard function.
"""

_predict_params_doc = """
    params : array_like
        The proportional hazards model parameters."""

_predict_cov_params_docstring = """
    cov_params : array_like
        The covariance matrix of the estimated `params` vector,
        used to obtain prediction errors if pred_type='lhr',
        otherwise optional."""



class PHSurvivalTime(object):

    def __init__(self, time, status, exog, strata=None, entry=None,
                 offset=None):
        """
        Represent a collection of survival times with possible
        stratification and left truncation.

        Parameters
        ----------
        time : array_like
            The times at which either the event (failure) occurs or
            the observation is censored.
        status : array_like
            Indicates whether the event (failure) occurs at `time`
            (`status` is 1), or if `time` is a censoring time (`status`
            is 0).
        exog : array_like
            The exogeneous (covariate) data matrix, cases are rows and
            variables are columns.
        strata : array_like
            Grouping variable defining the strata.  If None, all
            observations are in a single stratum.
        entry : array_like
            Entry (left truncation) times.  The observation is not
            part of the risk set for times before the entry time.  If
            None, the entry time is treated as being zero, which
            gives no left truncation.  The entry time must be less
            than or equal to `time`.
        offset : array_like
            An optional array of offsets
        """

        # Default strata
        if strata is None:
            strata = np.zeros(len(time), dtype=np.int32)

        # Default entry times
        if entry is None:
            entry = np.zeros(len(time))

        # Parameter validity checks.
        n1, n2, n3, n4 = len(time), len(status), len(strata),\
            len(entry)
        nv = [n1, n2, n3, n4]
        if max(nv) != min(nv):
            raise ValueError("endog, status, strata, and " +
                             "entry must all have the same length")
        if min(time) < 0:
            raise ValueError("endog must be non-negative")
        if min(entry) < 0:
            raise ValueError("entry time must be non-negative")

        # In Stata, this is entry >= time, in R it is >.
        if np.any(entry > time):
            raise ValueError("entry times may not occur " +
                             "after event or censoring times")

        # Get the row indices for the cases in each stratum
        stu = np.unique(strata)
        #sth = {x: [] for x in stu} # needs >=2.7
        sth = dict([(x, []) for x in stu])
        for i,k in enumerate(strata):
            sth[k].append(i)
        stratum_rows = [np.asarray(sth[k], dtype=np.int32) for k in stu]
        stratum_names = stu

        # Remove strata with no events
        ix = [i for i,ix in enumerate(stratum_rows) if status[ix].sum() > 0]
        self.nstrat_orig = len(stratum_rows)
        stratum_rows = [stratum_rows[i] for i in ix]
        stratum_names = [stratum_names[i] for i in ix]

        # The number of strata
        nstrat = len(stratum_rows)
        self.nstrat = nstrat

        # Remove subjects whose entry time occurs after the last event
        # in their stratum.
        for stx,ix in enumerate(stratum_rows):
            last_failure = max(time[ix][status[ix] == 1])

            # Stata uses < here, R uses <=
            ii = [i for i,t in enumerate(entry[ix]) if
                  t <= last_failure]
            stratum_rows[stx] = stratum_rows[stx][ii]

        # Remove subjects who are censored before the first event in
        # their stratum.
        for stx,ix in enumerate(stratum_rows):
            first_failure = min(time[ix][status[ix] == 1])

            ii = [i for i,t in enumerate(time[ix]) if
                  t >= first_failure]
            stratum_rows[stx] = stratum_rows[stx][ii]

        # Order by time within each stratum
        for stx,ix in enumerate(stratum_rows):
            ii = np.argsort(time[ix])
            stratum_rows[stx] = stratum_rows[stx][ii]

        if offset is not None:
            self.offset_s = []
            for stx in range(nstrat):
                self.offset_s.append(offset[stratum_rows[stx]])
        else:
            self.offset_s = None

        # Number of informative subjects
        self.n_obs = sum([len(ix) for ix in stratum_rows])

        # Split everything by stratum
        self.time_s = []
        self.exog_s = []
        self.status_s = []
        self.entry_s = []
        for ix in stratum_rows:
            self.time_s.append(time[ix])
            self.exog_s.append(exog[ix,:])
            self.status_s.append(status[ix])
            self.entry_s.append(entry[ix])

        self.stratum_rows = stratum_rows
        self.stratum_names = stratum_names

        # Precalculate some indices needed to fit Cox models.
        # Distinct failure times within a stratum are always taken to
        # be sorted in ascending order.
        #
        # ufailt_ix[stx][k] is a list of indices for subjects who fail
        # at the k^th sorted unique failure time in stratum stx
        #
        # risk_enter[stx][k] is a list of indices for subjects who
        # enter the risk set at the k^th sorted unique failure time in
        # stratum stx
        #
        # risk_exit[stx][k] is a list of indices for subjects who exit
        # the risk set at the k^th sorted unique failure time in
        # stratum stx
        self.ufailt_ix, self.risk_enter, self.risk_exit, self.ufailt =\
            [], [], [], []

        for stx in range(self.nstrat):

            # All failure times
            ift = np.flatnonzero(self.status_s[stx] == 1)
            ft = self.time_s[stx][ift]

            # Unique failure times
            uft = np.unique(ft)
            nuft = len(uft)

            # Indices of cases that fail at each unique failure time
            #uft_map = {x:i for i,x in enumerate(uft)} # requires >=2.7
            uft_map = dict([(x, i) for i,x in enumerate(uft)]) # 2.6
            uft_ix = [[] for k in range(nuft)]
            for ix,ti in zip(ift,ft):
                uft_ix[uft_map[ti]].append(ix)

            # Indices of cases (failed or censored) that enter the
            # risk set at each unique failure time.
            risk_enter1 = [[] for k in range(nuft)]
            for i,t in enumerate(self.time_s[stx]):
                ix = np.searchsorted(uft, t, "right") - 1
                if ix >= 0:
                    risk_enter1[ix].append(i)

            # Indices of cases (failed or censored) that exit the
            # risk set at each unique failure time.
            risk_exit1 = [[] for k in range(nuft)]
            for i,t in enumerate(self.entry_s[stx]):
                ix = np.searchsorted(uft, t)
                risk_exit1[ix].append(i)

            self.ufailt.append(uft)
            self.ufailt_ix.append([np.asarray(x, dtype=np.int32)
                                   for x in uft_ix])
            self.risk_enter.append([np.asarray(x, dtype=np.int32)
                                    for x in risk_enter1])
            self.risk_exit.append([np.asarray(x, dtype=np.int32)
                                   for x in risk_exit1])


class PHReg(model.LikelihoodModel):
    """
    Cox Proportional Hazards Regression Model

    The Cox PH Model is for right censored data.

    Parameters
    ----------
    endog : array_like
        The observed times (event or censoring)
    exog : 2D array_like
        The covariates or exogeneous variables
    status : array_like
        The censoring status values; status=1 indicates that an
        event occurred (e.g. failure or death), status=0 indicates
        that the observation was right censored. If None, defaults
        to status=1 for all cases.
    entry : array_like
        The entry times, if left truncation occurs
    strata : array_like
        Stratum labels.  If None, all observations are taken to be
        in a single stratum.
    ties : str
        The method used to handle tied times, must be either 'breslow'
        or 'efron'.
    offset : array_like
        Array of offset values
    missing : str
        The method used to handle missing data

    Notes
    -----
    Proportional hazards regression models should not include an
    explicit or implicit intercept.  The effect of an intercept is
    not identified using the partial likelihood approach.

    `endog`, `event`, `strata`, `entry`, and the first dimension
    of `exog` all must have the same length
    """

    def __init__(self, endog, exog, status=None, entry=None,
                 strata=None, offset=None, ties='breslow',
                 missing='drop', **kwargs):

        # Default is no censoring
        if status is None:
            status = np.ones(len(endog))

        super(PHReg, self).__init__(endog, exog, status=status,
                                    entry=entry, strata=strata,
                                    offset=offset, missing=missing,
                                    **kwargs)

        # endog and exog are automatically converted, but these are
        # not
        if self.status is not None:
            self.status = np.asarray(self.status)
        if self.entry is not None:
            self.entry = np.asarray(self.entry)
        if self.strata is not None:
            self.strata = np.asarray(self.strata)
        if self.offset is not None:
            self.offset = np.asarray(self.offset)

        self.surv = PHSurvivalTime(self.endog, self.status,
                                    self.exog, self.strata,
                                    self.entry, self.offset)
        self.nobs = len(self.endog)
        self.groups = None

        # TODO: not used?
        self.missing = missing

        self.df_resid = (np.float(self.exog.shape[0] -
                                  np.linalg.matrix_rank(self.exog)))
        self.df_model = np.float(np.linalg.matrix_rank(self.exog))

        ties = ties.lower()
        if ties not in ("efron", "breslow"):
            raise ValueError("`ties` must be either `efron` or " +
                             "`breslow`")
Loading ...