Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / statsmodels   python

Repository URL to install this package:

Version: 0.11.1 

/ datasets / china_smoking / data.py

"""Smoking and lung cancer in eight cities in China."""
from statsmodels.datasets import utils as du

__docformat__ = 'restructuredtext'

COPYRIGHT   = """Intern. J. Epidemiol. (1992)"""
TITLE       = __doc__
SOURCE      = """
Transcribed from Z. Liu, Smoking and Lung Cancer Incidence in China,
Intern. J. Epidemiol., 21:197-201, (1992).
"""

DESCRSHORT  = """Co-occurrence of lung cancer and smoking in 8 Chinese cities."""

DESCRLONG   = """This is a series of 8 2x2 contingency tables showing the co-occurrence
of lung cancer and smoking in 8 Chinese cities.
"""

NOTE        = """::

    Number of Observations - 8
    Number of Variables - 3
    Variable name definitions::

        city_name - name of the city
        smoking - yes or no, according to a person's smoking behavior
        lung_cancer - yes or no, according to a person's lung cancer status
"""


def load_pandas():
    """
    Load the China smoking/lung cancer data and return a Dataset class.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.
    """
    raw_data = du.load_csv(__file__, 'china_smoking.csv')
    data = raw_data.set_index('Location')
    dset = du.Dataset(data=data, title="Smoking and lung cancer in Chinese regions")
    dset.raw_data = raw_data
    return dset


def load(as_pandas=None):
    """
    Load the China smoking/lung cancer data and return a Dataset class.

    Parameters
    ----------
    as_pandas : bool
        Flag indicating whether to return pandas DataFrames and Series
        or numpy recarrays and arrays.  If True, returns pandas.

    Returns
    -------
    Dataset instance:
        See DATASET_PROPOSAL.txt for more information.
    """
    return du.as_numpy_dataset(load_pandas(), as_pandas=as_pandas,
                               retain_index=True)