Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

/ io / stata.py

"""
Module contains tools for processing Stata files into DataFrames

The StataReader below was originally written by Joe Presbrey as part of PyDTA.
It has been extended and improved by Skipper Seabold from the Statsmodels
project who also developed the StataWriter and was finally added to pandas in
a once again improved version.

You can find more information on http://presbrey.mit.edu/PyDTA and
http://www.statsmodels.org/devel/
"""

from collections import OrderedDict
import datetime
import os
import struct
import sys
import warnings

from dateutil.relativedelta import relativedelta
import numpy as np

from pandas._libs.lib import infer_dtype
from pandas._libs.tslibs import NaT, Timestamp
from pandas._libs.writers import max_len_string_array
from pandas.compat import (
    BytesIO, ResourceWarning, lmap, lrange, lzip, range, string_types,
    text_type, zip)
from pandas.util._decorators import Appender, deprecate_kwarg

from pandas.core.dtypes.common import (
    ensure_object, is_categorical_dtype, is_datetime64_dtype)

from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
from pandas.core.arrays import Categorical
from pandas.core.base import StringMixin
from pandas.core.frame import DataFrame
from pandas.core.series import Series

from pandas.io.common import (
    BaseIterator, _stringify_path, get_filepath_or_buffer)

_version_error = ("Version of given Stata file is not 104, 105, 108, "
                  "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
                  "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")

_statafile_processing_params1 = """\
convert_dates : boolean, defaults to True
    Convert date variables to DataFrame time values.
convert_categoricals : boolean, defaults to True
    Read value labels and convert columns to Categorical/Factor variables."""

_encoding_params = """\
encoding : string, None or encoding
    Encoding used to parse the files. None defaults to latin-1."""

_statafile_processing_params2 = """\
index_col : string, optional, default: None
    Column to set as index.
convert_missing : boolean, defaults to False
    Flag indicating whether to convert missing values to their Stata
    representations.  If False, missing values are replaced with nan.
    If True, columns containing missing values are returned with
    object data types and missing values are represented by
    StataMissingValue objects.
preserve_dtypes : boolean, defaults to True
    Preserve Stata datatypes. If False, numeric data are upcast to pandas
    default types for foreign data (float64 or int64).
columns : list or None
    Columns to retain.  Columns will be returned in the given order.  None
    returns all columns.
order_categoricals : boolean, defaults to True
    Flag indicating whether converted categorical data are ordered."""

_chunksize_params = """\
chunksize : int, default None
    Return StataReader object for iterations, returns chunks with
    given number of lines."""

_iterator_params = """\
iterator : boolean, default False
    Return StataReader object."""

_read_stata_doc = """
Read Stata file into DataFrame.

Parameters
----------
filepath_or_buffer : string or file-like object
    Path to .dta file or object implementing a binary read() functions.
%s
%s
%s
%s
%s

Returns
-------
DataFrame or StataReader

See Also
--------
pandas.io.stata.StataReader : Low-level reader for Stata data files.
pandas.DataFrame.to_stata: Export Stata data files.

Examples
--------
Read a Stata dta file:

>>> df = pd.read_stata('filename.dta')

Read a Stata dta file in 10,000 line chunks:

>>> itr = pd.read_stata('filename.dta', chunksize=10000)
>>> for chunk in itr:
...     do_something(chunk)
""" % (_statafile_processing_params1, _encoding_params,
       _statafile_processing_params2, _chunksize_params,
       _iterator_params)

_data_method_doc = """\
Reads observations from Stata file, converting them into a dataframe

.. deprecated::
    This is a legacy method.  Use `read` in new code.

Parameters
----------
%s
%s

Returns
-------
DataFrame
""" % (_statafile_processing_params1, _statafile_processing_params2)

_read_method_doc = """\
Reads observations from Stata file, converting them into a dataframe

Parameters
----------
nrows : int
    Number of lines to read from data file, if None read whole file.
%s
%s

Returns
-------
DataFrame
""" % (_statafile_processing_params1, _statafile_processing_params2)


_stata_reader_doc = """\
Class for reading Stata dta files.

Parameters
----------
path_or_buf : path (string), buffer or path object
    string, path object (pathlib.Path or py._path.local.LocalPath) or object
    implementing a binary read() functions.

    .. versionadded:: 0.23.0 support for pathlib, py.path.
%s
%s
%s
%s
""" % (_statafile_processing_params1, _statafile_processing_params2,
       _encoding_params, _chunksize_params)


@Appender(_read_stata_doc)
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def read_stata(filepath_or_buffer, convert_dates=True,
               convert_categoricals=True, encoding=None, index_col=None,
               convert_missing=False, preserve_dtypes=True, columns=None,
               order_categoricals=True, chunksize=None, iterator=False):

    reader = StataReader(filepath_or_buffer,
                         convert_dates=convert_dates,
                         convert_categoricals=convert_categoricals,
                         index_col=index_col, convert_missing=convert_missing,
                         preserve_dtypes=preserve_dtypes,
                         columns=columns,
                         order_categoricals=order_categoricals,
                         chunksize=chunksize)

    if iterator or chunksize:
        data = reader
    else:
        try:
            data = reader.read()
        finally:
            reader.close()
    return data


_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]


stata_epoch = datetime.datetime(1960, 1, 1)


def _stata_elapsed_date_to_datetime_vec(dates, fmt):
    """
    Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime

    Parameters
    ----------
    dates : Series
        The Stata Internal Format date to convert to datetime according to fmt
    fmt : str
        The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
        Returns

    Returns
    -------
    converted : Series
        The converted dates

    Examples
    --------
    >>> dates = pd.Series([52])
    >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
    0   1961-01-01
    dtype: datetime64[ns]

    Notes
    -----
    datetime/c - tc
        milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
    datetime/C - tC - NOT IMPLEMENTED
        milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
    date - td
        days since 01jan1960 (01jan1960 = 0)
    weekly date - tw
        weeks since 1960w1
        This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
        The datetime value is the start of the week in terms of days in the
        year, not ISO calendar weeks.
    monthly date - tm
        months since 1960m1
    quarterly date - tq
        quarters since 1960q1
    half-yearly date - th
        half-years since 1960h1 yearly
    date - ty
        years since 0000

    If you don't have pandas with datetime support, then you can't do
    milliseconds accurately.
    """
    MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
    MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
    MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
    MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
    MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000

    def convert_year_month_safe(year, month):
        """
        Convert year and month to datetimes, using pandas vectorized versions
        when the date range falls within the range supported by pandas.
        Otherwise it falls back to a slower but more robust method
        using datetime.
        """
        if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
            return to_datetime(100 * year + month, format='%Y%m')
        else:
            index = getattr(year, 'index', None)
            return Series(
                [datetime.datetime(y, m, 1) for y, m in zip(year, month)],
                index=index)

    def convert_year_days_safe(year, days):
        """
        Converts year (e.g. 1999) and days since the start of the year to a
        datetime or datetime64 Series
        """
        if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
            return (to_datetime(year, format='%Y') +
                    to_timedelta(days, unit='d'))
        else:
            index = getattr(year, 'index', None)
            value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
                     for y, d in zip(year, days)]
            return Series(value, index=index)

    def convert_delta_safe(base, deltas, unit):
        """
        Convert base dates and deltas to datetimes, using pandas vectorized
        versions if the deltas satisfy restrictions required to be expressed
        as dates in pandas.
        """
        index = getattr(deltas, 'index', None)
        if unit == 'd':
            if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
                values = [base + relativedelta(days=int(d)) for d in deltas]
                return Series(values, index=index)
        elif unit == 'ms':
            if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
                values = [base + relativedelta(microseconds=(int(d) * 1000))
                          for d in deltas]
                return Series(values, index=index)
        else:
            raise ValueError('format not understood')
        base = to_datetime(base)
        deltas = to_timedelta(deltas, unit=unit)
        return base + deltas

    # TODO: If/when pandas supports more than datetime64[ns], this should be
    # improved to use correct range, e.g. datetime[Y] for yearly
    bad_locs = np.isnan(dates)
    has_bad_values = False
    if bad_locs.any():
        has_bad_values = True
        data_col = Series(dates)
        data_col[bad_locs] = 1.0  # Replace with NaT
    dates = dates.astype(np.int64)

    if fmt.startswith(("%tc", "tc")):  # Delta ms relative to base
        base = stata_epoch
        ms = dates
        conv_dates = convert_delta_safe(base, ms, 'ms')
    elif fmt.startswith(("%tC", "tC")):

        warnings.warn("Encountered %tC format. Leaving in Stata "
                      "Internal Format.")
        conv_dates = Series(dates, dtype=np.object)
        if has_bad_values:
            conv_dates[bad_locs] = NaT
        return conv_dates
    # Delta days relative to base
    elif fmt.startswith(("%td", "td", "%d", "d")):
        base = stata_epoch
        days = dates
        conv_dates = convert_delta_safe(base, days, 'd')
    # does not count leap days - 7 days is a week.
    # 52nd week may have more than 7 days
    elif fmt.startswith(("%tw", "tw")):
        year = stata_epoch.year + dates // 52
        days = (dates % 52) * 7
        conv_dates = convert_year_days_safe(year, days)
    elif fmt.startswith(("%tm", "tm")):  # Delta months relative to base
        year = stata_epoch.year + dates // 12
        month = (dates % 12) + 1
Loading ...