"""
Module contains tools for processing Stata files into DataFrames
The StataReader below was originally written by Joe Presbrey as part of PyDTA.
It has been extended and improved by Skipper Seabold from the Statsmodels
project who also developed the StataWriter and was finally added to pandas in
a once again improved version.
You can find more information on http://presbrey.mit.edu/PyDTA and
http://www.statsmodels.org/devel/
"""
from collections import OrderedDict
import datetime
import os
import struct
import sys
import warnings
from dateutil.relativedelta import relativedelta
import numpy as np
from pandas._libs.lib import infer_dtype
from pandas._libs.tslibs import NaT, Timestamp
from pandas._libs.writers import max_len_string_array
from pandas.compat import (
BytesIO, ResourceWarning, lmap, lrange, lzip, range, string_types,
text_type, zip)
from pandas.util._decorators import Appender, deprecate_kwarg
from pandas.core.dtypes.common import (
ensure_object, is_categorical_dtype, is_datetime64_dtype)
from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
from pandas.core.arrays import Categorical
from pandas.core.base import StringMixin
from pandas.core.frame import DataFrame
from pandas.core.series import Series
from pandas.io.common import (
BaseIterator, _stringify_path, get_filepath_or_buffer)
_version_error = ("Version of given Stata file is not 104, 105, 108, "
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
_statafile_processing_params1 = """\
convert_dates : boolean, defaults to True
Convert date variables to DataFrame time values.
convert_categoricals : boolean, defaults to True
Read value labels and convert columns to Categorical/Factor variables."""
_encoding_params = """\
encoding : string, None or encoding
Encoding used to parse the files. None defaults to latin-1."""
_statafile_processing_params2 = """\
index_col : string, optional, default: None
Column to set as index.
convert_missing : boolean, defaults to False
Flag indicating whether to convert missing values to their Stata
representations. If False, missing values are replaced with nan.
If True, columns containing missing values are returned with
object data types and missing values are represented by
StataMissingValue objects.
preserve_dtypes : boolean, defaults to True
Preserve Stata datatypes. If False, numeric data are upcast to pandas
default types for foreign data (float64 or int64).
columns : list or None
Columns to retain. Columns will be returned in the given order. None
returns all columns.
order_categoricals : boolean, defaults to True
Flag indicating whether converted categorical data are ordered."""
_chunksize_params = """\
chunksize : int, default None
Return StataReader object for iterations, returns chunks with
given number of lines."""
_iterator_params = """\
iterator : boolean, default False
Return StataReader object."""
_read_stata_doc = """
Read Stata file into DataFrame.
Parameters
----------
filepath_or_buffer : string or file-like object
Path to .dta file or object implementing a binary read() functions.
%s
%s
%s
%s
%s
Returns
-------
DataFrame or StataReader
See Also
--------
pandas.io.stata.StataReader : Low-level reader for Stata data files.
pandas.DataFrame.to_stata: Export Stata data files.
Examples
--------
Read a Stata dta file:
>>> df = pd.read_stata('filename.dta')
Read a Stata dta file in 10,000 line chunks:
>>> itr = pd.read_stata('filename.dta', chunksize=10000)
>>> for chunk in itr:
... do_something(chunk)
""" % (_statafile_processing_params1, _encoding_params,
_statafile_processing_params2, _chunksize_params,
_iterator_params)
_data_method_doc = """\
Reads observations from Stata file, converting them into a dataframe
.. deprecated::
This is a legacy method. Use `read` in new code.
Parameters
----------
%s
%s
Returns
-------
DataFrame
""" % (_statafile_processing_params1, _statafile_processing_params2)
_read_method_doc = """\
Reads observations from Stata file, converting them into a dataframe
Parameters
----------
nrows : int
Number of lines to read from data file, if None read whole file.
%s
%s
Returns
-------
DataFrame
""" % (_statafile_processing_params1, _statafile_processing_params2)
_stata_reader_doc = """\
Class for reading Stata dta files.
Parameters
----------
path_or_buf : path (string), buffer or path object
string, path object (pathlib.Path or py._path.local.LocalPath) or object
implementing a binary read() functions.
.. versionadded:: 0.23.0 support for pathlib, py.path.
%s
%s
%s
%s
""" % (_statafile_processing_params1, _statafile_processing_params2,
_encoding_params, _chunksize_params)
@Appender(_read_stata_doc)
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
def read_stata(filepath_or_buffer, convert_dates=True,
convert_categoricals=True, encoding=None, index_col=None,
convert_missing=False, preserve_dtypes=True, columns=None,
order_categoricals=True, chunksize=None, iterator=False):
reader = StataReader(filepath_or_buffer,
convert_dates=convert_dates,
convert_categoricals=convert_categoricals,
index_col=index_col, convert_missing=convert_missing,
preserve_dtypes=preserve_dtypes,
columns=columns,
order_categoricals=order_categoricals,
chunksize=chunksize)
if iterator or chunksize:
data = reader
else:
try:
data = reader.read()
finally:
reader.close()
return data
_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
stata_epoch = datetime.datetime(1960, 1, 1)
def _stata_elapsed_date_to_datetime_vec(dates, fmt):
"""
Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
Parameters
----------
dates : Series
The Stata Internal Format date to convert to datetime according to fmt
fmt : str
The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
Returns
Returns
-------
converted : Series
The converted dates
Examples
--------
>>> dates = pd.Series([52])
>>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
0 1961-01-01
dtype: datetime64[ns]
Notes
-----
datetime/c - tc
milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
datetime/C - tC - NOT IMPLEMENTED
milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
date - td
days since 01jan1960 (01jan1960 = 0)
weekly date - tw
weeks since 1960w1
This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
The datetime value is the start of the week in terms of days in the
year, not ISO calendar weeks.
monthly date - tm
months since 1960m1
quarterly date - tq
quarters since 1960q1
half-yearly date - th
half-years since 1960h1 yearly
date - ty
years since 0000
If you don't have pandas with datetime support, then you can't do
milliseconds accurately.
"""
MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
def convert_year_month_safe(year, month):
"""
Convert year and month to datetimes, using pandas vectorized versions
when the date range falls within the range supported by pandas.
Otherwise it falls back to a slower but more robust method
using datetime.
"""
if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
return to_datetime(100 * year + month, format='%Y%m')
else:
index = getattr(year, 'index', None)
return Series(
[datetime.datetime(y, m, 1) for y, m in zip(year, month)],
index=index)
def convert_year_days_safe(year, days):
"""
Converts year (e.g. 1999) and days since the start of the year to a
datetime or datetime64 Series
"""
if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
return (to_datetime(year, format='%Y') +
to_timedelta(days, unit='d'))
else:
index = getattr(year, 'index', None)
value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
for y, d in zip(year, days)]
return Series(value, index=index)
def convert_delta_safe(base, deltas, unit):
"""
Convert base dates and deltas to datetimes, using pandas vectorized
versions if the deltas satisfy restrictions required to be expressed
as dates in pandas.
"""
index = getattr(deltas, 'index', None)
if unit == 'd':
if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
values = [base + relativedelta(days=int(d)) for d in deltas]
return Series(values, index=index)
elif unit == 'ms':
if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
values = [base + relativedelta(microseconds=(int(d) * 1000))
for d in deltas]
return Series(values, index=index)
else:
raise ValueError('format not understood')
base = to_datetime(base)
deltas = to_timedelta(deltas, unit=unit)
return base + deltas
# TODO: If/when pandas supports more than datetime64[ns], this should be
# improved to use correct range, e.g. datetime[Y] for yearly
bad_locs = np.isnan(dates)
has_bad_values = False
if bad_locs.any():
has_bad_values = True
data_col = Series(dates)
data_col[bad_locs] = 1.0 # Replace with NaT
dates = dates.astype(np.int64)
if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
base = stata_epoch
ms = dates
conv_dates = convert_delta_safe(base, ms, 'ms')
elif fmt.startswith(("%tC", "tC")):
warnings.warn("Encountered %tC format. Leaving in Stata "
"Internal Format.")
conv_dates = Series(dates, dtype=np.object)
if has_bad_values:
conv_dates[bad_locs] = NaT
return conv_dates
# Delta days relative to base
elif fmt.startswith(("%td", "td", "%d", "d")):
base = stata_epoch
days = dates
conv_dates = convert_delta_safe(base, days, 'd')
# does not count leap days - 7 days is a week.
# 52nd week may have more than 7 days
elif fmt.startswith(("%tw", "tw")):
year = stata_epoch.year + dates // 52
days = (dates % 52) * 7
conv_dates = convert_year_days_safe(year, days)
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
year = stata_epoch.year + dates // 12
month = (dates % 12) + 1
Loading ...