Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
pandas / tests / io / generate_legacy_storage_files.py
Size: Mime:
#!/usr/env/bin python

"""
self-contained to write legacy storage (pickle/msgpack) files

To use this script. Create an environment where you want
generate pickles, say its for 0.18.1, with your pandas clone
in ~/pandas

. activate pandas_0.18.1
cd ~/

$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \
    pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle

This script generates a storage file for the current arch, system,
and python version
  pandas version: 0.18.1
  output dir    : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/
  storage format: pickle
created pickle file: 0.18.1_x86_64_darwin_3.5.2.pickle

The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of pandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with master). These are then compared.

If we have cases where we changed the signature (e.g. we renamed
offset -> freq in Timestamp). Then we have to conditionally execute
in the generate_legacy_storage_files.py to make it
run under the older AND the newer version.

"""

from __future__ import print_function
from warnings import catch_warnings
from distutils.version import LooseVersion
from pandas import (Series, DataFrame, Panel,
                    SparseSeries, SparseDataFrame,
                    Index, MultiIndex, bdate_range, to_msgpack,
                    date_range, period_range, timedelta_range,
                    Timestamp, NaT, Categorical, Period)
from pandas.tseries.offsets import (
    DateOffset, Hour, Minute, Day,
    MonthBegin, MonthEnd, YearBegin,
    YearEnd, Week, WeekOfMonth, LastWeekOfMonth,
    BusinessDay, BusinessHour, CustomBusinessDay, FY5253,
    Easter,
    SemiMonthEnd, SemiMonthBegin,
    QuarterBegin, QuarterEnd)
from pandas.compat import u
import os
import sys
import numpy as np
import pandas
import platform as pl
from datetime import timedelta

_loose_version = LooseVersion(pandas.__version__)


def _create_sp_series():
    nan = np.nan

    # nan-based
    arr = np.arange(15, dtype=np.float64)
    arr[7:12] = nan
    arr[-1:] = nan

    bseries = SparseSeries(arr, kind='block')
    bseries.name = u'bseries'
    return bseries


def _create_sp_tsseries():
    nan = np.nan

    # nan-based
    arr = np.arange(15, dtype=np.float64)
    arr[7:12] = nan
    arr[-1:] = nan

    date_index = bdate_range('1/1/2011', periods=len(arr))
    bseries = SparseSeries(arr, index=date_index, kind='block')
    bseries.name = u'btsseries'
    return bseries


def _create_sp_frame():
    nan = np.nan

    data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
            u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
            u'C': np.arange(10).astype(np.int64),
            u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}

    dates = bdate_range('1/1/2011', periods=10)
    return SparseDataFrame(data, index=dates)


def create_data():
    """ create the pickle/msgpack data """

    data = {
        u'A': [0., 1., 2., 3., np.nan],
        u'B': [0, 1, 0, 1, 0],
        u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
        u'D': date_range('1/1/2009', periods=5),
        u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'),
                   period=Period('2012', 'M'))

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10),
                 float=Index(np.arange(10, dtype=np.float64)),
                 uint=Index(np.arange(10, dtype=np.uint64)),
                 timedelta=timedelta_range('00:00:00', freq='30T', periods=10))

    if _loose_version >= LooseVersion('0.18'):
        from pandas import RangeIndex
        index['range'] = RangeIndex(10)

    if _loose_version >= LooseVersion('0.21'):
        from pandas import interval_range
        index['interval'] = interval_range(0, periods=10)

    mi = dict(reg2=MultiIndex.from_tuples(
        tuple(zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo',
                     u'foo', u'qux', u'qux'],
                    [u'one', u'two', u'one', u'two', u'one',
                     u'two', u'one', u'two']])),
        names=[u'first', u'second']))

    series = dict(float=Series(data[u'A']),
                  int=Series(data[u'B']),
                  mixed=Series(data[u'E']),
                  ts=Series(np.arange(10).astype(np.int64),
                            index=date_range('20130101', periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(
                                tuple(zip(*[[1, 1, 2, 2, 2],
                                            [3, 4, 3, 4, 5]])),
                                names=[u'one', u'two'])),
                  dup=Series(np.arange(5).astype(np.float64),
                             index=[u'A', u'B', u'C', u'D', u'A']),
                  cat=Series(Categorical([u'foo', u'bar', u'baz'])),
                  dt=Series(date_range('20130101', periods=5)),
                  dt_tz=Series(date_range('20130101', periods=5,
                                          tz='US/Eastern')),
                  period=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list(u"ABCDA")
    frame = dict(float=DataFrame({u'A': series[u'float'],
                                  u'B': series[u'float'] + 1}),
                 int=DataFrame({u'A': series[u'int'],
                                u'B': series[u'int'] + 1}),
                 mixed=DataFrame({k: data[k]
                                  for k in [u'A', u'B', u'C', u'D']}),
                 mi=DataFrame({u'A': np.arange(5).astype(np.float64),
                               u'B': np.arange(5).astype(np.int64)},
                              index=MultiIndex.from_tuples(
                                  tuple(zip(*[[u'bar', u'bar', u'baz',
                                               u'baz', u'baz'],
                                              [u'one', u'two', u'one',
                                               u'two', u'three']])),
                                  names=[u'first', u'second'])),
                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                               columns=[u'A', u'B', u'A']),
                 cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
                 cat_and_float=DataFrame({
                     u'A': Categorical([u'foo', u'bar', u'baz']),
                     u'B': np.arange(3).astype(np.int64)}),
                 mixed_dup=mixed_dup_df,
                 dt_mixed_tzs=DataFrame({
                     u'A': Timestamp('20130102', tz='US/Eastern'),
                     u'B': Timestamp('20130603', tz='CET')}, index=range(5)),
                 dt_mixed2_tzs=DataFrame({
                     u'A': Timestamp('20130102', tz='US/Eastern'),
                     u'B': Timestamp('20130603', tz='CET'),
                     u'C': Timestamp('20130603', tz='UTC')}, index=range(5))
                 )

    with catch_warnings(record=True):
        mixed_dup_panel = Panel({u'ItemA': frame[u'float'],
                                 u'ItemB': frame[u'int']})
        mixed_dup_panel.items = [u'ItemA', u'ItemA']
        panel = dict(float=Panel({u'ItemA': frame[u'float'],
                                  u'ItemB': frame[u'float'] + 1}),
                     dup=Panel(
                         np.arange(30).reshape(3, 5, 2).astype(np.float64),
                         items=[u'A', u'B', u'A']),
                     mixed_dup=mixed_dup_panel)

    cat = dict(int8=Categorical(list('abcdefg')),
               int16=Categorical(np.arange(1000)),
               int32=Categorical(np.arange(10000)))

    timestamp = dict(normal=Timestamp('2011-01-01'),
                     nat=NaT,
                     tz=Timestamp('2011-01-01', tz='US/Eastern'))

    if _loose_version < LooseVersion('0.19.2'):
        timestamp['freq'] = Timestamp('2011-01-01', offset='D')
        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
                                      offset='M')
    else:
        timestamp['freq'] = Timestamp('2011-01-01', freq='D')
        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo',
                                      freq='M')

    off = {'DateOffset': DateOffset(years=1),
           'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
           'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
           'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
           'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
           'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
           'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
           'MonthBegin': MonthBegin(1),
           'MonthEnd': MonthEnd(1),
           'QuarterBegin': QuarterBegin(1),
           'QuarterEnd': QuarterEnd(1),
           'Day': Day(1),
           'YearBegin': YearBegin(1),
           'YearEnd': YearEnd(1),
           'Week': Week(1),
           'Week_Tues': Week(2, normalize=False, weekday=1),
           'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
           'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
           'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
           'Easter': Easter(),
           'Hour': Hour(1),
           'Minute': Minute(1)}

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                scalars=scalars,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()),
                cat=cat,
                timestamp=timestamp,
                offsets=off)


def create_pickle_data():
    data = create_data()

    # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and
    # panels if their columns/items were non-unique.
    if _loose_version < LooseVersion('0.14.1'):
        del data['frame']['mixed_dup']
        del data['panel']['mixed_dup']
    if _loose_version < LooseVersion('0.17.0'):
        del data['series']['period']
        del data['scalars']['period']
    return data


def _u(x):
    return {u(k): _u(x[k]) for k in x} if isinstance(x, dict) else x


def create_msgpack_data():
    data = create_data()
    if _loose_version < LooseVersion('0.17.0'):
        del data['frame']['mixed_dup']
        del data['panel']['mixed_dup']
        del data['frame']['dup']
        del data['panel']['dup']
    if _loose_version < LooseVersion('0.18.0'):
        del data['series']['dt_tz']
        del data['frame']['dt_mixed_tzs']
    # Not supported
    del data['sp_series']
    del data['sp_frame']
    del data['series']['cat']
    del data['series']['period']
    del data['frame']['cat_onecol']
    del data['frame']['cat_and_float']
    del data['scalars']['period']
    if _loose_version < LooseVersion('0.23.0'):
        del data['index']['interval']
    del data['offsets']
    return _u(data)


def platform_name():
    return '_'.join([str(pandas.__version__), str(pl.machine()),
                     str(pl.system().lower()), str(pl.python_version())])


def write_legacy_pickles(output_dir):

    # make sure we are < 0.13 compat (in py3)
    try:
        from pandas.compat import zip, cPickle as pickle  # noqa
    except:
        import pickle

    version = pandas.__version__

    print("This script generates a storage file for the current arch, system, "
          "and python version")
    print("  pandas version: {0}".format(version))
    print("  output dir    : {0}".format(output_dir))
    print("  storage format: pickle")

    pth = '{0}.pickle'.format(platform_name())

    fh = open(os.path.join(output_dir, pth), 'wb')
    pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
    fh.close()

    print("created pickle file: %s" % pth)


def write_legacy_msgpack(output_dir, compress):

    version = pandas.__version__

    print("This script generates a storage file for the current arch, "
          "system, and python version")
    print("  pandas version: {0}".format(version))
    print("  output dir    : {0}".format(output_dir))
    print("  storage format: msgpack")
    pth = '{0}.msgpack'.format(platform_name())
    to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(),
               compress=compress)

    print("created msgpack file: %s" % pth)


def write_legacy_file():
    # force our cwd to be the first searched
    sys.path.insert(0, '.')

    if not (3 <= len(sys.argv) <= 4):
        exit("Specify output directory and storage type: generate_legacy_"
             "storage_files.py <output_dir> <storage_type> "
             "<msgpack_compress_type>")

    output_dir = str(sys.argv[1])
    storage_type = str(sys.argv[2])
    try:
        compress_type = str(sys.argv[3])
    except IndexError:
        compress_type = None

    if storage_type == 'pickle':
        write_legacy_pickles(output_dir=output_dir)
    elif storage_type == 'msgpack':
        write_legacy_msgpack(output_dir=output_dir, compress=compress_type)
    else:
        exit("storage_type must be one of {'pickle', 'msgpack'}")


if __name__ == '__main__':
    write_legacy_file()