Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / groupby / test_timegrouper.py

""" test with the TimeGrouper / grouping with datetimes """

from datetime import datetime

import numpy as np
from numpy import nan
import pytest
import pytz

from pandas.compat import StringIO

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
from pandas.core.groupby.ops import BinGrouper
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal


class TestGroupBy(object):

    def test_groupby_with_timegrouper(self):
        # GH 4161
        # TimeGrouper requires a sorted index
        # also verifies that the resultant index has the correct name
        df_original = DataFrame({
            'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
            'Quantity': [18, 3, 5, 1, 9, 3],
            'Date': [
                datetime(2013, 9, 1, 13, 0),
                datetime(2013, 9, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 3, 10, 0),
                datetime(2013, 12, 2, 12, 0),
                datetime(2013, 9, 2, 14, 0),
            ]
        })

        # GH 6908 change target column's order
        df_reordered = df_original.sort_values(by='Quantity')

        for df in [df_original, df_reordered]:
            df = df.set_index(['Date'])

            expected = DataFrame(
                {'Quantity': 0},
                index=date_range('20130901',
                                 '20131205', freq='5D',
                                 name='Date', closed='left'))
            expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64')

            result1 = df.resample('5D') .sum()
            assert_frame_equal(result1, expected)

            df_sorted = df.sort_index()
            result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum()
            assert_frame_equal(result2, expected)

            result3 = df.groupby(pd.Grouper(freq='5D')).sum()
            assert_frame_equal(result3, expected)

    @pytest.mark.parametrize("should_sort", [True, False])
    def test_groupby_with_timegrouper_methods(self, should_sort):
        # GH 3881
        # make sure API of timegrouper conforms

        df = pd.DataFrame({
            'Branch': 'A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 8, 9, 3],
            'Date': [
                datetime(2013, 1, 1, 13, 0),
                datetime(2013, 1, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 12, 2, 12, 0),
                datetime(2013, 12, 2, 14, 0),
            ]
        })

        if should_sort:
            df = df.sort_values(by='Quantity', ascending=False)

        df = df.set_index('Date', drop=False)
        g = df.groupby(pd.Grouper(freq='6M'))
        assert g.group_keys

        assert isinstance(g.grouper, BinGrouper)
        groups = g.groups
        assert isinstance(groups, dict)
        assert len(groups) == 3

    def test_timegrouper_with_reg_groups(self):

        # GH 3794
        # allow combinateion of timegrouper/reg groups

        df_original = DataFrame({
            'Branch': 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 1, 1, 13, 0),
                datetime(2013, 1, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 12, 2, 12, 0),
                datetime(2013, 12, 2, 14, 0),
            ]
        }).set_index('Date')

        df_sorted = df_original.sort_values(by='Quantity', ascending=False)

        for df in [df_original, df_sorted]:
            expected = DataFrame({
                'Buyer': 'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 12, 31, 0, 0),
                    datetime(2013, 12, 31, 0, 0),
                    datetime(2013, 12, 31, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])

            result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

            expected = DataFrame({
                'Buyer': 'Carl Mark Carl Joe'.split(),
                'Quantity': [1, 3, 9, 18],
                'Date': [
                    datetime(2013, 1, 1, 0, 0),
                    datetime(2013, 1, 1, 0, 0),
                    datetime(2013, 7, 1, 0, 0),
                    datetime(2013, 7, 1, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

        df_original = DataFrame({
            'Branch': 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 10, 1, 13, 0),
                datetime(2013, 10, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 2, 12, 0),
                datetime(2013, 10, 2, 14, 0),
            ]
        }).set_index('Date')

        df_sorted = df_original.sort_values(by='Quantity', ascending=False)
        for df in [df_original, df_sorted]:

            expected = DataFrame({
                'Buyer': 'Carl Joe Mark Carl Joe'.split(),
                'Quantity': [6, 8, 3, 4, 10],
                'Date': [
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 2, 0, 0),
                    datetime(2013, 10, 2, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])

            result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
            expected = DataFrame({
                'Buyer': 'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 10, 31, 0, 0),
                    datetime(2013, 10, 31, 0, 0),
                    datetime(2013, 10, 31, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            assert_frame_equal(result, expected)

            # passing the name
            df = df.reset_index()
            result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
                                 ]).sum()
            assert_frame_equal(result, expected)

            with pytest.raises(KeyError):
                df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()

            # passing the level
            df = df.set_index('Date')
            result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
                                 ]).sum()
            assert_frame_equal(result, expected)
            result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
            )
            assert_frame_equal(result, expected)

            with pytest.raises(ValueError):
                df.groupby([pd.Grouper(freq='1M', level='foo'),
                            'Buyer']).sum()

            # multi names
            df = df.copy()
            df['Date'] = df.index + pd.offsets.MonthEnd(2)
            result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
                                 ]).sum()
            expected = DataFrame({
                'Buyer': 'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 11, 30, 0, 0),
                    datetime(2013, 11, 30, 0, 0),
                    datetime(2013, 11, 30, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            assert_frame_equal(result, expected)

            # error as we have both a level and a name!
            with pytest.raises(ValueError):
                df.groupby([pd.Grouper(freq='1M', key='Date',
                                       level='Date'), 'Buyer']).sum()

            # single groupers
            expected = DataFrame({'Quantity': [31],
                                  'Date': [datetime(2013, 10, 31, 0, 0)
                                           ]}).set_index('Date')
            result = df.groupby(pd.Grouper(freq='1M')).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M')]).sum()
            assert_frame_equal(result, expected)

            expected = DataFrame({'Quantity': [31],
                                  'Date': [datetime(2013, 11, 30, 0, 0)
                                           ]}).set_index('Date')
            result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
            assert_frame_equal(result, expected)

    @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR'])
    def test_timegrouper_with_reg_groups_freq(self, freq):
        # GH 6764 multiple grouping with/without sort
        df = DataFrame({
            'date': pd.to_datetime([
                '20121002', '20121007', '20130130', '20130202', '20130305',
                '20121002', '20121207', '20130130', '20130202', '20130305',
                '20130202', '20130305'
            ]),
            'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
                           359, 801],
            'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
        }).set_index('date')

        expected = (
            df.groupby('user_id')['whole_cost']
              .resample(freq)
              .sum(min_count=1)  # XXX
              .dropna()
              .reorder_levels(['date', 'user_id'])
              .sort_index()
              .astype('int64')
        )
        expected.name = 'whole_cost'

        result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
                                           'user_id'])['whole_cost'].sum()
        assert_series_equal(result1, expected)

        result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
            'whole_cost'].sum()
        assert_series_equal(result2, expected)

    def test_timegrouper_get_group(self):
        # GH 6914

        df_original = DataFrame({
            'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
            'Quantity': [18, 3, 5, 1, 9, 3],
            'Date': [datetime(2013, 9, 1, 13, 0),
                     datetime(2013, 9, 1, 13, 5),
                     datetime(2013, 10, 1, 20, 0),
                     datetime(2013, 10, 3, 10, 0),
                     datetime(2013, 12, 2, 12, 0),
                     datetime(2013, 9, 2, 14, 0), ]
        })
        df_reordered = df_original.sort_values(by='Quantity')

        # single grouping
        expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
                         df_original.iloc[[4]]]
        dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']

        for df in [df_original, df_reordered]:
            grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
            for t, expected in zip(dt_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group(dt)
                assert_frame_equal(result, expected)

        # multiple grouping
        expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
                         df_original.iloc[[4]]]
        g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
                  ('Joe', '2013-12-31')]

        for df in [df_original, df_reordered]:
            grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
            for (b, t), expected in zip(g_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group((b, dt))
                assert_frame_equal(result, expected)

        # with index
        df_original = df_original.set_index('Date')
        df_reordered = df_original.sort_values(by='Quantity')

        expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
                         df_original.iloc[[4]]]

        for df in [df_original, df_reordered]:
            grouped = df.groupby(pd.Grouper(freq='M'))
            for t, expected in zip(dt_list, expected_list):
                dt = pd.Timestamp(t)
                result = grouped.get_group(dt)
                assert_frame_equal(result, expected)

    def test_timegrouper_apply_return_type_series(self):
        # Using `apply` with the `TimeGrouper` should give the
        # same return type as an `apply` with a `Grouper`.
        # Issue #11742
        df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
                           'value': [10, 13]})
        df_dt = df.copy()
        df_dt['date'] = pd.to_datetime(df_dt['date'])
Loading ...