""" test with the TimeGrouper / grouping with datetimes """
from datetime import datetime
import numpy as np
from numpy import nan
import pytest
import pytz
from pandas.compat import StringIO
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
from pandas.core.groupby.ops import BinGrouper
from pandas.util import testing as tm
from pandas.util.testing import assert_frame_equal, assert_series_equal
class TestGroupBy(object):
def test_groupby_with_timegrouper(self):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
df_original = DataFrame({
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
'Quantity': [18, 3, 5, 1, 9, 3],
'Date': [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
]
})
# GH 6908 change target column's order
df_reordered = df_original.sort_values(by='Quantity')
for df in [df_original, df_reordered]:
df = df.set_index(['Date'])
expected = DataFrame(
{'Quantity': 0},
index=date_range('20130901',
'20131205', freq='5D',
name='Date', closed='left'))
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64')
result1 = df.resample('5D') .sum()
assert_frame_equal(result1, expected)
df_sorted = df.sort_index()
result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum()
assert_frame_equal(result2, expected)
result3 = df.groupby(pd.Grouper(freq='5D')).sum()
assert_frame_equal(result3, expected)
@pytest.mark.parametrize("should_sort", [True, False])
def test_groupby_with_timegrouper_methods(self, should_sort):
# GH 3881
# make sure API of timegrouper conforms
df = pd.DataFrame({
'Branch': 'A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
'Quantity': [1, 3, 5, 8, 9, 3],
'Date': [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
]
})
if should_sort:
df = df.sort_values(by='Quantity', ascending=False)
df = df.set_index('Date', drop=False)
g = df.groupby(pd.Grouper(freq='6M'))
assert g.group_keys
assert isinstance(g.grouper, BinGrouper)
groups = g.groups
assert isinstance(groups, dict)
assert len(groups) == 3
def test_timegrouper_with_reg_groups(self):
# GH 3794
# allow combinateion of timegrouper/reg groups
df_original = DataFrame({
'Branch': 'A A A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
'Date': [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
]
}).set_index('Date')
df_sorted = df_original.sort_values(by='Quantity', ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10, 18, 3],
'Date': [
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
]
}).set_index(['Date', 'Buyer'])
result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
assert_frame_equal(result, expected)
expected = DataFrame({
'Buyer': 'Carl Mark Carl Joe'.split(),
'Quantity': [1, 3, 9, 18],
'Date': [
datetime(2013, 1, 1, 0, 0),
datetime(2013, 1, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
]
}).set_index(['Date', 'Buyer'])
result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
assert_frame_equal(result, expected)
df_original = DataFrame({
'Branch': 'A A A A A A A B'.split(),
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
'Date': [
datetime(2013, 10, 1, 13, 0),
datetime(2013, 10, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 10, 2, 14, 0),
]
}).set_index('Date')
df_sorted = df_original.sort_values(by='Quantity', ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame({
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
'Quantity': [6, 8, 3, 4, 10],
'Date': [
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 2, 0, 0),
datetime(2013, 10, 2, 0, 0),
]
}).set_index(['Date', 'Buyer'])
result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10, 18, 3],
'Date': [
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
]
}).set_index(['Date', 'Buyer'])
assert_frame_equal(result, expected)
# passing the name
df = df.reset_index()
result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
]).sum()
assert_frame_equal(result, expected)
with pytest.raises(KeyError):
df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()
# passing the level
df = df.set_index('Date')
result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
]).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
)
assert_frame_equal(result, expected)
with pytest.raises(ValueError):
df.groupby([pd.Grouper(freq='1M', level='foo'),
'Buyer']).sum()
# multi names
df = df.copy()
df['Date'] = df.index + pd.offsets.MonthEnd(2)
result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
]).sum()
expected = DataFrame({
'Buyer': 'Carl Joe Mark'.split(),
'Quantity': [10, 18, 3],
'Date': [
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
]
}).set_index(['Date', 'Buyer'])
assert_frame_equal(result, expected)
# error as we have both a level and a name!
with pytest.raises(ValueError):
df.groupby([pd.Grouper(freq='1M', key='Date',
level='Date'), 'Buyer']).sum()
# single groupers
expected = DataFrame({'Quantity': [31],
'Date': [datetime(2013, 10, 31, 0, 0)
]}).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M')).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq='1M')]).sum()
assert_frame_equal(result, expected)
expected = DataFrame({'Quantity': [31],
'Date': [datetime(2013, 11, 30, 0, 0)
]}).set_index('Date')
result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
assert_frame_equal(result, expected)
result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
assert_frame_equal(result, expected)
@pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR'])
def test_timegrouper_with_reg_groups_freq(self, freq):
# GH 6764 multiple grouping with/without sort
df = DataFrame({
'date': pd.to_datetime([
'20121002', '20121007', '20130130', '20130202', '20130305',
'20121002', '20121207', '20130130', '20130202', '20130305',
'20130202', '20130305'
]),
'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
359, 801],
'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
}).set_index('date')
expected = (
df.groupby('user_id')['whole_cost']
.resample(freq)
.sum(min_count=1) # XXX
.dropna()
.reorder_levels(['date', 'user_id'])
.sort_index()
.astype('int64')
)
expected.name = 'whole_cost'
result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
'user_id'])['whole_cost'].sum()
assert_series_equal(result1, expected)
result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
'whole_cost'].sum()
assert_series_equal(result2, expected)
def test_timegrouper_get_group(self):
# GH 6914
df_original = DataFrame({
'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
'Quantity': [18, 3, 5, 1, 9, 3],
'Date': [datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0), ]
})
df_reordered = df_original.sort_values(by='Quantity')
# single grouping
expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
df_original.iloc[[4]]]
dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
for df in [df_original, df_reordered]:
grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
for t, expected in zip(dt_list, expected_list):
dt = pd.Timestamp(t)
result = grouped.get_group(dt)
assert_frame_equal(result, expected)
# multiple grouping
expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
df_original.iloc[[4]]]
g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
('Joe', '2013-12-31')]
for df in [df_original, df_reordered]:
grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
for (b, t), expected in zip(g_list, expected_list):
dt = pd.Timestamp(t)
result = grouped.get_group((b, dt))
assert_frame_equal(result, expected)
# with index
df_original = df_original.set_index('Date')
df_reordered = df_original.sort_values(by='Quantity')
expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
df_original.iloc[[4]]]
for df in [df_original, df_reordered]:
grouped = df.groupby(pd.Grouper(freq='M'))
for t, expected in zip(dt_list, expected_list):
dt = pd.Timestamp(t)
result = grouped.get_group(dt)
assert_frame_equal(result, expected)
def test_timegrouper_apply_return_type_series(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
'value': [10, 13]})
df_dt = df.copy()
df_dt['date'] = pd.to_datetime(df_dt['date'])
Loading ...