# -*- coding: utf-8 -*-
from __future__ import print_function
from collections import defaultdict
from datetime import datetime
from decimal import Decimal
import numpy as np
import pytest
from pandas.compat import (
OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip)
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import (
DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range,
read_csv)
import pandas.core.common as com
import pandas.util.testing as tm
from pandas.util.testing import (
assert_almost_equal, assert_frame_equal, assert_series_equal)
def test_repr():
# GH18203
result = repr(pd.Grouper(key='A', level='B'))
expected = "Grouper(key='A', level='B', axis=0, sort=False)"
assert result == expected
@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
def test_basic(dtype):
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
index = np.arange(9)
np.random.shuffle(index)
data = data.reindex(index)
grouped = data.groupby(lambda x: x // 3)
for k, v in grouped:
assert len(v) == 3
agged = grouped.aggregate(np.mean)
assert agged[1] == 1
assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
assert_series_equal(agged, grouped.mean())
assert_series_equal(grouped.agg(np.sum), grouped.sum())
expected = grouped.apply(lambda x: x * x.sum())
transformed = grouped.transform(lambda x: x * x.sum())
assert transformed[7] == 12
assert_series_equal(transformed, expected)
value_grouped = data.groupby(data)
assert_series_equal(value_grouped.aggregate(np.mean), agged,
check_index_type=False)
# complex agg
agged = grouped.aggregate([np.mean, np.std])
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
agged = grouped.aggregate({'one': np.mean, 'two': np.std})
group_constants = {0: 10, 1: 20, 2: 30}
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
assert agged[1] == 21
# corner cases
msg = "Must produce aggregated value"
# exception raised is type Exception
with pytest.raises(Exception, match=msg):
grouped.aggregate(lambda x: x * 2)
def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
key = mframe.index.codes[0]
grouped = mframe.groupby(key)
result = grouped.sum()
expected = mframe.groupby(key.astype('O')).sum()
assert_frame_equal(result, expected)
# GH 3911, mixed frame non-conversion
df = df_mixed_floats.copy()
df['value'] = lrange(len(df))
def max_value(group):
return group.loc[group['value'].idxmax()]
applied = df.groupby('A').apply(max_value)
result = applied.get_dtype_counts().sort_values()
expected = Series({'float64': 2,
'int64': 1,
'object': 2}).sort_values()
assert_series_equal(result, expected)
def test_groupby_return_type():
# GH2893, return a reduced type
df1 = DataFrame(
[{"val1": 1, "val2": 20},
{"val1": 1, "val2": 19},
{"val1": 2, "val2": 27},
{"val1": 2, "val2": 12}
])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()
result = df1.groupby("val1", squeeze=True).apply(func)
assert isinstance(result, Series)
df2 = DataFrame(
[{"val1": 1, "val2": 20},
{"val1": 1, "val2": 19},
{"val1": 1, "val2": 27},
{"val1": 1, "val2": 12}
])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()
result = df2.groupby("val1", squeeze=True).apply(func)
assert isinstance(result, Series)
# GH3596, return a consistent type (regression in 0.11 from 0.10.1)
df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
result = df.groupby('X', squeeze=False).count()
assert isinstance(result, DataFrame)
# GH5592
# inconcistent return type
df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
'Pony', 'Pony'], B=Series(
np.arange(7), dtype='int64'), C=date_range(
'20130101', periods=7)))
def f(grp):
return grp.iloc[0]
expected = df.groupby('A').first()[['B']]
result = df.groupby('A').apply(f)[['B']]
assert_frame_equal(result, expected)
def f(grp):
if grp.name == 'Tiger':
return None
return grp.iloc[0]
result = df.groupby('A').apply(f)[['B']]
e = expected.copy()
e.loc['Tiger'] = np.nan
assert_frame_equal(result, e)
def f(grp):
if grp.name == 'Pony':
return None
return grp.iloc[0]
result = df.groupby('A').apply(f)[['B']]
e = expected.copy()
e.loc['Pony'] = np.nan
assert_frame_equal(result, e)
# 5592 revisited, with datetimes
def f(grp):
if grp.name == 'Pony':
return None
return grp.iloc[0]
result = df.groupby('A').apply(f)[['C']]
e = df.groupby('A').first()[['C']]
e.loc['Pony'] = pd.NaT
assert_frame_equal(result, e)
# scalar outputs
def f(grp):
if grp.name == 'Pony':
return None
return grp.iloc[0].loc['C']
result = df.groupby('A').apply(f)
e = df.groupby('A').first()['C'].copy()
e.loc['Pony'] = np.nan
e.name = None
assert_series_equal(result, e)
def test_pass_args_kwargs(ts, tsframe):
def f(x, q=None, axis=0):
return np.percentile(x, q, axis=axis)
g = lambda x: np.percentile(x, 80, axis=0)
# Series
ts_grouped = ts.groupby(lambda x: x.month)
agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
agg_expected = ts_grouped.quantile(.8)
trans_expected = ts_grouped.transform(g)
assert_series_equal(apply_result, agg_expected)
assert_series_equal(agg_result, agg_expected, check_names=False)
assert_series_equal(trans_result, trans_expected)
agg_result = ts_grouped.agg(f, q=80)
apply_result = ts_grouped.apply(f, q=80)
trans_result = ts_grouped.transform(f, q=80)
assert_series_equal(agg_result, agg_expected)
assert_series_equal(apply_result, agg_expected)
assert_series_equal(trans_result, trans_expected)
# DataFrame
df_grouped = tsframe.groupby(lambda x: x.month)
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
apply_result = df_grouped.apply(DataFrame.quantile, .8)
expected = df_grouped.quantile(.8)
assert_frame_equal(apply_result, expected)
assert_frame_equal(agg_result, expected, check_names=False)
agg_result = df_grouped.agg(f, q=80)
apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
assert_frame_equal(agg_result, expected, check_names=False)
assert_frame_equal(apply_result, expected)
def test_len():
df = tm.makeTimeDataFrame()
grouped = df.groupby([lambda x: x.year, lambda x: x.month,
lambda x: x.day])
assert len(grouped) == len(df)
grouped = df.groupby([lambda x: x.year, lambda x: x.month])
expected = len({(x.year, x.month) for x in df.index})
assert len(grouped) == expected
# issue 11016
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
assert len(df.groupby(('a'))) == 0
assert len(df.groupby(('b'))) == 3
assert len(df.groupby(['a', 'b'])) == 3
def test_basic_regression():
# regression
T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
result = Series(T, lrange(0, len(T)))
groupings = np.random.random((1100, ))
groupings = Series(groupings, lrange(0, len(groupings))) * 10.
grouped = result.groupby(groupings)
grouped.mean()
@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
'int32', 'int16', 'int8'])
def test_with_na_groups(dtype):
index = Index(np.arange(10))
values = Series(np.ones(10), index, dtype=dtype)
labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
'bar', 'bar', np.nan, 'foo'], index=index)
# this SHOULD be an int
grouped = values.groupby(labels)
agged = grouped.agg(len)
expected = Series([4, 2], index=['bar', 'foo'])
assert_series_equal(agged, expected, check_dtype=False)
# assert issubclass(agged.dtype.type, np.integer)
# explicitly return a float from my function
def f(x):
return float(len(x))
agged = grouped.agg(f)
expected = Series([4, 2], index=['bar', 'foo'])
assert_series_equal(agged, expected, check_dtype=False)
assert issubclass(agged.dtype.type, np.dtype(dtype).type)
def test_indices_concatenation_order():
# GH 2808
def f1(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
names=['b', 'c'])
res = DataFrame(None, columns=['a'], index=multiindex)
return res
else:
y = y.set_index(['b', 'c'])
return y
def f2(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
return DataFrame()
else:
y = y.set_index(['b', 'c'])
return y
def f3(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
names=['foo', 'bar'])
res = DataFrame(None, columns=['a', 'b'], index=multiindex)
return res
else:
return y
df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
# correct result
result1 = df.groupby('a').apply(f1)
result2 = df2.groupby('a').apply(f1)
assert_frame_equal(result1, result2)
# should fail (not the same number of levels)
msg = "Cannot concat indices that do not have the same number of levels"
with pytest.raises(AssertionError, match=msg):
df.groupby('a').apply(f2)
with pytest.raises(AssertionError, match=msg):
df2.groupby('a').apply(f2)
# should fail (incorrect shape)
with pytest.raises(AssertionError, match=msg):
df.groupby('a').apply(f3)
with pytest.raises(AssertionError, match=msg):
Loading ...