# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime
import numpy as np
import pytest
from pandas.compat import PY37
import pandas as pd
from pandas import (
Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut)
import pandas.util.testing as tm
from pandas.util.testing import (
assert_equal, assert_frame_equal, assert_series_equal)
def cartesian_product_for_groupers(result, args, names):
""" Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper """
def f(a):
if isinstance(a, (CategoricalIndex, Categorical)):
categories = a.categories
a = Categorical.from_codes(np.arange(len(categories)),
categories=categories,
ordered=a.ordered)
return a
index = pd.MultiIndex.from_product(map(f, args), names=names)
return result.reindex(index).sort_index()
def test_apply_use_categorical_name(df):
cats = qcut(df.C, 4)
def get_stats(group):
return {'min': group.min(),
'max': group.max(),
'count': group.count(),
'mean': group.mean()}
result = df.groupby(cats, observed=False).D.apply(get_stats)
assert result.index.names[0] == 'C'
def test_basic():
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"], ordered=True)
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
result = data.groupby("b", observed=False).mean()
tm.assert_frame_equal(result, expected)
cat1 = Categorical(["a", "a", "b", "b"],
categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"],
categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
# single grouper
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
result = gb.sum()
tm.assert_frame_equal(result, expected)
# GH 8623
x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
[1, 'John P. Doe']],
columns=['person_id', 'person_name'])
x['person_name'] = Categorical(x.person_name)
g = x.groupby(['person_id'], observed=False)
result = g.transform(lambda x: x)
tm.assert_frame_equal(result, x[['person_name']])
result = x.drop_duplicates('person_name')
expected = x.iloc[[0, 1]]
tm.assert_frame_equal(result, expected)
def f(x):
return x.drop_duplicates('person_name').iloc[0]
result = g.apply(f)
expected = x.iloc[[0, 1]].copy()
expected.index = Index([1, 2], name='person_id')
expected['person_name'] = expected['person_name'].astype('object')
tm.assert_frame_equal(result, expected)
# GH 9921
# Monotonic
df = DataFrame({"a": [5, 15, 25]})
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
result = df.a.groupby(c, observed=False).transform(sum)
tm.assert_series_equal(result, df['a'])
tm.assert_series_equal(
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
df['a'])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(sum),
df[['a']])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
df[['a']])
# Filter
tm.assert_series_equal(
df.a.groupby(c, observed=False).filter(np.all),
df['a'])
tm.assert_frame_equal(
df.groupby(c, observed=False).filter(np.all),
df)
# Non-monotonic
df = DataFrame({"a": [5, 15, 25, -5]})
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
result = df.a.groupby(c, observed=False).transform(sum)
tm.assert_series_equal(result, df['a'])
tm.assert_series_equal(
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
df['a'])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(sum),
df[['a']])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
df[['a']])
# GH 9603
df = DataFrame({'a': [1, 0, 0, 0]})
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
result = df.groupby(c, observed=False).apply(len)
exp_index = CategoricalIndex(
c.values.categories, ordered=c.values.ordered)
expected = Series([1, 0, 0, 0], index=exp_index)
expected.index.name = 'a'
tm.assert_series_equal(result, expected)
# more basic
levels = ['foo', 'bar', 'baz', 'qux']
codes = np.random.randint(0, 4, size=100)
cats = Categorical.from_codes(codes, levels, ordered=True)
data = DataFrame(np.random.randn(100, 4))
result = data.groupby(cats, observed=False).mean()
expected = data.groupby(np.asarray(cats), observed=False).mean()
exp_idx = CategoricalIndex(levels, categories=cats.categories,
ordered=True)
expected = expected.reindex(exp_idx)
assert_frame_equal(result, expected)
grouped = data.groupby(cats, observed=False)
desc_result = grouped.describe()
idx = cats.codes.argsort()
ord_labels = np.asarray(cats).take(idx)
ord_data = data.take(idx)
exp_cats = Categorical(ord_labels, ordered=True,
categories=['foo', 'bar', 'baz', 'qux'])
expected = ord_data.groupby(
exp_cats, sort=False, observed=False).describe()
assert_frame_equal(desc_result, expected)
# GH 10460
expc = Categorical.from_codes(np.arange(4).repeat(8),
levels, ordered=True)
exp = CategoricalIndex(expc)
tm.assert_index_equal((desc_result.stack().index
.get_level_values(0)), exp)
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max'] * 4)
tm.assert_index_equal((desc_result.stack().index
.get_level_values(1)), exp)
def test_level_get_group(observed):
# GH15155
df = DataFrame(data=np.arange(2, 22, 2),
index=MultiIndex(
levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
codes=[[0] * 5 + [1] * 5, range(10)],
names=["Index1", "Index2"]))
g = df.groupby(level=["Index1"], observed=observed)
# expected should equal test.loc[["a"]]
# GH15166
expected = DataFrame(data=np.arange(2, 12, 2),
index=pd.MultiIndex(levels=[pd.CategoricalIndex(
["a", "b"]), range(5)],
codes=[[0] * 5, range(5)],
names=["Index1", "Index2"]))
result = g.get_group('a')
assert_frame_equal(result, expected)
@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636", strict=False)
@pytest.mark.parametrize('ordered', [True, False])
def test_apply(ordered):
# GH 10138
dense = Categorical(list('abc'), ordered=ordered)
# 'b' is in the categories but not in the list
missing = Categorical(
list('aaa'), categories=['a', 'b'], ordered=ordered)
values = np.arange(len(dense))
df = DataFrame({'missing': missing,
'dense': dense,
'values': values})
grouped = df.groupby(['missing', 'dense'], observed=True)
# missing category 'b' should still exist in the output index
idx = MultiIndex.from_arrays(
[missing, dense], names=['missing', 'dense'])
expected = DataFrame([0, 1, 2.],
index=idx,
columns=['values'])
result = grouped.apply(lambda x: np.mean(x))
assert_frame_equal(result, expected)
# we coerce back to ints
expected = expected.astype('int')
result = grouped.mean()
assert_frame_equal(result, expected)
result = grouped.agg(np.mean)
assert_frame_equal(result, expected)
# but for transform we should still get back the original index
idx = MultiIndex.from_arrays([missing, dense],
names=['missing', 'dense'])
expected = Series(1, index=idx)
result = grouped.apply(lambda x: 1)
assert_series_equal(result, expected)
def test_observed(observed):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869
cat1 = Categorical(["a", "a", "b", "b"],
categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"],
categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
df['C'] = ['foo', 'bar'] * 2
# multiple groupers with a non-cat
gb = df.groupby(['A', 'B', 'C'], observed=observed)
exp_index = pd.MultiIndex.from_arrays(
[cat1, cat2, ['foo', 'bar'] * 2],
names=['A', 'B', 'C'])
expected = DataFrame({'values': Series(
[1, 2, 3, 4], index=exp_index)}).sort_index()
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected,
[cat1, cat2, ['foo', 'bar']],
list('ABC'))
tm.assert_frame_equal(result, expected)
gb = df.groupby(['A', 'B'], observed=observed)
exp_index = pd.MultiIndex.from_arrays(
[cat1, cat2],
names=['A', 'B'])
expected = DataFrame({'values': [1, 2, 3, 4]},
index=exp_index)
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected,
[cat1, cat2],
list('AB'))
tm.assert_frame_equal(result, expected)
# https://github.com/pandas-dev/pandas/issues/8138
d = {'cat':
pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
ordered=True),
'ints': [1, 1, 2, 2],
'val': [10, 20, 30, 40]}
df = pd.DataFrame(d)
# Grouping on a single column
groups_single_key = df.groupby("cat", observed=observed)
result = groups_single_key.mean()
exp_index = pd.CategoricalIndex(list('ab'), name="cat",
categories=list('abc'),
ordered=True)
expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
index=exp_index)
if not observed:
index = pd.CategoricalIndex(list('abc'), name="cat",
categories=list('abc'),
ordered=True)
expected = expected.reindex(index)
tm.assert_frame_equal(result, expected)
# Grouping on two columns
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
result = groups_double_key.agg('mean')
expected = DataFrame(
{"val": [10, 30, 20, 40],
"cat": pd.Categorical(['a', 'a', 'b', 'b'],
categories=['a', 'b', 'c'],
ordered=True),
"ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
if not observed:
expected = cartesian_product_for_groupers(
expected,
[df.cat.values, [1, 2]],
['cat', 'ints'])
tm.assert_frame_equal(result, expected)
# GH 10132
for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
c, i = key
result = groups_double_key.get_group(key)
Loading ...