Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / groupby / test_categorical.py

# -*- coding: utf-8 -*-
from __future__ import print_function

from datetime import datetime

import numpy as np
import pytest

from pandas.compat import PY37

import pandas as pd
from pandas import (
    Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut)
import pandas.util.testing as tm
from pandas.util.testing import (
    assert_equal, assert_frame_equal, assert_series_equal)


def cartesian_product_for_groupers(result, args, names):
    """ Reindex to a cartesian production for the groupers,
    preserving the nature (Categorical) of each grouper """

    def f(a):
        if isinstance(a, (CategoricalIndex, Categorical)):
            categories = a.categories
            a = Categorical.from_codes(np.arange(len(categories)),
                                       categories=categories,
                                       ordered=a.ordered)
        return a

    index = pd.MultiIndex.from_product(map(f, args), names=names)
    return result.reindex(index).sort_index()


def test_apply_use_categorical_name(df):
    cats = qcut(df.C, 4)

    def get_stats(group):
        return {'min': group.min(),
                'max': group.max(),
                'count': group.count(),
                'mean': group.mean()}

    result = df.groupby(cats, observed=False).D.apply(get_stats)
    assert result.index.names[0] == 'C'


def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"], ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                   [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum),
        df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).filter(np.all),
        df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum),
        df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(
        c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels, categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels, ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(
        exp_cats, sort=False, observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8),
                                  levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index
                           .get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                 '75%', 'max'] * 4)
    tm.assert_index_equal((desc_result.stack().index
                           .get_level_values(1)), exp)


def test_level_get_group(observed):
    # GH15155
    df = DataFrame(data=np.arange(2, 22, 2),
                   index=MultiIndex(
                       levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
                       codes=[[0] * 5 + [1] * 5, range(10)],
                       names=["Index1", "Index2"]))
    g = df.groupby(level=["Index1"], observed=observed)

    # expected should equal test.loc[["a"]]
    # GH15166
    expected = DataFrame(data=np.arange(2, 12, 2),
                         index=pd.MultiIndex(levels=[pd.CategoricalIndex(
                             ["a", "b"]), range(5)],
        codes=[[0] * 5, range(5)],
        names=["Index1", "Index2"]))
    result = g.get_group('a')

    assert_frame_equal(result, expected)


@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636", strict=False)
@pytest.mark.parametrize('ordered', [True, False])
def test_apply(ordered):
    # GH 10138

    dense = Categorical(list('abc'), ordered=ordered)

    # 'b' is in the categories but not in the list
    missing = Categorical(
        list('aaa'), categories=['a', 'b'], ordered=ordered)
    values = np.arange(len(dense))
    df = DataFrame({'missing': missing,
                    'dense': dense,
                    'values': values})
    grouped = df.groupby(['missing', 'dense'], observed=True)

    # missing category 'b' should still exist in the output index
    idx = MultiIndex.from_arrays(
        [missing, dense], names=['missing', 'dense'])
    expected = DataFrame([0, 1, 2.],
                         index=idx,
                         columns=['values'])

    result = grouped.apply(lambda x: np.mean(x))
    assert_frame_equal(result, expected)

    # we coerce back to ints
    expected = expected.astype('int')
    result = grouped.mean()
    assert_frame_equal(result, expected)

    result = grouped.agg(np.mean)
    assert_frame_equal(result, expected)

    # but for transform we should still get back the original index
    idx = MultiIndex.from_arrays([missing, dense],
                                 names=['missing', 'dense'])
    expected = Series(1, index=idx)
    result = grouped.apply(lambda x: 1)
    assert_series_equal(result, expected)


def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df['C'] = ['foo', 'bar'] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(['A', 'B', 'C'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays(
        [cat1, cat2, ['foo', 'bar'] * 2],
        names=['A', 'B', 'C'])
    expected = DataFrame({'values': Series(
        [1, 2, 3, 4], index=exp_index)}).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [cat1, cat2, ['foo', 'bar']],
            list('ABC'))

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(['A', 'B'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays(
        [cat1, cat2],
        names=['A', 'B'])
    expected = DataFrame({'values': [1, 2, 3, 4]},
                         index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [cat1, cat2],
            list('AB'))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {'cat':
         pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
                        ordered=True),
         'ints': [1, 1, 2, 2],
         'val': [10, 20, 30, 40]}
    df = pd.DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = pd.CategoricalIndex(list('ab'), name="cat",
                                    categories=list('abc'),
                                    ordered=True)
    expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
                         index=exp_index)
    if not observed:
        index = pd.CategoricalIndex(list('abc'), name="cat",
                                    categories=list('abc'),
                                    ordered=True)
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg('mean')
    expected = DataFrame(
        {"val": [10, 30, 20, 40],
         "cat": pd.Categorical(['a', 'a', 'b', 'b'],
                               categories=['a', 'b', 'c'],
                               ordered=True),
         "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [df.cat.values, [1, 2]],
            ['cat', 'ints'])

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
Loading ...