Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / groupby / test_groupby.py

# -*- coding: utf-8 -*-
from __future__ import print_function

from collections import defaultdict
from datetime import datetime
from decimal import Decimal

import numpy as np
import pytest

from pandas.compat import (
    OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip)
from pandas.errors import PerformanceWarning

import pandas as pd
from pandas import (
    DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range,
    read_csv)
import pandas.core.common as com
import pandas.util.testing as tm
from pandas.util.testing import (
    assert_almost_equal, assert_frame_equal, assert_series_equal)


def test_repr():
    # GH18203
    result = repr(pd.Grouper(key='A', level='B'))
    expected = "Grouper(key='A', level='B', axis=0, sort=False)"
    assert result == expected


@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
def test_basic(dtype):

    data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)

    index = np.arange(9)
    np.random.shuffle(index)
    data = data.reindex(index)

    grouped = data.groupby(lambda x: x // 3)

    for k, v in grouped:
        assert len(v) == 3

    agged = grouped.aggregate(np.mean)
    assert agged[1] == 1

    assert_series_equal(agged, grouped.agg(np.mean))  # shorthand
    assert_series_equal(agged, grouped.mean())
    assert_series_equal(grouped.agg(np.sum), grouped.sum())

    expected = grouped.apply(lambda x: x * x.sum())
    transformed = grouped.transform(lambda x: x * x.sum())
    assert transformed[7] == 12
    assert_series_equal(transformed, expected)

    value_grouped = data.groupby(data)
    assert_series_equal(value_grouped.aggregate(np.mean), agged,
                        check_index_type=False)

    # complex agg
    agged = grouped.aggregate([np.mean, np.std])

    with tm.assert_produces_warning(FutureWarning,
                                    check_stacklevel=False):
        agged = grouped.aggregate({'one': np.mean, 'two': np.std})

    group_constants = {0: 10, 1: 20, 2: 30}
    agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
    assert agged[1] == 21

    # corner cases
    msg = "Must produce aggregated value"
    # exception raised is type Exception
    with pytest.raises(Exception, match=msg):
        grouped.aggregate(lambda x: x * 2)


def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
    key = mframe.index.codes[0]
    grouped = mframe.groupby(key)
    result = grouped.sum()

    expected = mframe.groupby(key.astype('O')).sum()
    assert_frame_equal(result, expected)

    # GH 3911, mixed frame non-conversion
    df = df_mixed_floats.copy()
    df['value'] = lrange(len(df))

    def max_value(group):
        return group.loc[group['value'].idxmax()]

    applied = df.groupby('A').apply(max_value)
    result = applied.get_dtype_counts().sort_values()
    expected = Series({'float64': 2,
                       'int64': 1,
                       'object': 2}).sort_values()
    assert_series_equal(result, expected)


def test_groupby_return_type():

    # GH2893, return a reduced type
    df1 = DataFrame(
        [{"val1": 1, "val2": 20},
         {"val1": 1, "val2": 19},
         {"val1": 2, "val2": 27},
         {"val1": 2, "val2": 12}
         ])

    def func(dataf):
        return dataf["val2"] - dataf["val2"].mean()

    result = df1.groupby("val1", squeeze=True).apply(func)
    assert isinstance(result, Series)

    df2 = DataFrame(
        [{"val1": 1, "val2": 20},
         {"val1": 1, "val2": 19},
         {"val1": 1, "val2": 27},
         {"val1": 1, "val2": 12}
         ])

    def func(dataf):
        return dataf["val2"] - dataf["val2"].mean()

    result = df2.groupby("val1", squeeze=True).apply(func)
    assert isinstance(result, Series)

    # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
    df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
    result = df.groupby('X', squeeze=False).count()
    assert isinstance(result, DataFrame)

    # GH5592
    # inconcistent return type
    df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
                           'Pony', 'Pony'], B=Series(
                               np.arange(7), dtype='int64'), C=date_range(
                                   '20130101', periods=7)))

    def f(grp):
        return grp.iloc[0]

    expected = df.groupby('A').first()[['B']]
    result = df.groupby('A').apply(f)[['B']]
    assert_frame_equal(result, expected)

    def f(grp):
        if grp.name == 'Tiger':
            return None
        return grp.iloc[0]

    result = df.groupby('A').apply(f)[['B']]
    e = expected.copy()
    e.loc['Tiger'] = np.nan
    assert_frame_equal(result, e)

    def f(grp):
        if grp.name == 'Pony':
            return None
        return grp.iloc[0]

    result = df.groupby('A').apply(f)[['B']]
    e = expected.copy()
    e.loc['Pony'] = np.nan
    assert_frame_equal(result, e)

    # 5592 revisited, with datetimes
    def f(grp):
        if grp.name == 'Pony':
            return None
        return grp.iloc[0]

    result = df.groupby('A').apply(f)[['C']]
    e = df.groupby('A').first()[['C']]
    e.loc['Pony'] = pd.NaT
    assert_frame_equal(result, e)

    # scalar outputs
    def f(grp):
        if grp.name == 'Pony':
            return None
        return grp.iloc[0].loc['C']

    result = df.groupby('A').apply(f)
    e = df.groupby('A').first()['C'].copy()
    e.loc['Pony'] = np.nan
    e.name = None
    assert_series_equal(result, e)


def test_pass_args_kwargs(ts, tsframe):

    def f(x, q=None, axis=0):
        return np.percentile(x, q, axis=axis)

    g = lambda x: np.percentile(x, 80, axis=0)

    # Series
    ts_grouped = ts.groupby(lambda x: x.month)
    agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
    apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
    trans_result = ts_grouped.transform(np.percentile, 80, axis=0)

    agg_expected = ts_grouped.quantile(.8)
    trans_expected = ts_grouped.transform(g)

    assert_series_equal(apply_result, agg_expected)
    assert_series_equal(agg_result, agg_expected, check_names=False)
    assert_series_equal(trans_result, trans_expected)

    agg_result = ts_grouped.agg(f, q=80)
    apply_result = ts_grouped.apply(f, q=80)
    trans_result = ts_grouped.transform(f, q=80)
    assert_series_equal(agg_result, agg_expected)
    assert_series_equal(apply_result, agg_expected)
    assert_series_equal(trans_result, trans_expected)

    # DataFrame
    df_grouped = tsframe.groupby(lambda x: x.month)
    agg_result = df_grouped.agg(np.percentile, 80, axis=0)
    apply_result = df_grouped.apply(DataFrame.quantile, .8)
    expected = df_grouped.quantile(.8)
    assert_frame_equal(apply_result, expected)
    assert_frame_equal(agg_result, expected, check_names=False)

    agg_result = df_grouped.agg(f, q=80)
    apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
    assert_frame_equal(agg_result, expected, check_names=False)
    assert_frame_equal(apply_result, expected)


def test_len():
    df = tm.makeTimeDataFrame()
    grouped = df.groupby([lambda x: x.year, lambda x: x.month,
                          lambda x: x.day])
    assert len(grouped) == len(df)

    grouped = df.groupby([lambda x: x.year, lambda x: x.month])
    expected = len({(x.year, x.month) for x in df.index})
    assert len(grouped) == expected

    # issue 11016
    df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
    assert len(df.groupby(('a'))) == 0
    assert len(df.groupby(('b'))) == 3
    assert len(df.groupby(['a', 'b'])) == 3


def test_basic_regression():
    # regression
    T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
    result = Series(T, lrange(0, len(T)))

    groupings = np.random.random((1100, ))
    groupings = Series(groupings, lrange(0, len(groupings))) * 10.

    grouped = result.groupby(groupings)
    grouped.mean()


@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
                                   'int32', 'int16', 'int8'])
def test_with_na_groups(dtype):
    index = Index(np.arange(10))
    values = Series(np.ones(10), index, dtype=dtype)
    labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
                     'bar', 'bar', np.nan, 'foo'], index=index)

    # this SHOULD be an int
    grouped = values.groupby(labels)
    agged = grouped.agg(len)
    expected = Series([4, 2], index=['bar', 'foo'])

    assert_series_equal(agged, expected, check_dtype=False)

    # assert issubclass(agged.dtype.type, np.integer)

    # explicitly return a float from my function
    def f(x):
        return float(len(x))

    agged = grouped.agg(f)
    expected = Series([4, 2], index=['bar', 'foo'])

    assert_series_equal(agged, expected, check_dtype=False)
    assert issubclass(agged.dtype.type, np.dtype(dtype).type)


def test_indices_concatenation_order():

    # GH 2808

    def f1(x):
        y = x[(x.b % 2) == 1] ** 2
        if y.empty:
            multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
                                    names=['b', 'c'])
            res = DataFrame(None, columns=['a'], index=multiindex)
            return res
        else:
            y = y.set_index(['b', 'c'])
            return y

    def f2(x):
        y = x[(x.b % 2) == 1] ** 2
        if y.empty:
            return DataFrame()
        else:
            y = y.set_index(['b', 'c'])
            return y

    def f3(x):
        y = x[(x.b % 2) == 1] ** 2
        if y.empty:
            multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
                                    names=['foo', 'bar'])
            res = DataFrame(None, columns=['a', 'b'], index=multiindex)
            return res
        else:
            return y

    df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})

    df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})

    # correct result
    result1 = df.groupby('a').apply(f1)
    result2 = df2.groupby('a').apply(f1)
    assert_frame_equal(result1, result2)

    # should fail (not the same number of levels)
    msg = "Cannot concat indices that do not have the same number of levels"
    with pytest.raises(AssertionError, match=msg):
        df.groupby('a').apply(f2)
    with pytest.raises(AssertionError, match=msg):
        df2.groupby('a').apply(f2)

    # should fail (incorrect shape)
    with pytest.raises(AssertionError, match=msg):
        df.groupby('a').apply(f3)
    with pytest.raises(AssertionError, match=msg):
Loading ...