Gemfury

alkaline-ml / pandas python

Repository URL to install this package:
Details
pandas / tests / groupby / test_function.py
import pytest

import numpy as np
import pandas as pd
from pandas import (DataFrame, Index, compat, isna,
                    Series, MultiIndex, Timestamp, date_range)
from pandas.errors import UnsupportedFunctionCall
from pandas.util import testing as tm
import pandas.core.nanops as nanops
from string import ascii_lowercase
from pandas.compat import product as cart_product


@pytest.mark.parametrize("agg_func", ['any', 'all'])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("vals", [
    ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''],
    [1, 2, 3], [1, 0, 0], [0, 0, 0],
    [1., 2., 3.], [1., 0., 0.], [0., 0., 0.],
    [True, True, True], [True, False, False], [False, False, False],
    [np.nan, np.nan, np.nan]
])
def test_groupby_bool_aggs(agg_func, skipna, vals):
    df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2})

    # Figure out expectation using Python builtin
    exp = getattr(compat.builtins, agg_func)(vals)

    # edge case for missing data with skipna and 'any'
    if skipna and all(isna(vals)) and agg_func == 'any':
        exp = False

    exp_df = DataFrame([exp] * 2, columns=['val'], index=Index(
        ['a', 'b'], name='key'))
    result = getattr(df.groupby('key'), agg_func)(skipna=skipna)
    tm.assert_frame_equal(result, exp_df)


def test_max_min_non_numeric():
    # #2700
    aa = DataFrame({'nn': [11, 11, 22, 22],
                    'ii': [1, 2, 3, 4],
                    'ss': 4 * ['mama']})

    result = aa.groupby('nn').max()
    assert 'ss' in result

    result = aa.groupby('nn').max(numeric_only=False)
    assert 'ss' in result

    result = aa.groupby('nn').min()
    assert 'ss' in result

    result = aa.groupby('nn').min(numeric_only=False)
    assert 'ss' in result


def test_intercept_builtin_sum():
    s = Series([1., 2., np.nan, 3.])
    grouped = s.groupby([0, 1, 2, 2])

    result = grouped.agg(compat.builtins.sum)
    result2 = grouped.apply(compat.builtins.sum)
    expected = grouped.sum()
    tm.assert_series_equal(result, expected)
    tm.assert_series_equal(result2, expected)


def test_builtins_apply():  # GH8155
    df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)),
                      columns=['jim', 'joe'])
    df['jolie'] = np.random.randn(1000)

    for keys in ['jim', ['jim', 'joe']]:  # single key & multi-key
        if keys == 'jim':
            continue
        for f in [max, min, sum]:
            fname = f.__name__
            result = df.groupby(keys).apply(f)
            result.shape
            ngroups = len(df.drop_duplicates(subset=keys))
            assert result.shape == (ngroups, 3), 'invalid frame shape: '\
                '{} (expected ({}, 3))'.format(result.shape, ngroups)

            tm.assert_frame_equal(result,  # numpy's equivalent function
                                  df.groupby(keys).apply(getattr(np, fname)))

            if f != sum:
                expected = df.groupby(keys).agg(fname).reset_index()
                expected.set_index(keys, inplace=True, drop=False)
                tm.assert_frame_equal(result, expected, check_dtype=False)

            tm.assert_series_equal(getattr(result, fname)(),
                                   getattr(df, fname)())


def test_arg_passthru():
    # make sure that we are passing thru kwargs
    # to our agg functions

    # GH3668
    # GH5724
    df = pd.DataFrame(
        {'group': [1, 1, 2],
         'int': [1, 2, 3],
         'float': [4., 5., 6.],
         'string': list('abc'),
         'category_string': pd.Series(list('abc')).astype('category'),
         'category_int': [7, 8, 9],
         'datetime': pd.date_range('20130101', periods=3),
         'datetimetz': pd.date_range('20130101',
                                     periods=3,
                                     tz='US/Eastern'),
         'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
        columns=['group', 'int', 'float', 'string',
                 'category_string', 'category_int',
                 'datetime', 'datetimetz',
                 'timedelta'])

    expected_columns_numeric = Index(['int', 'float', 'category_int'])

    # mean / median
    expected = pd.DataFrame(
        {'category_int': [7.5, 9],
         'float': [4.5, 6.],
         'timedelta': [pd.Timedelta('1.5s'),
                       pd.Timedelta('3s')],
         'int': [1.5, 3],
         'datetime': [pd.Timestamp('2013-01-01 12:00:00'),
                      pd.Timestamp('2013-01-03 00:00:00')],
         'datetimetz': [
             pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'),
             pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]},
        index=Index([1, 2], name='group'),
        columns=['int', 'float', 'category_int',
                 'datetime', 'datetimetz', 'timedelta'])
    for attr in ['mean', 'median']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_frame_equal(result.reindex_like(expected), expected)

    # TODO: min, max *should* handle
    # categorical (ordered) dtype
    expected_columns = Index(['int', 'float', 'string',
                              'category_int',
                              'datetime', 'datetimetz',
                              'timedelta'])
    for attr in ['min', 'max']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(['int', 'float', 'string',
                              'category_string', 'category_int',
                              'datetime', 'datetimetz',
                              'timedelta'])
    for attr in ['first', 'last']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(['int', 'float', 'string',
                              'category_int', 'timedelta'])
    for attr in ['sum']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(['int', 'float', 'category_int'])
    for attr in ['prod', 'cumprod']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    # like min, max, but don't include strings
    expected_columns = Index(['int', 'float',
                              'category_int',
                              'datetime', 'datetimetz',
                              'timedelta'])
    for attr in ['cummin', 'cummax']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        # GH 15561: numeric_only=False set by default like min/max
        tm.assert_index_equal(result.columns, expected_columns)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)

    expected_columns = Index(['int', 'float', 'category_int',
                              'timedelta'])
    for attr in ['cumsum']:
        f = getattr(df.groupby('group'), attr)
        result = f()
        tm.assert_index_equal(result.columns, expected_columns_numeric)

        result = f(numeric_only=False)
        tm.assert_index_equal(result.columns, expected_columns)


def test_non_cython_api():

    # GH5610
    # non-cython calls should not include the grouper

    df = DataFrame(
        [[1, 2, 'foo'],
         [1, np.nan, 'bar'],
         [3, np.nan, 'baz']],
        columns=['A', 'B', 'C'])
    g = df.groupby('A')
    gni = df.groupby('A', as_index=False)

    # mad
    expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3])
    expected.index.name = 'A'
    result = g.mad()
    tm.assert_frame_equal(result, expected)

    expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'],
                         index=[0, 1])
    result = gni.mad()
    tm.assert_frame_equal(result, expected)

    # describe
    expected_index = pd.Index([1, 3], name='A')
    expected_col = pd.MultiIndex(levels=[['B'],
                                         ['count', 'mean', 'std', 'min',
                                          '25%', '50%', '75%', 'max']],
                                 labels=[[0] * 8, list(range(8))])
    expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
                             [0.0, np.nan, np.nan, np.nan, np.nan, np.nan,
                              np.nan, np.nan]],
                            index=expected_index,
                            columns=expected_col)
    result = g.describe()
    tm.assert_frame_equal(result, expected)

    expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T,
                          df[df.A == 3].describe().unstack().to_frame().T])
    expected.index = pd.Index([0, 1])
    result = gni.describe()
    tm.assert_frame_equal(result, expected)

    # any
    expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'],
                         index=[1, 3])
    expected.index.name = 'A'
    result = g.any()
    tm.assert_frame_equal(result, expected)

    # idxmax
    expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3])
    expected.index.name = 'A'
    result = g.idxmax()
    tm.assert_frame_equal(result, expected)


def test_cython_api2():

    # this takes the fast apply path

    # cumsum (GH5614)
    df = DataFrame(
        [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]
         ], columns=['A', 'B', 'C'])
    expected = DataFrame(
        [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C'])
    result = df.groupby('A').cumsum()
    tm.assert_frame_equal(result, expected)

    # GH 5755 - cumsum is a transformer and should ignore as_index
    result = df.groupby('A', as_index=False).cumsum()
    tm.assert_frame_equal(result, expected)

    # GH 13994
    result = df.groupby('A').cumsum(axis=1)
    expected = df.cumsum(axis=1)
    tm.assert_frame_equal(result, expected)
    result = df.groupby('A').cumprod(axis=1)
    expected = df.cumprod(axis=1)
    tm.assert_frame_equal(result, expected)


def test_cython_median():
    df = DataFrame(np.random.randn(1000))
    df.values[::2] = np.nan

    labels = np.random.randint(0, 50, size=1000).astype(float)
    labels[::17] = np.nan

    result = df.groupby(labels).median()
    exp = df.groupby(labels).agg(nanops.nanmedian)
    tm.assert_frame_equal(result, exp)

    df = DataFrame(np.random.randn(1000, 5))
    rs = df.groupby(labels).agg(np.median)
    xp = df.groupby(labels).median()
    tm.assert_frame_equal(rs, xp)


def test_median_empty_bins(observed):
    df = pd.DataFrame(np.random.randint(0, 44, 500))

    grps = range(0, 55, 5)
    bins = pd.cut(df[0], grps)

    result = df.groupby(bins, observed=observed).median()
    expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", [
    'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
@pytest.mark.parametrize("method,data", [
    ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
    ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
    ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
    ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
    ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
             'args': [1]}),
    ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
               'out_type': 'int64'})
])
def test_groupby_non_arithmetic_agg_types(dtype, method, data):
    # GH9311, GH6620
    df = pd.DataFrame(
        [{'a': 1, 'b': 1},
         {'a': 1, 'b': 2},
         {'a': 2, 'b': 3},
         {'a': 2, 'b': 4}])

    df['b'] = df.b.astype(dtype)

    if 'args' not in data:
        data['args'] = []

    if 'out_type' in data:
        out_type = data['out_type']
    else:
        out_type = dtype

    exp = data['df']
    df_out = pd.DataFrame(exp)

    df_out['b'] = df_out.b.astype(out_type)
    df_out.set_index('a', inplace=True)

    grpd = df.groupby('a')
    t = getattr(grpd, method)(*data['args'])
    tm.assert_frame_equal(t, df_out)


def test_groupby_non_arithmetic_agg_intlike_precision():
    # GH9311, GH6620
    c = 24650000000000000

    inputs = ((Timestamp('2011-01-15 12:50:28.502376'),
               Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c))

    for i in inputs:
        df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}])

        grp_exp = {'first': {'expected': i[0]},
                   'last': {'expected': i[1]},
                   'min': {'expected': i[0]},
                   'max': {'expected': i[1]},
                   'nth': {'expected': i[1],
                           'args': [1]},
                   'count': {'expected': 2}}

        for method, data in compat.iteritems(grp_exp):
            if 'args' not in data:
                data['args'] = []

            grpd = df.groupby('a')
            res = getattr(grpd, method)(*data['args'])
            assert res.iloc[0].b == data['expected']


def test_fill_constistency():

    # GH9221
    # pass thru keyword arguments to the generated wrapper
    # are set if the passed kw is None (only)
    df = DataFrame(index=pd.MultiIndex.from_product(
        [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]),
        columns=Index(
        ['1', '2'], name='id'))
    df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan,
               np.nan, 22, np.nan]
    df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan,
               np.nan, 44, np.nan]

    expected = df.groupby(level=0, axis=0).fillna(method='ffill')
    result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T
    tm.assert_frame_equal(result, expected)


def test_groupby_cumprod():
    # GH 4095
    df = pd.DataFrame({'key': ['b'] * 10, 'value': 2})

    actual = df.groupby('key')['value'].cumprod()
    expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
    expected.name = 'value'
    tm.assert_series_equal(actual, expected)

    df = pd.DataFrame({'key': ['b'] * 100, 'value': 2})
    actual = df.groupby('key')['value'].cumprod()
    # if overflows, groupby product casts to float
    # while numpy passes back invalid values
    df['value'] = df['value'].astype(float)
    expected = df.groupby('key')['value'].apply(lambda x: x.cumprod())
    expected.name = 'value'
    tm.assert_series_equal(actual, expected)


def test_ops_general():
    ops = [('mean', np.mean),
           ('median', np.median),
           ('std', np.std),
           ('var', np.var),
           ('sum', np.sum),
           ('prod', np.prod),
           ('min', np.min),
           ('max', np.max),
           ('first', lambda x: x.iloc[0]),
           ('last', lambda x: x.iloc[-1]),
           ('count', np.size), ]
    try:
        from scipy.stats import sem
    except ImportError:
        pass
    else:
        ops.append(('sem', sem))
    df = DataFrame(np.random.randn(1000))
    labels = np.random.randint(0, 50, size=1000).astype(float)

    for op, targop in ops:
        result = getattr(df.groupby(labels), op)().astype(float)
        expected = df.groupby(labels).agg(targop)
        try:
            tm.assert_frame_equal(result, expected)
        except BaseException as exc:
            exc.args += ('operation: %s' % op, )
            raise


def test_max_nan_bug():
    raw = """,Date,app,File
-04-23,2013-04-23 00:00:00,,log080001.log
-05-06,2013-05-06 00:00:00,,log.log
-05-07,2013-05-07 00:00:00,OE,xlsx"""

    df = pd.read_csv(compat.StringIO(raw), parse_dates=[0])
    gb = df.groupby('Date')
    r = gb[['File']].max()
    e = gb['File'].max().to_frame()
    tm.assert_frame_equal(r, e)
    assert not r['File'].isna().any()


def test_nlargest():
    a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
    b = Series(list('a' * 5 + 'b' * 5))
    gb = a.groupby(b)
    r = gb.nlargest(3)
    e = Series([
        7, 5, 3, 10, 9, 6
    ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]]))
    tm.assert_series_equal(r, e)

    a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
    gb = a.groupby(b)
    e = Series([
        3, 2, 1, 3, 3, 2
    ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]]))
    tm.assert_series_equal(gb.nlargest(3, keep='last'), e)


def test_nsmallest():
    a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
    b = Series(list('a' * 5 + 'b' * 5))
    gb = a.groupby(b)
    r = gb.nsmallest(3)
    e = Series([
        1, 2, 3, 0, 4, 6
    ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
    tm.assert_series_equal(r, e)

    a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
    gb = a.groupby(b)
    e = Series([
        0, 1, 1, 0, 1, 2
    ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
    tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)


def test_numpy_compat():
    # see gh-12811
    df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]})
    g = df.groupby('A')

    msg = "numpy operations are not valid with groupby"

    for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'):
        tm.assert_raises_regex(UnsupportedFunctionCall, msg,
                               getattr(g, func), 1, 2, 3)
        tm.assert_raises_regex(UnsupportedFunctionCall, msg,
                               getattr(g, func), foo=1)


def test_cummin_cummax():
    # GH 15048
    num_types = [np.int32, np.int64, np.float32, np.float64]
    num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min,
                np.finfo(np.float32).min, np.finfo(np.float64).min]
    num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max,
               np.finfo(np.float32).max, np.finfo(np.float64).max]
    base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2],
                            'B': [3, 4, 3, 2, 2, 3, 2, 1]})
    expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
    expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

    for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
        df = base_df.astype(dtype)

        # cummin
        expected = pd.DataFrame({'B': expected_mins}).astype(dtype)
        result = df.groupby('A').cummin()
        tm.assert_frame_equal(result, expected)
        result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
        tm.assert_frame_equal(result, expected)

        # Test cummin w/ min value for dtype
        df.loc[[2, 6], 'B'] = min_val
        expected.loc[[2, 3, 6, 7], 'B'] = min_val
        result = df.groupby('A').cummin()
        tm.assert_frame_equal(result, expected)
        expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
        tm.assert_frame_equal(result, expected)

        # cummax
        expected = pd.DataFrame({'B': expected_maxs}).astype(dtype)
        result = df.groupby('A').cummax()
        tm.assert_frame_equal(result, expected)
        result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
        tm.assert_frame_equal(result, expected)

        # Test cummax w/ max value for dtype
        df.loc[[2, 6], 'B'] = max_val
        expected.loc[[2, 3, 6, 7], 'B'] = max_val
        result = df.groupby('A').cummax()
        tm.assert_frame_equal(result, expected)
        expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
        tm.assert_frame_equal(result, expected)

    # Test nan in some values
    base_df.loc[[0, 2, 4, 6], 'B'] = np.nan
    expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2,
                                   np.nan, 3, np.nan, 1]})
    result = base_df.groupby('A').cummin()
    tm.assert_frame_equal(result, expected)
    expected = (base_df.groupby('A')
                       .B
                       .apply(lambda x: x.cummin())
                       .to_frame())
    tm.assert_frame_equal(result, expected)

    expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4,
                                   np.nan, 3, np.nan, 3]})
    result = base_df.groupby('A').cummax()
    tm.assert_frame_equal(result, expected)
    expected = (base_df.groupby('A')
                       .B
                       .apply(lambda x: x.cummax())
                       .to_frame())
    tm.assert_frame_equal(result, expected)

    # Test nan in entire column
    base_df['B'] = np.nan
    expected = pd.DataFrame({'B': [np.nan] * 8})
    result = base_df.groupby('A').cummin()
    tm.assert_frame_equal(expected, result)
    result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame()
    tm.assert_frame_equal(expected, result)
    result = base_df.groupby('A').cummax()
    tm.assert_frame_equal(expected, result)
    result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame()
    tm.assert_frame_equal(expected, result)

    # GH 15561
    df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001'])))
    expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b')
    for method in ['cummax', 'cummin']:
        result = getattr(df.groupby('a')['b'], method)()
        tm.assert_series_equal(expected, result)

    # GH 15635
    df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
    result = df.groupby('a').b.cummax()
    expected = pd.Series([2, 1, 2], name='b')
    tm.assert_series_equal(result, expected)

    df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
    result = df.groupby('a').b.cummin()
    expected = pd.Series([1, 2, 1], name='b')
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize('in_vals, out_vals', [

    # Basics: strictly increasing (T), strictly decreasing (F),
    # abs val increasing (F), non-strictly increasing (T)
    ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1],
     [True, False, False, True]),

    # Test with inf vals
    ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
     [True, False, True, False]),

    # Test with nan vals; should always be False
    ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
     [False, False, False, False]),
])
def test_is_monotonic_increasing(in_vals, out_vals):
    # GH 17015
    source_dict = {
        'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
        'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
        'C': in_vals}
    df = pd.DataFrame(source_dict)
    result = df.groupby('B').C.is_monotonic_increasing
    index = Index(list('abcd'), name='B')
    expected = pd.Series(index=index, data=out_vals, name='C')
    tm.assert_series_equal(result, expected)

    # Also check result equal to manually taking x.is_monotonic_increasing.
    expected = (
        df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing))
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize('in_vals, out_vals', [
    # Basics: strictly decreasing (T), strictly increasing (F),
    # abs val decreasing (F), non-strictly increasing (T)
    ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1],
     [True, False, False, True]),

    # Test with inf vals
    ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
     [True, True, False, True]),

    # Test with nan vals; should always be False
    ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
     [False, False, False, False]),
])
def test_is_monotonic_decreasing(in_vals, out_vals):
    # GH 17015
    source_dict = {
        'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'],
        'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'],
        'C': in_vals}

    df = pd.DataFrame(source_dict)
    result = df.groupby('B').C.is_monotonic_decreasing
    index = Index(list('abcd'), name='B')
    expected = pd.Series(index=index, data=out_vals, name='C')
    tm.assert_series_equal(result, expected)


# describe
# --------------------------------

def test_apply_describe_bug(mframe):
    grouped = mframe.groupby(level='first')
    grouped.describe()  # it works!


def test_series_describe_multikey():
    ts = tm.makeTimeSeries()
    grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
    result = grouped.describe()
    tm.assert_series_equal(result['mean'], grouped.mean(),
                           check_names=False)
    tm.assert_series_equal(result['std'], grouped.std(), check_names=False)
    tm.assert_series_equal(result['min'], grouped.min(), check_names=False)


def test_series_describe_single():
    ts = tm.makeTimeSeries()
    grouped = ts.groupby(lambda x: x.month)
    result = grouped.apply(lambda x: x.describe())
    expected = grouped.describe().stack()
    tm.assert_series_equal(result, expected)


def test_series_index_name(df):
    grouped = df.loc[:, ['C']].groupby(df['A'])
    result = grouped.agg(lambda x: x.mean())
    assert result.index.name == 'A'


def test_frame_describe_multikey(tsframe):
    grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
    result = grouped.describe()
    desc_groups = []
    for col in tsframe:
        group = grouped[col].describe()
        # GH 17464 - Remove duplicate MultiIndex levels
        group_col = pd.MultiIndex(
            levels=[[col], group.columns],
            labels=[[0] * len(group.columns), range(len(group.columns))])
        group = pd.DataFrame(group.values,
                             columns=group_col,
                             index=group.index)
        desc_groups.append(group)
    expected = pd.concat(desc_groups, axis=1)
    tm.assert_frame_equal(result, expected)

    groupedT = tsframe.groupby({'A': 0, 'B': 0,
                                'C': 1, 'D': 1}, axis=1)
    result = groupedT.describe()
    expected = tsframe.describe().T
    expected.index = pd.MultiIndex(
        levels=[[0, 1], expected.index],
        labels=[[0, 0, 1, 1], range(len(expected.index))])
    tm.assert_frame_equal(result, expected)


def test_frame_describe_tupleindex():

    # GH 14848 - regression from 0.19.0 to 0.19.1
    df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
                     'y': [10, 20, 30, 40, 50] * 3,
                     'z': [100, 200, 300, 400, 500] * 3})
    df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
    df2 = df1.rename(columns={'k': 'key'})
    pytest.raises(ValueError, lambda: df1.groupby('k').describe())
    pytest.raises(ValueError, lambda: df2.groupby('key').describe())


def test_frame_describe_unstacked_format():
    # GH 4792
    prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990,
              pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499,
              pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499}
    volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
               pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
               pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000}
    df = pd.DataFrame({'PRICE': prices,
                       'VOLUME': volumes})
    result = df.groupby('PRICE').VOLUME.describe()
    data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
            df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
    expected = pd.DataFrame(data,
                            index=pd.Index([24990, 25499], name='PRICE'),
                            columns=['count', 'mean', 'std', 'min',
                                     '25%', '50%', '75%', 'max'])
    tm.assert_frame_equal(result, expected)


# nunique
# --------------------------------

@pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6),
                                              (10, 100, 1000)))
@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2))
def test_series_groupby_nunique(n, m, sort, dropna):

    def check_nunique(df, keys, as_index=True):
        gr = df.groupby(keys, as_index=as_index, sort=sort)
        left = gr['julie'].nunique(dropna=dropna)

        gr = df.groupby(keys, as_index=as_index, sort=sort)
        right = gr['julie'].apply(Series.nunique, dropna=dropna)
        if not as_index:
            right = right.reset_index(drop=True)

        tm.assert_series_equal(left, right, check_names=False)

    days = date_range('2015-08-23', periods=10)

    frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n),
                       'joe': np.random.choice(days, n),
                       'julie': np.random.randint(0, m, n)})

    check_nunique(frame, ['jim'])
    check_nunique(frame, ['jim', 'joe'])

    frame.loc[1::17, 'jim'] = None
    frame.loc[3::37, 'joe'] = None
    frame.loc[7::19, 'julie'] = None
    frame.loc[8::19, 'julie'] = None
    frame.loc[9::19, 'julie'] = None

    check_nunique(frame, ['jim'])
    check_nunique(frame, ['jim', 'joe'])
    check_nunique(frame, ['jim'], as_index=False)
    check_nunique(frame, ['jim', 'joe'], as_index=False)


def test_nunique():
    df = DataFrame({
        'A': list('abbacc'),
        'B': list('abxacc'),
        'C': list('abbacx'),
    })

    expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
    result = df.groupby('A', as_index=False).nunique()
    tm.assert_frame_equal(result, expected)

    # as_index
    expected.index = list('abc')
    expected.index.name = 'A'
    result = df.groupby('A').nunique()
    tm.assert_frame_equal(result, expected)

    # with na
    result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
    tm.assert_frame_equal(result, expected)

    # dropna
    expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
                         index=list('abc'))
    expected.index.name = 'A'
    result = df.replace({'x': None}).groupby('A').nunique()
    tm.assert_frame_equal(result, expected)


def test_nunique_with_object():
    # GH 11077
    data = pd.DataFrame(
        [[100, 1, 'Alice'],
         [200, 2, 'Bob'],
         [300, 3, 'Charlie'],
         [-400, 4, 'Dan'],
         [500, 5, 'Edith']],
        columns=['amount', 'id', 'name']
    )

    result = data.groupby(['id', 'amount'])['name'].nunique()
    index = MultiIndex.from_arrays([data.id, data.amount])
    expected = pd.Series([1] * 5, name='name', index=index)
    tm.assert_series_equal(result, expected)


def test_nunique_with_empty_series():
    # GH 12553
    data = pd.Series(name='name')
    result = data.groupby(level=0).nunique()
    expected = pd.Series(name='name', dtype='int64')
    tm.assert_series_equal(result, expected)


def test_nunique_with_timegrouper():
    # GH 13453
    test = pd.DataFrame({
        'time': [Timestamp('2016-06-28 09:35:35'),
                 Timestamp('2016-06-28 16:09:30'),
                 Timestamp('2016-06-28 16:46:28')],
        'data': ['1', '2', '3']}).set_index('time')
    result = test.groupby(pd.Grouper(freq='h'))['data'].nunique()
    expected = test.groupby(
        pd.Grouper(freq='h')
    )['data'].apply(pd.Series.nunique)
    tm.assert_series_equal(result, expected)


# count
# --------------------------------

def test_groupby_timedelta_cython_count():
    df = DataFrame({'g': list('ab' * 2),
                    'delt': np.arange(4).astype('timedelta64[ns]')})
    expected = Series([
        2, 2
    ], index=pd.Index(['a', 'b'], name='g'), name='delt')
    result = df.groupby('g').delt.count()
    tm.assert_series_equal(expected, result)


def test_count():
    n = 1 << 15
    dr = date_range('2015-08-30', periods=n // 10, freq='T')

    df = DataFrame({
        '1st': np.random.choice(
            list(ascii_lowercase), n),
        '2nd': np.random.randint(0, 5, n),
        '3rd': np.random.randn(n).round(3),
        '4th': np.random.randint(-10, 10, n),
        '5th': np.random.choice(dr, n),
        '6th': np.random.randn(n).round(3),
        '7th': np.random.randn(n).round(3),
        '8th': np.random.choice(dr, n) - np.random.choice(dr, 1),
        '9th': np.random.choice(
            list(ascii_lowercase), n)
    })

    for col in df.columns.drop(['1st', '2nd', '4th']):
        df.loc[np.random.choice(n, n // 10), col] = np.nan

    df['9th'] = df['9th'].astype('category')

    for key in '1st', '2nd', ['1st', '2nd']:
        left = df.groupby(key).count()
        right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
        tm.assert_frame_equal(left, right)

    # GH5610
    # count counts non-nulls
    df = pd.DataFrame([[1, 2, 'foo'],
                       [1, np.nan, 'bar'],
                       [3, np.nan, np.nan]],
                      columns=['A', 'B', 'C'])

    count_as = df.groupby('A').count()
    count_not_as = df.groupby('A', as_index=False).count()

    expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'],
                         index=[1, 3])
    expected.index.name = 'A'
    tm.assert_frame_equal(count_not_as, expected.reset_index())
    tm.assert_frame_equal(count_as, expected)

    count_B = df.groupby('A')['B'].count()
    tm.assert_series_equal(count_B, expected['B'])


def test_count_object():
    df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3})
    result = df.groupby('c').a.count()
    expected = pd.Series([
        3, 3
    ], index=pd.Index([2, 3], name='c'), name='a')
    tm.assert_series_equal(result, expected)

    df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3,
                       'c': [2] * 3 + [3] * 3})
    result = df.groupby('c').a.count()
    expected = pd.Series([
        1, 3
    ], index=pd.Index([2, 3], name='c'), name='a')
    tm.assert_series_equal(result, expected)


def test_count_cross_type():
    # GH8169
    vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint(
        0, 2, (100, 2))))

    df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd'])
    df[df == 2] = np.nan
    expected = df.groupby(['c', 'd']).count()

    for t in ['float32', 'object']:
        df['a'] = df['a'].astype(t)
        df['b'] = df['b'].astype(t)
        result = df.groupby(['c', 'd']).count()
        tm.assert_frame_equal(result, expected)


def test_lower_int_prec_count():
    df = DataFrame({'a': np.array(
        [0, 1, 2, 100], np.int8),
        'b': np.array(
        [1, 2, 3, 6], np.uint32),
        'c': np.array(
        [4, 5, 6, 8], np.int16),
        'grp': list('ab' * 2)})
    result = df.groupby('grp').count()
    expected = DataFrame({'a': [2, 2],
                          'b': [2, 2],
                          'c': [2, 2]}, index=pd.Index(list('ab'),
                                                       name='grp'))
    tm.assert_frame_equal(result, expected)


def test_count_uses_size_on_exception():
    class RaisingObjectException(Exception):
        pass

    class RaisingObject(object):

        def __init__(self, msg='I will raise inside Cython'):
            super(RaisingObject, self).__init__()
            self.msg = msg

        def __eq__(self, other):
            # gets called in Cython to check that raising calls the method
            raise RaisingObjectException(self.msg)

    df = DataFrame({'a': [RaisingObject() for _ in range(4)],
                    'grp': list('ab' * 2)})
    result = df.groupby('grp').count()
    expected = DataFrame({'a': [2, 2]}, index=pd.Index(
        list('ab'), name='grp'))
    tm.assert_frame_equal(result, expected)


# size
# --------------------------------

def test_size(df):
    grouped = df.groupby(['A', 'B'])
    result = grouped.size()
    for key, group in grouped:
        assert result[key] == len(group)

    grouped = df.groupby('A')
    result = grouped.size()
    for key, group in grouped:
        assert result[key] == len(group)

    grouped = df.groupby('B')
    result = grouped.size()
    for key, group in grouped:
        assert result[key] == len(group)

    df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
    for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
        left = df.groupby(key, sort=sort).size()
        right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
        tm.assert_series_equal(left, right, check_names=False)

    # GH11699
    df = DataFrame([], columns=['A', 'B'])
    out = Series([], dtype='int64', index=Index([], name='A'))
    tm.assert_series_equal(df.groupby('A').size(), out)


# pipe
# --------------------------------

def test_pipe():
    # Test the pipe method of DataFrameGroupBy.
    # Issue #17871

    random_state = np.random.RandomState(1234567890)

    df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': random_state.randn(8),
                    'C': random_state.randn(8)})

    def f(dfgb):
        return dfgb.B.max() - dfgb.C.min().min()

    def square(srs):
        return srs ** 2

    # Note that the transformations are
    # GroupBy -> Series
    # Series -> Series
    # This then chains the GroupBy.pipe and the
    # NDFrame.pipe methods
    result = df.groupby('A').pipe(f).pipe(square)

    index = Index([u'bar', u'foo'], dtype='object', name=u'A')
    expected = pd.Series([8.99110003361, 8.17516964785], name='B',
                         index=index)

    tm.assert_series_equal(expected, result)


def test_pipe_args():
    # Test passing args to the pipe method of DataFrameGroupBy.
    # Issue #17871

    df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'],
                       'x': [1.0, 2.0, 3.0, 2.0, 5.0],
                       'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]})

    def f(dfgb, arg1):
        return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
                    .groupby(dfgb.grouper))

    def g(dfgb, arg2):
        return dfgb.sum() / dfgb.sum().sum() + arg2

    def h(df, arg3):
        return df.x + df.y - arg3

    result = (df
              .groupby('group')
              .pipe(f, 0)
              .pipe(g, 10)
              .pipe(h, 100))

    # Assert the results here
    index = pd.Index(['A', 'B', 'C'], name='group')
    expected = pd.Series([-79.5160891089, -78.4839108911, -80],
                         index=index)

    tm.assert_series_equal(expected, result)

    # test SeriesGroupby.pipe
    ser = pd.Series([1, 1, 2, 2, 3, 3])
    result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())

    expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))

    tm.assert_series_equal(result, expected)
alkaline-ml / pandas python

Products

About

Resources

Contact Gemfury