"""
Provide the groupby split-apply-combine paradigm. Define the GroupBy
class providing the base-class of operations.
The SeriesGroupBy and DataFrameGroupBy sub-class
(defined in pandas.core.groupby.generic)
expose these user-facing objects to provide specific functionailty.
"""
import collections
from contextlib import contextmanager
import datetime
from functools import partial, wraps
import types
import warnings
import numpy as np
from pandas._libs import Timestamp, groupby as libgroupby
import pandas.compat as compat
from pandas.compat import callable, range, set_function_name, zip
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._validators import validate_kwargs
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
from pandas.core.dtypes.missing import isna, notna
import pandas.core.algorithms as algorithms
from pandas.core.base import (
DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
import pandas.core.common as com
from pandas.core.config import option_context
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base
from pandas.core.index import Index, MultiIndex
from pandas.core.series import Series
from pandas.core.sorting import get_group_index_sorter
_common_see_also = """
See Also
--------
pandas.Series.%(name)s
pandas.DataFrame.%(name)s
pandas.Panel.%(name)s
"""
_apply_docs = dict(
template="""
Apply function `func` group-wise and combine the results together.
The function passed to `apply` must take a {input} as its first
argument and return a DataFrame, Series or scalar. `apply` will
then take care of combining the results back together into a single
dataframe or series. `apply` is therefore a highly flexible
grouping method.
While `apply` is a very flexible method, its downside is that
using it can be quite a bit slower than using more specific methods
like `agg` or `transform`. Pandas offers a wide range of method that will
be much faster than using `apply` for their specific purposes, so try to
use them before reaching for `apply`.
Parameters
----------
func : callable
A callable that takes a {input} as its first argument, and
returns a dataframe, a series or a scalar. In addition the
callable may take positional and keyword arguments.
args, kwargs : tuple and dict
Optional positional and keyword arguments to pass to `func`.
Returns
-------
applied : Series or DataFrame
See Also
--------
pipe : Apply function to the full GroupBy object instead of to each
group.
aggregate : Apply aggregate function to the GroupBy object.
transform : Apply function column-by-column to the GroupBy object.
Series.apply : Apply a function to a Series.
DataFrame.apply : Apply a function to each row or column of a DataFrame.
""",
dataframe_examples="""
>>> df = pd.DataFrame({'A': 'a a b'.split(),
'B': [1,2,3],
'C': [4,6, 5]})
>>> g = df.groupby('A')
Notice that ``g`` has two groups, ``a`` and ``b``.
Calling `apply` in various ways, we can get different grouping results:
Example 1: below the function passed to `apply` takes a DataFrame as
its argument and returns a DataFrame. `apply` combines the result for
each group together into a new DataFrame:
>>> g[['B', 'C']].apply(lambda x: x / x.sum())
B C
0 0.333333 0.4
1 0.666667 0.6
2 1.000000 1.0
Example 2: The function passed to `apply` takes a DataFrame as
its argument and returns a Series. `apply` combines the result for
each group together into a new DataFrame:
>>> g[['B', 'C']].apply(lambda x: x.max() - x.min())
B C
A
a 1 2
b 0 0
Example 3: The function passed to `apply` takes a DataFrame as
its argument and returns a scalar. `apply` combines the result for
each group together into a Series, including setting the index as
appropriate:
>>> g.apply(lambda x: x.C.max() - x.B.min())
A
a 5
b 2
dtype: int64
""",
series_examples="""
>>> s = pd.Series([0, 1, 2], index='a a b'.split())
>>> g = s.groupby(s.index)
From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
Calling `apply` in various ways, we can get different grouping results:
Example 1: The function passed to `apply` takes a Series as
its argument and returns a Series. `apply` combines the result for
each group together into a new Series:
>>> g.apply(lambda x: x*2 if x.name == 'b' else x/2)
0 0.0
1 0.5
2 4.0
dtype: float64
Example 2: The function passed to `apply` takes a Series as
its argument and returns a scalar. `apply` combines the result for
each group together into a Series, including setting the index as
appropriate:
>>> g.apply(lambda x: x.max() - x.min())
a 1
b 0
dtype: int64
Notes
-----
In the current implementation `apply` calls `func` twice on the
first group to decide whether it can take a fast or slow code
path. This can lead to unexpected behavior if `func` has
side-effects, as they will take effect twice for the first
group.
Examples
--------
{examples}
""")
_pipe_template = """\
Apply a function `func` with arguments to this %(klass)s object and return
the function's result.
%(versionadded)s
Use `.pipe` when you want to improve readability by chaining together
functions that expect Series, DataFrames, GroupBy or Resampler objects.
Instead of writing
>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c)
You can write
>>> (df.groupby('group')
... .pipe(f)
... .pipe(g, arg1=a)
... .pipe(h, arg2=b, arg3=c))
which is much more readable.
Parameters
----------
func : callable or tuple of (callable, string)
Function to apply to this %(klass)s object or, alternatively,
a `(callable, data_keyword)` tuple where `data_keyword` is a
string indicating the keyword of `callable` that expects the
%(klass)s object.
args : iterable, optional
positional arguments passed into `func`.
kwargs : dict, optional
a dictionary of keyword arguments passed into `func`.
Returns
-------
object : the return type of `func`.
See Also
--------
pandas.Series.pipe : Apply a function with arguments to a series.
pandas.DataFrame.pipe: Apply a function with arguments to a dataframe.
apply : Apply function to each group instead of to the
full %(klass)s object.
Notes
-----
See more `here
<http://pandas.pydata.org/pandas-docs/stable/groupby.html#piping-function-calls>`_
Examples
--------
%(examples)s
"""
_transform_template = """
Call function producing a like-indexed %(klass)s on each group and
return a %(klass)s having the same indexes as the original object
filled with the transformed values
Parameters
----------
f : function
Function to apply to each group
Returns
-------
%(klass)s
See Also
--------
aggregate, transform
Notes
-----
Each group is endowed the attribute 'name' in case you need to know
which group you are working on.
The current implementation imposes three requirements on f:
* f must return a value that either has the same shape as the input
subframe or can be broadcast to the shape of the input subframe.
For example, f returns a scalar it will be broadcast to have the
same shape as the input subframe.
* if this is a DataFrame, f must support application column-by-column
in the subframe. If f also supports application to the entire subframe,
then a fast path is used starting from the second chunk.
* f must not mutate groups. Mutation is not supported and may
produce unexpected results.
Examples
--------
# Same shape
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
... 'foo', 'bar'],
... 'B' : ['one', 'one', 'two', 'three',
... 'two', 'two'],
... 'C' : [1, 5, 5, 2, 5, 5],
... 'D' : [2.0, 5., 8., 1., 2., 9.]})
>>> grouped = df.groupby('A')
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
C D
0 -1.154701 -0.577350
1 0.577350 0.000000
2 0.577350 1.154701
3 -1.154701 -1.000000
4 0.577350 -0.577350
5 0.577350 1.000000
# Broadcastable
>>> grouped.transform(lambda x: x.max() - x.min())
C D
0 4 6.0
1 3 8.0
2 4 6.0
3 3 8.0
4 4 6.0
5 3 8.0
"""
class GroupByPlot(PandasObject):
"""
Class implementing the .plot attribute for groupby objects.
"""
def __init__(self, groupby):
self._groupby = groupby
def __call__(self, *args, **kwargs):
def f(self):
return self.plot(*args, **kwargs)
f.__name__ = 'plot'
return self._groupby.apply(f)
def __getattr__(self, name):
def attr(*args, **kwargs):
def f(self):
return getattr(self.plot, name)(*args, **kwargs)
return self._groupby.apply(f)
return attr
@contextmanager
def _group_selection_context(groupby):
"""
Set / reset the _group_selection_context.
"""
groupby._set_group_selection()
yield groupby
groupby._reset_group_selection()
class _GroupBy(PandasObject, SelectionMixin):
_group_selection = None
_apply_whitelist = frozenset()
def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True, squeeze=False,
observed=False, **kwargs):
self._selection = selection
if isinstance(obj, NDFrame):
obj._consolidate_inplace()
self.level = level
if not as_index:
if not isinstance(obj, DataFrame):
raise TypeError('as_index=False only valid with DataFrame')
if axis != 0:
raise ValueError('as_index=False only valid for axis=0')
self.as_index = as_index
Loading ...