"""
Base and utility classes for pandas objects.
"""
import builtins
from collections import OrderedDict
import textwrap
import warnings
import numpy as np
import pandas._libs.lib as lib
from pandas.compat import PYPY
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64_ns_dtype,
is_datetime64tz_dtype,
is_datetimelike,
is_extension_array_dtype,
is_extension_type,
is_list_like,
is_object_dtype,
is_scalar,
is_timedelta64_ns_dtype,
)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
from pandas.core import algorithms, common as com
from pandas.core.accessor import DirNamesMixin
from pandas.core.arrays import ExtensionArray
import pandas.core.nanops as nanops
_shared_docs = dict()
_indexops_doc_kwargs = dict(
klass="IndexOpsMixin",
inplace="",
unique="IndexOpsMixin",
duplicated="IndexOpsMixin",
)
class StringMixin:
"""
Implements string methods so long as object defines a `__str__` method.
"""
# side note - this could be made into a metaclass if more than one
# object needs
# ----------------------------------------------------------------------
# Formatting
def __str__(self):
"""
Return a string representation for a particular Object
"""
raise AbstractMethodError(self)
def __repr__(self):
"""
Return a string representation for a particular object.
"""
return str(self)
class PandasObject(DirNamesMixin):
"""baseclass for various pandas objects"""
@property
def _constructor(self):
"""class constructor (for this class it's just `__class__`"""
return self.__class__
def __repr__(self):
"""
Return a string representation for a particular object.
"""
# Should be overwritten by base classes
return object.__repr__(self)
def _reset_cache(self, key=None):
"""
Reset cached properties. If ``key`` is passed, only clears that key.
"""
if getattr(self, "_cache", None) is None:
return
if key is None:
self._cache.clear()
else:
self._cache.pop(key, None)
def __sizeof__(self):
"""
Generates the total memory usage for an object that returns
either a value or Series of values
"""
if hasattr(self, "memory_usage"):
mem = self.memory_usage(deep=True)
if not is_scalar(mem):
mem = mem.sum()
return int(mem)
# no memory_usage attribute, so fall back to
# object's 'sizeof'
return super().__sizeof__()
class NoNewAttributesMixin:
"""Mixin which prevents adding new attributes.
Prevents additional attributes via xxx.attribute = "something" after a
call to `self.__freeze()`. Mainly used to prevent the user from using
wrong attributes on a accessor (`Series.cat/.str/.dt`).
If you really want to add a new attribute at a later time, you need to use
`object.__setattr__(self, key, value)`.
"""
def _freeze(self):
"""Prevents setting additional attributes"""
object.__setattr__(self, "__frozen", True)
# prevent adding any attribute via s.xxx.new_attribute = ...
def __setattr__(self, key, value):
# _cache is used by a decorator
# We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
# because
# 1.) getattr is false for attributes that raise errors
# 2.) cls.__dict__ doesn't traverse into base classes
if getattr(self, "__frozen", False) and not (
key == "_cache"
or key in type(self).__dict__
or getattr(self, key, None) is not None
):
raise AttributeError(
"You cannot add any new attribute '{key}'".format(key=key)
)
object.__setattr__(self, key, value)
class GroupByError(Exception):
pass
class DataError(GroupByError):
pass
class SpecificationError(GroupByError):
pass
class SelectionMixin:
"""
mixin implementing the selection & aggregation interface on a group-like
object sub-classes need to define: obj, exclusions
"""
_selection = None
_internal_names = ["_cache", "__setstate__"]
_internal_names_set = set(_internal_names)
_builtin_table = OrderedDict(
((builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min))
)
_cython_table = OrderedDict(
(
(builtins.sum, "sum"),
(builtins.max, "max"),
(builtins.min, "min"),
(np.all, "all"),
(np.any, "any"),
(np.sum, "sum"),
(np.nansum, "sum"),
(np.mean, "mean"),
(np.nanmean, "mean"),
(np.prod, "prod"),
(np.nanprod, "prod"),
(np.std, "std"),
(np.nanstd, "std"),
(np.var, "var"),
(np.nanvar, "var"),
(np.median, "median"),
(np.nanmedian, "median"),
(np.max, "max"),
(np.nanmax, "max"),
(np.min, "min"),
(np.nanmin, "min"),
(np.cumprod, "cumprod"),
(np.nancumprod, "cumprod"),
(np.cumsum, "cumsum"),
(np.nancumsum, "cumsum"),
)
)
@property
def _selection_name(self):
"""
return a name for myself; this would ideally be called
the 'name' property, but we cannot conflict with the
Series.name property which can be set
"""
if self._selection is None:
return None # 'result'
else:
return self._selection
@property
def _selection_list(self):
if not isinstance(
self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)
):
return [self._selection]
return self._selection
@cache_readonly
def _selected_obj(self):
if self._selection is None or isinstance(self.obj, ABCSeries):
return self.obj
else:
return self.obj[self._selection]
@cache_readonly
def ndim(self):
return self._selected_obj.ndim
@cache_readonly
def _obj_with_exclusions(self):
if self._selection is not None and isinstance(self.obj, ABCDataFrame):
return self.obj.reindex(columns=self._selection_list)
if len(self.exclusions) > 0:
return self.obj.drop(self.exclusions, axis=1)
else:
return self.obj
def __getitem__(self, key):
if self._selection is not None:
raise IndexError(
"Column(s) {selection} already selected".format(
selection=self._selection
)
)
if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)):
if len(self.obj.columns.intersection(key)) != len(key):
bad_keys = list(set(key).difference(self.obj.columns))
raise KeyError(
"Columns not found: {missing}".format(missing=str(bad_keys)[1:-1])
)
return self._gotitem(list(key), ndim=2)
elif not getattr(self, "as_index", False):
if key not in self.obj.columns:
raise KeyError("Column not found: {key}".format(key=key))
return self._gotitem(key, ndim=2)
else:
if key not in self.obj:
raise KeyError("Column not found: {key}".format(key=key))
return self._gotitem(key, ndim=1)
def _gotitem(self, key, ndim, subset=None):
"""
sub-classes to define
return a sliced object
Parameters
----------
key : string / list of selections
ndim : 1,2
requested ndim of result
subset : object, default None
subset to act on
"""
raise AbstractMethodError(self)
def aggregate(self, func, *args, **kwargs):
raise AbstractMethodError(self)
agg = aggregate
def _try_aggregate_string_function(self, arg, *args, **kwargs):
"""
if arg is a string, then try to operate on it:
- try to find a function (or attribute) on ourselves
- try to find a numpy function
- raise
"""
assert isinstance(arg, str)
f = getattr(self, arg, None)
if f is not None:
if callable(f):
return f(*args, **kwargs)
# people may try to aggregate on a non-callable attribute
# but don't let them think they can pass args to it
assert len(args) == 0
assert (
len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0
)
return f
f = getattr(np, arg, None)
if f is not None:
try:
return f(self, *args, **kwargs)
except (AttributeError, TypeError):
pass
raise AttributeError(
"'{arg}' is not a valid function for "
"'{cls}' object".format(arg=arg, cls=type(self).__name__)
)
def _aggregate(self, arg, *args, **kwargs):
"""
provide an implementation for the aggregators
Parameters
----------
arg : string, dict, function
*args : args to pass on to the function
**kwargs : kwargs to pass on to the function
Returns
-------
tuple of result, how
Notes
-----
how can be a string describe the required post-processing, or
None if not required
"""
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
is_nested_renamer = False
_axis = kwargs.pop("_axis", None)
if _axis is None:
_axis = getattr(self, "axis", 0)
_level = kwargs.pop("_level", None)
if isinstance(arg, str):
return self._try_aggregate_string_function(arg, *args, **kwargs), None
if isinstance(arg, dict):
# aggregate based on the passed dict
if _axis != 0: # pragma: no cover
raise ValueError("Can only pass dict with axis=0")
obj = self._selected_obj
def nested_renaming_depr(level=4):
# deprecation of nested renaming
# GH 15931
msg = textwrap.dedent(
"""\
using a dict with renaming is deprecated and will be removed
in a future version.
For column-specific groupby renaming, use named aggregation
>>> df.groupby(...).agg(name=('column', aggfunc))
"""
)
warnings.warn(msg, FutureWarning, stacklevel=level)
# if we have a dict of any non-scalars
# eg. {'A' : ['mean']}, normalize all to
# be list-likes
if any(is_aggregator(x) for x in arg.values()):
new_arg = OrderedDict()
for k, v in arg.items():
if not isinstance(v, (tuple, list, dict)):
new_arg[k] = [v]
else:
new_arg[k] = v
# the keys must be in the columns
# for ndim=2, or renamers for ndim=1
# ok for now, but deprecated
# {'A': { 'ra': 'mean' }}
# {'A': { 'ra': ['mean'] }}
# {'ra': ['mean']}
# not ok
# {'ra' : { 'A' : 'mean' }}
if isinstance(v, dict):
is_nested_renamer = True
if k not in obj.columns:
msg = (
"cannot perform renaming for {key} with a "
"nested dictionary"
).format(key=k)
raise SpecificationError(msg)
nested_renaming_depr(4 + (_level or 0))
elif isinstance(obj, ABCSeries):
nested_renaming_depr()
elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
raise KeyError("Column '{col}' does not exist!".format(col=k))
arg = new_arg
else:
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
if isinstance(obj, ABCDataFrame) and len(
obj.columns.intersection(keys)
) != len(keys):
nested_renaming_depr()
from pandas.core.reshape.concat import concat
def _agg_1dim(name, how, subset=None):
"""
aggregate a 1-dim with how
"""
colg = self._gotitem(name, ndim=1, subset=subset)
if colg.ndim != 1:
raise SpecificationError(
"nested dictionary is ambiguous " "in aggregation"
)
return colg.aggregate(how, _level=(_level or 0) + 1)
def _agg_2dim(name, how):
"""
aggregate a 2-dim with how
"""
colg = self._gotitem(self._selection, ndim=2, subset=obj)
return colg.aggregate(how, _level=None)
def _agg(arg, func):
"""
run the aggregations over the arg with func
return an OrderedDict
"""
result = OrderedDict()
for fname, agg_how in arg.items():
result[fname] = func(fname, agg_how)
return result
# set the final keys
keys = list(arg.keys())
result = OrderedDict()
# nested renamer
if is_nested_renamer:
result = list(_agg(arg, _agg_1dim).values())
if all(isinstance(r, dict) for r in result):
result, results = OrderedDict(), result
for r in results:
result.update(r)
keys = list(result.keys())
else:
if self._selection is not None:
keys = None
# some selection on the object
elif self._selection is not None:
sl = set(self._selection_list)
# we are a Series like object,
# but may have multiple aggregations
if len(sl) == 1:
result = _agg(
arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)
)
# we are selecting the same set as we are aggregating
elif not len(sl - set(keys)):
result = _agg(arg, _agg_1dim)
# we are a DataFrame, with possibly multiple aggregations
else:
result = _agg(arg, _agg_2dim)
# no selection
else:
try:
result = _agg(arg, _agg_1dim)
except SpecificationError:
# we are aggregating expecting all 1d-returns
# but we have 2d
result = _agg(arg, _agg_2dim)
# combine results
def is_any_series():
# return a boolean if we have *any* nested series
return any(isinstance(r, ABCSeries) for r in result.values())
def is_any_frame():
# return a boolean if we have *any* nested series
return any(isinstance(r, ABCDataFrame) for r in result.values())
if isinstance(result, list):
return concat(result, keys=keys, axis=1, sort=True), True
elif is_any_frame():
# we have a dict of DataFrames
# return a MI DataFrame
return concat([result[k] for k in keys], keys=keys, axis=1), True
elif isinstance(self, ABCSeries) and is_any_series():
# we have a dict of Series
# return a MI Series
try:
result = concat(result)
except TypeError:
# we want to give a nice error here if
# we have non-same sized objects, so
# we don't automatically broadcast
raise ValueError(
"cannot perform both aggregation "
"and transformation operations "
"simultaneously"
)
return result, True
# fall thru
from pandas import DataFrame, Series
try:
result = DataFrame(result)
except ValueError:
# we have a dict of scalars
result = Series(result, name=getattr(self, "name", None))
return result, True
elif is_list_like(arg):
# we require a list, but not an 'str'
return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None
else:
result = None
f = self._is_cython_func(arg)
if f and not args and not kwargs:
return getattr(self, f)(), None
# caller can react
return result, True
def _aggregate_multiple_funcs(self, arg, _level, _axis):
from pandas.core.reshape.concat import concat
if _axis != 0:
raise NotImplementedError("axis other than 0 is not supported")
if self._selected_obj.ndim == 1:
obj = self._selected_obj
else:
obj = self._obj_with_exclusions
results = []
keys = []
# degenerate case
if obj.ndim == 1:
for a in arg:
try:
colg = self._gotitem(obj.name, ndim=1, subset=obj)
results.append(colg.aggregate(a))
# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)
except (TypeError, DataError):
pass
except SpecificationError:
raise
# multiples
else:
for index, col in enumerate(obj):
try:
colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
results.append(colg.aggregate(arg))
keys.append(col)
except (TypeError, DataError):
pass
except ValueError:
# cannot aggregate
continue
except SpecificationError:
raise
# if we are empty
if not len(results):
raise ValueError("no results")
try:
return concat(results, keys=keys, axis=1, sort=False)
except TypeError:
# we are concatting non-NDFrame objects,
# e.g. a list of scalars
from pandas.core.dtypes.cast import is_nested_object
from pandas import Series
result = Series(results, index=keys, name=self.name)
if is_nested_object(result):
raise ValueError(
"cannot combine transform and " "aggregation operations"
)
return result
def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
"""
return a new object with the replacement attributes
"""
if obj is None:
obj = self._selected_obj.copy()
if obj_type is None:
obj_type = self._constructor
if isinstance(obj, obj_type):
obj = obj.obj
for attr in self._attributes:
if attr not in kwargs:
kwargs[attr] = getattr(self, attr)
return obj_type(obj, **kwargs)
def _is_cython_func(self, arg):
"""
if we define an internal function for this argument, return it
"""
return self._cython_table.get(arg)
def _is_builtin_func(self, arg):
"""
if we define an builtin function for this argument, return it,
otherwise return the arg
"""
return self._builtin_table.get(arg, arg)
class IndexOpsMixin:
"""
Common ops mixin to support a unified interface / docs for Series / Index
"""
# ndarray compatibility
__array_priority__ = 1000
def transpose(self, *args, **kwargs):
"""
Return the transpose, which is by definition self.
Returns
-------
%(klass)s
"""
nv.validate_transpose(args, kwargs)
return self
T = property(
transpose,
doc="""\nReturn the transpose, which is by
definition self.\n""",
)
@property
def _is_homogeneous_type(self):
"""
Whether the object has a single dtype.
By definition, Series and Index are always considered homogeneous.
A MultiIndex may or may not be homogeneous, depending on the
dtypes of the levels.
See Also
--------
DataFrame._is_homogeneous_type
MultiIndex._is_homogeneous_type
"""
return True
@property
def shape(self):
"""
Return a tuple of the shape of the underlying data.
"""
return self._values.shape
@property
def ndim(self):
"""
Number of dimensions of the underlying data, by definition 1.
"""
return 1
def item(self):
"""
Return the first element of the underlying data as a python scalar.
.. deprecated 0.25.0
Returns
-------
scalar
The first element of %(klass)s.
"""
warnings.warn(
"`item` has been deprecated and will be removed in a " "future version",
FutureWarning,
stacklevel=2,
)
return self.values.item()
@property
def data(self):
"""
Return the data pointer of the underlying data.
.. deprecated:: 0.23.0
"""
warnings.warn(
"{obj}.data is deprecated and will be removed "
"in a future version".format(obj=type(self).__name__),
FutureWarning,
stacklevel=2,
)
return self.values.data
@property
def itemsize(self):
"""
Return the size of the dtype of the item of the underlying data.
.. deprecated:: 0.23.0
"""
warnings.warn(
"{obj}.itemsize is deprecated and will be removed "
"in a future version".format(obj=type(self).__name__),
FutureWarning,
stacklevel=2,
)
return self._ndarray_values.itemsize
@property
def nbytes(self):
"""
Return the number of bytes in the underlying data.
"""
return self._values.nbytes
@property
def strides(self):
"""
Return the strides of the underlying data.
.. deprecated:: 0.23.0
"""
warnings.warn(
"{obj}.strides is deprecated and will be removed "
"in a future version".format(obj=type(self).__name__),
FutureWarning,
stacklevel=2,
)
return self._ndarray_values.strides
@property
def size(self):
"""
Return the number of elements in the underlying data.
"""
return len(self._values)
@property
def flags(self):
"""
Return the ndarray.flags for the underlying data.
.. deprecated:: 0.23.0
"""
warnings.warn(
"{obj}.flags is deprecated and will be removed "
"in a future version".format(obj=type(self).__name__),
FutureWarning,
stacklevel=2,
)
return self.values.flags
@property
def base(self):
"""
Return the base object if the memory of the underlying data is shared.
.. deprecated:: 0.23.0
"""
warnings.warn(
"{obj}.base is deprecated and will be removed "
"in a future version".format(obj=type(self).__name__),
FutureWarning,
stacklevel=2,
)
return self.values.base
@property
def array(self) -> ExtensionArray:
"""
The ExtensionArray of the data backing this Series or Index.
.. versionadded:: 0.24.0
Returns
-------
ExtensionArray
An ExtensionArray of the values stored within. For extension
types, this is the actual array. For NumPy native types, this
is a thin (no copy) wrapper around :class:`numpy.ndarray`.
``.array`` differs ``.values`` which may require converting the
data to a different form.
See Also
--------
Index.to_numpy : Similar method that always returns a NumPy array.
Series.to_numpy : Similar method that always returns a NumPy array.
Notes
-----
This table lays out the different array types for each extension
dtype within pandas.
================== =============================
dtype array type
================== =============================
category Categorical
period PeriodArray
interval IntervalArray
IntegerNA IntegerArray
datetime64[ns, tz] DatetimeArray
================== =============================
For any 3rd-party extension types, the array type will be an
ExtensionArray.
For all remaining dtypes ``.array`` will be a
:class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
stored within. If you absolutely need a NumPy array (possibly with
copying / coercing data), then use :meth:`Series.to_numpy` instead.
Examples
--------
For regular NumPy types like int, and float, a PandasArray
is returned.
>>> pd.Series([1, 2, 3]).array
<PandasArray>
[1, 2, 3]
Length: 3, dtype: int64
For extension types, like Categorical, the actual ExtensionArray
is returned
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
>>> ser.array
[a, b, a]
Categories (2, object): [a, b]
"""
# As a mixin, we depend on the mixing class having _values.
# Special mixin syntax may be developed in the future:
# https://github.com/python/typing/issues/246
result = self._values # type: ignore
if is_datetime64_ns_dtype(result.dtype):
from pandas.arrays import DatetimeArray
result = DatetimeArray(result)
elif is_timedelta64_ns_dtype(result.dtype):
from pandas.arrays import TimedeltaArray
result = TimedeltaArray(result)
elif not is_extension_array_dtype(result.dtype):
from pandas.core.arrays.numpy_ import PandasArray
result = PandasArray(result)
return result
def to_numpy(self, dtype=None, copy=False):
"""
A NumPy ndarray representing the values in this Series or Index.
.. versionadded:: 0.24.0
Parameters
----------
dtype : str or numpy.dtype, optional
The dtype to pass to :meth:`numpy.asarray`
copy : bool, default False
Whether to ensure that the returned value is a not a view on
another array. Note that ``copy=False`` does not *ensure* that
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
a copy is made, even if not strictly necessary.
Returns
-------
numpy.ndarray
See Also
--------
Series.array : Get the actual data stored within.
Index.array : Get the actual data stored within.
DataFrame.to_numpy : Similar method for DataFrame.
Notes
-----
The returned array will be the same up to equality (values equal
in `self` will be equal in the returned array; likewise for values
that are not equal). When `self` contains an ExtensionArray, the
dtype may be different. For example, for a category-dtype Series,
``to_numpy()`` will return a NumPy array and the categorical dtype
will be lost.
For NumPy dtypes, this will be a reference to the actual data stored
in this Series or Index (assuming ``copy=False``). Modifying the result
in place will modify the data stored in the Series or Index (not that
we recommend doing that).
For extension types, ``to_numpy()`` *may* require copying data and
coercing the result to a NumPy type (possibly object), which may be
expensive. When you need a no-copy reference to the underlying data,
:attr:`Series.array` should be used instead.
This table lays out the different dtypes and default return types of
``to_numpy()`` for various dtypes within pandas.
================== ================================
dtype array type
================== ================================
category[T] ndarray[T] (same dtype as input)
period ndarray[object] (Periods)
interval ndarray[object] (Intervals)
IntegerNA ndarray[object]
datetime64[ns] datetime64[ns]
datetime64[ns, tz] ndarray[object] (Timestamps)
================== ================================
Examples
--------
>>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
>>> ser.to_numpy()
array(['a', 'b', 'a'], dtype=object)
Specify the `dtype` to control how datetime-aware data is represented.
Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
objects, each with the correct ``tz``.
>>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
>>> ser.to_numpy(dtype=object)
array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
dtype=object)
Or ``dtype='datetime64[ns]'`` to return an ndarray of native
datetime64 values. The values are converted to UTC and the timezone
info is dropped.
>>> ser.to_numpy(dtype="datetime64[ns]")
... # doctest: +ELLIPSIS
array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
dtype='datetime64[ns]')
"""
if is_datetime64tz_dtype(self.dtype) and dtype is None:
# note: this is going to change very soon.
# I have a WIP PR making this unnecessary, but it's
# a bit out of scope for the DatetimeArray PR.
dtype = "object"
result = np.asarray(self._values, dtype=dtype)
# TODO(GH-24345): Avoid potential double copy
if copy:
result = result.copy()
return result
@property
def _ndarray_values(self) -> np.ndarray:
"""
The data as an ndarray, possibly losing information.
The expectation is that this is cheap to compute, and is primarily
used for interacting with our indexers.
- categorical -> codes
"""
if is_extension_array_dtype(self):
return self.array._ndarray_values
# As a mixin, we depend on the mixing class having values.
# Special mixin syntax may be developed in the future:
# https://github.com/python/typing/issues/246
return self.values # type: ignore
@property
def empty(self):
return not self.size
def max(self, axis=None, skipna=True, *args, **kwargs):
"""
Return the maximum value of the Index.
Parameters
----------
axis : int, optional
For compatibility with NumPy. Only 0 or None are allowed.
skipna : bool, default True
Returns
-------
scalar
Maximum value.
See Also
--------
Index.min : Return the minimum value in an Index.
Series.max : Return the maximum value in a Series.
DataFrame.max : Return the maximum values in a DataFrame.
Examples
--------
>>> idx = pd.Index([3, 2, 1])
>>> idx.max()
3
>>> idx = pd.Index(['c', 'b', 'a'])
>>> idx.max()
'c'
For a MultiIndex, the maximum is determined lexicographically.
>>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
>>> idx.max()
('b', 2)
"""
nv.validate_minmax_axis(axis)
nv.validate_max(args, kwargs)
return nanops.nanmax(self._values, skipna=skipna)
def argmax(self, axis=None, skipna=True, *args, **kwargs):
"""
Return an ndarray of the maximum argument indexer.
Parameters
----------
axis : {None}
Dummy argument for consistency with Series
skipna : bool, default True
Returns
-------
numpy.ndarray
Indices of the maximum values.
See Also
--------
numpy.ndarray.argmax
"""
nv.validate_minmax_axis(axis)
nv.validate_argmax_with_skipna(skipna, args, kwargs)
return nanops.nanargmax(self._values, skipna=skipna)
def min(self, axis=None, skipna=True, *args, **kwargs):
"""
Return the minimum value of the Index.
Parameters
----------
axis : {None}
Dummy argument for consistency with Series
skipna : bool, default True
Returns
-------
scalar
Minimum value.
See Also
--------
Index.max : Return the maximum value of the object.
Series.min : Return the minimum value in a Series.
DataFrame.min : Return the minimum values in a DataFrame.
Examples
--------
>>> idx = pd.Index([3, 2, 1])
>>> idx.min()
1
>>> idx = pd.Index(['c', 'b', 'a'])
>>> idx.min()
'a'
For a MultiIndex, the minimum is determined lexicographically.
>>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
>>> idx.min()
('a', 1)
"""
nv.validate_minmax_axis(axis)
nv.validate_min(args, kwargs)
return nanops.nanmin(self._values, skipna=skipna)
def argmin(self, axis=None, skipna=True, *args, **kwargs):
"""
Return a ndarray of the minimum argument indexer.
Parameters
----------
axis : {None}
Dummy argument for consistency with Series
skipna : bool, default True
Returns
-------
numpy.ndarray
See Also
--------
numpy.ndarray.argmin
"""
nv.validate_minmax_axis(axis)
nv.validate_argmax_with_skipna(skipna, args, kwargs)
return nanops.nanargmin(self._values, skipna=skipna)
def tolist(self):
"""
Return a list of the values.
These are each a scalar type, which is a Python scalar
(for str, int, float) or a pandas scalar
(for Timestamp/Timedelta/Interval/Period)
Returns
-------
list
See Also
--------
numpy.ndarray.tolist
"""
if is_datetimelike(self._values):
return [com.maybe_box_datetimelike(x) for x in self._values]
elif is_extension_array_dtype(self._values):
return list(self._values)
else:
return self._values.tolist()
to_list = tolist
def __iter__(self):
"""
Return an iterator of the values.
These are each a scalar type, which is a Python scalar
(for str, int, float) or a pandas scalar
(for Timestamp/Timedelta/Interval/Period)
Returns
-------
iterator
"""
# We are explicitly making element iterators.
if is_datetimelike(self._values):
return map(com.maybe_box_datetimelike, self._values)
elif is_extension_array_dtype(self._values):
return iter(self._values)
else:
return map(self._values.item, range(self._values.size))
@cache_readonly
def hasnans(self):
"""
Return if I have any nans; enables various perf speedups.
"""
return bool(isna(self).any())
def _reduce(
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
):
""" perform the reduction type operation if we can """
func = getattr(self, name, None)
if func is None:
raise TypeError(
"{klass} cannot perform the operation {op}".format(
klass=self.__class__.__name__, op=name
)
)
return func(skipna=skipna, **kwds)
def _map_values(self, mapper, na_action=None):
"""
An internal function that maps values using the input
correspondence (which can be a dict, Series, or function).
Parameters
----------
mapper : function, dict, or Series
The input correspondence object
na_action : {None, 'ignore'}
If 'ignore', propagate NA values, without passing them to the
mapping function
Returns
-------
Union[Index, MultiIndex], inferred
The output of the mapping function applied to the index.
If the function returns a tuple with more than one element
a MultiIndex will be returned.
"""
# we can fastpath dict/Series to an efficient map
# as we know that we are not going to have to yield
# python types
if isinstance(mapper, dict):
if hasattr(mapper, "__missing__"):
# If a dictionary subclass defines a default value method,
# convert mapper to a lookup function (GH #15999).
dict_with_default = mapper
mapper = lambda x: dict_with_default[x]
else:
# Dictionary does not have a default. Thus it's safe to
# convert to an Series for efficiency.
# we specify the keys here to handle the
# possibility that they are tuples
from pandas import Series
mapper = Series(mapper)
if isinstance(mapper, ABCSeries):
# Since values were input this means we came from either
# a dict or a series and mapper should be an index
if is_categorical_dtype(self._values):
# use the built in categorical series mapper which saves
# time by mapping the categories instead of all values
return self._values.map(mapper)
if is_extension_type(self.dtype):
values = self._values
else:
values = self.values
indexer = mapper.index.get_indexer(values)
new_values = algorithms.take_1d(mapper._values, indexer)
return new_values
# we must convert to python types
if is_extension_type(self.dtype):
values = self._values
if na_action is not None:
raise NotImplementedError
map_f = lambda values, f: values.map(f)
else:
values = self.astype(object)
values = getattr(values, "values", values)
if na_action == "ignore":
def map_f(values, f):
return lib.map_infer_mask(values, f, isna(values).view(np.uint8))
else:
map_f = lib.map_infer
# mapper is a function
new_values = map_f(values, mapper)
return new_values
def value_counts(
self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
):
"""
Return a Series containing counts of unique values.
The resulting object will be in descending order so that the
first element is the most frequently-occurring element.
Excludes NA values by default.
Parameters
----------
normalize : boolean, default False
If True then the object returned will contain the relative
frequencies of the unique values.
sort : boolean, default True
Sort by frequencies.
ascending : boolean, default False
Sort in ascending order.
bins : integer, optional
Rather than count values, group them into half-open bins,
a convenience for ``pd.cut``, only works with numeric data.
dropna : boolean, default True
Don't include counts of NaN.
Returns
-------
Series
See Also
--------
Series.count: Number of non-NA elements in a Series.
DataFrame.count: Number of non-NA elements in a DataFrame.
Examples
--------
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
>>> index.value_counts()
3.0 2
4.0 1
2.0 1
1.0 1
dtype: int64
With `normalize` set to `True`, returns the relative frequency by
dividing all values by the sum of values.
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
>>> s.value_counts(normalize=True)
3.0 0.4
4.0 0.2
2.0 0.2
1.0 0.2
dtype: float64
**bins**
Bins can be useful for going from a continuous variable to a
categorical variable; instead of counting unique
apparitions of values, divide the index in the specified
number of half-open bins.
>>> s.value_counts(bins=3)
(2.0, 3.0] 2
(0.996, 2.0] 2
(3.0, 4.0] 1
dtype: int64
**dropna**
With `dropna` set to `False` we can also see NaN index values.
>>> s.value_counts(dropna=False)
3.0 2
NaN 1
4.0 1
2.0 1
1.0 1
dtype: int64
"""
from pandas.core.algorithms import value_counts
result = value_counts(
self,
sort=sort,
ascending=ascending,
normalize=normalize,
bins=bins,
dropna=dropna,
)
return result
def unique(self):
values = self._values
if hasattr(values, "unique"):
result = values.unique()
else:
from pandas.core.algorithms import unique1d
result = unique1d(values)
return result
def nunique(self, dropna=True):
"""
Return number of unique elements in the object.
Excludes NA values by default.
Parameters
----------
dropna : bool, default True
Don't include NaN in the count.
Returns
-------
int
See Also
--------
DataFrame.nunique: Method nunique for DataFrame.
Series.count: Count non-NA/null observations in the Series.
Examples
--------
>>> s = pd.Series([1, 3, 5, 7, 7])
>>> s
0 1
1 3
2 5
3 7
4 7
dtype: int64
>>> s.nunique()
4
"""
uniqs = self.unique()
n = len(uniqs)
if dropna and isna(uniqs).any():
n -= 1
return n
@property
def is_unique(self):
"""
Return boolean if values in the object are unique.
Returns
-------
bool
"""
return self.nunique(dropna=False) == len(self)
@property
def is_monotonic(self):
"""
Return boolean if values in the object are
monotonic_increasing.
.. versionadded:: 0.19.0
Returns
-------
bool
"""
from pandas import Index
return Index(self).is_monotonic
is_monotonic_increasing = is_monotonic
@property
def is_monotonic_decreasing(self):
"""
Return boolean if values in the object are
monotonic_decreasing.
.. versionadded:: 0.19.0
Returns
-------
bool
"""
from pandas import Index
return Index(self).is_monotonic_decreasing
def memory_usage(self, deep=False):
"""
Memory usage of the values
Parameters
----------
deep : bool
Introspect the data deeply, interrogate
`object` dtypes for system-level memory consumption
Returns
-------
bytes used
See Also
--------
numpy.ndarray.nbytes
Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False or if used on PyPy
"""
if hasattr(self.array, "memory_usage"):
return self.array.memory_usage(deep=deep)
v = self.array.nbytes
if deep and is_object_dtype(self) and not PYPY:
v += lib.memory_usage_of_objects(self.array)
return v
@Substitution(
values="",
order="",
size_hint="",
sort=textwrap.dedent(
"""\
sort : boolean, default False
Sort `uniques` and shuffle `labels` to maintain the
relationship.
"""
),
)
@Appender(algorithms._shared_docs["factorize"])
def factorize(self, sort=False, na_sentinel=-1):
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
_shared_docs[
"searchsorted"
] = """
Find indices where elements should be inserted to maintain order.
Find the indices into a sorted %(klass)s `self` such that, if the
corresponding elements in `value` were inserted before the indices,
the order of `self` would be preserved.
Parameters
----------
value : array_like
Values to insert into `self`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
sorter : 1-D array_like, optional
Optional array of integer indices that sort `self` into ascending
order. They are typically the result of ``np.argsort``.
Returns
-------
int or array of int
A scalar or array of insertion points with the
same shape as `value`.
.. versionchanged :: 0.24.0
If `value` is a scalar, an int is now always returned.
Previously, scalar inputs returned an 1-item array for
:class:`Series` and :class:`Categorical`.
See Also
--------
numpy.searchsorted
Notes
-----
Binary search is used to find the required insertion points.
Examples
--------
>>> x = pd.Series([1, 2, 3])
>>> x
0 1
1 2
2 3
dtype: int64
>>> x.searchsorted(4)
3
>>> x.searchsorted([0, 4])
array([0, 3])
>>> x.searchsorted([1, 3], side='left')
array([0, 2])
>>> x.searchsorted([1, 3], side='right')
array([1, 3])
>>> x = pd.Categorical(['apple', 'bread', 'bread',
'cheese', 'milk'], ordered=True)
[apple, bread, bread, cheese, milk]
Categories (4, object): [apple < bread < cheese < milk]
>>> x.searchsorted('bread')
1
>>> x.searchsorted(['bread'], side='right')
array([3])
"""
@Substitution(klass="Index")
@Appender(_shared_docs["searchsorted"])
def searchsorted(self, value, side="left", sorter=None):
return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
def drop_duplicates(self, keep="first", inplace=False):
inplace = validate_bool_kwarg(inplace, "inplace")
if isinstance(self, ABCIndexClass):
if self.is_unique:
return self._shallow_copy()
duplicated = self.duplicated(keep=keep)
result = self[np.logical_not(duplicated)]
if inplace:
return self._update_inplace(result)
else:
return result
def duplicated(self, keep="first"):
from pandas.core.algorithms import duplicated
if isinstance(self, ABCIndexClass):
if self.is_unique:
return np.zeros(len(self), dtype=np.bool)
return duplicated(self, keep=keep)
else:
return self._constructor(
duplicated(self, keep=keep), index=self.index
).__finalize__(self)
# ----------------------------------------------------------------------
# abstracts
def _update_inplace(self, result, verify_is_copy=True, **kwargs):
raise AbstractMethodError(self)