# pylint: disable=E1101,W0232
import textwrap
from warnings import warn
import numpy as np
from pandas._libs import algos as libalgos, lib
import pandas.compat as compat
from pandas.compat import lzip, u
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
Appender, Substitution, cache_readonly, deprecate_kwarg)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
from pandas.core.dtypes.cast import (
coerce_indexer_dtype, maybe_infer_to_datetimelike)
from pandas.core.dtypes.common import (
ensure_int64, ensure_object, ensure_platform_int, is_categorical,
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype,
is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence,
is_timedelta64_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import (
ABCCategoricalIndex, ABCDataFrame, ABCIndexClass, ABCSeries)
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna
from pandas.core.accessor import PandasDelegate, delegate_names
import pandas.core.algorithms as algorithms
from pandas.core.algorithms import factorize, take, take_1d, unique1d
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.config import get_option
from pandas.core.missing import interpolate_2d
from pandas.core.sorting import nargsort
from pandas.io.formats import console
from pandas.io.formats.terminal import get_terminal_size
from .base import ExtensionArray, _extension_array_shared_docs
_take_msg = textwrap.dedent("""\
Interpreting negative values in 'indexer' as missing values.
In the future, this will change to meaning positional indices
from the right.
Use 'allow_fill=True' to retain the previous behavior and silence this
warning.
Use 'allow_fill=False' to accept the new behavior.""")
def _cat_compare_op(op):
def f(self, other):
# On python2, you can usually compare any type to any type, and
# Categoricals can be seen as a custom type, but having different
# results depending whether categories are the same or not is kind of
# insane, so be a bit stricter here and use the python3 idea of
# comparing only things of equal type.
if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
return NotImplemented
other = lib.item_from_zerodim(other)
if not self.ordered:
if op in ['__lt__', '__gt__', '__le__', '__ge__']:
raise TypeError("Unordered Categoricals can only compare "
"equality or not")
if isinstance(other, Categorical):
# Two Categoricals can only be be compared if the categories are
# the same (maybe up to ordering, depending on ordered)
msg = ("Categoricals can only be compared if "
"'categories' are the same.")
if len(self.categories) != len(other.categories):
raise TypeError(msg + " Categories are different lengths")
elif (self.ordered and not (self.categories ==
other.categories).all()):
raise TypeError(msg)
elif not set(self.categories) == set(other.categories):
raise TypeError(msg)
if not (self.ordered == other.ordered):
raise TypeError("Categoricals can only be compared if "
"'ordered' is the same")
if not self.ordered and not self.categories.equals(
other.categories):
# both unordered and different order
other_codes = _get_codes_for_values(other, self.categories)
else:
other_codes = other._codes
na_mask = (self._codes == -1) | (other_codes == -1)
f = getattr(self._codes, op)
ret = f(other_codes)
if na_mask.any():
# In other series, the leads to False, so do that here too
ret[na_mask] = False
return ret
# Numpy < 1.13 may convert a scalar to a zerodim array during
# comparison operation when second arg has higher priority, e.g.
#
# cat[0] < cat
#
# With cat[0], for example, being ``np.int64(1)`` by the time it gets
# into this function would become ``np.array(1)``.
if is_scalar(other):
if other in self.categories:
i = self.categories.get_loc(other)
return getattr(self._codes, op)(i)
else:
if op == '__eq__':
return np.repeat(False, len(self))
elif op == '__ne__':
return np.repeat(True, len(self))
else:
msg = ("Cannot compare a Categorical for op {op} with a "
"scalar, which is not a category.")
raise TypeError(msg.format(op=op))
else:
# allow categorical vs object dtype array comparisons for equality
# these are only positional comparisons
if op in ['__eq__', '__ne__']:
return getattr(np.array(self), op)(np.array(other))
msg = ("Cannot compare a Categorical for op {op} with type {typ}."
"\nIf you want to compare values, use 'np.asarray(cat) "
"<op> other'.")
raise TypeError(msg.format(op=op, typ=type(other)))
f.__name__ = op
return f
def _maybe_to_categorical(array):
"""
Coerce to a categorical if a series is given.
Internal use ONLY.
"""
if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
return array._values
elif isinstance(array, np.ndarray):
return Categorical(array)
return array
def contains(cat, key, container):
"""
Helper for membership check for ``key`` in ``cat``.
This is a helper method for :method:`__contains__`
and :class:`CategoricalIndex.__contains__`.
Returns True if ``key`` is in ``cat.categories`` and the
location of ``key`` in ``categories`` is in ``container``.
Parameters
----------
cat : :class:`Categorical`or :class:`categoricalIndex`
key : a hashable object
The key to check membership for.
container : Container (e.g. list-like or mapping)
The container to check for membership in.
Returns
-------
is_in : bool
True if ``key`` is in ``self.categories`` and location of
``key`` in ``categories`` is in ``container``, else False.
Notes
-----
This method does not check for NaN values. Do that separately
before calling this method.
"""
hash(key)
# get location of key in categories.
# If a KeyError, the key isn't in categories, so logically
# can't be in container either.
try:
loc = cat.categories.get_loc(key)
except KeyError:
return False
# loc is the location of key in categories, but also the *value*
# for key in container. So, `key` may be in categories,
# but still not in `container`. Example ('b' in categories,
# but not in values):
# 'b' in Categorical(['a'], categories=['a', 'b']) # False
if is_scalar(loc):
return loc in container
else:
# if categories is an IntervalIndex, loc is an array.
return any(loc_ in container for loc_ in loc)
_codes_doc = """\
The category codes of this categorical.
Level codes are an array if integer which are the positions of the real
values in the categories array.
There is not setter, use the other categorical methods and the normal item
setter to change values in the categorical.
"""
class Categorical(ExtensionArray, PandasObject):
"""
Represents a categorical variable in classic R / S-plus fashion
`Categoricals` can only take on only a limited, and usually fixed, number
of possible values (`categories`). In contrast to statistical categorical
variables, a `Categorical` might have an order, but numerical operations
(additions, divisions, ...) are not possible.
All values of the `Categorical` are either in `categories` or `np.nan`.
Assigning values outside of `categories` will raise a `ValueError`. Order
is defined by the order of the `categories`, not lexical order of the
values.
Parameters
----------
values : list-like
The values of the categorical. If categories are given, values not in
categories will be replaced with NaN.
categories : Index-like (unique), optional
The unique categories for this categorical. If not given, the
categories are assumed to be the unique values of `values` (sorted, if
possible, otherwise in the order in which they appear).
ordered : boolean, (default False)
Whether or not this categorical is treated as a ordered categorical.
If True, the resulting categorical will be ordered.
An ordered categorical respects, when sorted, the order of its
`categories` attribute (which in turn is the `categories` argument, if
provided).
dtype : CategoricalDtype
An instance of ``CategoricalDtype`` to use for this categorical
.. versionadded:: 0.21.0
Attributes
----------
categories : Index
The categories of this categorical
codes : ndarray
The codes (integer positions, which point to the categories) of this
categorical, read only.
ordered : boolean
Whether or not this Categorical is ordered.
dtype : CategoricalDtype
The instance of ``CategoricalDtype`` storing the ``categories``
and ``ordered``.
.. versionadded:: 0.21.0
Methods
-------
from_codes
__array__
Raises
------
ValueError
If the categories do not validate.
TypeError
If an explicit ``ordered=True`` is given but no `categories` and the
`values` are not sortable.
See Also
--------
pandas.api.types.CategoricalDtype : Type for categorical data.
CategoricalIndex : An Index with an underlying ``Categorical``.
Notes
-----
See the `user guide
<http://pandas.pydata.org/pandas-docs/stable/categorical.html>`_ for more.
Examples
--------
>>> pd.Categorical([1, 2, 3, 1, 2, 3])
[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]
>>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
[a, b, c, a, b, c]
Categories (3, object): [a, b, c]
Ordered `Categoricals` can be sorted according to the custom order
of the categories and can have a min and max value.
>>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True,
... categories=['c', 'b', 'a'])
>>> c
[a, b, c, a, b, c]
Categories (3, object): [c < b < a]
>>> c.min()
'c'
"""
# For comparisons, so that numpy uses our implementation if the compare
# ops, which raise
__array_priority__ = 1000
_dtype = CategoricalDtype(ordered=False)
# tolist is not actually deprecated, just suppressed in the __dir__
_deprecations = frozenset(['labels', 'tolist'])
_typ = 'categorical'
def __init__(self, values, categories=None, ordered=None, dtype=None,
fastpath=False):
dtype = CategoricalDtype._from_values_or_dtype(values, categories,
ordered, dtype)
# At this point, dtype is always a CategoricalDtype, but
# we may have dtype.categories be None, and we need to
# infer categories in a factorization step futher below
if fastpath:
self._codes = coerce_indexer_dtype(values, dtype.categories)
self._dtype = self._dtype.update_dtype(dtype)
return
# null_mask indicates missing values we want to exclude from inference.
# This means: only missing values in list-likes (not arrays/ndframes).
null_mask = np.array(False)
# sanitize input
if is_categorical_dtype(values):
if dtype.categories is None:
dtype = CategoricalDtype(values.categories, dtype.ordered)
elif not isinstance(values, (ABCIndexClass, ABCSeries)):
# sanitize_array coerces np.nan to a string under certain versions
# of numpy
values = maybe_infer_to_datetimelike(values, convert_dates=True)
if not isinstance(values, np.ndarray):
values = _convert_to_list_like(values)
from pandas.core.internals.construction import sanitize_array
Loading ...