Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

/ core / arrays / categorical.py

# pylint: disable=E1101,W0232

import textwrap
from warnings import warn

import numpy as np

from pandas._libs import algos as libalgos, lib
import pandas.compat as compat
from pandas.compat import lzip, u
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
    Appender, Substitution, cache_readonly, deprecate_kwarg)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs

from pandas.core.dtypes.cast import (
    coerce_indexer_dtype, maybe_infer_to_datetimelike)
from pandas.core.dtypes.common import (
    ensure_int64, ensure_object, ensure_platform_int, is_categorical,
    is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
    is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype,
    is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence,
    is_timedelta64_dtype)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import (
    ABCCategoricalIndex, ABCDataFrame, ABCIndexClass, ABCSeries)
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna

from pandas.core.accessor import PandasDelegate, delegate_names
import pandas.core.algorithms as algorithms
from pandas.core.algorithms import factorize, take, take_1d, unique1d
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.config import get_option
from pandas.core.missing import interpolate_2d
from pandas.core.sorting import nargsort

from pandas.io.formats import console
from pandas.io.formats.terminal import get_terminal_size

from .base import ExtensionArray, _extension_array_shared_docs

_take_msg = textwrap.dedent("""\
    Interpreting negative values in 'indexer' as missing values.
    In the future, this will change to meaning positional indices
    from the right.

    Use 'allow_fill=True' to retain the previous behavior and silence this
    warning.

    Use 'allow_fill=False' to accept the new behavior.""")


def _cat_compare_op(op):
    def f(self, other):
        # On python2, you can usually compare any type to any type, and
        # Categoricals can be seen as a custom type, but having different
        # results depending whether categories are the same or not is kind of
        # insane, so be a bit stricter here and use the python3 idea of
        # comparing only things of equal type.
        if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
            return NotImplemented

        other = lib.item_from_zerodim(other)

        if not self.ordered:
            if op in ['__lt__', '__gt__', '__le__', '__ge__']:
                raise TypeError("Unordered Categoricals can only compare "
                                "equality or not")
        if isinstance(other, Categorical):
            # Two Categoricals can only be be compared if the categories are
            # the same (maybe up to ordering, depending on ordered)

            msg = ("Categoricals can only be compared if "
                   "'categories' are the same.")
            if len(self.categories) != len(other.categories):
                raise TypeError(msg + " Categories are different lengths")
            elif (self.ordered and not (self.categories ==
                                        other.categories).all()):
                raise TypeError(msg)
            elif not set(self.categories) == set(other.categories):
                raise TypeError(msg)

            if not (self.ordered == other.ordered):
                raise TypeError("Categoricals can only be compared if "
                                "'ordered' is the same")
            if not self.ordered and not self.categories.equals(
                    other.categories):
                # both unordered and different order
                other_codes = _get_codes_for_values(other, self.categories)
            else:
                other_codes = other._codes

            na_mask = (self._codes == -1) | (other_codes == -1)
            f = getattr(self._codes, op)
            ret = f(other_codes)
            if na_mask.any():
                # In other series, the leads to False, so do that here too
                ret[na_mask] = False
            return ret

        # Numpy < 1.13 may convert a scalar to a zerodim array during
        # comparison operation when second arg has higher priority, e.g.
        #
        #     cat[0] < cat
        #
        # With cat[0], for example, being ``np.int64(1)`` by the time it gets
        # into this function would become ``np.array(1)``.
        if is_scalar(other):
            if other in self.categories:
                i = self.categories.get_loc(other)
                return getattr(self._codes, op)(i)
            else:
                if op == '__eq__':
                    return np.repeat(False, len(self))
                elif op == '__ne__':
                    return np.repeat(True, len(self))
                else:
                    msg = ("Cannot compare a Categorical for op {op} with a "
                           "scalar, which is not a category.")
                    raise TypeError(msg.format(op=op))
        else:

            # allow categorical vs object dtype array comparisons for equality
            # these are only positional comparisons
            if op in ['__eq__', '__ne__']:
                return getattr(np.array(self), op)(np.array(other))

            msg = ("Cannot compare a Categorical for op {op} with type {typ}."
                   "\nIf you want to compare values, use 'np.asarray(cat) "
                   "<op> other'.")
            raise TypeError(msg.format(op=op, typ=type(other)))

    f.__name__ = op

    return f


def _maybe_to_categorical(array):
    """
    Coerce to a categorical if a series is given.

    Internal use ONLY.
    """
    if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
        return array._values
    elif isinstance(array, np.ndarray):
        return Categorical(array)
    return array


def contains(cat, key, container):
    """
    Helper for membership check for ``key`` in ``cat``.

    This is a helper method for :method:`__contains__`
    and :class:`CategoricalIndex.__contains__`.

    Returns True if ``key`` is in ``cat.categories`` and the
    location of ``key`` in ``categories`` is in ``container``.

    Parameters
    ----------
    cat : :class:`Categorical`or :class:`categoricalIndex`
    key : a hashable object
        The key to check membership for.
    container : Container (e.g. list-like or mapping)
        The container to check for membership in.

    Returns
    -------
    is_in : bool
        True if ``key`` is in ``self.categories`` and location of
        ``key`` in ``categories`` is in ``container``, else False.

    Notes
    -----
    This method does not check for NaN values. Do that separately
    before calling this method.
    """
    hash(key)

    # get location of key in categories.
    # If a KeyError, the key isn't in categories, so logically
    #  can't be in container either.
    try:
        loc = cat.categories.get_loc(key)
    except KeyError:
        return False

    # loc is the location of key in categories, but also the *value*
    # for key in container. So, `key` may be in categories,
    # but still not in `container`. Example ('b' in categories,
    # but not in values):
    # 'b' in Categorical(['a'], categories=['a', 'b'])  # False
    if is_scalar(loc):
        return loc in container
    else:
        # if categories is an IntervalIndex, loc is an array.
        return any(loc_ in container for loc_ in loc)


_codes_doc = """\
The category codes of this categorical.

Level codes are an array if integer which are the positions of the real
values in the categories array.

There is not setter, use the other categorical methods and the normal item
setter to change values in the categorical.
"""


class Categorical(ExtensionArray, PandasObject):
    """
    Represents a categorical variable in classic R / S-plus fashion

    `Categoricals` can only take on only a limited, and usually fixed, number
    of possible values (`categories`). In contrast to statistical categorical
    variables, a `Categorical` might have an order, but numerical operations
    (additions, divisions, ...) are not possible.

    All values of the `Categorical` are either in `categories` or `np.nan`.
    Assigning values outside of `categories` will raise a `ValueError`. Order
    is defined by the order of the `categories`, not lexical order of the
    values.

    Parameters
    ----------
    values : list-like
        The values of the categorical. If categories are given, values not in
        categories will be replaced with NaN.
    categories : Index-like (unique), optional
        The unique categories for this categorical. If not given, the
        categories are assumed to be the unique values of `values` (sorted, if
        possible, otherwise in the order in which they appear).
    ordered : boolean, (default False)
        Whether or not this categorical is treated as a ordered categorical.
        If True, the resulting categorical will be ordered.
        An ordered categorical respects, when sorted, the order of its
        `categories` attribute (which in turn is the `categories` argument, if
        provided).
    dtype : CategoricalDtype
        An instance of ``CategoricalDtype`` to use for this categorical

        .. versionadded:: 0.21.0

    Attributes
    ----------
    categories : Index
        The categories of this categorical
    codes : ndarray
        The codes (integer positions, which point to the categories) of this
        categorical, read only.
    ordered : boolean
        Whether or not this Categorical is ordered.
    dtype : CategoricalDtype
        The instance of ``CategoricalDtype`` storing the ``categories``
        and ``ordered``.

        .. versionadded:: 0.21.0

    Methods
    -------
    from_codes
    __array__

    Raises
    ------
    ValueError
        If the categories do not validate.
    TypeError
        If an explicit ``ordered=True`` is given but no `categories` and the
        `values` are not sortable.

    See Also
    --------
    pandas.api.types.CategoricalDtype : Type for categorical data.
    CategoricalIndex : An Index with an underlying ``Categorical``.

    Notes
    -----
    See the `user guide
    <http://pandas.pydata.org/pandas-docs/stable/categorical.html>`_ for more.

    Examples
    --------
    >>> pd.Categorical([1, 2, 3, 1, 2, 3])
    [1, 2, 3, 1, 2, 3]
    Categories (3, int64): [1, 2, 3]

    >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
    [a, b, c, a, b, c]
    Categories (3, object): [a, b, c]

    Ordered `Categoricals` can be sorted according to the custom order
    of the categories and can have a min and max value.

    >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True,
    ...                    categories=['c', 'b', 'a'])
    >>> c
    [a, b, c, a, b, c]
    Categories (3, object): [c < b < a]
    >>> c.min()
    'c'
    """

    # For comparisons, so that numpy uses our implementation if the compare
    # ops, which raise
    __array_priority__ = 1000
    _dtype = CategoricalDtype(ordered=False)
    # tolist is not actually deprecated, just suppressed in the __dir__
    _deprecations = frozenset(['labels', 'tolist'])
    _typ = 'categorical'

    def __init__(self, values, categories=None, ordered=None, dtype=None,
                 fastpath=False):

        dtype = CategoricalDtype._from_values_or_dtype(values, categories,
                                                       ordered, dtype)
        # At this point, dtype is always a CategoricalDtype, but
        # we may have dtype.categories be None, and we need to
        # infer categories in a factorization step futher below

        if fastpath:
            self._codes = coerce_indexer_dtype(values, dtype.categories)
            self._dtype = self._dtype.update_dtype(dtype)
            return

        # null_mask indicates missing values we want to exclude from inference.
        # This means: only missing values in list-likes (not arrays/ndframes).
        null_mask = np.array(False)

        # sanitize input
        if is_categorical_dtype(values):
            if dtype.categories is None:
                dtype = CategoricalDtype(values.categories, dtype.ordered)
        elif not isinstance(values, (ABCIndexClass, ABCSeries)):
            # sanitize_array coerces np.nan to a string under certain versions
            # of numpy
            values = maybe_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.internals.construction import sanitize_array
Loading ...