Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

/ core / algorithms.py

"""
Generic data algorithms. This module is experimental at the moment and not
intended for public consumption
"""
from __future__ import division

from textwrap import dedent
from warnings import catch_warnings, simplefilter, warn

import numpy as np

from pandas._libs import algos, hashtable as htable, lib
from pandas._libs.tslib import iNaT
from pandas.util._decorators import Appender, Substitution, deprecate_kwarg

from pandas.core.dtypes.cast import (
    construct_1d_object_array_from_listlike, maybe_promote)
from pandas.core.dtypes.common import (
    ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
    ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
    is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
    is_datetimelike, is_extension_array_dtype, is_float_dtype,
    is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
    is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
    is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
    needs_i8_conversion)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, na_value_for_dtype

from pandas.core import common as com

_shared_docs = {}


# --------------- #
# dtype access    #
# --------------- #
def _ensure_data(values, dtype=None):
    """
    routine to ensure that our data is of the correct
    input dtype for lower-level routines

    This will coerce:
    - ints -> int64
    - uint -> uint64
    - bool -> uint64 (TODO this should be uint8)
    - datetimelike -> i8
    - datetime64tz -> i8 (in local tz)
    - categorical -> codes

    Parameters
    ----------
    values : array-like
    dtype : pandas_dtype, optional
        coerce to this dtype

    Returns
    -------
    (ndarray, pandas_dtype, algo dtype as a string)

    """

    # we check some simple dtypes first
    try:
        if is_object_dtype(dtype):
            return ensure_object(np.asarray(values)), 'object', 'object'
        if is_bool_dtype(values) or is_bool_dtype(dtype):
            # we are actually coercing to uint64
            # until our algos support uint8 directly (see TODO)
            return np.asarray(values).astype('uint64'), 'bool', 'uint64'
        elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
            return ensure_int64(values), 'int64', 'int64'
        elif (is_unsigned_integer_dtype(values) or
              is_unsigned_integer_dtype(dtype)):
            return ensure_uint64(values), 'uint64', 'uint64'
        elif is_float_dtype(values) or is_float_dtype(dtype):
            return ensure_float64(values), 'float64', 'float64'
        elif is_object_dtype(values) and dtype is None:
            return ensure_object(np.asarray(values)), 'object', 'object'
        elif is_complex_dtype(values) or is_complex_dtype(dtype):

            # ignore the fact that we are casting to float
            # which discards complex parts
            with catch_warnings():
                simplefilter("ignore", np.ComplexWarning)
                values = ensure_float64(values)
            return values, 'float64', 'float64'

    except (TypeError, ValueError, OverflowError):
        # if we are trying to coerce to a dtype
        # and it is incompat this will fall thru to here
        return ensure_object(values), 'object', 'object'

    # datetimelike
    if (needs_i8_conversion(values) or
            is_period_dtype(dtype) or
            is_datetime64_any_dtype(dtype) or
            is_timedelta64_dtype(dtype)):
        if is_period_dtype(values) or is_period_dtype(dtype):
            from pandas import PeriodIndex
            values = PeriodIndex(values)
            dtype = values.dtype
        elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):
            from pandas import TimedeltaIndex
            values = TimedeltaIndex(values)
            dtype = values.dtype
        else:
            # Datetime
            from pandas import DatetimeIndex
            values = DatetimeIndex(values)
            dtype = values.dtype

        return values.asi8, dtype, 'int64'

    elif (is_categorical_dtype(values) and
          (is_categorical_dtype(dtype) or dtype is None)):
        values = getattr(values, 'values', values)
        values = values.codes
        dtype = 'category'

        # we are actually coercing to int64
        # until our algos support int* directly (not all do)
        values = ensure_int64(values)

        return values, dtype, 'int64'

    # we have failed, return object
    values = np.asarray(values, dtype=np.object)
    return ensure_object(values), 'object', 'object'


def _reconstruct_data(values, dtype, original):
    """
    reverse of _ensure_data

    Parameters
    ----------
    values : ndarray
    dtype : pandas_dtype
    original : ndarray-like

    Returns
    -------
    Index for extension types, otherwise ndarray casted to dtype
    """
    from pandas import Index
    if is_extension_array_dtype(dtype):
        values = dtype.construct_array_type()._from_sequence(values)
    elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
        values = Index(original)._shallow_copy(values, name=None)
    elif is_bool_dtype(dtype):
        values = values.astype(dtype)

        # we only support object dtypes bool Index
        if isinstance(original, Index):
            values = values.astype(object)
    elif dtype is not None:
        values = values.astype(dtype)

    return values


def _ensure_arraylike(values):
    """
    ensure that we are arraylike if not already
    """
    if not is_array_like(values):
        inferred = lib.infer_dtype(values, skipna=False)
        if inferred in ['mixed', 'string', 'unicode']:
            if isinstance(values, tuple):
                values = list(values)
            values = construct_1d_object_array_from_listlike(values)
        else:
            values = np.asarray(values)
    return values


_hashtables = {
    'float64': (htable.Float64HashTable, htable.Float64Vector),
    'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
    'int64': (htable.Int64HashTable, htable.Int64Vector),
    'string': (htable.StringHashTable, htable.ObjectVector),
    'object': (htable.PyObjectHashTable, htable.ObjectVector)
}


def _get_hashtable_algo(values):
    """
    Parameters
    ----------
    values : arraylike

    Returns
    -------
    tuples(hashtable class,
           vector class,
           values,
           dtype,
           ndtype)
    """
    values, dtype, ndtype = _ensure_data(values)

    if ndtype == 'object':

        # it's cheaper to use a String Hash Table than Object; we infer
        # including nulls because that is the only difference between
        # StringHashTable and ObjectHashtable
        if lib.infer_dtype(values, skipna=False) in ['string']:
            ndtype = 'string'
        else:
            ndtype = 'object'

    htable, table = _hashtables[ndtype]
    return (htable, table, values, dtype, ndtype)


def _get_data_algo(values, func_map):

    if is_categorical_dtype(values):
        values = values._values_for_rank()

    values, dtype, ndtype = _ensure_data(values)
    if ndtype == 'object':

        # it's cheaper to use a String Hash Table than Object; we infer
        # including nulls because that is the only difference between
        # StringHashTable and ObjectHashtable
        if lib.infer_dtype(values, skipna=False) in ['string']:
            ndtype = 'string'

    f = func_map.get(ndtype, func_map['object'])

    return f, values


# --------------- #
# top-level algos #
# --------------- #

def match(to_match, values, na_sentinel=-1):
    """
    Compute locations of to_match into values

    Parameters
    ----------
    to_match : array-like
        values to find positions of
    values : array-like
        Unique set of values
    na_sentinel : int, default -1
        Value to mark "not found"

    Examples
    --------

    Returns
    -------
    match : ndarray of integers
    """
    values = com.asarray_tuplesafe(values)
    htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
    to_match, _, _ = _ensure_data(to_match, dtype)
    table = htable(min(len(to_match), 1000000))
    table.map_locations(values)
    result = table.lookup(to_match)

    if na_sentinel != -1:

        # replace but return a numpy array
        # use a Series because it handles dtype conversions properly
        from pandas import Series
        result = Series(result.ravel()).replace(-1, na_sentinel)
        result = result.values.reshape(result.shape)

    return result


def unique(values):
    """
    Hash table-based unique. Uniques are returned in order
    of appearance. This does NOT sort.

    Significantly faster than numpy.unique. Includes NA values.

    Parameters
    ----------
    values : 1d array-like

    Returns
    -------
    unique values.
      - If the input is an Index, the return is an Index
      - If the input is a Categorical dtype, the return is a Categorical
      - If the input is a Series/ndarray, the return will be an ndarray

    See Also
    --------
    pandas.Index.unique
    pandas.Series.unique

    Examples
    --------
    >>> pd.unique(pd.Series([2, 1, 3, 3]))
    array([2, 1, 3])

    >>> pd.unique(pd.Series([2] + [1] * 5))
    array([2, 1])

    >>> pd.unique(pd.Series([pd.Timestamp('20160101'),
    ...                     pd.Timestamp('20160101')]))
    array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')

    >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
    ...                      pd.Timestamp('20160101', tz='US/Eastern')]))
    array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
          dtype=object)

    >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
    ...                     pd.Timestamp('20160101', tz='US/Eastern')]))
    DatetimeIndex(['2016-01-01 00:00:00-05:00'],
    ...           dtype='datetime64[ns, US/Eastern]', freq=None)

    >>> pd.unique(list('baabc'))
    array(['b', 'a', 'c'], dtype=object)

    An unordered Categorical will return categories in the
    order of appearance.

    >>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))
    [b, a, c]
    Categories (3, object): [b, a, c]

    >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
    ...                                    categories=list('abc'))))
    [b, a, c]
    Categories (3, object): [b, a, c]

    An ordered Categorical preserves the category ordering.

    >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
    ...                                    categories=list('abc'),
    ...                                    ordered=True)))
    [b, a, c]
    Categories (3, object): [a < b < c]
Loading ...