"""
Generic data algorithms. This module is experimental at the moment and not
intended for public consumption
"""
from __future__ import division
from textwrap import dedent
from warnings import catch_warnings, simplefilter, warn
import numpy as np
from pandas._libs import algos, hashtable as htable, lib
from pandas._libs.tslib import iNaT
from pandas.util._decorators import Appender, Substitution, deprecate_kwarg
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike, maybe_promote)
from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
is_datetimelike, is_extension_array_dtype, is_float_dtype,
is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype,
needs_i8_conversion)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, na_value_for_dtype
from pandas.core import common as com
_shared_docs = {}
# --------------- #
# dtype access #
# --------------- #
def _ensure_data(values, dtype=None):
"""
routine to ensure that our data is of the correct
input dtype for lower-level routines
This will coerce:
- ints -> int64
- uint -> uint64
- bool -> uint64 (TODO this should be uint8)
- datetimelike -> i8
- datetime64tz -> i8 (in local tz)
- categorical -> codes
Parameters
----------
values : array-like
dtype : pandas_dtype, optional
coerce to this dtype
Returns
-------
(ndarray, pandas_dtype, algo dtype as a string)
"""
# we check some simple dtypes first
try:
if is_object_dtype(dtype):
return ensure_object(np.asarray(values)), 'object', 'object'
if is_bool_dtype(values) or is_bool_dtype(dtype):
# we are actually coercing to uint64
# until our algos support uint8 directly (see TODO)
return np.asarray(values).astype('uint64'), 'bool', 'uint64'
elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
return ensure_int64(values), 'int64', 'int64'
elif (is_unsigned_integer_dtype(values) or
is_unsigned_integer_dtype(dtype)):
return ensure_uint64(values), 'uint64', 'uint64'
elif is_float_dtype(values) or is_float_dtype(dtype):
return ensure_float64(values), 'float64', 'float64'
elif is_object_dtype(values) and dtype is None:
return ensure_object(np.asarray(values)), 'object', 'object'
elif is_complex_dtype(values) or is_complex_dtype(dtype):
# ignore the fact that we are casting to float
# which discards complex parts
with catch_warnings():
simplefilter("ignore", np.ComplexWarning)
values = ensure_float64(values)
return values, 'float64', 'float64'
except (TypeError, ValueError, OverflowError):
# if we are trying to coerce to a dtype
# and it is incompat this will fall thru to here
return ensure_object(values), 'object', 'object'
# datetimelike
if (needs_i8_conversion(values) or
is_period_dtype(dtype) or
is_datetime64_any_dtype(dtype) or
is_timedelta64_dtype(dtype)):
if is_period_dtype(values) or is_period_dtype(dtype):
from pandas import PeriodIndex
values = PeriodIndex(values)
dtype = values.dtype
elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):
from pandas import TimedeltaIndex
values = TimedeltaIndex(values)
dtype = values.dtype
else:
# Datetime
from pandas import DatetimeIndex
values = DatetimeIndex(values)
dtype = values.dtype
return values.asi8, dtype, 'int64'
elif (is_categorical_dtype(values) and
(is_categorical_dtype(dtype) or dtype is None)):
values = getattr(values, 'values', values)
values = values.codes
dtype = 'category'
# we are actually coercing to int64
# until our algos support int* directly (not all do)
values = ensure_int64(values)
return values, dtype, 'int64'
# we have failed, return object
values = np.asarray(values, dtype=np.object)
return ensure_object(values), 'object', 'object'
def _reconstruct_data(values, dtype, original):
"""
reverse of _ensure_data
Parameters
----------
values : ndarray
dtype : pandas_dtype
original : ndarray-like
Returns
-------
Index for extension types, otherwise ndarray casted to dtype
"""
from pandas import Index
if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
values = Index(original)._shallow_copy(values, name=None)
elif is_bool_dtype(dtype):
values = values.astype(dtype)
# we only support object dtypes bool Index
if isinstance(original, Index):
values = values.astype(object)
elif dtype is not None:
values = values.astype(dtype)
return values
def _ensure_arraylike(values):
"""
ensure that we are arraylike if not already
"""
if not is_array_like(values):
inferred = lib.infer_dtype(values, skipna=False)
if inferred in ['mixed', 'string', 'unicode']:
if isinstance(values, tuple):
values = list(values)
values = construct_1d_object_array_from_listlike(values)
else:
values = np.asarray(values)
return values
_hashtables = {
'float64': (htable.Float64HashTable, htable.Float64Vector),
'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
'int64': (htable.Int64HashTable, htable.Int64Vector),
'string': (htable.StringHashTable, htable.ObjectVector),
'object': (htable.PyObjectHashTable, htable.ObjectVector)
}
def _get_hashtable_algo(values):
"""
Parameters
----------
values : arraylike
Returns
-------
tuples(hashtable class,
vector class,
values,
dtype,
ndtype)
"""
values, dtype, ndtype = _ensure_data(values)
if ndtype == 'object':
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ['string']:
ndtype = 'string'
else:
ndtype = 'object'
htable, table = _hashtables[ndtype]
return (htable, table, values, dtype, ndtype)
def _get_data_algo(values, func_map):
if is_categorical_dtype(values):
values = values._values_for_rank()
values, dtype, ndtype = _ensure_data(values)
if ndtype == 'object':
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ['string']:
ndtype = 'string'
f = func_map.get(ndtype, func_map['object'])
return f, values
# --------------- #
# top-level algos #
# --------------- #
def match(to_match, values, na_sentinel=-1):
"""
Compute locations of to_match into values
Parameters
----------
to_match : array-like
values to find positions of
values : array-like
Unique set of values
na_sentinel : int, default -1
Value to mark "not found"
Examples
--------
Returns
-------
match : ndarray of integers
"""
values = com.asarray_tuplesafe(values)
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
to_match, _, _ = _ensure_data(to_match, dtype)
table = htable(min(len(to_match), 1000000))
table.map_locations(values)
result = table.lookup(to_match)
if na_sentinel != -1:
# replace but return a numpy array
# use a Series because it handles dtype conversions properly
from pandas import Series
result = Series(result.ravel()).replace(-1, na_sentinel)
result = result.values.reshape(result.shape)
return result
def unique(values):
"""
Hash table-based unique. Uniques are returned in order
of appearance. This does NOT sort.
Significantly faster than numpy.unique. Includes NA values.
Parameters
----------
values : 1d array-like
Returns
-------
unique values.
- If the input is an Index, the return is an Index
- If the input is a Categorical dtype, the return is a Categorical
- If the input is a Series/ndarray, the return will be an ndarray
See Also
--------
pandas.Index.unique
pandas.Series.unique
Examples
--------
>>> pd.unique(pd.Series([2, 1, 3, 3]))
array([2, 1, 3])
>>> pd.unique(pd.Series([2] + [1] * 5))
array([2, 1])
>>> pd.unique(pd.Series([pd.Timestamp('20160101'),
... pd.Timestamp('20160101')]))
array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
>>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
... pd.Timestamp('20160101', tz='US/Eastern')]))
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
dtype=object)
>>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
... pd.Timestamp('20160101', tz='US/Eastern')]))
DatetimeIndex(['2016-01-01 00:00:00-05:00'],
... dtype='datetime64[ns, US/Eastern]', freq=None)
>>> pd.unique(list('baabc'))
array(['b', 'a', 'c'], dtype=object)
An unordered Categorical will return categories in the
order of appearance.
>>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))
[b, a, c]
Categories (3, object): [b, a, c]
>>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
... categories=list('abc'))))
[b, a, c]
Categories (3, object): [b, a, c]
An ordered Categorical preserves the category ordering.
>>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
... categories=list('abc'),
... ordered=True)))
[b, a, c]
Categories (3, object): [a < b < c]
Loading ...