""" miscellaneous sorting / groupby utilities """
import warnings
import numpy as np
from pandas._libs import algos, hashtable, lib
from pandas._libs.hashtable import unique_label_indices
from pandas.compat import PY3, long, string_types
from pandas.core.dtypes.cast import infer_dtype_from_array
from pandas.core.dtypes.common import (
ensure_int64, ensure_platform_int, is_categorical_dtype, is_list_like)
from pandas.core.dtypes.missing import isna
import pandas.core.algorithms as algorithms
_INT64_MAX = np.iinfo(np.int64).max
def get_group_index(labels, shape, sort, xnull):
"""
For the particular label_list, gets the offsets into the hypothetical list
representing the totally ordered cartesian product of all possible label
combinations, *as long as* this space fits within int64 bounds;
otherwise, though group indices identify unique combinations of
labels, they cannot be deconstructed.
- If `sort`, rank of returned ids preserve lexical ranks of labels.
i.e. returned id's can be used to do lexical sort on labels;
- If `xnull` nulls (-1 labels) are passed through.
Parameters
----------
labels: sequence of arrays
Integers identifying levels at each location
shape: sequence of ints same length as labels
Number of unique levels at each location
sort: boolean
If the ranks of returned ids should match lexical ranks of labels
xnull: boolean
If true nulls are excluded. i.e. -1 values in the labels are
passed through
Returns
-------
An array of type int64 where two elements are equal if their corresponding
labels are equal at all location.
"""
def _int64_cut_off(shape):
acc = long(1)
for i, mul in enumerate(shape):
acc *= long(mul)
if not acc < _INT64_MAX:
return i
return len(shape)
def maybe_lift(lab, size):
# promote nan values (assigned -1 label in lab array)
# so that all output values are non-negative
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
labels = map(ensure_int64, labels)
if not xnull:
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
labels = list(labels)
shape = list(shape)
# Iteratively process all the labels in chunks sized so less
# than _INT64_MAX unique int ids will be required for each chunk
while True:
# how many levels can be done without overflow:
nlev = _int64_cut_off(shape)
# compute flat ids for the first `nlev` levels
stride = np.prod(shape[1:nlev], dtype='i8')
out = stride * labels[0].astype('i8', subok=False, copy=False)
for i in range(1, nlev):
if shape[i] == 0:
stride = 0
else:
stride //= shape[i]
out += labels[i] * stride
if xnull: # exclude nulls
mask = labels[0] == -1
for lab in labels[1:nlev]:
mask |= lab == -1
out[mask] = -1
if nlev == len(shape): # all levels done!
break
# compress what has been done so far in order to avoid overflow
# to retain lexical ranks, obs_ids should be sorted
comp_ids, obs_ids = compress_group_index(out, sort=sort)
labels = [comp_ids] + labels[nlev:]
shape = [len(obs_ids)] + shape[nlev:]
return out
def get_compressed_ids(labels, sizes):
"""
Group_index is offsets into cartesian product of all possible labels. This
space can be huge, so this function compresses it, by computing offsets
(comp_ids) into the list of unique labels (obs_group_ids).
Parameters
----------
labels : list of label arrays
sizes : list of size of the levels
Returns
-------
tuple of (comp_ids, obs_group_ids)
"""
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)
def is_int64_overflow_possible(shape):
the_prod = long(1)
for x in shape:
the_prod *= long(x)
return the_prod >= _INT64_MAX
def decons_group_index(comp_labels, shape):
# reconstruct labels
if is_int64_overflow_possible(shape):
# at some point group indices are factorized,
# and may not be deconstructed here! wrong path!
raise ValueError('cannot deconstruct factorized group indices!')
label_list = []
factor = 1
y = 0
x = comp_labels
for i in reversed(range(len(shape))):
labels = (x - y) % (factor * shape[i]) // factor
np.putmask(labels, comp_labels < 0, -1)
label_list.append(labels)
y = labels * factor
factor *= shape[i]
return label_list[::-1]
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
"""
reconstruct labels from observed group ids
Parameters
----------
xnull: boolean,
if nulls are excluded; i.e. -1 labels are passed through
"""
if not xnull:
lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
shape = np.asarray(shape, dtype='i8') + lift
if not is_int64_overflow_possible(shape):
# obs ids are deconstructable! take the fast route!
out = decons_group_index(obs_ids, shape)
return out if xnull or not lift.any() \
else [x - y for x, y in zip(out, lift)]
i = unique_label_indices(comp_ids)
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
return [i8copy(lab[i]) for lab in labels]
def indexer_from_factorized(labels, shape, compress=True):
ids = get_group_index(labels, shape, sort=True, xnull=False)
if not compress:
ngroups = (ids.size and ids.max()) + 1
else:
ids, obs = compress_group_index(ids, sort=True)
ngroups = len(obs)
return get_group_index_sorter(ids, ngroups)
def lexsort_indexer(keys, orders=None, na_position='last'):
from pandas.core.arrays import Categorical
labels = []
shape = []
if isinstance(orders, bool):
orders = [orders] * len(keys)
elif orders is None:
orders = [True] * len(keys)
for key, order in zip(keys, orders):
# we are already a Categorical
if is_categorical_dtype(key):
c = key
# create the Categorical
else:
c = Categorical(key, ordered=True)
if na_position not in ['last', 'first']:
raise ValueError('invalid na_position: {!r}'.format(na_position))
n = len(c.categories)
codes = c.codes.copy()
mask = (c.codes == -1)
if order: # ascending
if na_position == 'last':
codes = np.where(mask, n, codes)
elif na_position == 'first':
codes += 1
else: # not order means descending
if na_position == 'last':
codes = np.where(mask, n, n - codes - 1)
elif na_position == 'first':
codes = np.where(mask, 0, n - codes)
if mask.any():
n += 1
shape.append(n)
labels.append(codes)
return indexer_from_factorized(labels, shape)
def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
"""
This is intended to be a drop-in replacement for np.argsort which
handles NaNs. It adds ascending and na_position parameters.
GH #6399, #5231
"""
# specially handle Categorical
if is_categorical_dtype(items):
if na_position not in {'first', 'last'}:
raise ValueError('invalid na_position: {!r}'.format(na_position))
mask = isna(items)
cnt_null = mask.sum()
sorted_idx = items.argsort(ascending=ascending, kind=kind)
if ascending and na_position == 'last':
# NaN is coded as -1 and is listed in front after sorting
sorted_idx = np.roll(sorted_idx, -cnt_null)
elif not ascending and na_position == 'first':
# NaN is coded as -1 and is listed in the end after sorting
sorted_idx = np.roll(sorted_idx, cnt_null)
return sorted_idx
with warnings.catch_warnings():
# https://github.com/pandas-dev/pandas/issues/25439
# can be removed once ExtensionArrays are properly handled by nargsort
warnings.filterwarnings(
"ignore", category=FutureWarning,
message="Converting timezone-aware DatetimeArray to")
items = np.asanyarray(items)
idx = np.arange(len(items))
mask = isna(items)
non_nans = items[~mask]
non_nan_idx = idx[~mask]
nan_idx = np.nonzero(mask)[0]
if not ascending:
non_nans = non_nans[::-1]
non_nan_idx = non_nan_idx[::-1]
indexer = non_nan_idx[non_nans.argsort(kind=kind)]
if not ascending:
indexer = indexer[::-1]
# Finally, place the NaNs at the end or the beginning according to
# na_position
if na_position == 'last':
indexer = np.concatenate([indexer, nan_idx])
elif na_position == 'first':
indexer = np.concatenate([nan_idx, indexer])
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
return indexer
class _KeyMapper(object):
"""
Ease my suffering. Map compressed group id -> key tuple
"""
def __init__(self, comp_ids, ngroups, levels, labels):
self.levels = levels
self.labels = labels
self.comp_ids = comp_ids.astype(np.int64)
self.k = len(labels)
self.tables = [hashtable.Int64HashTable(ngroups)
for _ in range(self.k)]
self._populate_tables()
def _populate_tables(self):
for labs, table in zip(self.labels, self.tables):
table.map(self.comp_ids, labs.astype(np.int64))
def get_key(self, comp_id):
return tuple(level[table.get_item(comp_id)]
for table, level in zip(self.tables, self.levels))
def get_flattened_iterator(comp_ids, ngroups, levels, labels):
# provide "flattened" iterator for multi-group setting
mapper = _KeyMapper(comp_ids, ngroups, levels, labels)
return [mapper.get_key(i) for i in range(ngroups)]
def get_indexer_dict(label_list, keys):
""" return a diction of {labels} -> {indexers} """
shape = list(map(len, keys))
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
ngroups = ((group_index.size and group_index.max()) + 1) \
if is_int64_overflow_possible(shape) \
else np.prod(shape, dtype='i8')
sorter = get_group_index_sorter(group_index, ngroups)
sorted_labels = [lab.take(sorter) for lab in label_list]
group_index = group_index.take(sorter)
return lib.indices_fast(sorter, group_index, keys, sorted_labels)
# ----------------------------------------------------------------------
# sorting levels...cleverly?
def get_group_index_sorter(group_index, ngroups):
"""
algos.groupsort_indexer implements `counting sort` and it is at least
O(ngroups), where
ngroups = prod(shape)
shape = map(len, keys)
that is, linear in the number of combinations (cartesian product) of unique
Loading ...