Gemfury

360core / pandas python

Repository URL to install this package:
Details
pandas / tests / test_algos.py
# -*- coding: utf-8 -*-

import numpy as np
import pytest

from numpy.random import RandomState
from numpy import nan
from datetime import datetime
from itertools import permutations
from pandas import (Series, Categorical, CategoricalIndex,
                    Timestamp, DatetimeIndex, Index, IntervalIndex)
import pandas as pd

from pandas import compat
from pandas._libs import (groupby as libgroupby, algos as libalgos,
                          hashtable as ht)
from pandas._libs.hashtable import unique_label_indices
from pandas.compat import lrange, range
import pandas.core.algorithms as algos
import pandas.core.common as com
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.util.testing import assert_almost_equal


class TestMatch(object):

    def test_ints(self):
        values = np.array([0, 2, 1])
        to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0])

        result = algos.match(to_match, values)
        expected = np.array([0, 2, 1, 1, 0, 2, -1, 0], dtype=np.int64)
        tm.assert_numpy_array_equal(result, expected)

        result = Series(algos.match(to_match, values, np.nan))
        expected = Series(np.array([0, 2, 1, 1, 0, 2, np.nan, 0]))
        tm.assert_series_equal(result, expected)

        s = Series(np.arange(5), dtype=np.float32)
        result = algos.match(s, [2, 4])
        expected = np.array([-1, -1, 0, -1, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(result, expected)

        result = Series(algos.match(s, [2, 4], np.nan))
        expected = Series(np.array([np.nan, np.nan, 0, np.nan, 1]))
        tm.assert_series_equal(result, expected)

    def test_strings(self):
        values = ['foo', 'bar', 'baz']
        to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux']

        result = algos.match(to_match, values)
        expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64)
        tm.assert_numpy_array_equal(result, expected)

        result = Series(algos.match(to_match, values, np.nan))
        expected = Series(np.array([1, 0, np.nan, 0, 1, 2, np.nan]))
        tm.assert_series_equal(result, expected)


class TestFactorize(object):

    def test_basic(self):

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c',
                                           'c'])
        tm.assert_numpy_array_equal(
            uniques, np.array(['a', 'b', 'c'], dtype=object))

        labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
                                           'a', 'c', 'c', 'c'], sort=True)
        exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array(['a', 'b', 'c'], dtype=object)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(range(5))))
        exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)

        exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
        exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64)
        tm.assert_numpy_array_equal(uniques, exp)

        labels, uniques = algos.factorize(list(reversed(np.arange(5.))),
                                          sort=True)
        exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64)
        tm.assert_numpy_array_equal(uniques, exp)

    def test_mixed(self):

        # doc example reshaping.rst
        x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index(['A', 'B', 3.14, np.inf])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = Index([3.14, np.inf, 'A', 'B'])
        tm.assert_index_equal(uniques, exp)

    def test_datelike(self):

        # M8
        v1 = Timestamp('20130101 09:00:00.00004')
        v2 = Timestamp('20130101')
        x = Series([v1, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)

        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = DatetimeIndex([v1, v2])
        tm.assert_index_equal(uniques, exp)

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        exp = DatetimeIndex([v2, v1])
        tm.assert_index_equal(uniques, exp)

        # period
        v1 = pd.Period('201302', freq='M')
        v2 = pd.Period('201303', freq='M')
        x = Series([v1, v1, v1, v2, v2, v1])

        # periods are not 'sorted' as they are converted back into an index
        labels, uniques = algos.factorize(x)
        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

        # GH 5986
        v1 = pd.to_timedelta('1 day 1 min')
        v2 = pd.to_timedelta('1 day')
        x = Series([v1, v2, v1, v1, v2, v2, v1])
        labels, uniques = algos.factorize(x)
        exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))

        labels, uniques = algos.factorize(x, sort=True)
        exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(labels, exp)
        tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))

    def test_factorize_nan(self):
        # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
        # rizer.factorize should not raise an exception if na_sentinel indexes
        # outside of reverse_indexer
        key = np.array([1, 2, 1, np.nan], dtype='O')
        rizer = ht.Factorizer(len(key))
        for na_sentinel in (-1, 20):
            ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
            expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
            assert len(set(key)) == len(set(expected))
            tm.assert_numpy_array_equal(pd.isna(key),
                                        expected == na_sentinel)

        # nan still maps to na_sentinel when sort=False
        key = np.array([0, np.nan, 1], dtype='O')
        na_sentinel = -1

        # TODO(wesm): unused?
        ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel)  # noqa

        expected = np.array([2, -1, 0], dtype='int32')
        assert len(set(key)) == len(set(expected))
        tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)

    @pytest.mark.parametrize("data,expected_label,expected_level", [
        (
            [(1, 1), (1, 2), (0, 0), (1, 2), 'nonsense'],
            [0, 1, 2, 1, 3],
            [(1, 1), (1, 2), (0, 0), 'nonsense']
        ),
        (
            [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
            [0, 1, 2, 1, 3],
            [(1, 1), (1, 2), (0, 0), (1, 2, 3)]
        ),
        (
            [(1, 1), (1, 2), (0, 0), (1, 2)],
            [0, 1, 2, 1],
            [(1, 1), (1, 2), (0, 0)]
        )
    ])
    def test_factorize_tuple_list(self, data, expected_label, expected_level):
        # GH9454
        result = pd.factorize(data)

        tm.assert_numpy_array_equal(result[0],
                                    np.array(expected_label, dtype=np.intp))

        expected_level_array = com._asarray_tuplesafe(expected_level,
                                                      dtype=object)
        tm.assert_numpy_array_equal(result[1], expected_level_array)

    def test_complex_sorting(self):
        # gh 12666 - check no segfault
        # Test not valid numpy versions older than 1.11
        if pd._np_version_under1p11:
            pytest.skip("Test valid only for numpy 1.11+")

        x17 = np.array([complex(i) for i in range(17)], dtype=object)

        pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True)

    def test_uint64_factorize(self):
        data = np.array([2**63, 1, 2**63], dtype=np.uint64)
        exp_labels = np.array([0, 1, 0], dtype=np.intp)
        exp_uniques = np.array([2**63, 1], dtype=np.uint64)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques)

        data = np.array([2**63, -1, 2**63], dtype=object)
        exp_labels = np.array([0, 1, 0], dtype=np.intp)
        exp_uniques = np.array([2**63, -1], dtype=object)

        labels, uniques = algos.factorize(data)
        tm.assert_numpy_array_equal(labels, exp_labels)
        tm.assert_numpy_array_equal(uniques, exp_uniques)

    def test_deprecate_order(self):
        # gh 19727 - check warning is raised for deprecated keyword, order.
        # Test not valid once order keyword is removed.
        data = np.array([2**63, 1, 2**63], dtype=np.uint64)
        with tm.assert_produces_warning(expected_warning=FutureWarning):
            algos.factorize(data, order=True)
        with tm.assert_produces_warning(False):
            algos.factorize(data)

    @pytest.mark.parametrize('data', [
        np.array([0, 1, 0], dtype='u8'),
        np.array([-2**63, 1, -2**63], dtype='i8'),
        np.array(['__nan__', 'foo', '__nan__'], dtype='object'),
    ])
    def test_parametrized_factorize_na_value_default(self, data):
        # arrays that include the NA default for that type, but isn't used.
        l, u = algos.factorize(data)
        expected_uniques = data[[0, 1]]
        expected_labels = np.array([0, 1, 0], dtype=np.intp)
        tm.assert_numpy_array_equal(l, expected_labels)
        tm.assert_numpy_array_equal(u, expected_uniques)

    @pytest.mark.parametrize('data, na_value', [
        (np.array([0, 1, 0, 2], dtype='u8'), 0),
        (np.array([1, 0, 1, 2], dtype='u8'), 1),
        (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63),
        (np.array([1, -2**63, 1, 0], dtype='i8'), 1),
        (np.array(['a', '', 'a', 'b'], dtype=object), 'a'),
        (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()),
        (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object),
         ('a', 1)),
    ])
    def test_parametrized_factorize_na_value(self, data, na_value):
        l, u = algos._factorize_array(data, na_value=na_value)
        expected_uniques = data[[1, 3]]
        expected_labels = np.array([-1, 0, -1, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(l, expected_labels)
        tm.assert_numpy_array_equal(u, expected_uniques)


class TestUnique(object):

    def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        assert isinstance(result, np.ndarray)

    def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype('O')

        result = algos.unique(arr)
        assert isinstance(result, np.ndarray)

    def test_object_refcount_bug(self):
        lst = ['A', 'B', 'C', 'D', 'E']
        for i in range(1000):
            len(algos.unique(lst))

    def test_on_index_object(self):

        mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
            np.arange(5), 5)])
        expected = mindex.values
        expected.sort()

        mindex = mindex.repeat(2)

        result = pd.unique(mindex)
        result.sort()

        tm.assert_almost_equal(result, expected)

    def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            ['2015-01-03T00:00:00.000000000+0000',
             '2015-01-01T00:00:00.000000000+0000'],
            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

    def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

    def test_uint64_overflow(self):
        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
        exp = np.array([1, 2, 2**63], dtype=np.uint64)
        tm.assert_numpy_array_equal(algos.unique(s), exp)

    def test_nan_in_object_array(self):
        l = ['a', np.nan, 'c', 'c']
        result = pd.unique(l)
        expected = np.array(['a', np.nan, 'c'], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

    def test_categorical(self):

        # we are expecting to return in the order
        # of appearance
        expected = Categorical(list('bac'), categories=list('bac'))

        # we are expecting to return in the order
        # of the categories
        expected_o = Categorical(
            list('bac'), categories=list('abc'), ordered=True)

        # GH 15939
        c = Categorical(list('baabc'))
        result = c.unique()
        tm.assert_categorical_equal(result, expected)

        result = algos.unique(c)
        tm.assert_categorical_equal(result, expected)

        c = Categorical(list('baabc'), ordered=True)
        result = c.unique()
        tm.assert_categorical_equal(result, expected_o)

        result = algos.unique(c)
        tm.assert_categorical_equal(result, expected_o)

        # Series of categorical dtype
        s = Series(Categorical(list('baabc')), name='foo')
        result = s.unique()
        tm.assert_categorical_equal(result, expected)

        result = pd.unique(s)
        tm.assert_categorical_equal(result, expected)

        # CI -> return CI
        ci = CategoricalIndex(Categorical(list('baabc'),
                                          categories=list('bac')))
        expected = CategoricalIndex(expected)
        result = ci.unique()
        tm.assert_index_equal(result, expected)

        result = pd.unique(ci)
        tm.assert_index_equal(result, expected)

    def test_datetime64tz_aware(self):
        # GH 15939

        result = Series(
            Index([Timestamp('20160101', tz='US/Eastern'),
                   Timestamp('20160101', tz='US/Eastern')])).unique()
        expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
                                       tz='US/Eastern')], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

        result = Index([Timestamp('20160101', tz='US/Eastern'),
                        Timestamp('20160101', tz='US/Eastern')]).unique()
        expected = DatetimeIndex(['2016-01-01 00:00:00'],
                                 dtype='datetime64[ns, US/Eastern]', freq=None)
        tm.assert_index_equal(result, expected)

        result = pd.unique(
            Series(Index([Timestamp('20160101', tz='US/Eastern'),
                          Timestamp('20160101', tz='US/Eastern')])))
        expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
                                       tz='US/Eastern')], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

        result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
                                  Timestamp('20160101', tz='US/Eastern')]))
        expected = DatetimeIndex(['2016-01-01 00:00:00'],
                                 dtype='datetime64[ns, US/Eastern]', freq=None)
        tm.assert_index_equal(result, expected)

    def test_order_of_appearance(self):
        # 9346
        # light testing of guarantee of order of appearance
        # these also are the doc-examples
        result = pd.unique(Series([2, 1, 3, 3]))
        tm.assert_numpy_array_equal(result,
                                    np.array([2, 1, 3], dtype='int64'))

        result = pd.unique(Series([2] + [1] * 5))
        tm.assert_numpy_array_equal(result,
                                    np.array([2, 1], dtype='int64'))

        result = pd.unique(Series([Timestamp('20160101'),
                                   Timestamp('20160101')]))
        expected = np.array(['2016-01-01T00:00:00.000000000'],
                            dtype='datetime64[ns]')
        tm.assert_numpy_array_equal(result, expected)

        result = pd.unique(Index(
            [Timestamp('20160101', tz='US/Eastern'),
             Timestamp('20160101', tz='US/Eastern')]))
        expected = DatetimeIndex(['2016-01-01 00:00:00'],
                                 dtype='datetime64[ns, US/Eastern]',
                                 freq=None)
        tm.assert_index_equal(result, expected)

        result = pd.unique(list('aabc'))
        expected = np.array(['a', 'b', 'c'], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

        result = pd.unique(Series(Categorical(list('aabc'))))
        expected = Categorical(list('abc'))
        tm.assert_categorical_equal(result, expected)

    @pytest.mark.parametrize("arg ,expected", [
        (('1', '1', '2'), np.array(['1', '2'], dtype=object)),
        (('foo',), np.array(['foo'], dtype=object))
    ])
    def test_tuple_with_strings(self, arg, expected):
        # see GH 17108
        result = pd.unique(arg)
        tm.assert_numpy_array_equal(result, expected)

    def test_obj_none_preservation(self):
        # GH 20866
        arr = np.array(['foo', None], dtype=object)
        result = pd.unique(arr)
        expected = np.array(['foo', None], dtype=object)

        tm.assert_numpy_array_equal(result, expected, strict_nan=True)


class TestIsin(object):

    def test_invalid(self):

        pytest.raises(TypeError, lambda: algos.isin(1, 1))
        pytest.raises(TypeError, lambda: algos.isin(1, [1]))
        pytest.raises(TypeError, lambda: algos.isin([1], 1))

    def test_basic(self):

        result = algos.isin([1, 2], [1])
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(np.array([1, 2]), [1])
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(Series([1, 2]), [1])
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(Series([1, 2]), Series([1]))
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(Series([1, 2]), set([1]))
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(['a', 'b'], ['a'])
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(Series(['a', 'b']), Series(['a']))
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(Series(['a', 'b']), set(['a']))
        expected = np.array([True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(['a', 'b'], [1])
        expected = np.array([False, False])
        tm.assert_numpy_array_equal(result, expected)

    def test_i8(self):

        arr = pd.date_range('20130101', periods=3).values
        result = algos.isin(arr, [arr[0]])
        expected = np.array([True, False, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(arr, arr[0:2])
        expected = np.array([True, True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(arr, set(arr[0:2]))
        expected = np.array([True, True, False])
        tm.assert_numpy_array_equal(result, expected)

        arr = pd.timedelta_range('1 day', periods=3).values
        result = algos.isin(arr, [arr[0]])
        expected = np.array([True, False, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(arr, arr[0:2])
        expected = np.array([True, True, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.isin(arr, set(arr[0:2]))
        expected = np.array([True, True, False])
        tm.assert_numpy_array_equal(result, expected)

    def test_large(self):

        s = pd.date_range('20000101', periods=2000000, freq='s').values
        result = algos.isin(s, s[0:2])
        expected = np.zeros(len(s), dtype=bool)
        expected[0] = True
        expected[1] = True
        tm.assert_numpy_array_equal(result, expected)

    def test_categorical_from_codes(self):
        # GH 16639
        vals = np.array([0, 1, 2, 0])
        cats = ['a', 'b', 'c']
        Sd = Series(Categorical(1).from_codes(vals, cats))
        St = Series(Categorical(1).from_codes(np.array([0, 1]), cats))
        expected = np.array([True, True, False, True])
        result = algos.isin(Sd, St)
        tm.assert_numpy_array_equal(expected, result)

    @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
    def test_empty(self, empty):
        # see gh-16991
        vals = Index(["a", "b"])
        expected = np.array([False, False])

        result = algos.isin(vals, empty)
        tm.assert_numpy_array_equal(expected, result)


class TestValueCounts(object):

    def test_value_counts(self):
        np.random.seed(1234)
        from pandas.core.reshape.tile import cut

        arr = np.random.randn(4)
        factor = cut(arr, 4)

        # assert isinstance(factor, n)
        result = algos.value_counts(factor)
        breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
        index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
        expected = Series([1, 1, 1, 1], index=index)
        tm.assert_series_equal(result.sort_index(), expected.sort_index())

    def test_value_counts_bins(self):
        s = [1, 2, 3, 4]
        result = algos.value_counts(s, bins=1)
        expected = Series([4],
                          index=IntervalIndex.from_tuples([(0.996, 4.0)]))
        tm.assert_series_equal(result, expected)

        result = algos.value_counts(s, bins=2, sort=False)
        expected = Series([2, 2],
                          index=IntervalIndex.from_tuples([(0.996, 2.5),
                                                           (2.5, 4.0)]))
        tm.assert_series_equal(result, expected)

    def test_value_counts_dtypes(self):
        result = algos.value_counts([1, 1.])
        assert len(result) == 1

        result = algos.value_counts([1, 1.], bins=1)
        assert len(result) == 1

        result = algos.value_counts(Series([1, 1., '1']))  # object
        assert len(result) == 2

        pytest.raises(TypeError, lambda s: algos.value_counts(s, bins=1),
                      ['1', 1])

    def test_value_counts_nat(self):
        td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]')
        dt = pd.to_datetime(['NaT', '2014-01-01'])

        for s in [td, dt]:
            vc = algos.value_counts(s)
            vc_with_na = algos.value_counts(s, dropna=False)
            assert len(vc) == 1
            assert len(vc_with_na) == 2

        exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1})
        tm.assert_series_equal(algos.value_counts(dt), exp_dt)
        # TODO same for (timedelta)

    def test_value_counts_datetime_outofbounds(self):
        # GH 13663
        s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1),
                    datetime(5000, 1, 1), datetime(6000, 1, 1),
                    datetime(3000, 1, 1), datetime(3000, 1, 1)])
        res = s.value_counts()

        exp_index = Index([datetime(3000, 1, 1), datetime(5000, 1, 1),
                           datetime(6000, 1, 1)], dtype=object)
        exp = Series([3, 2, 1], index=exp_index)
        tm.assert_series_equal(res, exp)

        # GH 12424
        res = pd.to_datetime(Series(['2362-01-01', np.nan]),
                             errors='ignore')
        exp = Series(['2362-01-01', np.nan], dtype=object)
        tm.assert_series_equal(res, exp)

    def test_categorical(self):
        s = Series(Categorical(list('aaabbc')))
        result = s.value_counts()
        expected = Series([3, 2, 1], index=CategoricalIndex(['a', 'b', 'c']))

        tm.assert_series_equal(result, expected, check_index_type=True)

        # preserve order?
        s = s.cat.as_ordered()
        result = s.value_counts()
        expected.index = expected.index.as_ordered()
        tm.assert_series_equal(result, expected, check_index_type=True)

    def test_categorical_nans(self):
        s = Series(Categorical(list('aaaaabbbcc')))  # 4,3,2,1 (nan)
        s.iloc[1] = np.nan
        result = s.value_counts()
        expected = Series([4, 3, 2], index=CategoricalIndex(
            ['a', 'b', 'c'], categories=['a', 'b', 'c']))
        tm.assert_series_equal(result, expected, check_index_type=True)
        result = s.value_counts(dropna=False)
        expected = Series([
            4, 3, 2, 1
        ], index=CategoricalIndex(['a', 'b', 'c', np.nan]))
        tm.assert_series_equal(result, expected, check_index_type=True)

        # out of order
        s = Series(Categorical(
            list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c']))
        s.iloc[1] = np.nan
        result = s.value_counts()
        expected = Series([4, 3, 2], index=CategoricalIndex(
            ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True))
        tm.assert_series_equal(result, expected, check_index_type=True)

        result = s.value_counts(dropna=False)
        expected = Series([4, 3, 2, 1], index=CategoricalIndex(
            ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
        tm.assert_series_equal(result, expected, check_index_type=True)

    def test_categorical_zeroes(self):
        # keep the `d` category with 0
        s = Series(Categorical(
            list('bbbaac'), categories=list('abcd'), ordered=True))
        result = s.value_counts()
        expected = Series([3, 2, 1, 0], index=Categorical(
            ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
        tm.assert_series_equal(result, expected, check_index_type=True)

    def test_dropna(self):
        # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328

        tm.assert_series_equal(
            Series([True, True, False]).value_counts(dropna=True),
            Series([2, 1], index=[True, False]))
        tm.assert_series_equal(
            Series([True, True, False]).value_counts(dropna=False),
            Series([2, 1], index=[True, False]))

        tm.assert_series_equal(
            Series([True, True, False, None]).value_counts(dropna=True),
            Series([2, 1], index=[True, False]))
        tm.assert_series_equal(
            Series([True, True, False, None]).value_counts(dropna=False),
            Series([2, 1, 1], index=[True, False, np.nan]))
        tm.assert_series_equal(
            Series([10.3, 5., 5.]).value_counts(dropna=True),
            Series([2, 1], index=[5., 10.3]))
        tm.assert_series_equal(
            Series([10.3, 5., 5.]).value_counts(dropna=False),
            Series([2, 1], index=[5., 10.3]))

        tm.assert_series_equal(
            Series([10.3, 5., 5., None]).value_counts(dropna=True),
            Series([2, 1], index=[5., 10.3]))

        # 32-bit linux has a different ordering
        if not compat.is_platform_32bit():
            result = Series([10.3, 5., 5., None]).value_counts(dropna=False)
            expected = Series([2, 1, 1], index=[5., 10.3, np.nan])
            tm.assert_series_equal(result, expected)

    def test_value_counts_normalized(self):
        # GH12558
        s = Series([1, 2, np.nan, np.nan, np.nan])
        dtypes = (np.float64, np.object, 'M8[ns]')
        for t in dtypes:
            s_typed = s.astype(t)
            result = s_typed.value_counts(normalize=True, dropna=False)
            expected = Series([0.6, 0.2, 0.2],
                              index=Series([np.nan, 2.0, 1.0], dtype=t))
            tm.assert_series_equal(result, expected)

            result = s_typed.value_counts(normalize=True, dropna=True)
            expected = Series([0.5, 0.5],
                              index=Series([2.0, 1.0], dtype=t))
            tm.assert_series_equal(result, expected)

    def test_value_counts_uint64(self):
        arr = np.array([2**63], dtype=np.uint64)
        expected = Series([1], index=[2**63])
        result = algos.value_counts(arr)

        tm.assert_series_equal(result, expected)

        arr = np.array([-1, 2**63], dtype=object)
        expected = Series([1, 1], index=[-1, 2**63])
        result = algos.value_counts(arr)

        # 32-bit linux has a different ordering
        if not compat.is_platform_32bit():
            tm.assert_series_equal(result, expected)


class TestDuplicated(object):

    def test_duplicated_with_nas(self):
        keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)

        result = algos.duplicated(keys)
        expected = np.array([False, False, False, True, False, True])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.duplicated(keys, keep='first')
        expected = np.array([False, False, False, True, False, True])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.duplicated(keys, keep='last')
        expected = np.array([True, False, True, False, False, False])
        tm.assert_numpy_array_equal(result, expected)

        result = algos.duplicated(keys, keep=False)
        expected = np.array([True, False, True, True, False, True])
        tm.assert_numpy_array_equal(result, expected)

        keys = np.empty(8, dtype=object)
        for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2,
                                  [0, np.nan, 0, np.nan] * 2)):
            keys[i] = t

        result = algos.duplicated(keys)
        falses = [False] * 4
        trues = [True] * 4
        expected = np.array(falses + trues)
        tm.assert_numpy_array_equal(result, expected)

        result = algos.duplicated(keys, keep='last')
        expected = np.array(trues + falses)
        tm.assert_numpy_array_equal(result, expected)

        result = algos.duplicated(keys, keep=False)
        expected = np.array(trues + trues)
        tm.assert_numpy_array_equal(result, expected)

    @pytest.mark.parametrize('case', [
        np.array([1, 2, 1, 5, 3,
                  2, 4, 1, 5, 6]),
        np.array([1.1, 2.2, 1.1, np.nan, 3.3,
                  2.2, 4.4, 1.1, np.nan, 6.6]),
        pytest.param(np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
                               2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
                     marks=pytest.mark.xfail(reason="Complex bug. GH 16399")
                     ),
        np.array(['a', 'b', 'a', 'e', 'c',
                  'b', 'd', 'a', 'e', 'f'], dtype=object),
        np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7],
                 dtype=np.uint64),
    ])
    def test_numeric_object_likes(self, case):
        exp_first = np.array([False, False, True, False, False,
                              True, False, True, True, False])
        exp_last = np.array([True, True, True, True, False,
                             False, False, False, False, False])
        exp_false = exp_first | exp_last

        res_first = algos.duplicated(case, keep='first')
        tm.assert_numpy_array_equal(res_first, exp_first)

        res_last = algos.duplicated(case, keep='last')
        tm.assert_numpy_array_equal(res_last, exp_last)

        res_false = algos.duplicated(case, keep=False)
        tm.assert_numpy_array_equal(res_false, exp_false)

        # index
        for idx in [Index(case), Index(case, dtype='category')]:
            res_first = idx.duplicated(keep='first')
            tm.assert_numpy_array_equal(res_first, exp_first)

            res_last = idx.duplicated(keep='last')
            tm.assert_numpy_array_equal(res_last, exp_last)

            res_false = idx.duplicated(keep=False)
            tm.assert_numpy_array_equal(res_false, exp_false)

        # series
        for s in [Series(case), Series(case, dtype='category')]:
            res_first = s.duplicated(keep='first')
            tm.assert_series_equal(res_first, Series(exp_first))

            res_last = s.duplicated(keep='last')
            tm.assert_series_equal(res_last, Series(exp_last))

            res_false = s.duplicated(keep=False)
            tm.assert_series_equal(res_false, Series(exp_false))

    def test_datetime_likes(self):

        dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03',
              '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06']
        td = ['1 days', '2 days', '1 days', 'NaT', '3 days',
              '2 days', '4 days', '1 days', 'NaT', '6 days']

        cases = [np.array([Timestamp(d) for d in dt]),
                 np.array([Timestamp(d, tz='US/Eastern') for d in dt]),
                 np.array([pd.Period(d, freq='D') for d in dt]),
                 np.array([np.datetime64(d) for d in dt]),
                 np.array([pd.Timedelta(d) for d in td])]

        exp_first = np.array([False, False, True, False, False,
                              True, False, True, True, False])
        exp_last = np.array([True, True, True, True, False,
                             False, False, False, False, False])
        exp_false = exp_first | exp_last

        for case in cases:
            res_first = algos.duplicated(case, keep='first')
            tm.assert_numpy_array_equal(res_first, exp_first)

            res_last = algos.duplicated(case, keep='last')
            tm.assert_numpy_array_equal(res_last, exp_last)

            res_false = algos.duplicated(case, keep=False)
            tm.assert_numpy_array_equal(res_false, exp_false)

            # index
            for idx in [Index(case), Index(case, dtype='category'),
                        Index(case, dtype=object)]:
                res_first = idx.duplicated(keep='first')
                tm.assert_numpy_array_equal(res_first, exp_first)

                res_last = idx.duplicated(keep='last')
                tm.assert_numpy_array_equal(res_last, exp_last)

                res_false = idx.duplicated(keep=False)
                tm.assert_numpy_array_equal(res_false, exp_false)

            # series
            for s in [Series(case), Series(case, dtype='category'),
                      Series(case, dtype=object)]:
                res_first = s.duplicated(keep='first')
                tm.assert_series_equal(res_first, Series(exp_first))

                res_last = s.duplicated(keep='last')
                tm.assert_series_equal(res_last, Series(exp_last))

                res_false = s.duplicated(keep=False)
                tm.assert_series_equal(res_false, Series(exp_false))

    def test_unique_index(self):
        cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)]
        for case in cases:
            assert case.is_unique
            tm.assert_numpy_array_equal(case.duplicated(),
                                        np.array([False, False, False]))

    @pytest.mark.parametrize('arr, unique', [
        ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
         [(0, 0), (0, 1), (1, 0), (1, 1)]),
        ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')],
         [('b', 'c'), ('a', 'b')]),
        ([('a', 1), ('b', 2), ('a', 3), ('a', 1)],
         [('a', 1), ('b', 2), ('a', 3)]),
    ])
    def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected)


class GroupVarTestMixin(object):

    def test_group_var_generic_1d(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 1))).astype(self.dtype)
        counts = np.zeros(5, dtype='int64')
        values = 10 * prng.rand(15, 1).astype(self.dtype)
        labels = np.tile(np.arange(5), (3, )).astype('int64')

        expected_out = (np.squeeze(values)
                        .reshape((5, 3), order='F')
                        .std(axis=1, ddof=1) ** 2)[:, np.newaxis]
        expected_counts = counts + 3

        self.algo(out, counts, values, labels)
        assert np.allclose(out, expected_out, self.rtol)
        tm.assert_numpy_array_equal(counts, expected_counts)

    def test_group_var_generic_1d_flat_labels(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((1, 1))).astype(self.dtype)
        counts = np.zeros(1, dtype='int64')
        values = 10 * prng.rand(5, 1).astype(self.dtype)
        labels = np.zeros(5, dtype='int64')

        expected_out = np.array([[values.std(ddof=1) ** 2]])
        expected_counts = counts + 5

        self.algo(out, counts, values, labels)

        assert np.allclose(out, expected_out, self.rtol)
        tm.assert_numpy_array_equal(counts, expected_counts)

    def test_group_var_generic_2d_all_finite(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
        counts = np.zeros(5, dtype='int64')
        values = 10 * prng.rand(10, 2).astype(self.dtype)
        labels = np.tile(np.arange(5), (2, )).astype('int64')

        expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
        expected_counts = counts + 2

        self.algo(out, counts, values, labels)
        assert np.allclose(out, expected_out, self.rtol)
        tm.assert_numpy_array_equal(counts, expected_counts)

    def test_group_var_generic_2d_some_nan(self):
        prng = RandomState(1234)

        out = (np.nan * np.ones((5, 2))).astype(self.dtype)
        counts = np.zeros(5, dtype='int64')
        values = 10 * prng.rand(10, 2).astype(self.dtype)
        values[:, 1] = np.nan
        labels = np.tile(np.arange(5), (2, )).astype('int64')

        expected_out = np.vstack([values[:, 0]
                                  .reshape(5, 2, order='F')
                                  .std(ddof=1, axis=1) ** 2,
                                  np.nan * np.ones(5)]).T.astype(self.dtype)
        expected_counts = counts + 2

        self.algo(out, counts, values, labels)
        tm.assert_almost_equal(out, expected_out, check_less_precise=6)
        tm.assert_numpy_array_equal(counts, expected_counts)

    def test_group_var_constant(self):
        # Regression test from GH 10448.

        out = np.array([[np.nan]], dtype=self.dtype)
        counts = np.array([0], dtype='int64')
        values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
        labels = np.zeros(3, dtype='int64')

        self.algo(out, counts, values, labels)

        assert counts[0] == 3
        assert out[0, 0] >= 0
        tm.assert_almost_equal(out[0, 0], 0.0)


class TestGroupVarFloat64(GroupVarTestMixin):
    __test__ = True

    algo = libgroupby.group_var_float64
    dtype = np.float64
    rtol = 1e-5

    def test_group_var_large_inputs(self):

        prng = RandomState(1234)

        out = np.array([[np.nan]], dtype=self.dtype)
        counts = np.array([0], dtype='int64')
        values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype)
        values.shape = (10 ** 6, 1)
        labels = np.zeros(10 ** 6, dtype='int64')

        self.algo(out, counts, values, labels)

        assert counts[0] == 10 ** 6
        tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True)


class TestGroupVarFloat32(GroupVarTestMixin):
    __test__ = True

    algo = libgroupby.group_var_float32
    dtype = np.float32
    rtol = 1e-2


class TestHashTable(object):

    def test_lookup_nan(self):
        xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
        m = ht.Float64HashTable()
        m.map_locations(xs)
        tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
                                                            dtype=np.int64))

    def test_lookup_overflow(self):
        xs = np.array([1, 2, 2**63], dtype=np.uint64)
        m = ht.UInt64HashTable()
        m.map_locations(xs)
        tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
                                                            dtype=np.int64))

    def test_get_unique(self):
        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
        exp = np.array([1, 2, 2**63], dtype=np.uint64)
        tm.assert_numpy_array_equal(s.unique(), exp)

    def test_vector_resize(self):
        # Test for memory errors after internal vector
        # reallocations (pull request #7157)

        def _test_vector_resize(htable, uniques, dtype, nvals, safely_resizes):
            vals = np.array(np.random.randn(1000), dtype=dtype)
            # get_labels may append to uniques
            htable.get_labels(vals[:nvals], uniques, 0, -1)
            # to_array() set an external_view_exists flag on uniques.
            tmp = uniques.to_array()
            oldshape = tmp.shape
            # subsequent get_labels() calls can no longer append to it
            # (for all but StringHashTables + ObjectVector)
            if safely_resizes:
                htable.get_labels(vals, uniques, 0, -1)
            else:
                with pytest.raises(ValueError) as excinfo:
                    htable.get_labels(vals, uniques, 0, -1)
                assert str(excinfo.value).startswith('external reference')
            uniques.to_array()   # should not raise here
            assert tmp.shape == oldshape

        test_cases = [
            (ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
            (ht.StringHashTable, ht.ObjectVector, 'object', True),
            (ht.Float64HashTable, ht.Float64Vector, 'float64', False),
            (ht.Int64HashTable, ht.Int64Vector, 'int64', False),
            (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]

        for (tbl, vect, dtype, safely_resizes) in test_cases:
            # resizing to empty is a special case
            _test_vector_resize(tbl(), vect(), dtype, 0, safely_resizes)
            _test_vector_resize(tbl(), vect(), dtype, 10, safely_resizes)


def test_quantile():
    s = Series(np.random.randn(100))

    result = algos.quantile(s, [0, .25, .5, .75, 1.])
    expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
    tm.assert_almost_equal(result, expected)


def test_unique_label_indices():

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False)

    a[np.random.choice(len(a), 10)] = -1
    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False)


class TestRank(object):

    @td.skip_if_no_scipy
    def test_scipy_compat(self):
        from scipy.stats import rankdata

        def _check(arr):
            mask = ~np.isfinite(arr)
            arr = arr.copy()
            result = libalgos.rank_1d_float64(arr)
            arr[mask] = np.inf
            exp = rankdata(arr)
            exp[mask] = nan
            assert_almost_equal(result, exp)

        _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan]))
        _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan]))

    def test_basic(self):
        exp = np.array([1, 2], dtype=np.float64)

        for dtype in np.typecodes['AllInteger']:
            s = Series([1, 100], dtype=dtype)
            tm.assert_numpy_array_equal(algos.rank(s), exp)

    def test_uint64_overflow(self):
        exp = np.array([1, 2], dtype=np.float64)

        for dtype in [np.float64, np.uint64]:
            s = Series([1, 2**63], dtype=dtype)
            tm.assert_numpy_array_equal(algos.rank(s), exp)

    def test_too_many_ndims(self):
        arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
        msg = "Array with ndim > 2 are not supported"

        with tm.assert_raises_regex(TypeError, msg):
            algos.rank(arr)


def test_pad_backfill_object_segfault():

    old = np.array([], dtype='O')
    new = np.array([datetime(2010, 12, 31)], dtype='O')

    result = libalgos.pad_object(old, new)
    expected = np.array([-1], dtype=np.int64)
    tm.assert_numpy_array_equal(result, expected)

    result = libalgos.pad_object(new, old)
    expected = np.array([], dtype=np.int64)
    tm.assert_numpy_array_equal(result, expected)

    result = libalgos.backfill_object(old, new)
    expected = np.array([-1], dtype=np.int64)
    tm.assert_numpy_array_equal(result, expected)

    result = libalgos.backfill_object(new, old)
    expected = np.array([], dtype=np.int64)
    tm.assert_numpy_array_equal(result, expected)


def test_arrmap():
    values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O')
    result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar'])
    assert (result.dtype == np.bool_)


class TestTseriesUtil(object):

    def test_combineFunc(self):
        pass

    def test_reindex(self):
        pass

    def test_isna(self):
        pass

    def test_groupby(self):
        pass

    def test_groupby_withnull(self):
        pass

    def test_backfill(self):
        old = Index([1, 5, 10])
        new = Index(lrange(12))

        filler = libalgos.backfill_int64(old.values, new.values)

        expect_filler = np.array([0, 0, 1, 1, 1, 1,
                                  2, 2, 2, 2, 2, -1], dtype=np.int64)
        tm.assert_numpy_array_equal(filler, expect_filler)

        # corner case
        old = Index([1, 4])
        new = Index(lrange(5, 10))
        filler = libalgos.backfill_int64(old.values, new.values)

        expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
        tm.assert_numpy_array_equal(filler, expect_filler)

    def test_pad(self):
        old = Index([1, 5, 10])
        new = Index(lrange(12))

        filler = libalgos.pad_int64(old.values, new.values)

        expect_filler = np.array([-1, 0, 0, 0, 0, 1,
                                  1, 1, 1, 1, 2, 2], dtype=np.int64)
        tm.assert_numpy_array_equal(filler, expect_filler)

        # corner case
        old = Index([5, 10])
        new = Index(lrange(5))
        filler = libalgos.pad_int64(old.values, new.values)
        expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
        tm.assert_numpy_array_equal(filler, expect_filler)


def test_is_lexsorted():
    failure = [
        np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                  3, 3,
                  3, 3,
                  3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
                  2, 2, 2, 2, 2, 2, 2,
                  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                  1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1,
                  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                  0, 0, 0, 0, 0, 0, 0,
                  0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'),
        np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
                  15, 14,
                  13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28,
                  27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
                  12, 11,
                  10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25,
                  24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,
                  9, 8,
                  7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22,
                  21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
                  6, 5,
                  4, 3, 2, 1, 0], dtype='int64')]

    assert (not libalgos.is_lexsorted(failure))


def test_groupsort_indexer():
    a = np.random.randint(0, 1000, 100).astype(np.int64)
    b = np.random.randint(0, 1000, 100).astype(np.int64)

    result = libalgos.groupsort_indexer(a, 1000)[0]

    # need to use a stable sort
    # np.argsort returns int, groupsort_indexer
    # always returns int64
    expected = np.argsort(a, kind='mergesort')
    expected = expected.astype(np.int64)

    tm.assert_numpy_array_equal(result, expected)

    # compare with lexsort
    # np.lexsort returns int, groupsort_indexer
    # always returns int64
    key = a * 1000 + b
    result = libalgos.groupsort_indexer(key, 1000000)[0]
    expected = np.lexsort((b, a))
    expected = expected.astype(np.int64)

    tm.assert_numpy_array_equal(result, expected)


def test_infinity_sort():
    # GH 13445
    # numpy's argsort can be unhappy if something is less than
    # itself.  Instead, let's give our infinities a self-consistent
    # ordering, but outside the float extended real line.

    Inf = libalgos.Infinity()
    NegInf = libalgos.NegInfinity()

    ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]

    assert all(Inf >= x for x in ref_nums)
    assert all(Inf > x or x is Inf for x in ref_nums)
    assert Inf >= Inf and Inf == Inf
    assert not Inf < Inf and not Inf > Inf
    assert libalgos.Infinity() == libalgos.Infinity()
    assert not libalgos.Infinity() != libalgos.Infinity()

    assert all(NegInf <= x for x in ref_nums)
    assert all(NegInf < x or x is NegInf for x in ref_nums)
    assert NegInf <= NegInf and NegInf == NegInf
    assert not NegInf < NegInf and not NegInf > NegInf
    assert libalgos.NegInfinity() == libalgos.NegInfinity()
    assert not libalgos.NegInfinity() != libalgos.NegInfinity()

    for perm in permutations(ref_nums):
        assert sorted(perm) == ref_nums

    # smoke tests
    np.array([libalgos.Infinity()] * 32).argsort()
    np.array([libalgos.NegInfinity()] * 32).argsort()


def test_infinity_against_nan():
    Inf = libalgos.Infinity()
    NegInf = libalgos.NegInfinity()

    assert not Inf > np.nan
    assert not Inf >= np.nan
    assert not Inf < np.nan
    assert not Inf <= np.nan
    assert not Inf == np.nan
    assert Inf != np.nan

    assert not NegInf > np.nan
    assert not NegInf >= np.nan
    assert not NegInf < np.nan
    assert not NegInf <= np.nan
    assert not NegInf == np.nan
    assert NegInf != np.nan


def test_ensure_platform_int():
    arr = np.arange(100, dtype=np.intp)

    result = libalgos.ensure_platform_int(arr)
    assert (result is arr)


def test_int64_add_overflow():
    # see gh-14068
    msg = "Overflow in int64 addition"
    m = np.iinfo(np.int64).max
    n = np.iinfo(np.int64).min

    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([m, m]), m)
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([n, n]), n)
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
                                   arr_mask=np.array([False, True]))
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
                                   b_mask=np.array([False, True]))
    with tm.assert_raises_regex(OverflowError, msg):
        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
                                   arr_mask=np.array([False, True]),
                                   b_mask=np.array([False, True]))
    with tm.assert_raises_regex(OverflowError, msg):
        with tm.assert_produces_warning(RuntimeWarning):
            algos.checked_add_with_arr(np.array([m, m]),
                                       np.array([np.nan, m]))

    # Check that the nan boolean arrays override whether or not
    # the addition overflows. We don't check the result but just
    # the fact that an OverflowError is not raised.
    with pytest.raises(AssertionError):
        with tm.assert_raises_regex(OverflowError, msg):
            algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
                                       arr_mask=np.array([True, True]))
    with pytest.raises(AssertionError):
        with tm.assert_raises_regex(OverflowError, msg):
            algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
                                       b_mask=np.array([True, True]))
    with pytest.raises(AssertionError):
        with tm.assert_raises_regex(OverflowError, msg):
            algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
                                       arr_mask=np.array([True, False]),
                                       b_mask=np.array([False, True]))


class TestMode(object):

    def test_no_mode(self):
        exp = Series([], dtype=np.float64)
        tm.assert_series_equal(algos.mode([]), exp)

    def test_mode_single(self):
        # GH 15714
        exp_single = [1]
        data_single = [1]

        exp_multi = [1]
        data_multi = [1, 1]

        for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
            s = Series(data_single, dtype=dt)
            exp = Series(exp_single, dtype=dt)
            tm.assert_series_equal(algos.mode(s), exp)

            s = Series(data_multi, dtype=dt)
            exp = Series(exp_multi, dtype=dt)
            tm.assert_series_equal(algos.mode(s), exp)

        exp = Series([1], dtype=np.int)
        tm.assert_series_equal(algos.mode([1]), exp)

        exp = Series(['a', 'b', 'c'], dtype=np.object)
        tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp)

    def test_number_mode(self):
        exp_single = [1]
        data_single = [1] * 5 + [2] * 3

        exp_multi = [1, 3]
        data_multi = [1] * 5 + [2] * 3 + [3] * 5

        for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
            s = Series(data_single, dtype=dt)
            exp = Series(exp_single, dtype=dt)
            tm.assert_series_equal(algos.mode(s), exp)

            s = Series(data_multi, dtype=dt)
            exp = Series(exp_multi, dtype=dt)
            tm.assert_series_equal(algos.mode(s), exp)

    def test_strobj_mode(self):
        exp = ['b']
        data = ['a'] * 2 + ['b'] * 3

        s = Series(data, dtype='c')
        exp = Series(exp, dtype='c')
        tm.assert_series_equal(algos.mode(s), exp)

        exp = ['bar']
        data = ['foo'] * 2 + ['bar'] * 3

        for dt in [str, object]:
            s = Series(data, dtype=dt)
            exp = Series(exp, dtype=dt)
            tm.assert_series_equal(algos.mode(s), exp)

    def test_datelike_mode(self):
        exp = Series(['1900-05-03', '2011-01-03',
                      '2013-01-02'], dtype="M8[ns]")
        s = Series(['2011-01-03', '2013-01-02',
                    '1900-05-03'], dtype='M8[ns]')
        tm.assert_series_equal(algos.mode(s), exp)

        exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]')
        s = Series(['2011-01-03', '2013-01-02', '1900-05-03',
                    '2011-01-03', '2013-01-02'], dtype='M8[ns]')
        tm.assert_series_equal(algos.mode(s), exp)

    def test_timedelta_mode(self):
        exp = Series(['-1 days', '0 days', '1 days'],
                     dtype='timedelta64[ns]')
        s = Series(['1 days', '-1 days', '0 days'],
                   dtype='timedelta64[ns]')
        tm.assert_series_equal(algos.mode(s), exp)

        exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]')
        s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min',
                    '2 min', '2 min'], dtype='timedelta64[ns]')
        tm.assert_series_equal(algos.mode(s), exp)

    def test_mixed_dtype(self):
        exp = Series(['foo'])
        s = Series([1, 'foo', 'foo'])
        tm.assert_series_equal(algos.mode(s), exp)

    def test_uint64_overflow(self):
        exp = Series([2**63], dtype=np.uint64)
        s = Series([1, 2**63, 2**63], dtype=np.uint64)
        tm.assert_series_equal(algos.mode(s), exp)

        exp = Series([1, 2**63], dtype=np.uint64)
        s = Series([1, 2**63], dtype=np.uint64)
        tm.assert_series_equal(algos.mode(s), exp)

    def test_categorical(self):
        c = Categorical([1, 2])
        exp = c
        tm.assert_categorical_equal(algos.mode(c), exp)
        tm.assert_categorical_equal(c.mode(), exp)

        c = Categorical([1, 'a', 'a'])
        exp = Categorical(['a'], categories=[1, 'a'])
        tm.assert_categorical_equal(algos.mode(c), exp)
        tm.assert_categorical_equal(c.mode(), exp)

        c = Categorical([1, 1, 2, 3, 3])
        exp = Categorical([1, 3], categories=[1, 2, 3])
        tm.assert_categorical_equal(algos.mode(c), exp)
        tm.assert_categorical_equal(c.mode(), exp)

    def test_index(self):
        idx = Index([1, 2, 3])
        exp = Series([1, 2, 3], dtype=np.int64)
        tm.assert_series_equal(algos.mode(idx), exp)

        idx = Index([1, 'a', 'a'])
        exp = Series(['a'], dtype=object)
        tm.assert_series_equal(algos.mode(idx), exp)

        idx = Index([1, 1, 2, 3, 3])
        exp = Series([1, 3], dtype=np.int64)
        tm.assert_series_equal(algos.mode(idx), exp)

        exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]')
        idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min',
                     '2 min', '2 min'], dtype='timedelta64[ns]')
        tm.assert_series_equal(algos.mode(idx), exp)
360core / pandas python

Products

About

Resources

Contact Gemfury