Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / indexes / multi / test_duplicates.py

# -*- coding: utf-8 -*-

from itertools import product

import numpy as np
import pytest

from pandas._libs import hashtable
from pandas.compat import range, u

from pandas import DatetimeIndex, MultiIndex
import pandas.util.testing as tm


@pytest.mark.parametrize('names', [None, ['first', 'second']])
def test_unique(names):
    mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names)

    res = mi.unique()
    exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names)
    tm.assert_index_equal(res, exp)

    mi = MultiIndex.from_arrays([list('aaaa'), list('abab')],
                                names=names)
    res = mi.unique()
    exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names)
    tm.assert_index_equal(res, exp)

    mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names)
    res = mi.unique()
    exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names)
    tm.assert_index_equal(res, exp)

    # GH #20568 - empty MI
    mi = MultiIndex.from_arrays([[], []], names=names)
    res = mi.unique()
    tm.assert_index_equal(mi, res)


def test_unique_datetimelike():
    idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01',
                          '2015-01-01', 'NaT', 'NaT'])
    idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02',
                          '2015-01-02', 'NaT', '2015-01-01'],
                         tz='Asia/Tokyo')
    result = MultiIndex.from_arrays([idx1, idx2]).unique()

    eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT'])
    eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02',
                           'NaT', '2015-01-01'],
                          tz='Asia/Tokyo')
    exp = MultiIndex.from_arrays([eidx1, eidx2])
    tm.assert_index_equal(result, exp)


@pytest.mark.parametrize('level', [0, 'first', 1, 'second'])
def test_unique_level(idx, level):
    # GH #17896 - with level= argument
    result = idx.unique(level=level)
    expected = idx.get_level_values(level).unique()
    tm.assert_index_equal(result, expected)

    # With already unique level
    mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]],
                                names=['first', 'second'])
    result = mi.unique(level=level)
    expected = mi.get_level_values(level)
    tm.assert_index_equal(result, expected)

    # With empty MI
    mi = MultiIndex.from_arrays([[], []], names=['first', 'second'])
    result = mi.unique(level=level)
    expected = mi.get_level_values(level)


@pytest.mark.parametrize('dropna', [True, False])
def test_get_unique_index(idx, dropna):
    mi = idx[[0, 1, 0, 1, 1, 0, 0]]
    expected = mi._shallow_copy(mi[[0, 1]])

    result = mi._get_unique_index(dropna=dropna)
    assert result.unique
    tm.assert_index_equal(result, expected)


def test_duplicate_multiindex_codes():
    # GH 17464
    # Make sure that a MultiIndex with duplicate levels throws a ValueError
    with pytest.raises(ValueError):
        mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)])

    # And that using set_levels with duplicate levels fails
    mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'],
                                 [1, 2, 1, 2, 3]])
    with pytest.raises(ValueError):
        mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
                      inplace=True)


@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
                                   [1, 'a', 1]])
def test_duplicate_level_names(names):
    # GH18872, GH19029
    mi = MultiIndex.from_product([[0, 1]] * 3, names=names)
    assert mi.names == names

    # With .rename()
    mi = MultiIndex.from_product([[0, 1]] * 3)
    mi = mi.rename(names)
    assert mi.names == names

    # With .rename(., level=)
    mi.rename(names[1], level=1, inplace=True)
    mi = mi.rename([names[0], names[2]], level=[0, 2])
    assert mi.names == names


def test_duplicate_meta_data():
    # GH 10115
    mi = MultiIndex(
        levels=[[0, 1], [0, 1, 2]],
        codes=[[0, 0, 0, 0, 1, 1, 1],
               [0, 1, 2, 0, 0, 1, 2]])

    for idx in [mi,
                mi.set_names([None, None]),
                mi.set_names([None, 'Num']),
                mi.set_names(['Upper', 'Num']), ]:
        assert idx.has_duplicates
        assert idx.drop_duplicates().names == idx.names


def test_has_duplicates(idx, idx_dup):
    # see fixtures
    assert idx.is_unique is True
    assert idx.has_duplicates is False
    assert idx_dup.is_unique is False
    assert idx_dup.has_duplicates is True

    mi = MultiIndex(levels=[[0, 1], [0, 1, 2]],
                    codes=[[0, 0, 0, 0, 1, 1, 1],
                           [0, 1, 2, 0, 0, 1, 2]])
    assert mi.is_unique is False
    assert mi.has_duplicates is True

    # single instance of NaN
    mi_nan = MultiIndex(levels=[['a', 'b'], [0, 1]],
                        codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]])
    assert mi_nan.is_unique is True
    assert mi_nan.has_duplicates is False

    # multiple instances of NaN
    mi_nan_dup = MultiIndex(levels=[['a', 'b'], [0, 1]],
                            codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]])
    assert mi_nan_dup.is_unique is False
    assert mi_nan_dup.has_duplicates is True


def test_has_duplicates_from_tuples():
    # GH 9075
    t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169),
         (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119),
         (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135),
         (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145),
         (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158),
         (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122),
         (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160),
         (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180),
         (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143),
         (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128),
         (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129),
         (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111),
         (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114),
         (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121),
         (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126),
         (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155),
         (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123),
         (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)]

    mi = MultiIndex.from_tuples(t)
    assert not mi.has_duplicates


def test_has_duplicates_overflow():
    # handle int64 overflow if possible
    def check(nlevels, with_nulls):
        codes = np.tile(np.arange(500), 2)
        level = np.arange(500)

        if with_nulls:  # inject some null values
            codes[500] = -1  # common nan value
            codes = [codes.copy() for i in range(nlevels)]
            for i in range(nlevels):
                codes[i][500 + i - nlevels // 2] = -1

            codes += [np.array([-1, 1]).repeat(500)]
        else:
            codes = [codes] * nlevels + [np.arange(2).repeat(500)]

        levels = [level] * nlevels + [[0, 1]]

        # no dups
        mi = MultiIndex(levels=levels, codes=codes)
        assert not mi.has_duplicates

        # with a dup
        if with_nulls:
            def f(a):
                return np.insert(a, 1000, a[0])
            codes = list(map(f, codes))
            mi = MultiIndex(levels=levels, codes=codes)
        else:
            values = mi.values.tolist()
            mi = MultiIndex.from_tuples(values + [values[0]])

        assert mi.has_duplicates

    # no overflow
    check(4, False)
    check(4, True)

    # overflow possible
    check(8, False)
    check(8, True)


@pytest.mark.parametrize('keep, expected', [
    ('first', np.array([False, False, False, True, True, False])),
    ('last', np.array([False, True, True, False, False, False])),
    (False, np.array([False, True, True, True, True, False]))
])
def test_duplicated(idx_dup, keep, expected):
    result = idx_dup.duplicated(keep=keep)
    tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize('keep', ['first', 'last', False])
def test_duplicated_large(keep):
    # GH 9125
    n, k = 200, 5000
    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
    codes = [np.random.choice(n, k * n) for lev in levels]
    mi = MultiIndex(levels=levels, codes=codes)

    result = mi.duplicated(keep=keep)
    expected = hashtable.duplicated_object(mi.values, keep=keep)
    tm.assert_numpy_array_equal(result, expected)


def test_get_duplicates():
    # GH5873
    for a in [101, 102]:
        mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]])
        assert not mi.has_duplicates

        with tm.assert_produces_warning(FutureWarning):
            # Deprecated - see GH20239
            assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []]))

        tm.assert_numpy_array_equal(mi.duplicated(),
                                    np.zeros(2, dtype='bool'))

    for n in range(1, 6):  # 1st level shape
        for m in range(1, 5):  # 2nd level shape
            # all possible unique combinations, including nan
            codes = product(range(-1, n), range(-1, m))
            mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]],
                            codes=np.random.permutation(list(codes)).T)
            assert len(mi) == (n + 1) * (m + 1)
            assert not mi.has_duplicates

            with tm.assert_produces_warning(FutureWarning):
                # Deprecated - see GH20239
                assert mi.get_duplicates().equals(MultiIndex.from_arrays(
                    [[], []]))

            tm.assert_numpy_array_equal(mi.duplicated(),
                                        np.zeros(len(mi), dtype='bool'))