# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime, timedelta
import inspect
import numpy as np
import pytest
from pandas.compat import PY2, lrange
from pandas.core.dtypes.common import (
is_categorical_dtype, is_interval_dtype, is_object_dtype)
from pandas import (
Categorical, DataFrame, DatetimeIndex, Index, IntervalIndex, MultiIndex,
RangeIndex, Series, Timestamp, cut, date_range, to_datetime)
import pandas.util.testing as tm
class TestDataFrameAlterAxes():
def test_set_index_directly(self, float_string_frame):
df = float_string_frame
idx = Index(np.arange(len(df))[::-1])
df.index = idx
tm.assert_index_equal(df.index, idx)
with pytest.raises(ValueError, match='Length mismatch'):
df.index = idx[::2]
def test_set_index(self, float_string_frame):
df = float_string_frame
idx = Index(np.arange(len(df))[::-1])
df = df.set_index(idx)
tm.assert_index_equal(df.index, idx)
with pytest.raises(ValueError, match='Length mismatch'):
df.set_index(idx[::2])
def test_set_index_cast(self):
# issue casting an index then set_index
df = DataFrame({'A': [1.1, 2.2, 3.3], 'B': [5.0, 6.1, 7.2]},
index=[2010, 2011, 2012])
df2 = df.set_index(df.index.astype(np.int32))
tm.assert_frame_equal(df, df2)
# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('inplace', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_drop_inplace(self, frame_of_index_cols,
drop, inplace, keys):
df = frame_of_index_cols
if isinstance(keys, list):
idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)
else:
idx = Index(df[keys], name=keys)
expected = df.drop(keys, axis=1) if drop else df
expected.index = idx
if inplace:
result = df.copy()
result.set_index(keys, drop=drop, inplace=True)
else:
result = df.set_index(keys, drop=drop)
tm.assert_frame_equal(result, expected)
# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append(self, frame_of_index_cols, drop, keys):
df = frame_of_index_cols
keys = keys if isinstance(keys, list) else [keys]
idx = MultiIndex.from_arrays([df.index] + [df[x] for x in keys],
names=[None] + keys)
expected = df.drop(keys, axis=1) if drop else df.copy()
expected.index = idx
result = df.set_index(keys, drop=drop, append=True)
tm.assert_frame_equal(result, expected)
# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append_to_multiindex(self, frame_of_index_cols,
drop, keys):
# append to existing multiindex
df = frame_of_index_cols.set_index(['D'], drop=drop, append=True)
keys = keys if isinstance(keys, list) else [keys]
expected = frame_of_index_cols.set_index(['D'] + keys,
drop=drop, append=True)
result = df.set_index(keys, drop=drop, append=True)
tm.assert_frame_equal(result, expected)
def test_set_index_after_mutation(self):
# GH1590
df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']})
expected = DataFrame({'val': [1, 2]},
Index(['b', 'c'], name='key'))
df2 = df.loc[df.index.map(lambda indx: indx >= 1)]
result = df2.set_index('key')
tm.assert_frame_equal(result, expected)
# MultiIndex constructor does not work directly on Series -> lambda
# Add list-of-list constructor because list is ambiguous -> lambda
# also test index name if append=True (name is duplicate here for B)
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, lambda x: [list(x)],
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name', [(True, None),
(True, 'B'), (True, 'test'), (False, None)])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_pass_single_array(self, frame_of_index_cols,
drop, append, index_name, box):
df = frame_of_index_cols
df.index.name = index_name
key = box(df['B'])
if box == list:
# list of strings gets interpreted as list of keys
msg = "['one', 'two', 'three', 'one', 'two']"
with pytest.raises(KeyError, match=msg):
df.set_index(key, drop=drop, append=append)
else:
# np.array/list-of-list "forget" the name of B
name_mi = getattr(key, 'names', None)
name = [getattr(key, 'name', None)] if name_mi is None else name_mi
result = df.set_index(key, drop=drop, append=append)
# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name
tm.assert_frame_equal(result, expected)
# MultiIndex constructor does not work directly on Series -> lambda
# also test index name if append=True (name is duplicate here for A & B)
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name',
[(True, None), (True, 'A'), (True, 'B'),
(True, 'test'), (False, None)])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_pass_arrays(self, frame_of_index_cols,
drop, append, index_name, box):
df = frame_of_index_cols
df.index.name = index_name
keys = ['A', box(df['B'])]
# np.array/list "forget" the name of B
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
result = df.set_index(keys, drop=drop, append=append)
# only valid column keys are dropped
# since B is always passed as array above, only A is dropped, if at all
expected = df.set_index(['A', 'B'], drop=False, append=append)
expected = expected.drop('A', axis=1) if drop else expected
expected.index.names = [index_name] + names if append else names
tm.assert_frame_equal(result, expected)
# MultiIndex constructor does not work directly on Series -> lambda
# We also emulate a "constructor" for the label -> lambda
# also test index name if append=True (name is duplicate here for A)
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('append, index_name', [(True, None),
(True, 'A'), (True, 'test'), (False, None)])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
append, index_name, box1, box2):
df = frame_of_index_cols
df.index.name = index_name
keys = [box1(df['A']), box2(df['A'])]
result = df.set_index(keys, drop=drop, append=append)
# need to adapt first drop for case that both keys are 'A' --
# cannot drop the same column twice;
# use "is" because == would give ambiguous Boolean error for containers
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop
# to test against already-tested behaviour, we add sequentially,
# hence second append always True; must wrap keys in list, otherwise
# box = list would be interpreted as keys
expected = df.set_index([keys[0]], drop=first_drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_pass_multiindex(self, frame_of_index_cols,
drop, append):
df = frame_of_index_cols
keys = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B'])
result = df.set_index(keys, drop=drop, append=append)
# setting with a MultiIndex will never drop columns
expected = df.set_index(['A', 'B'], drop=False, append=append)
tm.assert_frame_equal(result, expected)
def test_set_index_verify_integrity(self, frame_of_index_cols):
df = frame_of_index_cols
with pytest.raises(ValueError, match='Index has duplicate keys'):
df.set_index('A', verify_integrity=True)
# with MultiIndex
with pytest.raises(ValueError, match='Index has duplicate keys'):
df.set_index([df['A'], df['A']], verify_integrity=True)
@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols
with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
# column names are A-E, as well as one tuple
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
# non-existent key in list with arrays
with pytest.raises(KeyError, match='X'):
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
# tuples always raise KeyError
with pytest.raises(KeyError, match=msg):
df.set_index(tuple(df['A']), drop=drop, append=append)
# also within a list
with pytest.raises(KeyError, match=msg):
df.set_index(['A', df['A'], tuple(df['A'])],
drop=drop, append=append)
@pytest.mark.xfail(reason='broken due to revert, see GH 25085')
@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
@pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)],
ids=['set', 'iter', 'generator'])
def test_set_index_raise_on_type(self, frame_of_index_cols, box,
drop, append):
df = frame_of_index_cols
msg = 'The parameter "keys" may be a column key, .*'
# forbidden type, e.g. set/iter/generator
with pytest.raises(TypeError, match=msg):
df.set_index(box(df['A']), drop=drop, append=append)
# forbidden type in list, e.g. set/iter/generator
with pytest.raises(TypeError, match=msg):
df.set_index(['A', df['A'], box(df['A'])],
drop=drop, append=append)
def test_set_index_custom_label_type(self):
# GH 24969
class Thing(object):
def __init__(self, name, color):
self.name = name
self.color = color
def __str__(self):
return "<Thing %r>" % (self.name,)
# necessary for pretty KeyError
__repr__ = __str__
thing1 = Thing('One', 'red')
thing2 = Thing('Two', 'blue')
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
expected = DataFrame({thing1: [0, 1]},
index=Index([2, 3], name=thing2))
# use custom label directly
result = df.set_index(thing2)
tm.assert_frame_equal(result, expected)
# custom label wrapped in list
result = df.set_index([thing2])
tm.assert_frame_equal(result, expected)
# missing key
thing3 = Thing('Three', 'pink')
msg = "<Thing 'Three'>"
with pytest.raises(KeyError, match=msg):
# missing label directly
df.set_index(thing3)
with pytest.raises(KeyError, match=msg):
# missing label in list
df.set_index([thing3])
def test_set_index_custom_label_hashable_iterable(self):
# GH 24969
# actual example discussed in GH 24984 was e.g. for shapely.geometry
# objects (e.g. a collection of Points) that can be both hashable and
# iterable; using frozenset as a stand-in for testing here
class Thing(frozenset):
# need to stabilize repr for KeyError (due to random order in sets)
def __repr__(self):
tmp = sorted(list(self))
# double curly brace prints one brace in format string
return "frozenset({{{}}})".format(', '.join(map(repr, tmp)))
thing1 = Thing(['One', 'red'])
thing2 = Thing(['Two', 'blue'])
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
expected = DataFrame({thing1: [0, 1]},
index=Index([2, 3], name=thing2))
# use custom label directly
result = df.set_index(thing2)
tm.assert_frame_equal(result, expected)
# custom label wrapped in list
result = df.set_index([thing2])
tm.assert_frame_equal(result, expected)
# missing key
thing3 = Thing(['Three', 'pink'])
msg = '.*' # due to revert, see GH 25085
with pytest.raises(KeyError, match=msg):
Loading ...