Repository URL to install this package:
Version:
0.23.4 ▾
|
# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
import random
import numpy as np
import pandas as pd
from pandas.compat import lrange
from pandas.api.types import CategoricalDtype
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
date_range, NaT, IntervalIndex)
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameSorting(TestData):
def test_sort(self):
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# see gh-9816
with tm.assert_produces_warning(FutureWarning):
frame.sortlevel()
def test_sort_values(self):
frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]],
index=[1, 2, 3], columns=list('ABC'))
# by column (axis=0)
sorted_df = frame.sort_values(by='A')
indexer = frame['A'].argsort().values
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by='A', ascending=False)
indexer = indexer[::-1]
expected = frame.loc[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by='A', ascending=False)
assert_frame_equal(sorted_df, expected)
# GH4839
sorted_df = frame.sort_values(by=['A'], ascending=[False])
assert_frame_equal(sorted_df, expected)
# multiple bys
sorted_df = frame.sort_values(by=['B', 'C'])
expected = frame.loc[[2, 1, 3]]
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=['B', 'C'], ascending=False)
assert_frame_equal(sorted_df, expected[::-1])
sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False])
assert_frame_equal(sorted_df, expected)
pytest.raises(ValueError, lambda: frame.sort_values(
by=['A', 'B'], axis=2, inplace=True))
# by row (axis=1): GH 10806
sorted_df = frame.sort_values(by=3, axis=1)
expected = frame
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=3, axis=1, ascending=False)
expected = frame.reindex(columns=['C', 'B', 'A'])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 2], axis='columns')
expected = frame.reindex(columns=['B', 'A', 'C'])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1,
ascending=[True, False])
assert_frame_equal(sorted_df, expected)
sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False)
expected = frame.reindex(columns=['C', 'B', 'A'])
assert_frame_equal(sorted_df, expected)
msg = r'Length of ascending \(5\) != length of by \(2\)'
with tm.assert_raises_regex(ValueError, msg):
frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
def test_sort_values_inplace(self):
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
sorted_df = frame.copy()
sorted_df.sort_values(by='A', inplace=True)
expected = frame.sort_values(by='A')
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=1, axis=1, inplace=True)
expected = frame.sort_values(by=1, axis=1)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by='A', ascending=False, inplace=True)
expected = frame.sort_values(by='A', ascending=False)
assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True)
expected = frame.sort_values(by=['A', 'B'], ascending=False)
assert_frame_equal(sorted_df, expected)
def test_sort_nan(self):
# GH3917
nan = np.nan
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]})
# sort one column only
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A'], na_position='first')
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3])
sorted_df = df.sort_values(['A'], na_position='first', ascending=False)
assert_frame_equal(sorted_df, expected)
expected = df.reindex(columns=['B', 'A'])
sorted_df = df.sort_values(by=1, axis=1, na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='last', order
expected = DataFrame(
{'A': [1, 1, 2, 4, 6, 8, nan],
'B': [2, 9, nan, 5, 5, 4, 5]},
index=[3, 0, 1, 6, 4, 5, 2])
sorted_df = df.sort_values(['A', 'B'])
assert_frame_equal(sorted_df, expected)
# na_position='first', order
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 2, 9, nan, 5, 5, 4]},
index=[2, 3, 0, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='first', not order
expected = DataFrame(
{'A': [nan, 1, 1, 2, 4, 6, 8],
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], ascending=[
1, 0], na_position='first')
assert_frame_equal(sorted_df, expected)
# na_position='last', not order
expected = DataFrame(
{'A': [8, 6, 4, 2, 1, 1, nan],
'B': [4, 5, 5, nan, 2, 9, 5]},
index=[5, 4, 6, 1, 3, 0, 2])
sorted_df = df.sort_values(['A', 'B'], ascending=[
0, 1], na_position='last')
assert_frame_equal(sorted_df, expected)
# Test DataFrame with nan label
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan])
# NaN label, ascending=True, na_position='last'
sorted_df = df.sort_index(
kind='quicksort', ascending=True, na_position='last')
expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]},
index=[1, 2, 3, 4, 5, 6, nan])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=True, na_position='first'
sorted_df = df.sort_index(na_position='first')
expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8],
'B': [5, 9, nan, 5, 2, 5, 4]},
index=[nan, 1, 2, 3, 4, 5, 6])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='last'
sorted_df = df.sort_index(kind='quicksort', ascending=False)
expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4],
'B': [4, 5, 2, 5, nan, 9, 5]},
index=[6, 5, 4, 3, 2, 1, nan])
assert_frame_equal(sorted_df, expected)
# NaN label, ascending=False, na_position='first'
sorted_df = df.sort_index(
kind='quicksort', ascending=False, na_position='first')
expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1],
'B': [5, 4, 5, 2, 5, nan, 9]},
index=[nan, 6, 5, 4, 3, 2, 1])
assert_frame_equal(sorted_df, expected)
def test_stable_descending_sort(self):
# GH #6399
df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
columns=['sort_col', 'order'])
sorted_df = df.sort_values(by='sort_col', kind='mergesort',
ascending=False)
assert_frame_equal(df, sorted_df)
def test_stable_descending_multicolumn_sort(self):
nan = np.nan
df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4],
'B': [9, nan, 5, 2, 5, 4, 5]})
# test stable mergesort
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 2, 9]},
index=[2, 5, 4, 6, 1, 3, 0])
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1],
na_position='first',
kind='mergesort')
assert_frame_equal(sorted_df, expected)
expected = DataFrame(
{'A': [nan, 8, 6, 4, 2, 1, 1],
'B': [5, 4, 5, 5, nan, 9, 2]},
index=[2, 5, 4, 6, 1, 0, 3])
sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0],
na_position='first',
kind='mergesort')
assert_frame_equal(sorted_df, expected)
def test_stable_categorial(self):
# GH 16793
df = DataFrame({
'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)
})
expected = df.copy()
sorted_df = df.sort_values('x', kind='mergesort')
assert_frame_equal(sorted_df, expected)
def test_sort_datetimes(self):
# GH 3461, argsort / lexsort differences for a datetime column
df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'],
columns=['A'],
index=date_range('20130101', periods=9))
dts = [Timestamp(x)
for x in ['2004-02-11', '2004-01-21', '2004-01-26',
'2005-09-20', '2010-10-04', '2009-05-12',
'2008-11-12', '2010-09-28', '2010-09-28']]
df['B'] = dts[::2] + dts[1::2]
df['C'] = 2.
df['A1'] = 3.
df1 = df.sort_values(by='A')
df2 = df.sort_values(by=['A'])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['B'])
assert_frame_equal(df1, df2)
df1 = df.sort_values(by='B')
df2 = df.sort_values(by=['C', 'B'])
assert_frame_equal(df1, df2)
def test_frame_column_inplace_sort_exception(self):
s = self.frame['A']
with tm.assert_raises_regex(ValueError, "This Series is a view"):
s.sort_values(inplace=True)
cp = s.copy()
cp.sort_values() # it works!
def test_sort_nat_values_in_int_column(self):
# GH 14922: "sorting with large float and multiple columns incorrect"
# cause was that the int64 value NaT was considered as "na". Which is
# only correct for datetime64 columns.
int_values = (2, int(NaT))
float_values = (2.0, -1.797693e308)
df = DataFrame(dict(int=int_values, float=float_values),
columns=["int", "float"])
df_reversed = DataFrame(dict(int=int_values[::-1],
float=float_values[::-1]),
columns=["int", "float"],
index=[1, 0])
# NaT is not a "na" for int64 columns, so na_position must not
# influence the result:
df_sorted = df.sort_values(["int", "float"], na_position="last")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["int", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
# reverse sorting order
df_sorted = df.sort_values(["int", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
# and now check if NaT is still considered as "na" for datetime64
# columns:
df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT],
float=float_values), columns=["datetime", "float"])
df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")],
float=float_values[::-1]),
columns=["datetime", "float"],
index=[1, 0])
df_sorted = df.sort_values(["datetime", "float"], na_position="first")
assert_frame_equal(df_sorted, df_reversed)
df_sorted = df.sort_values(["datetime", "float"], na_position="last")
assert_frame_equal(df_sorted, df)
# Ascending should not affect the results.
df_sorted = df.sort_values(["datetime", "float"], ascending=False)
assert_frame_equal(df_sorted, df)
def test_sort_nat(self):
# GH 16836
d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01',
np.nan, '2016-01-01']]
d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01',
'2016-01-01', '2015-01-01']]
df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3])
d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01',
'2016-01-01', np.nan]]
d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01',
'2017-01-01', '2016-01-01']]
expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2])
sorted_df = df.sort_values(by=['a', 'b'], )
tm.assert_frame_equal(sorted_df, expected)
class TestDataFrameSortIndexKinds(TestData):
def test_sort_index_multicolumn(self):
A = np.arange(5).repeat(20)
B = np.tile(np.arange(5), 20)
random.shuffle(A)
random.shuffle(B)
frame = DataFrame({'A': A, 'B': B,
'C': np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['A', 'B'])
result = frame.sort_values(by=['A', 'B'])
indexer = np.lexsort((frame['B'], frame['A']))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['A', 'B'], ascending=False)
result = frame.sort_values(by=['A', 'B'], ascending=False)
indexer = np.lexsort((frame['B'].rank(ascending=False),
frame['A'].rank(ascending=False)))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
frame.sort_index(by=['B', 'A'])
result = frame.sort_values(by=['B', 'A'])
indexer = np.lexsort((frame['A'], frame['B']))
expected = frame.take(indexer)
assert_frame_equal(result, expected)
def test_sort_index_inplace(self):
frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# axis=0
unordered = frame.loc[[3, 2, 4, 1]]
a_id = id(unordered['A'])
df = unordered.copy()
df.sort_index(inplace=True)
expected = frame
assert_frame_equal(df, expected)
assert a_id != id(df['A'])
df = unordered.copy()
df.sort_index(ascending=False, inplace=True)
expected = frame[::-1]
assert_frame_equal(df, expected)
# axis=1
unordered = frame.loc[:, ['D', 'B', 'C', 'A']]
df = unordered.copy()
df.sort_index(axis=1, inplace=True)
expected = frame
assert_frame_equal(df, expected)
df = unordered.copy()
df.sort_index(axis=1, ascending=False, inplace=True)
expected = frame.iloc[:, ::-1]
assert_frame_equal(df, expected)
def test_sort_index_different_sortorder(self):
A = np.arange(20).repeat(5)
B = np.tile(np.arange(5), 20)
indexer = np.random.permutation(100)
A = A.take(indexer)
B = B.take(indexer)
df = DataFrame({'A': A, 'B': B,
'C': np.random.randn(100)})
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=['A', 'B'], ascending=[1, 0])
result = df.sort_values(by=['A', 'B'], ascending=[1, 0])
ex_indexer = np.lexsort((df.B.max() - df.B, df.A))
expected = df.take(ex_indexer)
assert_frame_equal(result, expected)
# test with multiindex, too
idf = df.set_index(['A', 'B'])
result = idf.sort_index(ascending=[1, 0])
expected = idf.take(ex_indexer)
assert_frame_equal(result, expected)
# also, Series!
result = idf['C'].sort_index(ascending=[1, 0])
assert_series_equal(result, expected['C'])
def test_sort_index_duplicates(self):
# with 9816, these are all translated to .sort_values
df = DataFrame([lrange(5, 9), lrange(4)],
columns=['a', 'a', 'b', 'b'])
with tm.assert_raises_regex(ValueError, 'not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by='a')
with tm.assert_raises_regex(ValueError, 'not unique'):
df.sort_values(by='a')
with tm.assert_raises_regex(ValueError, 'not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=['a'])
with tm.assert_raises_regex(ValueError, 'not unique'):
df.sort_values(by=['a'])
with tm.assert_raises_regex(ValueError, 'not unique'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
# multi-column 'by' is separate codepath
df.sort_index(by=['a', 'b'])
with tm.assert_raises_regex(ValueError, 'not unique'):
# multi-column 'by' is separate codepath
df.sort_values(by=['a', 'b'])
# with multi-index
# GH4370
df = DataFrame(np.random.randn(4, 2),
columns=MultiIndex.from_tuples([('a', 0), ('a', 1)]))
with tm.assert_raises_regex(ValueError, 'level'):
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by='a')
with tm.assert_raises_regex(ValueError, 'level'):
df.sort_values(by='a')
# convert tuples to a list of tuples
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=[('a', 1)])
expected = df.sort_values(by=[('a', 1)])
# use .sort_values #9816
with tm.assert_produces_warning(FutureWarning):
df.sort_index(by=('a', 1))
result = df.sort_values(by=('a', 1))
assert_frame_equal(result, expected)
def test_sort_index_level(self):
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4]], mi)
res = df.sort_index(level='A', sort_remaining=False)
assert_frame_equal(df, res)
res = df.sort_index(level=['A', 'B'], sort_remaining=False)
assert_frame_equal(df, res)
def test_sort_index_categorical_index(self):
df = (DataFrame({'A': np.arange(6, dtype='int64'),
'B': Series(list('aabbca'))
.astype(CategoricalDtype(list('cab')))})
.set_index('B'))
result = df.sort_index()
expected = df.iloc[[4, 0, 1, 5, 2, 3]]
assert_frame_equal(result, expected)
result = df.sort_index(ascending=False)
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
assert_frame_equal(result, expected)
def test_sort_index(self):
# GH13496
frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4],
columns=['A', 'B', 'C', 'D'])
# axis=0 : sort rows by index labels
unordered = frame.loc[[3, 2, 4, 1]]
result = unordered.sort_index(axis=0)
expected = frame
assert_frame_equal(result, expected)
result = unordered.sort_index(ascending=False)
expected = frame[::-1]
assert_frame_equal(result, expected)
# axis=1 : sort columns by column names
unordered = frame.iloc[:, [2, 1, 3, 0]]
result = unordered.sort_index(axis=1)
assert_frame_equal(result, frame)
result = unordered.sort_index(axis=1, ascending=False)
expected = frame.iloc[:, ::-1]
assert_frame_equal(result, expected)
@pytest.mark.parametrize("level", ['A', 0]) # GH 21052
def test_sort_index_multiindex(self, level):
# GH13496
# sort rows by specified level of multi-index
mi = MultiIndex.from_tuples([
[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC'))
df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi)
expected_mi = MultiIndex.from_tuples([
[1, 1, 1],
[2, 1, 2],
[2, 1, 3]], names=list('ABC'))
expected = pd.DataFrame([
[5, 6],
[3, 4],
[1, 2]], index=expected_mi)
result = df.sort_index(level=level)
assert_frame_equal(result, expected)
# sort_remaining=False
expected_mi = MultiIndex.from_tuples([
[1, 1, 1],
[2, 1, 3],
[2, 1, 2]], names=list('ABC'))
expected = pd.DataFrame([
[5, 6],
[1, 2],
[3, 4]], index=expected_mi)
result = df.sort_index(level=level, sort_remaining=False)
assert_frame_equal(result, expected)
def test_sort_index_intervalindex(self):
# this is a de-facto sort via unstack
# confirming that we sort in the order of the bins
y = Series(np.random.randn(100))
x1 = Series(np.sign(np.random.randn(100)))
x2 = pd.cut(Series(np.random.randn(100)),
bins=[-3, -0.5, 0, 0.5, 3])
model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
expected = IntervalIndex.from_tuples(
[(-3.0, -0.5), (-0.5, 0.0),
(0.0, 0.5), (0.5, 3.0)],
closed='right')
result = result.columns.levels[1].categories
tm.assert_index_equal(result, expected)