Repository URL to install this package:
|
Version:
0.15.2 ▾
|
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
import pytest
import numpy as np
import pandas
from pandas.testing import assert_index_equal
import matplotlib
import modin.pandas as pd
import sys
from modin.pandas.test.utils import (
NROWS,
RAND_LOW,
RAND_HIGH,
df_equals,
arg_keys,
name_contains,
test_data,
test_data_values,
test_data_keys,
axis_keys,
axis_values,
int_arg_keys,
int_arg_values,
create_test_dfs,
eval_general,
generate_multiindex,
extra_test_parameters,
default_to_pandas_ignore_string,
)
from modin.config import NPartitions
from modin.utils import get_current_execution
from modin.test.test_utils import warns_that_defaulting_to_pandas
from modin.pandas.indexing import is_range_like
NPartitions.put(4)
# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")
# Our configuration in pytest.ini requires that we explicitly catch all
# instances of defaulting to pandas, but some test modules, like this one,
# have too many such instances.
# TODO(https://github.com/modin-project/modin/issues/3655): catch all instances
# of defaulting to pandas.
pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string)
def eval_setitem(md_df, pd_df, value, col=None, loc=None):
if loc is not None:
col = pd_df.columns[loc]
value_getter = value if callable(value) else (lambda *args, **kwargs: value)
eval_general(
md_df, pd_df, lambda df: df.__setitem__(col, value_getter(df)), __inplace__=True
)
def eval_loc(md_df, pd_df, value, key):
if isinstance(value, tuple):
assert len(value) == 2
# case when value for pandas different
md_value, pd_value = value
else:
md_value, pd_value = value, value
eval_general(
md_df,
pd_df,
lambda df: df.loc.__setitem__(
key, pd_value if isinstance(df, pandas.DataFrame) else md_value
),
__inplace__=True,
)
@pytest.mark.parametrize(
"dates",
[
["2018-02-27 09:03:30", "2018-02-27 09:04:30"],
["2018-02-27 09:03:00", "2018-02-27 09:05:00"],
],
)
@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None])
def test_asof_with_nan(dates, subset):
data = {"a": [10, 20, 30, 40, 50], "b": [None, None, None, None, 500]}
index = pd.DatetimeIndex(
[
"2018-02-27 09:01:00",
"2018-02-27 09:02:00",
"2018-02-27 09:03:00",
"2018-02-27 09:04:00",
"2018-02-27 09:05:00",
]
)
modin_where = pd.DatetimeIndex(dates)
pandas_where = pandas.DatetimeIndex(dates)
compare_asof(data, index, modin_where, pandas_where, subset)
@pytest.mark.parametrize(
"dates",
[
["2018-02-27 09:03:30", "2018-02-27 09:04:30"],
["2018-02-27 09:03:00", "2018-02-27 09:05:00"],
],
)
@pytest.mark.parametrize("subset", ["a", "b", ["a", "b"], None])
def test_asof_without_nan(dates, subset):
data = {"a": [10, 20, 30, 40, 50], "b": [70, 600, 30, -200, 500]}
index = pd.DatetimeIndex(
[
"2018-02-27 09:01:00",
"2018-02-27 09:02:00",
"2018-02-27 09:03:00",
"2018-02-27 09:04:00",
"2018-02-27 09:05:00",
]
)
modin_where = pd.DatetimeIndex(dates)
pandas_where = pandas.DatetimeIndex(dates)
compare_asof(data, index, modin_where, pandas_where, subset)
@pytest.mark.parametrize(
"lookup",
[[60, 70, 90], [60.5, 70.5, 100]],
)
@pytest.mark.parametrize("subset", ["col2", "col1", ["col1", "col2"], None])
def test_asof_large(lookup, subset):
data = test_data["float_nan_data"]
index = list(range(NROWS))
modin_where = pd.Index(lookup)
pandas_where = pandas.Index(lookup)
compare_asof(data, index, modin_where, pandas_where, subset)
def compare_asof(
data, index, modin_where: pd.Index, pandas_where: pandas.Index, subset
):
modin_df = pd.DataFrame(data, index=index)
pandas_df = pandas.DataFrame(data, index=index)
df_equals(
modin_df.asof(modin_where, subset=subset),
pandas_df.asof(pandas_where, subset=subset),
)
df_equals(
modin_df.asof(modin_where.values, subset=subset),
pandas_df.asof(pandas_where.values, subset=subset),
)
df_equals(
modin_df.asof(list(modin_where.values), subset=subset),
pandas_df.asof(list(pandas_where.values), subset=subset),
)
df_equals(
modin_df.asof(modin_where.values[0], subset=subset),
pandas_df.asof(pandas_where.values[0], subset=subset),
)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_first_valid_index(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
assert modin_df.first_valid_index() == (pandas_df.first_valid_index())
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys))
def test_head(data, n):
# Test normal dataframe head
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
df_equals(modin_df.head(n), pandas_df.head(n))
df_equals(modin_df.head(len(modin_df) + 1), pandas_df.head(len(pandas_df) + 1))
# Test head when we call it from a QueryCompilerView
modin_result = modin_df.loc[:, ["col1", "col3", "col3"]].head(n)
pandas_result = pandas_df.loc[:, ["col1", "col3", "col3"]].head(n)
df_equals(modin_result, pandas_result)
@pytest.mark.skip(reason="Defaulting to Pandas")
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_iat(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data) # noqa F841
with pytest.raises(NotImplementedError):
modin_df.iat()
@pytest.mark.gpu
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_iloc(request, data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
if not name_contains(request.node.name, ["empty_data"]):
# Scalar
np.testing.assert_equal(modin_df.iloc[0, 1], pandas_df.iloc[0, 1])
# Series
df_equals(modin_df.iloc[0], pandas_df.iloc[0])
df_equals(modin_df.iloc[1:, 0], pandas_df.iloc[1:, 0])
df_equals(modin_df.iloc[1:2, 0], pandas_df.iloc[1:2, 0])
# DataFrame
df_equals(modin_df.iloc[[1, 2]], pandas_df.iloc[[1, 2]])
# See issue #80
# df_equals(modin_df.iloc[[1, 2], [1, 0]], pandas_df.iloc[[1, 2], [1, 0]])
df_equals(modin_df.iloc[1:2, 0:2], pandas_df.iloc[1:2, 0:2])
# Issue #43
modin_df.iloc[0:3, :]
# Write Item
modin_df.iloc[[1, 2]] = 42
pandas_df.iloc[[1, 2]] = 42
df_equals(modin_df, pandas_df)
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
modin_df.iloc[0] = modin_df.iloc[1]
pandas_df.iloc[0] = pandas_df.iloc[1]
df_equals(modin_df, pandas_df)
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
modin_df.iloc[:, 0] = modin_df.iloc[:, 1]
pandas_df.iloc[:, 0] = pandas_df.iloc[:, 1]
df_equals(modin_df, pandas_df)
# From issue #1775
df_equals(
modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])],
pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5])],
)
# Read values, selecting rows with callable and a column with a scalar.
df_equals(
pandas_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5]), 0],
modin_df.iloc[lambda df: df.index.get_indexer_for(df.index[:5]), 0],
)
else:
with pytest.raises(IndexError):
modin_df.iloc[0, 1]
@pytest.mark.gpu
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_index(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
df_equals(modin_df.index, pandas_df.index)
modin_df_cp = modin_df.copy()
pandas_df_cp = pandas_df.copy()
modin_df_cp.index = [str(i) for i in modin_df_cp.index]
pandas_df_cp.index = [str(i) for i in pandas_df_cp.index]
df_equals(modin_df_cp.index, pandas_df_cp.index)
@pytest.mark.gpu
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_indexing_duplicate_axis(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))]
assert any(modin_df.index.duplicated())
assert any(pandas_df.index.duplicated())
df_equals(modin_df.iloc[0], pandas_df.iloc[0])
df_equals(modin_df.loc[0], pandas_df.loc[0])
df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4])
df_equals(
modin_df.loc[0, modin_df.columns[0:4]],
pandas_df.loc[0, pandas_df.columns[0:4]],
)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_set_index(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
modin_result = modin_df.set_index([modin_df.index, modin_df.columns[0]])
pandas_result = pandas_df.set_index([pandas_df.index, pandas_df.columns[0]])
df_equals(modin_result, pandas_result)
# test for the case from https://github.com/modin-project/modin/issues/4308
eval_general(modin_df, pandas_df, lambda df: df.set_index("inexistent_col"))
@pytest.mark.gpu
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_keys(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
df_equals(modin_df.keys(), pandas_df.keys())
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_loc(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
key1 = modin_df.columns[0]
key2 = modin_df.columns[1]
# Scalar
df_equals(modin_df.loc[0, key1], pandas_df.loc[0, key1])
# Series
df_equals(modin_df.loc[0], pandas_df.loc[0])
df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1])
df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1])
df_equals(modin_df.loc[:, key1], pandas_df.loc[:, key1])
# DataFrame
df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]])
indices = [i % 3 == 0 for i in range(len(modin_df.index))]
columns = [i % 5 == 0 for i in range(len(modin_df.columns))]
# Key is a list of booleans
modin_result = modin_df.loc[indices, columns]
pandas_result = pandas_df.loc[indices, columns]
df_equals(modin_result, pandas_result)
# Key is a Modin or pandas series of booleans
df_equals(
modin_df.loc[pd.Series(indices), pd.Series(columns, index=modin_df.columns)],
pandas_df.loc[
pandas.Series(indices), pandas.Series(columns, index=modin_df.columns)
],
)
modin_result = modin_df.loc[:, columns]
pandas_result = pandas_df.loc[:, columns]
df_equals(modin_result, pandas_result)
modin_result = modin_df.loc[indices]
pandas_result = pandas_df.loc[indices]
df_equals(modin_result, pandas_result)
# See issue #80
# df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']])
df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2])
# From issue #421
df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]])
df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :])
# From issue #1023
key1 = modin_df.columns[0]
key2 = modin_df.columns[-2]
df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2])
# Write Item
modin_df_copy = modin_df.copy()
pandas_df_copy = pandas_df.copy()
modin_df_copy.loc[[1, 2]] = 42
pandas_df_copy.loc[[1, 2]] = 42
df_equals(modin_df_copy, pandas_df_copy)
# Write an item, selecting rows with a callable.
modin_df_copy2 = modin_df.copy()
pandas_df_copy2 = pandas_df.copy()
modin_df_copy2.loc[lambda df: df[key1].isin(list(range(1000)))] = 42
pandas_df_copy2.loc[lambda df: df[key1].isin(list(range(1000)))] = 42
df_equals(modin_df_copy2, pandas_df_copy2)
# Write an item, selecting rows with a callable and a column with a scalar.
modin_df_copy3 = modin_df.copy()
pandas_df_copy3 = pandas_df.copy()
modin_df_copy3.loc[lambda df: df[key1].isin(list(range(1000))), key1] = 42
pandas_df_copy3.loc[lambda df: df[key1].isin(list(range(1000))), key1] = 42
df_equals(modin_df_copy3, pandas_df_copy3)
# Disabled for `BaseOnPython` because of the issue with `getitem_array`:
# https://github.com/modin-project/modin/issues/3701
if get_current_execution() != "BaseOnPython":
# From issue #1775
df_equals(
modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))],
pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))],
)
# Read values, selecting rows with a callable and a column with a scalar.
df_equals(
pandas_df.loc[lambda df: df[key1].isin(list(range(1000))), key1],
modin_df.loc[lambda df: df[key1].isin(list(range(1000))), key1],
)
# From issue #1374
with pytest.raises(KeyError):
modin_df.loc["NO_EXIST"]
@pytest.mark.parametrize(
"key_getter, value_getter",
[
pytest.param(
lambda df, axis: (
(slice(None), df.axes[axis][:2])
if axis
else (df.axes[axis][:2], slice(None))
),
lambda df, axis: df.iloc[:, :1] if axis else df.iloc[:1, :],
id="len(key)_>_len(value)",
),
pytest.param(
lambda df, axis: (
(slice(None), df.axes[axis][:2])
if axis
else (df.axes[axis][:2], slice(None))
),
lambda df, axis: df.iloc[:, :3] if axis else df.iloc[:3, :],
id="len(key)_<_len(value)",
),
pytest.param(
lambda df, axis: (
(slice(None), df.axes[axis][:2])
if axis
else (df.axes[axis][:2], slice(None))
),
lambda df, axis: df.iloc[:, :2] if axis else df.iloc[:2, :],
id="len(key)_==_len(value)",
),
],
)
@pytest.mark.parametrize("key_axis", [0, 1])
@pytest.mark.parametrize("reverse_value_index", [True, False])
@pytest.mark.parametrize("reverse_value_columns", [True, False])
def test_loc_4456(
key_getter, value_getter, key_axis, reverse_value_index, reverse_value_columns
):
data = test_data["float_nan_data"]
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)
key = key_getter(pandas_df, key_axis)
# `df.loc` doesn't work right for range-like indexers. Converting them to a list.
# https://github.com/modin-project/modin/issues/4497
if is_range_like(key[0]):
key = (list(key[0]), key[1])
if is_range_like(key[1]):
key = (key[0], list(key[1]))
value = pandas.DataFrame(
np.random.randint(0, 100, size=pandas_df.shape),
index=pandas_df.index,
columns=pandas_df.columns,
)
pdf_value = value_getter(value, key_axis)
mdf_value = value_getter(pd.DataFrame(value), key_axis)
if reverse_value_index:
pdf_value = pdf_value.reindex(index=pdf_value.index[::-1])
mdf_value = mdf_value.reindex(index=mdf_value.index[::-1])
if reverse_value_columns:
pdf_value = pdf_value.reindex(columns=pdf_value.columns[::-1])
mdf_value = mdf_value.reindex(columns=mdf_value.columns[::-1])
eval_loc(modin_df, pandas_df, pdf_value, key)
eval_loc(modin_df, pandas_df, (mdf_value, pdf_value), key)
# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_single_categorical_column():
modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
modin_df.loc[1:3, "status"] = "a"
pandas_df.loc[1:3, "status"] = "a"
df_equals(modin_df, pandas_df)
def test_loc_multi_index():
modin_df = pd.read_csv(
"modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0
)
pandas_df = pandas.read_csv(
"modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0
)
df_equals(modin_df.loc[1], pandas_df.loc[1])
df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"])
df_equals(
modin_df.loc[1, ("Presidents", "Pure mentions")],
pandas_df.loc[1, ("Presidents", "Pure mentions")],
)
assert (
modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")]
== pandas_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")]
)
df_equals(modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2), "Presidents"])
tuples = [
("bar", "one"),
("bar", "two"),
("bar", "three"),
("bar", "four"),
("baz", "one"),
("baz", "two"),
("baz", "three"),
("baz", "four"),
("foo", "one"),
("foo", "two"),
("foo", "three"),
("foo", "four"),
("qux", "one"),
("qux", "two"),
("qux", "three"),
("qux", "four"),
]
modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"])
frame_data = np.random.randint(0, 100, size=(16, 100))
modin_df = pd.DataFrame(
frame_data,
index=modin_index,
columns=["col{}".format(i) for i in range(100)],
)
pandas_df = pandas.DataFrame(
frame_data,
index=pandas_index,
columns=["col{}".format(i) for i in range(100)],
)
df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"])
assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"]
df_equals(
modin_df.loc["bar", ("col1", "col2")],
pandas_df.loc["bar", ("col1", "col2")],
)
# From issue #1456
transposed_modin = modin_df.T
transposed_pandas = pandas_df.T
df_equals(
transposed_modin.loc[transposed_modin.index[:-2], :],
transposed_pandas.loc[transposed_pandas.index[:-2], :],
)
# From issue #1610
df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index])
df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]])
def test_loc_empty():
pandas_df = pandas.DataFrame(index=range(5))
modin_df = pd.DataFrame(index=range(5))
df_equals(pandas_df.loc[1], modin_df.loc[1])
pandas_df.loc[1] = 3
modin_df.loc[1] = 3
df_equals(pandas_df, modin_df)
@pytest.mark.parametrize("index", [["row1", "row2", "row3"]])
@pytest.mark.parametrize("columns", [["col1", "col2"]])
def test_loc_assignment(index, columns):
md_df, pd_df = create_test_dfs(index=index, columns=columns)
for i, ind in enumerate(index):
for j, col in enumerate(columns):
value_to_assign = int(str(i) + str(j))
md_df.loc[ind][col] = value_to_assign
pd_df.loc[ind][col] = value_to_assign
df_equals(md_df, pd_df)
@pytest.fixture
def loc_iter_dfs():
columns = ["col1", "col2", "col3"]
index = ["row1", "row2", "row3"]
return create_test_dfs(
{col: ([idx] * len(index)) for idx, col in enumerate(columns)},
columns=columns,
index=index,
)
@pytest.mark.parametrize("reverse_order", [False, True])
@pytest.mark.parametrize("axis", [0, 1])
def test_loc_iter_assignment(loc_iter_dfs, reverse_order, axis):
if reverse_order and axis:
pytest.xfail(
"Due to internal sorting of lookup values assignment order is lost, see GH-#2552"
)
md_df, pd_df = loc_iter_dfs
select = [slice(None), slice(None)]
select[axis] = sorted(pd_df.axes[axis][:-1], reverse=reverse_order)
select = tuple(select)
pd_df.loc[select] = pd_df.loc[select] + pd_df.loc[select]
md_df.loc[select] = md_df.loc[select] + md_df.loc[select]
df_equals(md_df, pd_df)
@pytest.mark.parametrize("reverse_order", [False, True])
@pytest.mark.parametrize("axis", [0, 1])
def test_loc_order(loc_iter_dfs, reverse_order, axis):
md_df, pd_df = loc_iter_dfs
select = [slice(None), slice(None)]
select[axis] = sorted(pd_df.axes[axis][:-1], reverse=reverse_order)
select = tuple(select)
df_equals(pd_df.loc[select], md_df.loc[select])
@pytest.mark.gpu
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_loc_nested_assignment(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
key1 = modin_df.columns[0]
key2 = modin_df.columns[1]
modin_df[key1].loc[0] = 500
pandas_df[key1].loc[0] = 500
df_equals(modin_df, pandas_df)
modin_df[key2].loc[0] = None
pandas_df[key2].loc[0] = None
df_equals(modin_df, pandas_df)
def test_iloc_assignment():
modin_df = pd.DataFrame(index=["row1", "row2", "row3"], columns=["col1", "col2"])
pandas_df = pandas.DataFrame(
index=["row1", "row2", "row3"], columns=["col1", "col2"]
)
modin_df.iloc[0]["col1"] = 11
modin_df.iloc[1]["col1"] = 21
modin_df.iloc[2]["col1"] = 31
modin_df.iloc[lambda df: 0]["col2"] = 12
modin_df.iloc[1][lambda df: ["col2"]] = 22
modin_df.iloc[lambda df: 2][lambda df: ["col2"]] = 32
pandas_df.iloc[0]["col1"] = 11
pandas_df.iloc[1]["col1"] = 21
pandas_df.iloc[2]["col1"] = 31
pandas_df.iloc[lambda df: 0]["col2"] = 12
pandas_df.iloc[1][lambda df: ["col2"]] = 22
pandas_df.iloc[lambda df: 2][lambda df: ["col2"]] = 32
df_equals(modin_df, pandas_df)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_iloc_nested_assignment(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
key1 = modin_df.columns[0]
key2 = modin_df.columns[1]
modin_df[key1].iloc[0] = 500
pandas_df[key1].iloc[0] = 500
df_equals(modin_df, pandas_df)
modin_df[key2].iloc[0] = None
pandas_df[key2].iloc[0] = None
df_equals(modin_df, pandas_df)
def test_iloc_empty():
pandas_df = pandas.DataFrame(index=range(5))
modin_df = pd.DataFrame(index=range(5))
df_equals(pandas_df.iloc[1], modin_df.iloc[1])
pandas_df.iloc[1] = 3
modin_df.iloc[1] = 3
df_equals(pandas_df, modin_df)
def test_loc_series():
md_df, pd_df = create_test_dfs({"a": [1, 2], "b": [3, 4]})
pd_df.loc[pd_df["a"] > 1, "b"] = np.log(pd_df["b"])
md_df.loc[md_df["a"] > 1, "b"] = np.log(md_df["b"])
df_equals(pd_df, md_df)
@pytest.mark.parametrize("locator_name", ["loc", "iloc"])
@pytest.mark.parametrize(
"slice_indexer",
[
slice(None, None, -2),
slice(1, 10, None),
slice(None, 10, None),
slice(10, None, None),
slice(10, None, -2),
slice(-10, None, -2),
slice(None, 1_000_000_000, None),
],
)
def test_loc_iloc_slice_indexer(locator_name, slice_indexer):
md_df, pd_df = create_test_dfs(test_data_values[0])
# Shifting the index, so labels won't match its position
shifted_index = pandas.RangeIndex(1, len(md_df) + 1)
md_df.index = shifted_index
pd_df.index = shifted_index
eval_general(md_df, pd_df, lambda df: getattr(df, locator_name)[slice_indexer])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_pop(request, data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
if "empty_data" not in request.node.name:
key = modin_df.columns[0]
temp_modin_df = modin_df.copy()
temp_pandas_df = pandas_df.copy()
modin_popped = temp_modin_df.pop(key)
pandas_popped = temp_pandas_df.pop(key)
df_equals(modin_popped, pandas_popped)
df_equals(temp_modin_df, temp_pandas_df)
def test_reindex():
frame_data = {
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [0, 0, 0, 0],
}
pandas_df = pandas.DataFrame(frame_data)
modin_df = pd.DataFrame(frame_data)
df_equals(modin_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1]))
df_equals(modin_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2]))
df_equals(
modin_df.reindex(["col1", "col3", "col4", "col2"], axis=1),
pandas_df.reindex(["col1", "col3", "col4", "col2"], axis=1),
)
df_equals(
modin_df.reindex(["col1", "col7", "col4", "col8"], axis=1),
pandas_df.reindex(["col1", "col7", "col4", "col8"], axis=1),
)
df_equals(
modin_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]),
pandas_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]),
)
df_equals(
modin_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0),
pandas_df.T.reindex(["col1", "col7", "col4", "col8"], axis=0),
)
def test_reindex_4438():
index = pd.date_range(end="1/1/2018", periods=3, freq="h", name="some meta")
new_index = list(reversed(index))
# index case
modin_df = pd.DataFrame([1, 2, 3], index=index)
pandas_df = pandas.DataFrame([1, 2, 3], index=index)
new_modin_df = modin_df.reindex(new_index)
new_pandas_df = pandas_df.reindex(new_index)
df_equals(new_modin_df, new_pandas_df)
# column case
modin_df = pd.DataFrame(np.array([[1], [2], [3]]).T, columns=index)
pandas_df = pandas.DataFrame(np.array([[1], [2], [3]]).T, columns=index)
new_modin_df = modin_df.reindex(columns=new_index)
new_pandas_df = pandas_df.reindex(columns=new_index)
df_equals(new_modin_df, new_pandas_df)
# multiindex case
multi_index = pandas.MultiIndex.from_arrays(
[("a", "b", "c"), ("a", "b", "c")], names=["first", "second"]
)
new_multi_index = list(reversed(multi_index))
modin_df = pd.DataFrame([1, 2, 3], index=multi_index)
pandas_df = pandas.DataFrame([1, 2, 3], index=multi_index)
new_modin_df = modin_df.reindex(new_multi_index)
new_pandas_df = pandas_df.reindex(new_multi_index)
df_equals(new_modin_df, new_pandas_df)
# multicolumn case
modin_df = pd.DataFrame(np.array([[1], [2], [3]]).T, columns=multi_index)
pandas_df = pandas.DataFrame(np.array([[1], [2], [3]]).T, columns=multi_index)
new_modin_df = modin_df.reindex(columns=new_multi_index)
new_pandas_df = pandas_df.reindex(columns=new_multi_index)
df_equals(new_modin_df, new_pandas_df)
# index + multiindex case
modin_df = pd.DataFrame([1, 2, 3], index=index)
pandas_df = pandas.DataFrame([1, 2, 3], index=index)
new_modin_df = modin_df.reindex(new_multi_index)
new_pandas_df = pandas_df.reindex(new_multi_index)
df_equals(new_modin_df, new_pandas_df)
def test_reindex_like():
df1 = pd.DataFrame(
[
[24.3, 75.7, "high"],
[31, 87.8, "high"],
[22, 71.6, "medium"],
[35, 95, "medium"],
],
columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
)
df2 = pd.DataFrame(
[[28, "low"], [30, "low"], [35.1, "medium"]],
columns=["temp_celsius", "windspeed"],
index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]),
)
with warns_that_defaulting_to_pandas():
df2.reindex_like(df1)
def test_rename_sanity():
source_df = pandas.DataFrame(test_data["int_data"])[
["col1", "index", "col3", "col4"]
]
mapping = {"col1": "a", "index": "b", "col3": "c", "col4": "d"}
modin_df = pd.DataFrame(source_df)
df_equals(modin_df.rename(columns=mapping), source_df.rename(columns=mapping))
renamed2 = source_df.rename(columns=str.lower)
df_equals(modin_df.rename(columns=str.lower), renamed2)
modin_df = pd.DataFrame(renamed2)
df_equals(modin_df.rename(columns=str.upper), renamed2.rename(columns=str.upper))
# index
data = {"A": {"foo": 0, "bar": 1}}
# gets sorted alphabetical
df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)
assert_index_equal(
modin_df.rename(index={"foo": "bar", "bar": "foo"}).index,
df.rename(index={"foo": "bar", "bar": "foo"}).index,
)
assert_index_equal(
modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index
)
# Using the `mapper` functionality with `axis`
assert_index_equal(
modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index
)
assert_index_equal(
modin_df.rename(str.upper, axis=1).columns,
df.rename(str.upper, axis=1).columns,
)
# have to pass something
with pytest.raises(TypeError):
modin_df.rename()
# partial columns
renamed = source_df.rename(columns={"col3": "foo", "col4": "bar"})
modin_df = pd.DataFrame(source_df)
assert_index_equal(
modin_df.rename(columns={"col3": "foo", "col4": "bar"}).index,
source_df.rename(columns={"col3": "foo", "col4": "bar"}).index,
)
# other axis
renamed = source_df.T.rename(index={"col3": "foo", "col4": "bar"})
assert_index_equal(
source_df.T.rename(index={"col3": "foo", "col4": "bar"}).index,
modin_df.T.rename(index={"col3": "foo", "col4": "bar"}).index,
)
# index with name
index = pandas.Index(["foo", "bar"], name="name")
renamer = pandas.DataFrame(data, index=index)
modin_df = pd.DataFrame(data, index=index)
renamed = renamer.rename(index={"foo": "bar", "bar": "foo"})
modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"})
assert_index_equal(renamed.index, modin_renamed.index)
assert renamed.index.name == modin_renamed.index.name
def test_rename_multiindex():
tuples_index = [("foo1", "bar1"), ("foo2", "bar2")]
tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")]
index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"])
columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"])
frame_data = [(0, 0), (1, 1)]
df = pandas.DataFrame(frame_data, index=index, columns=columns)
modin_df = pd.DataFrame(frame_data, index=index, columns=columns)
#
# without specifying level -> accross all levels
renamed = df.rename(
index={"foo1": "foo3", "bar2": "bar3"},
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
)
modin_renamed = modin_df.rename(
index={"foo1": "foo3", "bar2": "bar3"},
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
)
assert_index_equal(renamed.index, modin_renamed.index)
renamed = df.rename(
index={"foo1": "foo3", "bar2": "bar3"},
columns={"fizz1": "fizz3", "buzz2": "buzz3"},
)
assert_index_equal(renamed.columns, modin_renamed.columns)
assert renamed.index.names == modin_renamed.index.names
assert renamed.columns.names == modin_renamed.columns.names
#
# with specifying a level
# dict
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0)
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0
)
assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz")
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz"
)
assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1
)
assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz")
modin_renamed = modin_df.rename(
columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz"
)
assert_index_equal(renamed.columns, modin_renamed.columns)
# function
func = str.upper
renamed = df.rename(columns=func, level=0)
modin_renamed = modin_df.rename(columns=func, level=0)
assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns=func, level="fizz")
modin_renamed = modin_df.rename(columns=func, level="fizz")
assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns=func, level=1)
modin_renamed = modin_df.rename(columns=func, level=1)
assert_index_equal(renamed.columns, modin_renamed.columns)
renamed = df.rename(columns=func, level="buzz")
modin_renamed = modin_df.rename(columns=func, level="buzz")
assert_index_equal(renamed.columns, modin_renamed.columns)
# index
renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
modin_renamed = modin_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
assert_index_equal(modin_renamed.index, renamed.index)
@pytest.mark.xfail(reason="Pandas does not pass this test")
def test_rename_nocopy():
source_df = pandas.DataFrame(test_data["int_data"])[
["col1", "index", "col3", "col4"]
]
modin_df = pd.DataFrame(source_df)
modin_renamed = modin_df.rename(columns={"col3": "foo"}, copy=False)
modin_renamed["foo"] = 1
assert (modin_df["col3"] == 1).all()
def test_rename_inplace():
source_df = pandas.DataFrame(test_data["int_data"])[
["col1", "index", "col3", "col4"]
]
modin_df = pd.DataFrame(source_df)
df_equals(
modin_df.rename(columns={"col3": "foo"}),
source_df.rename(columns={"col3": "foo"}),
)
frame = source_df.copy()
modin_frame = modin_df.copy()
frame.rename(columns={"col3": "foo"}, inplace=True)
modin_frame.rename(columns={"col3": "foo"}, inplace=True)
df_equals(modin_frame, frame)
def test_rename_bug():
# rename set ref_locs, and set_index was not resetting
frame_data = {0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]}
df = pandas.DataFrame(frame_data)
modin_df = pd.DataFrame(frame_data)
df = df.rename(columns={0: "a"})
df = df.rename(columns={1: "b"})
df = df.set_index(["a", "b"])
df.columns = ["2001-01-01"]
modin_df = modin_df.rename(columns={0: "a"})
modin_df = modin_df.rename(columns={1: "b"})
modin_df = modin_df.set_index(["a", "b"])
modin_df.columns = ["2001-01-01"]
df_equals(modin_df, df)
def test_rename_axis():
data = {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]}
index = ["dog", "cat", "monkey"]
modin_df = pd.DataFrame(data, index)
pandas_df = pandas.DataFrame(data, index)
df_equals(modin_df.rename_axis("animal"), pandas_df.rename_axis("animal"))
df_equals(
modin_df.rename_axis("limbs", axis="columns"),
pandas_df.rename_axis("limbs", axis="columns"),
)
modin_df.rename_axis("limbs", axis="columns", inplace=True)
pandas_df.rename_axis("limbs", axis="columns", inplace=True)
df_equals(modin_df, pandas_df)
new_index = pd.MultiIndex.from_product(
[["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"]
)
modin_df.index = new_index
pandas_df.index = new_index
df_equals(
modin_df.rename_axis(index={"type": "class"}),
pandas_df.rename_axis(index={"type": "class"}),
)
df_equals(
modin_df.rename_axis(columns=str.upper),
pandas_df.rename_axis(columns=str.upper),
)
df_equals(
modin_df.rename_axis(columns=[str.upper(o) for o in modin_df.columns.names]),
pandas_df.rename_axis(columns=[str.upper(o) for o in pandas_df.columns.names]),
)
with pytest.raises(ValueError):
df_equals(
modin_df.rename_axis(str.upper, axis=1),
pandas_df.rename_axis(str.upper, axis=1),
)
def test_rename_axis_inplace():
test_frame = pandas.DataFrame(test_data["int_data"])
modin_df = pd.DataFrame(test_frame)
result = test_frame.copy()
modin_result = modin_df.copy()
no_return = result.rename_axis("foo", inplace=True)
modin_no_return = modin_result.rename_axis("foo", inplace=True)
assert no_return is modin_no_return
df_equals(modin_result, result)
result = test_frame.copy()
modin_result = modin_df.copy()
no_return = result.rename_axis("bar", axis=1, inplace=True)
modin_no_return = modin_result.rename_axis("bar", axis=1, inplace=True)
assert no_return is modin_no_return
df_equals(modin_result, result)
def test_reorder_levels():
data = np.random.randint(1, 100, 12)
modin_df = pd.DataFrame(
data,
index=pd.MultiIndex.from_tuples(
[
(num, letter, color)
for num in range(1, 3)
for letter in ["a", "b", "c"]
for color in ["Red", "Green"]
],
names=["Number", "Letter", "Color"],
),
)
pandas_df = pandas.DataFrame(
data,
index=pandas.MultiIndex.from_tuples(
[
(num, letter, color)
for num in range(1, 3)
for letter in ["a", "b", "c"]
for color in ["Red", "Green"]
],
names=["Number", "Letter", "Color"],
),
)
df_equals(
modin_df.reorder_levels(["Letter", "Color", "Number"]),
pandas_df.reorder_levels(["Letter", "Color", "Number"]),
)
def test_reindex_multiindex():
data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6)
index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"])
modin_midx = pd.MultiIndex.from_product(
[["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
)
pandas_midx = pandas.MultiIndex.from_product(
[["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
)
modin_df1, modin_df2 = (
pd.DataFrame(data=data1, index=index, columns=index),
pd.DataFrame(data2, modin_midx),
)
pandas_df1, pandas_df2 = (
pandas.DataFrame(data=data1, index=index, columns=index),
pandas.DataFrame(data2, pandas_midx),
)
modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"]
md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index])
pd_midx = pandas.MultiIndex.from_product(
[pandas_df2.index.levels[0], pandas_df1.index]
)
# reindex without axis, index, or columns
modin_result = modin_df1.reindex(md_midx, fill_value=0)
pandas_result = pandas_df1.reindex(pd_midx, fill_value=0)
df_equals(modin_result, pandas_result)
# reindex with only axis
modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0)
pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0)
df_equals(modin_result, pandas_result)
# reindex with axis and level
modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0)
pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0)
df_equals(modin_result, pandas_result)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_reset_index(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
modin_result = modin_df.reset_index(inplace=False)
pandas_result = pandas_df.reset_index(inplace=False)
df_equals(modin_result, pandas_result)
modin_df_cp = modin_df.copy()
pd_df_cp = pandas_df.copy()
modin_df_cp.reset_index(inplace=True)
pd_df_cp.reset_index(inplace=True)
df_equals(modin_df_cp, pd_df_cp)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_reset_index_multiindex_groupby(data):
# GH#4394
modin_df, pandas_df = create_test_dfs(data)
modin_df.index = pd.MultiIndex.from_tuples(
[(i // 10, i // 5, i) for i in range(len(modin_df))]
)
pandas_df.index = pandas.MultiIndex.from_tuples(
[(i // 10, i // 5, i) for i in range(len(pandas_df))]
)
eval_general(
modin_df,
pandas_df,
lambda df: df.reset_index().groupby(list(df.columns[:2])).count(),
)
@pytest.mark.parametrize(
"data",
[
pytest.param(
test_data["int_data"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
test_data["float_nan_data"],
],
ids=["int_data", "float_nan_data"],
)
@pytest.mark.parametrize("nlevels", [3])
@pytest.mark.parametrize("columns_multiindex", [True, False])
@pytest.mark.parametrize(
"level",
[
"no_level",
None,
0,
1,
2,
[2, 0],
[2, 1],
[1, 0],
pytest.param(
[2, 1, 2],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
[0, 0, 0, 0],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
["level_name_1"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
["level_name_2", "level_name_1"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
[2, "level_name_0"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
],
)
@pytest.mark.parametrize("col_level", ["no_col_level", 0, 1, 2])
@pytest.mark.parametrize("col_fill", ["no_col_fill", None, 0, "new"])
@pytest.mark.parametrize("drop", [False])
@pytest.mark.parametrize(
"multiindex_levels_names_max_levels",
[
0,
1,
2,
pytest.param(
3, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra")
),
pytest.param(
4, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra")
),
],
)
@pytest.mark.parametrize(
"none_in_index_names",
[
pytest.param(
False,
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
True,
"mixed_1st_None",
pytest.param(
"mixed_2nd_None",
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
],
)
def test_reset_index_with_multi_index_no_drop(
data,
nlevels,
columns_multiindex,
level,
col_level,
col_fill,
drop,
multiindex_levels_names_max_levels,
none_in_index_names,
):
data_rows = len(data[list(data.keys())[0]])
index = generate_multiindex(data_rows, nlevels=nlevels)
data_columns = len(data.keys())
columns = (
generate_multiindex(data_columns, nlevels=nlevels)
if columns_multiindex
else pandas.RangeIndex(0, data_columns)
)
# Replace original data columns with generated
data = {columns[ind]: data[key] for ind, key in enumerate(data)}
index.names = (
[f"level_{i}" for i in range(index.nlevels)]
if multiindex_levels_names_max_levels == 0
else [
tuple(
[
f"level_{i}_name_{j}"
for j in range(
0,
max(multiindex_levels_names_max_levels + 1 - index.nlevels, 0)
+ i,
)
]
)
if max(multiindex_levels_names_max_levels + 1 - index.nlevels, 0) + i > 0
else f"level_{i}"
for i in range(index.nlevels)
]
)
if none_in_index_names is True:
index.names = [None] * len(index.names)
elif none_in_index_names:
names_list = list(index.names)
start_index = 0 if none_in_index_names == "mixed_1st_None" else 1
names_list[start_index::2] = [None] * len(names_list[start_index::2])
index.names = names_list
modin_df = pd.DataFrame(data, index=index, columns=columns)
pandas_df = pandas.DataFrame(data, index=index, columns=columns)
if isinstance(level, list):
level = [
index.names[int(x[len("level_name_") :])]
if isinstance(x, str) and x.startswith("level_name_")
else x
for x in level
]
kwargs = {"drop": drop}
if level != "no_level":
kwargs["level"] = level
if col_level != "no_col_level":
kwargs["col_level"] = col_level
if col_fill != "no_col_fill":
kwargs["col_fill"] = col_fill
eval_general(modin_df, pandas_df, lambda df: df.reset_index(**kwargs))
@pytest.mark.parametrize(
"data",
[
pytest.param(
test_data["int_data"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
test_data["float_nan_data"],
],
ids=["int_data", "float_nan_data"],
)
@pytest.mark.parametrize("nlevels", [3])
@pytest.mark.parametrize(
"level",
[
"no_level",
None,
0,
1,
2,
[2, 0],
[2, 1],
[1, 0],
pytest.param(
[2, 1, 2],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
[0, 0, 0, 0],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
["level_name_1"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
["level_name_2", "level_name_1"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
pytest.param(
[2, "level_name_0"],
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
],
)
@pytest.mark.parametrize(
"multiindex_levels_names_max_levels",
[
0,
1,
2,
pytest.param(
3, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra")
),
pytest.param(
4, marks=pytest.mark.skipif(not extra_test_parameters, reason="extra")
),
],
)
@pytest.mark.parametrize(
"none_in_index_names",
[
pytest.param(
False,
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
True,
"mixed_1st_None",
pytest.param(
"mixed_2nd_None",
marks=pytest.mark.skipif(not extra_test_parameters, reason="extra"),
),
],
)
def test_reset_index_with_multi_index_drop(
data, nlevels, level, multiindex_levels_names_max_levels, none_in_index_names
):
test_reset_index_with_multi_index_no_drop(
data,
nlevels,
True,
level,
"no_col_level",
"no_col_fill",
True,
multiindex_levels_names_max_levels,
none_in_index_names,
)
@pytest.mark.parametrize("index_levels_names_max_levels", [0, 1, 2])
def test_reset_index_with_named_index(index_levels_names_max_levels):
modin_df = pd.DataFrame(test_data_values[0])
pandas_df = pandas.DataFrame(test_data_values[0])
index_name = (
tuple([f"name_{j}" for j in range(0, index_levels_names_max_levels)])
if index_levels_names_max_levels > 0
else "NAME_OF_INDEX"
)
modin_df.index.name = pandas_df.index.name = index_name
df_equals(modin_df, pandas_df)
df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))
modin_df.reset_index(drop=True, inplace=True)
pandas_df.reset_index(drop=True, inplace=True)
df_equals(modin_df, pandas_df)
modin_df = pd.DataFrame(test_data_values[0])
pandas_df = pandas.DataFrame(test_data_values[0])
modin_df.index.name = pandas_df.index.name = index_name
df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))
@pytest.mark.parametrize(
"index",
[
pandas.Index([11, 22, 33, 44], name="col0"),
pandas.MultiIndex.from_product(
[[100, 200], [300, 400]], names=["level1", "col0"]
),
],
ids=["index", "multiindex"],
)
def test_reset_index_metadata_update(index):
modin_df, pandas_df = create_test_dfs({"col0": [0, 1, 2, 3]}, index=index)
modin_df.columns = pandas_df.columns = ["col1"]
eval_general(modin_df, pandas_df, lambda df: df.reset_index())
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
def test_sample(data, axis):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
with pytest.raises(ValueError):
modin_df.sample(n=3, frac=0.4, axis=axis)
with pytest.raises(KeyError):
modin_df.sample(frac=0.5, weights="CoLuMn_No_ExIsT", axis=0)
with pytest.raises(ValueError):
modin_df.sample(frac=0.5, weights=modin_df.columns[0], axis=1)
with pytest.raises(ValueError):
modin_df.sample(
frac=0.5, weights=[0.5 for _ in range(len(modin_df.index[:-1]))], axis=0
)
with pytest.raises(ValueError):
modin_df.sample(
frac=0.5,
weights=[0.5 for _ in range(len(modin_df.columns[:-1]))],
axis=1,
)
with pytest.raises(ValueError):
modin_df.sample(n=-3, axis=axis)
with pytest.raises(ValueError):
modin_df.sample(frac=0.2, weights=pandas.Series(), axis=axis)
if isinstance(axis, str):
num_axis = pandas.DataFrame()._get_axis_number(axis)
else:
num_axis = axis
# weights that sum to 1
sums = sum(i % 2 for i in range(len(modin_df.axes[num_axis])))
weights = [i % 2 / sums for i in range(len(modin_df.axes[num_axis]))]
modin_result = modin_df.sample(
frac=0.5, random_state=42, weights=weights, axis=axis
)
pandas_result = pandas_df.sample(
frac=0.5, random_state=42, weights=weights, axis=axis
)
df_equals(modin_result, pandas_result)
# weights that don't sum to 1
weights = [i % 2 for i in range(len(modin_df.axes[num_axis]))]
modin_result = modin_df.sample(
frac=0.5, random_state=42, weights=weights, axis=axis
)
pandas_result = pandas_df.sample(
frac=0.5, random_state=42, weights=weights, axis=axis
)
df_equals(modin_result, pandas_result)
modin_result = modin_df.sample(n=0, axis=axis)
pandas_result = pandas_df.sample(n=0, axis=axis)
df_equals(modin_result, pandas_result)
modin_result = modin_df.sample(frac=0.5, random_state=42, axis=axis)
pandas_result = pandas_df.sample(frac=0.5, random_state=42, axis=axis)
df_equals(modin_result, pandas_result)
modin_result = modin_df.sample(n=2, random_state=42, axis=axis)
pandas_result = pandas_df.sample(n=2, random_state=42, axis=axis)
df_equals(modin_result, pandas_result)
# issue #1692, numpy RandomState object
# We must create a new random state for each iteration because the values that
# are selected will be impacted if the object has already been used.
random_state = np.random.RandomState(42)
modin_result = modin_df.sample(frac=0.5, random_state=random_state, axis=axis)
random_state = np.random.RandomState(42)
pandas_result = pandas_df.sample(frac=0.5, random_state=random_state, axis=axis)
df_equals(modin_result, pandas_result)
def test_select_dtypes():
frame_data = {
"test1": list("abc"),
"test2": np.arange(3, 6).astype("u1"),
"test3": np.arange(8.0, 11.0, dtype="float64"),
"test4": [True, False, True],
"test5": pandas.date_range("now", periods=3).values,
"test6": list(range(5, 8)),
}
df = pandas.DataFrame(frame_data)
rd = pd.DataFrame(frame_data)
include = np.float, "integer"
exclude = (np.bool_,)
r = rd.select_dtypes(include=include, exclude=exclude)
e = df[["test2", "test3", "test6"]]
df_equals(r, e)
r = rd.select_dtypes(include=np.bool_)
e = df[["test4"]]
df_equals(r, e)
r = rd.select_dtypes(exclude=np.bool_)
e = df[["test1", "test2", "test3", "test5", "test6"]]
df_equals(r, e)
try:
pd.DataFrame().select_dtypes()
assert False
except ValueError:
assert True
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys))
def test_tail(data, n):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
df_equals(modin_df.tail(n), pandas_df.tail(n))
df_equals(modin_df.tail(len(modin_df)), pandas_df.tail(len(pandas_df)))
def test_xs():
d = {
"num_legs": [4, 4, 2, 2],
"num_wings": [0, 0, 2, 2],
"class": ["mammal", "mammal", "mammal", "bird"],
"animal": ["cat", "dog", "bat", "penguin"],
"locomotion": ["walks", "walks", "flies", "walks"],
}
df = pd.DataFrame(data=d)
df = df.set_index(["class", "animal", "locomotion"])
with warns_that_defaulting_to_pandas():
df.xs("mammal")
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___getitem__(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
key = modin_df.columns[0]
modin_col = modin_df.__getitem__(key)
assert isinstance(modin_col, pd.Series)
pd_col = pandas_df[key]
df_equals(pd_col, modin_col)
slices = [
(None, -1),
(-1, None),
(1, 2),
(1, None),
(None, 1),
(1, -1),
(-3, -1),
(1, -1, 2),
(-1, 1, -1),
(None, None, 2),
]
# slice test
for slice_param in slices:
s = slice(*slice_param)
df_equals(modin_df[s], pandas_df[s])
# Test empty
df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___getitem_bool_indexers(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
indices = [i % 3 == 0 for i in range(len(modin_df.index))]
columns = [i % 5 == 0 for i in range(len(modin_df.columns))]
# Key is a list of booleans
modin_result = modin_df.loc[indices, columns]
pandas_result = pandas_df.loc[indices, columns]
df_equals(modin_result, pandas_result)
# Key is a Modin or pandas series of booleans
df_equals(
modin_df.loc[pd.Series(indices), pd.Series(columns, index=modin_df.columns)],
pandas_df.loc[
pandas.Series(indices), pandas.Series(columns, index=modin_df.columns)
],
)
def test_getitem_empty_mask():
# modin-project/modin#517
modin_frames = []
pandas_frames = []
data1 = np.random.randint(0, 100, size=(100, 4))
mdf1 = pd.DataFrame(data1, columns=list("ABCD"))
pdf1 = pandas.DataFrame(data1, columns=list("ABCD"))
modin_frames.append(mdf1)
pandas_frames.append(pdf1)
data2 = np.random.randint(0, 100, size=(100, 4))
mdf2 = pd.DataFrame(data2, columns=list("ABCD"))
pdf2 = pandas.DataFrame(data2, columns=list("ABCD"))
modin_frames.append(mdf2)
pandas_frames.append(pdf2)
data3 = np.random.randint(0, 100, size=(100, 4))
mdf3 = pd.DataFrame(data3, columns=list("ABCD"))
pdf3 = pandas.DataFrame(data3, columns=list("ABCD"))
modin_frames.append(mdf3)
pandas_frames.append(pdf3)
modin_data = pd.concat(modin_frames)
pandas_data = pandas.concat(pandas_frames)
df_equals(
modin_data[[False for _ in modin_data.index]],
pandas_data[[False for _ in modin_data.index]],
)
def test_getitem_datetime_slice():
data = {"data": range(1000)}
index = pd.date_range("2017/1/4", periods=1000)
modin_df = pd.DataFrame(data=data, index=index)
pandas_df = pandas.DataFrame(data=data, index=index)
s = slice("2017-01-06", "2017-01-09")
df_equals(modin_df[s], pandas_df[s])
def test_getitem_same_name():
data = [
[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12],
[13, 14, 15, 16],
[17, 18, 19, 20],
]
columns = ["c1", "c2", "c1", "c3"]
modin_df = pd.DataFrame(data, columns=columns)
pandas_df = pandas.DataFrame(data, columns=columns)
df_equals(modin_df["c1"], pandas_df["c1"])
df_equals(modin_df["c2"], pandas_df["c2"])
df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]])
df_equals(modin_df["c3"], pandas_df["c3"])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___getattr__(request, data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data) # noqa F841
if "empty_data" not in request.node.name:
key = modin_df.columns[0]
col = modin_df.__getattr__(key)
col = modin_df.__getattr__("col1")
assert isinstance(col, pd.Series)
col = getattr(modin_df, "col1")
assert isinstance(col, pd.Series)
# Check that lookup in column doesn't override other attributes
df2 = modin_df.rename(index=str, columns={key: "columns"})
assert isinstance(df2.columns, pandas.Index)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___setitem__(data):
eval_setitem(*create_test_dfs(data), loc=-1, value=1)
eval_setitem(
*create_test_dfs(data), loc=-1, value=lambda df: type(df)(df[df.columns[0]])
)
nrows = len(data[list(data.keys())[0]])
arr = np.arange(nrows * 2).reshape(-1, 2)
eval_setitem(*create_test_dfs(data), loc=-1, value=arr)
eval_setitem(*create_test_dfs(data), col="___NON EXISTENT COLUMN", value=arr)
eval_setitem(*create_test_dfs(data), loc=0, value=np.arange(nrows))
modin_df = pd.DataFrame(columns=data.keys())
pandas_df = pandas.DataFrame(columns=data.keys())
for col in modin_df.columns:
modin_df[col] = np.arange(1000)
for col in pandas_df.columns:
pandas_df[col] = np.arange(1000)
df_equals(modin_df, pandas_df)
# Test series assignment to column
modin_df = pd.DataFrame(columns=modin_df.columns)
pandas_df = pandas.DataFrame(columns=pandas_df.columns)
modin_df[modin_df.columns[-1]] = modin_df[modin_df.columns[0]]
pandas_df[pandas_df.columns[-1]] = pandas_df[pandas_df.columns[0]]
df_equals(modin_df, pandas_df)
if not sys.version_info.major == 3 and sys.version_info.minor > 6:
# This test doesn't work correctly on Python 3.6
# Test 2d ndarray assignment to column
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
modin_df["new_col"] = modin_df[[modin_df.columns[0]]].values
pandas_df["new_col"] = pandas_df[[pandas_df.columns[0]]].values
df_equals(modin_df, pandas_df)
assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0]))
modin_df[1:5] = 10
pandas_df[1:5] = 10
df_equals(modin_df, pandas_df)
# Transpose test
modin_df = pd.DataFrame(data).T
pandas_df = pandas.DataFrame(data).T
modin_df[modin_df.columns[0]] = 0
pandas_df[pandas_df.columns[0]] = 0
df_equals(modin_df, pandas_df)
modin_df.columns = [str(i) for i in modin_df.columns]
pandas_df.columns = [str(i) for i in pandas_df.columns]
modin_df[modin_df.columns[0]] = 0
pandas_df[pandas_df.columns[0]] = 0
df_equals(modin_df, pandas_df)
modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345
pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345
df_equals(modin_df, pandas_df)
modin_df[1:5] = 10
pandas_df[1:5] = 10
df_equals(modin_df, pandas_df)
def test___setitem__partitions_aligning():
# from issue #2390
modin_df = pd.DataFrame({"a": [1, 2, 3]})
pandas_df = pandas.DataFrame({"a": [1, 2, 3]})
modin_df["b"] = pd.Series([4, 5, 6, 7, 8])
pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8])
df_equals(modin_df, pandas_df)
# from issue #2442
data = {"a": [1, 2, 3, 4]}
# Index with duplicated timestamp
index = pandas.to_datetime(["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"])
md_df, pd_df = create_test_dfs(data, index=index)
# Setting new column
pd_df["b"] = pandas.Series(np.arange(4))
md_df["b"] = pd.Series(np.arange(4))
df_equals(md_df, pd_df)
# Setting existing column
pd_df["b"] = pandas.Series(np.arange(4))
md_df["b"] = pd.Series(np.arange(4))
df_equals(md_df, pd_df)
pd_df["a"] = pandas.Series(np.arange(4))
md_df["a"] = pd.Series(np.arange(4))
df_equals(md_df, pd_df)
def test___setitem__with_mismatched_partitions():
fname = "200kx99.csv"
np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
modin_df = pd.read_csv(fname)
pandas_df = pandas.read_csv(fname)
modin_df["new"] = pd.Series(list(range(len(modin_df))))
pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
df_equals(modin_df, pandas_df)
def test___setitem__mask():
# DataFrame mask:
data = test_data["int_data"]
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
mean = int((RAND_HIGH + RAND_LOW) / 2)
pandas_df[pandas_df > mean] = -50
modin_df[modin_df > mean] = -50
df_equals(modin_df, pandas_df)
# Array mask:
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)
array = (pandas_df > mean).to_numpy()
modin_df[array] = -50
pandas_df[array] = -50
df_equals(modin_df, pandas_df)
# Array mask of wrong size:
with pytest.raises(ValueError):
array = np.array([[1, 2], [3, 4]])
modin_df[array] = 20
@pytest.mark.parametrize(
"data",
[
{},
{"id": [], "max_speed": [], "health": []},
{"id": [1], "max_speed": [2], "health": [3]},
{"id": [4, 40, 400], "max_speed": [111, 222, 333], "health": [33, 22, 11]},
],
ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"],
)
@pytest.mark.parametrize(
"value",
[[11, 22], [11, 22, 33]],
ids=["2_length_val", "3_length_val"],
)
@pytest.mark.parametrize("convert_to_series", [False, True])
@pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"])
def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id):
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)
def applyier(df):
if convert_to_series:
converted_value = (
pandas.Series(value)
if isinstance(df, pandas.DataFrame)
else pd.Series(value)
)
else:
converted_value = value
df[new_col_id] = converted_value
return df
eval_general(modin_df, pandas_df, applyier)
def test_setitem_on_empty_df_4407():
data = {}
index = pd.date_range(end="1/1/2018", periods=0, freq="D")
column = pd.date_range(end="1/1/2018", periods=1, freq="h")[0]
modin_df = pd.DataFrame(data, columns=index)
pandas_df = pandas.DataFrame(data, columns=index)
modin_df[column] = pd.Series([1])
pandas_df[column] = pandas.Series([1])
df_equals(modin_df, pandas_df)
assert modin_df.columns.freq == pandas_df.columns.freq
def test___setitem__unhashable_list():
# from #3258 and #3291
cols = ["a", "b"]
modin_df = pd.DataFrame([[0, 0]], columns=cols)
modin_df[cols] = modin_df[cols]
pandas_df = pandas.DataFrame([[0, 0]], columns=cols)
pandas_df[cols] = pandas_df[cols]
df_equals(modin_df, pandas_df)
def test___setitem__single_item_in_series():
# Test assigning a single item in a Series for issue
# https://github.com/modin-project/modin/issues/3860
modin_series = pd.Series(99)
pandas_series = pandas.Series(99)
modin_series[:1] = pd.Series(100)
pandas_series[:1] = pandas.Series(100)
df_equals(modin_series, pandas_series)
def test___setitem__assigning_single_categorical_sets_correct_dtypes():
# This test case comes from
# https://github.com/modin-project/modin/issues/3895
modin_df = pd.DataFrame({"categories": ["A"]})
modin_df["categories"] = pd.Categorical(["A"])
pandas_df = pandas.DataFrame({"categories": ["A"]})
pandas_df["categories"] = pandas.Categorical(["A"])
df_equals(modin_df, pandas_df)
def test_iloc_assigning_scalar_none_to_string_frame():
# This test case comes from
# https://github.com/modin-project/modin/issues/3981
data = [["A"]]
modin_df = pd.DataFrame(data, dtype="string")
modin_df.iloc[0, 0] = None
pandas_df = pandas.DataFrame(data, dtype="string")
pandas_df.iloc[0, 0] = None
df_equals(modin_df, pandas_df)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___len__(data):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
assert len(modin_df) == len(pandas_df)
def test_index_order():
# see #1708 and #1869 for details
df_modin, df_pandas = (
pd.DataFrame(test_data["float_nan_data"]),
pandas.DataFrame(test_data["float_nan_data"]),
)
rows_number = len(df_modin.index)
level_0 = np.random.choice([x for x in range(10)], rows_number)
level_1 = np.random.choice([x for x in range(10)], rows_number)
index = pandas.MultiIndex.from_arrays([level_0, level_1])
df_modin.index = index
df_pandas.index = index
for func in ["all", "any", "mad", "count"]:
df_equals(
getattr(df_modin, func)(level=0).index,
getattr(df_pandas, func)(level=0).index,
)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("sortorder", [0, 3, 5])
def test_multiindex_from_frame(data, sortorder):
modin_df, pandas_df = create_test_dfs(data)
def call_from_frame(df):
if type(df).__module__.startswith("pandas"):
return pandas.MultiIndex.from_frame(df, sortorder)
else:
return pd.MultiIndex.from_frame(df, sortorder)
eval_general(modin_df, pandas_df, call_from_frame, comparator=assert_index_equal)