from datetime import datetime
from io import StringIO
import re
from typing import Dict
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Index, Series, Timestamp, date_range
from pandas.tests.frame.common import TestData
from pandas.util.testing import assert_frame_equal, assert_series_equal
@pytest.fixture
def mix_ab() -> Dict[str, list]:
return {"a": list(range(4)), "b": list("ab..")}
@pytest.fixture
def mix_abc() -> Dict[str, list]:
return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]}
class TestDataFrameReplace(TestData):
def test_replace_inplace(self):
self.tsframe["A"][:5] = np.nan
self.tsframe["A"][-5:] = np.nan
tsframe = self.tsframe.copy()
tsframe.replace(np.nan, 0, inplace=True)
assert_frame_equal(tsframe, self.tsframe.fillna(0))
# mixed type
mf = self.mixed_frame
mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan
result = self.mixed_frame.replace(np.nan, 0)
expected = self.mixed_frame.fillna(value=0)
assert_frame_equal(result, expected)
tsframe = self.tsframe.copy()
tsframe.replace([np.nan], [0], inplace=True)
assert_frame_equal(tsframe, self.tsframe.fillna(0))
def test_regex_replace_scalar(self, mix_ab):
obj = {"a": list("ab.."), "b": list("efgh")}
dfobj = DataFrame(obj)
dfmix = DataFrame(mix_ab)
# simplest cases
# regex -> value
# obj frame
res = dfobj.replace(r"\s*\.\s*", np.nan, regex=True)
assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.replace(r"\s*\.\s*", np.nan, regex=True)
assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True)
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
assert_frame_equal(res, expec)
# with mixed
res = dfmix.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True)
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
# everything with compiled regexs as well
res = dfobj.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True)
assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True)
assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1")
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
assert_frame_equal(res, expec)
# with mixed
res = dfmix.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1")
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
res = dfmix.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1")
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
res = dfmix.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1")
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
def test_regex_replace_scalar_inplace(self, mix_ab):
obj = {"a": list("ab.."), "b": list("efgh")}
dfobj = DataFrame(obj)
dfmix = DataFrame(mix_ab)
# simplest cases
# regex -> value
# obj frame
res = dfobj.copy()
res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True)
assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True)
assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True)
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
assert_frame_equal(res, expec)
# with mixed
res = dfmix.copy()
res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True)
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
# everything with compiled regexs as well
res = dfobj.copy()
res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True)
assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True)
assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True)
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
assert_frame_equal(res, expec)
# with mixed
res = dfmix.copy()
res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True)
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
res = dfobj.copy()
res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True)
assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True)
assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True)
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
assert_frame_equal(res, expec)
# with mixed
res = dfmix.copy()
res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True)
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
# everything with compiled regexs as well
res = dfobj.copy()
res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True)
assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True)
assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True)
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
assert_frame_equal(res, expec)
# with mixed
res = dfmix.copy()
res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True)
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
assert_frame_equal(res, expec)
def test_regex_replace_list_obj(self):
obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")}
dfobj = DataFrame(obj)
# lists of regexes and values
# list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
to_replace_res = [r"\s*\.\s*", r"e|f|g"]
values = [np.nan, "crap"]
res = dfobj.replace(to_replace_res, values, regex=True)
expec = DataFrame(
{
"a": ["a", "b", np.nan, np.nan],
"b": ["crap"] * 3 + ["h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"]
values = [r"\1\1", r"\1_crap"]
res = dfobj.replace(to_replace_res, values, regex=True)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["e_crap", "f_crap", "g_crap", "h"],
"c": ["h", "e_crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
# or vN)]
to_replace_res = [r"\s*(\.)\s*", r"e"]
values = [r"\1\1", r"crap"]
res = dfobj.replace(to_replace_res, values, regex=True)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["crap", "f", "g", "h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
to_replace_res = [r"\s*(\.)\s*", r"e"]
values = [r"\1\1", r"crap"]
res = dfobj.replace(value=values, regex=to_replace_res)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["crap", "f", "g", "h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
def test_regex_replace_list_obj_inplace(self):
# same as above with inplace=True
# lists of regexes and values
obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")}
dfobj = DataFrame(obj)
# lists of regexes and values
# list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
to_replace_res = [r"\s*\.\s*", r"e|f|g"]
values = [np.nan, "crap"]
res = dfobj.copy()
res.replace(to_replace_res, values, inplace=True, regex=True)
expec = DataFrame(
{
"a": ["a", "b", np.nan, np.nan],
"b": ["crap"] * 3 + ["h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"]
values = [r"\1\1", r"\1_crap"]
res = dfobj.copy()
res.replace(to_replace_res, values, inplace=True, regex=True)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["e_crap", "f_crap", "g_crap", "h"],
"c": ["h", "e_crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
# or vN)]
to_replace_res = [r"\s*(\.)\s*", r"e"]
values = [r"\1\1", r"crap"]
res = dfobj.copy()
res.replace(to_replace_res, values, inplace=True, regex=True)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["crap", "f", "g", "h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
to_replace_res = [r"\s*(\.)\s*", r"e"]
values = [r"\1\1", r"crap"]
res = dfobj.copy()
res.replace(value=values, regex=to_replace_res, inplace=True)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["crap", "f", "g", "h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
def test_regex_replace_list_mixed(self, mix_ab):
# mixed frame to make sure this doesn't break things
dfmix = DataFrame(mix_ab)
# lists of regexes and values
# list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
to_replace_res = [r"\s*\.\s*", r"a"]
values = [np.nan, "crap"]
mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")}
dfmix2 = DataFrame(mix2)
res = dfmix2.replace(to_replace_res, values, regex=True)
expec = DataFrame(
{
"a": mix2["a"],
"b": ["crap", "b", np.nan, np.nan],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
to_replace_res = [r"\s*(\.)\s*", r"(a|b)"]
values = [r"\1\1", r"\1_crap"]
res = dfmix.replace(to_replace_res, values, regex=True)
expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]})
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
# or vN)]
to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
values = [r"\1\1", r"crap", r"\1_crap"]
res = dfmix.replace(to_replace_res, values, regex=True)
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
assert_frame_equal(res, expec)
to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
values = [r"\1\1", r"crap", r"\1_crap"]
res = dfmix.replace(regex=to_replace_res, value=values)
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
assert_frame_equal(res, expec)
def test_regex_replace_list_mixed_inplace(self, mix_ab):
dfmix = DataFrame(mix_ab)
# the same inplace
# lists of regexes and values
# list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
to_replace_res = [r"\s*\.\s*", r"a"]
values = [np.nan, "crap"]
res = dfmix.copy()
res.replace(to_replace_res, values, inplace=True, regex=True)
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]})
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
to_replace_res = [r"\s*(\.)\s*", r"(a|b)"]
values = [r"\1\1", r"\1_crap"]
res = dfmix.copy()
res.replace(to_replace_res, values, inplace=True, regex=True)
expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]})
assert_frame_equal(res, expec)
# list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
# or vN)]
to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
values = [r"\1\1", r"crap", r"\1_crap"]
res = dfmix.copy()
res.replace(to_replace_res, values, inplace=True, regex=True)
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
assert_frame_equal(res, expec)
to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
values = [r"\1\1", r"crap", r"\1_crap"]
res = dfmix.copy()
res.replace(regex=to_replace_res, value=values, inplace=True)
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
assert_frame_equal(res, expec)
def test_regex_replace_dict_mixed(self, mix_abc):
dfmix = DataFrame(mix_abc)
# dicts
# single dict {re1: v1}, search the whole frame
# need test for this...
# list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole
# frame
res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True)
res2 = dfmix.copy()
res2.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
# list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the
# whole frame
res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True)
res2 = dfmix.copy()
res2.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"})
res2 = dfmix.copy()
res2.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
# scalar -> dict
# to_replace regex, {value: value}
expec = DataFrame(
{"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]}
)
res = dfmix.replace("a", {"b": np.nan}, regex=True)
res2 = dfmix.copy()
res2.replace("a", {"b": np.nan}, regex=True, inplace=True)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
res = dfmix.replace("a", {"b": np.nan}, regex=True)
res2 = dfmix.copy()
res2.replace(regex="a", value={"b": np.nan}, inplace=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
def test_regex_replace_dict_nested(self, mix_abc):
# nested dicts will not work until this is implemented for Series
dfmix = DataFrame(mix_abc)
res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True)
res2 = dfmix.copy()
res4 = dfmix.copy()
res2.replace({"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True)
res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}})
res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
assert_frame_equal(res3, expec)
assert_frame_equal(res4, expec)
def test_regex_replace_dict_nested_non_first_character(self):
# GH 25259
df = pd.DataFrame({"first": ["abc", "bca", "cab"]})
expected = pd.DataFrame({"first": [".bc", "bc.", "c.b"]})
result = df.replace({"a": "."}, regex=True)
assert_frame_equal(result, expected)
def test_regex_replace_dict_nested_gh4115(self):
df = pd.DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
result = df.replace({"Type": {"Q": 0, "T": 1}})
assert_frame_equal(result, expected)
def test_regex_replace_list_to_scalar(self, mix_abc):
df = DataFrame(mix_abc)
expec = DataFrame(
{
"a": mix_abc["a"],
"b": np.array([np.nan] * 4),
"c": [np.nan, np.nan, np.nan, "d"],
}
)
res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True)
res2 = df.copy()
res3 = df.copy()
res2.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True)
res3.replace(regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
assert_frame_equal(res3, expec)
def test_regex_replace_str_to_numeric(self, mix_abc):
# what happens when you try to replace a numeric value with a regex?
df = DataFrame(mix_abc)
res = df.replace(r"\s*\.\s*", 0, regex=True)
res2 = df.copy()
res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True)
res3 = df.copy()
res3.replace(regex=r"\s*\.\s*", value=0, inplace=True)
expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]})
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
assert_frame_equal(res3, expec)
def test_regex_replace_regex_list_to_numeric(self, mix_abc):
df = DataFrame(mix_abc)
res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
res2 = df.copy()
res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True)
res3 = df.copy()
res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
assert_frame_equal(res3, expec)
def test_regex_replace_series_of_regexes(self, mix_abc):
df = DataFrame(mix_abc)
s1 = Series({"b": r"\s*\.\s*"})
s2 = Series({"b": np.nan})
res = df.replace(s1, s2, regex=True)
res2 = df.copy()
res2.replace(s1, s2, inplace=True, regex=True)
res3 = df.copy()
res3.replace(regex=s1, value=s2, inplace=True)
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
)
assert_frame_equal(res, expec)
assert_frame_equal(res2, expec)
assert_frame_equal(res3, expec)
def test_regex_replace_numeric_to_object_conversion(self, mix_abc):
df = DataFrame(mix_abc)
expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]})
res = df.replace(0, "a")
assert_frame_equal(res, expec)
assert res.a.dtype == np.object_
@pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"])
def test_replace_regex_metachar(self, metachar):
df = DataFrame({"a": [metachar, "else"]})
result = df.replace({"a": {metachar: "paren"}})
expected = DataFrame({"a": ["paren", "else"]})
assert_frame_equal(result, expected)
def test_replace(self):
self.tsframe["A"][:5] = np.nan
self.tsframe["A"][-5:] = np.nan
zero_filled = self.tsframe.replace(np.nan, -1e8)
assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8))
assert_frame_equal(zero_filled.replace(-1e8, np.nan), self.tsframe)
self.tsframe["A"][:5] = np.nan
self.tsframe["A"][-5:] = np.nan
self.tsframe["B"][:5] = -1e8
# empty
df = DataFrame(index=["a", "b"])
assert_frame_equal(df, df.replace(5, 7))
# GH 11698
# test for mixed data types.
df = pd.DataFrame(
[("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))]
)
df1 = df.replace("-", np.nan)
expected_df = pd.DataFrame(
[(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))]
)
assert_frame_equal(df1, expected_df)
def test_replace_list(self):
obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")}
dfobj = DataFrame(obj)
# lists of regexes and values
# list of [v1, v2, ..., vN] -> [v1, v2, ..., vN]
to_replace_res = [r".", r"e"]
values = [np.nan, "crap"]
res = dfobj.replace(to_replace_res, values)
expec = DataFrame(
{
"a": ["a", "b", np.nan, np.nan],
"b": ["crap", "f", "g", "h"],
"c": ["h", "crap", "l", "o"],
}
)
assert_frame_equal(res, expec)
# list of [v1, v2, ..., vN] -> [v1, v2, .., vN]
to_replace_res = [r".", r"f"]
values = [r"..", r"crap"]
res = dfobj.replace(to_replace_res, values)
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
"b": ["e", "crap", "g", "h"],
"c": ["h", "e", "l", "o"],
}
)
assert_frame_equal(res, expec)
def test_replace_with_empty_list(self):
# GH 21977
s = pd.Series([["a", "b"], [], np.nan, [1]])
df = pd.DataFrame({"col": s})
expected = df
result = df.replace([], np.nan)
assert_frame_equal(result, expected)
# GH 19266
with pytest.raises(ValueError, match="cannot assign mismatch"):
df.replace({np.nan: []})
with pytest.raises(ValueError, match="cannot assign mismatch"):
df.replace({np.nan: ["dummy", "alt"]})
def test_replace_series_dict(self):
# from GH 3064
df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
result = df.replace(0, {"zero": 0.5, "one": 1.0})
expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}})
assert_frame_equal(result, expected)
result = df.replace(0, df.mean())
assert_frame_equal(result, expected)
# series to series/dict
df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
s = Series({"zero": 0.0, "one": 2.0})
result = df.replace(s, {"zero": 0.5, "one": 1.0})
expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}})
assert_frame_equal(result, expected)
result = df.replace(s, df.mean())
assert_frame_equal(result, expected)
def test_replace_convert(self):
# gh 3907
df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]])
m = {"foo": 1, "bar": 2, "bah": 3}
rep = df.replace(m)
expec = Series([np.int64] * 3)
res = rep.dtypes
assert_series_equal(expec, res)
def test_replace_mixed(self):
mf = self.mixed_frame
mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan
result = self.mixed_frame.replace(np.nan, -18)
expected = self.mixed_frame.fillna(value=-18)
assert_frame_equal(result, expected)
assert_frame_equal(result.replace(-18, np.nan), self.mixed_frame)
result = self.mixed_frame.replace(np.nan, -1e8)
expected = self.mixed_frame.fillna(value=-1e8)
assert_frame_equal(result, expected)
assert_frame_equal(result.replace(-1e8, np.nan), self.mixed_frame)
# int block upcasting
df = DataFrame(
{
"A": Series([1.0, 2.0], dtype="float64"),
"B": Series([0, 1], dtype="int64"),
}
)
expected = DataFrame(
{
"A": Series([1.0, 2.0], dtype="float64"),
"B": Series([0.5, 1], dtype="float64"),
}
)
result = df.replace(0, 0.5)
assert_frame_equal(result, expected)
df.replace(0, 0.5, inplace=True)
assert_frame_equal(df, expected)
# int block splitting
df = DataFrame(
{
"A": Series([1.0, 2.0], dtype="float64"),
"B": Series([0, 1], dtype="int64"),
"C": Series([1, 2], dtype="int64"),
}
)
expected = DataFrame(
{
"A": Series([1.0, 2.0], dtype="float64"),
"B": Series([0.5, 1], dtype="float64"),
"C": Series([1, 2], dtype="int64"),
}
)
result = df.replace(0, 0.5)
assert_frame_equal(result, expected)
# to object block upcasting
df = DataFrame(
{
"A": Series([1.0, 2.0], dtype="float64"),
"B": Series([0, 1], dtype="int64"),
}
)
expected = DataFrame(
{
"A": Series([1, "foo"], dtype="object"),
"B": Series([0, 1], dtype="int64"),
}
)
result = df.replace(2, "foo")
assert_frame_equal(result, expected)
expected = DataFrame(
{
"A": Series(["foo", "bar"], dtype="object"),
"B": Series([0, "foo"], dtype="object"),
}
)
result = df.replace([1, 2], ["foo", "bar"])
assert_frame_equal(result, expected)
# test case from
df = DataFrame(
{"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")}
)
result = df.replace(3, df.mean().to_dict())
expected = df.copy().astype("float64")
m = df.mean()
expected.iloc[0, 0] = m[0]
expected.iloc[1, 1] = m[1]
assert_frame_equal(result, expected)
def test_replace_simple_nested_dict(self):
df = DataFrame({"col": range(1, 5)})
expected = DataFrame({"col": ["a", 2, 3, "b"]})
result = df.replace({"col": {1: "a", 4: "b"}})
assert_frame_equal(expected, result)
# in this case, should be the same as the not nested version
result = df.replace({1: "a", 4: "b"})
assert_frame_equal(expected, result)
def test_replace_simple_nested_dict_with_nonexistent_value(self):
df = DataFrame({"col": range(1, 5)})
expected = DataFrame({"col": ["a", 2, 3, "b"]})
result = df.replace({-1: "-", 1: "a", 4: "b"})
assert_frame_equal(expected, result)
result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}})
assert_frame_equal(expected, result)
def test_replace_value_is_none(self):
orig_value = self.tsframe.iloc[0, 0]
orig2 = self.tsframe.iloc[1, 0]
self.tsframe.iloc[0, 0] = np.nan
self.tsframe.iloc[1, 0] = 1
result = self.tsframe.replace(to_replace={np.nan: 0})
expected = self.tsframe.T.replace(to_replace={np.nan: 0}).T
assert_frame_equal(result, expected)
result = self.tsframe.replace(to_replace={np.nan: 0, 1: -1e8})
tsframe = self.tsframe.copy()
tsframe.iloc[0, 0] = 0
tsframe.iloc[1, 0] = -1e8
expected = tsframe
assert_frame_equal(expected, result)
self.tsframe.iloc[0, 0] = orig_value
self.tsframe.iloc[1, 0] = orig2
def test_replace_for_new_dtypes(self):
# dtypes
tsframe = self.tsframe.copy().astype(np.float32)
tsframe["A"][:5] = np.nan
tsframe["A"][-5:] = np.nan
zero_filled = tsframe.replace(np.nan, -1e8)
assert_frame_equal(zero_filled, tsframe.fillna(-1e8))
assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe)
tsframe["A"][:5] = np.nan
tsframe["A"][-5:] = np.nan
tsframe["B"][:5] = -1e8
b = tsframe["B"]
b[b == -1e8] = np.nan
tsframe["B"] = b
result = tsframe.fillna(method="bfill")
assert_frame_equal(result, tsframe.fillna(method="bfill"))
@pytest.mark.parametrize(
"frame, to_replace, value, expected",
[
(DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})),
(
DataFrame({"ints": [1, 2, 3]}, dtype=np.int32),
1,
0,
DataFrame({"ints": [0, 2, 3]}, dtype=np.int32),
),
(
DataFrame({"ints": [1, 2, 3]}, dtype=np.int16),
1,
0,
DataFrame({"ints": [0, 2, 3]}, dtype=np.int16),
),
(
DataFrame({"bools": [True, False, True]}),
False,
True,
DataFrame({"bools": [True, True, True]}),
),
(
DataFrame({"complex": [1j, 2j, 3j]}),
1j,
0,
DataFrame({"complex": [0j, 2j, 3j]}),
),
(
DataFrame(
{
"datetime64": Index(
[
datetime(2018, 5, 28),
datetime(2018, 7, 28),
datetime(2018, 5, 28),
]
)
}
),
datetime(2018, 5, 28),
datetime(2018, 7, 28),
DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}),
),
# GH 20380
(
DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}),
"foo",
"bar",
DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}),
),
(
DataFrame(
{
"A": date_range("20130101", periods=3, tz="US/Eastern"),
"B": [0, np.nan, 2],
}
),
Timestamp("20130102", tz="US/Eastern"),
Timestamp("20130104", tz="US/Eastern"),
DataFrame(
{
"A": [
Timestamp("20130101", tz="US/Eastern"),
Timestamp("20130104", tz="US/Eastern"),
Timestamp("20130103", tz="US/Eastern"),
],
"B": [0, np.nan, 2],
}
),
),
],
)
def test_replace_dtypes(self, frame, to_replace, value, expected):
result = getattr(frame, "replace")(to_replace, value)
assert_frame_equal(result, expected)
def test_replace_input_formats_listlike(self):
# both dicts
to_rep = {"A": np.nan, "B": 0, "C": ""}
values = {"A": 0, "B": -1, "C": "missing"}
df = DataFrame(
{"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
)
filled = df.replace(to_rep, values)
expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()}
assert_frame_equal(filled, DataFrame(expected))
result = df.replace([0, 2, 5], [5, 2, 0])
expected = DataFrame(
{"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]}
)
assert_frame_equal(result, expected)
# scalar to dict
values = {"A": 0, "B": -1, "C": "missing"}
df = DataFrame(
{"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
)
filled = df.replace(np.nan, values)
expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()}
assert_frame_equal(filled, DataFrame(expected))
# list to list
to_rep = [np.nan, 0, ""]
values = [-2, -1, "missing"]
result = df.replace(to_rep, values)
expected = df.copy()
for i in range(len(to_rep)):
expected.replace(to_rep[i], values[i], inplace=True)
assert_frame_equal(result, expected)
msg = r"Replacement lists must match in length\. Expecting 3 got 2"
with pytest.raises(ValueError, match=msg):
df.replace(to_rep, values[1:])
def test_replace_input_formats_scalar(self):
df = DataFrame(
{"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
)
# dict to scalar
to_rep = {"A": np.nan, "B": 0, "C": ""}
filled = df.replace(to_rep, 0)
expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()}
assert_frame_equal(filled, DataFrame(expected))
msg = "value argument must be scalar, dict, or Series"
with pytest.raises(TypeError, match=msg):
df.replace(to_rep, [np.nan, 0, ""])
# list to scalar
to_rep = [np.nan, 0, ""]
result = df.replace(to_rep, -1)
expected = df.copy()
for i in range(len(to_rep)):
expected.replace(to_rep[i], -1, inplace=True)
assert_frame_equal(result, expected)
def test_replace_limit(self):
pass
def test_replace_dict_no_regex(self):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
)
weights = {
"Agree": 4,
"Disagree": 2,
"Neutral": 3,
"Strongly Agree": 5,
"Strongly Disagree": 1,
}
expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
result = answer.replace(weights)
assert_series_equal(result, expected)
def test_replace_series_no_regex(self):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
)
weights = Series(
{
"Agree": 4,
"Disagree": 2,
"Neutral": 3,
"Strongly Agree": 5,
"Strongly Disagree": 1,
}
)
expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
result = answer.replace(weights)
assert_series_equal(result, expected)
def test_replace_dict_tuple_list_ordering_remains_the_same(self):
df = DataFrame(dict(A=[np.nan, 1]))
res1 = df.replace(to_replace={np.nan: 0, 1: -1e8})
res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0])
res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0])
expected = DataFrame({"A": [0, -1e8]})
assert_frame_equal(res1, res2)
assert_frame_equal(res2, res3)
assert_frame_equal(res3, expected)
def test_replace_doesnt_replace_without_regex(self):
raw = """fol T_opp T_Dir T_Enh
0 1 0 0 vo
1 2 vr 0 0
2 2 0 0 0
3 3 0 bt 0"""
df = pd.read_csv(StringIO(raw), sep=r"\s+")
res = df.replace({r"\D": 1})
assert_frame_equal(df, res)
def test_replace_bool_with_string(self):
df = DataFrame({"a": [True, False], "b": list("ab")})
result = df.replace(True, "a")
expected = DataFrame({"a": ["a", False], "b": df.b})
assert_frame_equal(result, expected)
def test_replace_pure_bool_with_string_no_op(self):
df = DataFrame(np.random.rand(2, 2) > 0.5)
result = df.replace("asdf", "fdsa")
assert_frame_equal(df, result)
def test_replace_bool_with_bool(self):
df = DataFrame(np.random.rand(2, 2) > 0.5)
result = df.replace(False, True)
expected = DataFrame(np.ones((2, 2), dtype=bool))
assert_frame_equal(result, expected)
def test_replace_with_dict_with_bool_keys(self):
df = DataFrame({0: [True, False], 1: [False, True]})
with pytest.raises(TypeError, match="Cannot compare types .+"):
df.replace({"asdf": "asdb", True: "yes"})
def test_replace_truthy(self):
df = DataFrame({"a": [True, True]})
r = df.replace([np.inf, -np.inf], np.nan)
e = df
assert_frame_equal(r, e)
def test_replace_int_to_int_chain(self):
df = DataFrame({"a": list(range(1, 5))})
with pytest.raises(ValueError, match="Replacement not allowed .+"):
df.replace({"a": dict(zip(range(1, 5), range(2, 6)))})
def test_replace_str_to_str_chain(self):
a = np.arange(1, 5)
astr = a.astype(str)
bstr = np.arange(2, 6).astype(str)
df = DataFrame({"a": astr})
with pytest.raises(ValueError, match="Replacement not allowed .+"):
df.replace({"a": dict(zip(astr, bstr))})
def test_replace_swapping_bug(self):
df = pd.DataFrame({"a": [True, False, True]})
res = df.replace({"a": {True: "Y", False: "N"}})
expect = pd.DataFrame({"a": ["Y", "N", "Y"]})
assert_frame_equal(res, expect)
df = pd.DataFrame({"a": [0, 1, 0]})
res = df.replace({"a": {0: "Y", 1: "N"}})
expect = pd.DataFrame({"a": ["Y", "N", "Y"]})
assert_frame_equal(res, expect)
def test_replace_period(self):
d = {
"fname": {
"out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"),
"out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"),
"out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"),
"out_augmented_SUBSIDY_WEEK.json": pd.Period(
year=2011, month=4, freq="M"
),
"out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"),
"out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"),
"out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"),
}
}
df = pd.DataFrame(
[
"out_augmented_AUG_2012.json",
"out_augmented_SEP_2013.json",
"out_augmented_SUBSIDY_WEEK.json",
"out_augmented_MAY_2012.json",
"out_augmented_MAY_2011.json",
"out_augmented_AUG_2011.json",
"out_augmented_JAN_2011.json",
],
columns=["fname"],
)
assert set(df.fname.values) == set(d["fname"].keys())
# We don't support converting object -> specialized EA in
# replace yet.
expected = DataFrame(
{"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object
)
result = df.replace(d)
assert_frame_equal(result, expected)
def test_replace_datetime(self):
d = {
"fname": {
"out_augmented_AUG_2011.json": pd.Timestamp("2011-08"),
"out_augmented_JAN_2011.json": pd.Timestamp("2011-01"),
"out_augmented_MAY_2012.json": pd.Timestamp("2012-05"),
"out_augmented_SUBSIDY_WEEK.json": pd.Timestamp("2011-04"),
"out_augmented_AUG_2012.json": pd.Timestamp("2012-08"),
"out_augmented_MAY_2011.json": pd.Timestamp("2011-05"),
"out_augmented_SEP_2013.json": pd.Timestamp("2013-09"),
}
}
df = pd.DataFrame(
[
"out_augmented_AUG_2012.json",
"out_augmented_SEP_2013.json",
"out_augmented_SUBSIDY_WEEK.json",
"out_augmented_MAY_2012.json",
"out_augmented_MAY_2011.json",
"out_augmented_AUG_2011.json",
"out_augmented_JAN_2011.json",
],
columns=["fname"],
)
assert set(df.fname.values) == set(d["fname"].keys())
expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]})
result = df.replace(d)
assert_frame_equal(result, expected)
def test_replace_datetimetz(self):
# GH 11326
# behaving poorly when presented with a datetime64[ns, tz]
df = DataFrame(
{
"A": date_range("20130101", periods=3, tz="US/Eastern"),
"B": [0, np.nan, 2],
}
)
result = df.replace(np.nan, 1)
expected = DataFrame(
{
"A": date_range("20130101", periods=3, tz="US/Eastern"),
"B": Series([0, 1, 2], dtype="float64"),
}
)
assert_frame_equal(result, expected)
result = df.fillna(1)
assert_frame_equal(result, expected)
result = df.replace(0, np.nan)
expected = DataFrame(
{
"A": date_range("20130101", periods=3, tz="US/Eastern"),
"B": [np.nan, np.nan, 2],
}
)
assert_frame_equal(result, expected)
result = df.replace(
Timestamp("20130102", tz="US/Eastern"),
Timestamp("20130104", tz="US/Eastern"),
)
expected = DataFrame(
{
"A": [
Timestamp("20130101", tz="US/Eastern"),
Timestamp("20130104", tz="US/Eastern"),
Timestamp("20130103", tz="US/Eastern"),
],
"B": [0, np.nan, 2],
}
)
assert_frame_equal(result, expected)
result = df.copy()
result.iloc[1, 0] = np.nan
result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern"))
assert_frame_equal(result, expected)
# coerce to object
result = df.copy()
result.iloc[1, 0] = np.nan
result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific"))
expected = DataFrame(
{
"A": [
Timestamp("20130101", tz="US/Eastern"),
Timestamp("20130104", tz="US/Pacific"),
Timestamp("20130103", tz="US/Eastern"),
],
"B": [0, np.nan, 2],
}
)
assert_frame_equal(result, expected)
result = df.copy()
result.iloc[1, 0] = np.nan
result = result.replace({"A": np.nan}, Timestamp("20130104"))
expected = DataFrame(
{
"A": [
Timestamp("20130101", tz="US/Eastern"),
Timestamp("20130104"),
Timestamp("20130103", tz="US/Eastern"),
],
"B": [0, np.nan, 2],
}
)
assert_frame_equal(result, expected)
def test_replace_with_empty_dictlike(self, mix_abc):
# GH 15289
df = DataFrame(mix_abc)
assert_frame_equal(df, df.replace({}))
assert_frame_equal(df, df.replace(Series([])))
assert_frame_equal(df, df.replace({"b": {}}))
assert_frame_equal(df, df.replace(Series({"b": {}})))
@pytest.mark.parametrize(
"to_replace, method, expected",
[
(0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}),
(
np.nan,
"bfill",
{"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]},
),
("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}),
(
[0, 2],
"bfill",
{"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]},
),
(
[1, 2],
"pad",
{"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]},
),
(
(1, 2),
"bfill",
{"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]},
),
(
["b", "c"],
"ffill",
{"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]},
),
],
)
def test_replace_method(self, to_replace, method, expected):
# GH 19632
df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]})
result = df.replace(to_replace=to_replace, value=None, method=method)
expected = DataFrame(expected)
assert_frame_equal(result, expected)