Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / pandas   python

Repository URL to install this package:

Version: 1.1.1 

/ tests / indexing / test_chaining_and_caching.py

import numpy as np
import pytest

import pandas as pd
from pandas import DataFrame, Series, Timestamp, date_range, option_context
import pandas._testing as tm
import pandas.core.common as com


class TestCaching:
    def test_slice_consolidate_invalidate_item_cache(self):

        # this is chained assignment, but will 'work'
        with option_context("chained_assignment", None):

            # #3970
            df = DataFrame({"aa": np.arange(5), "bb": [2.2] * 5})

            # Creates a second float block
            df["cc"] = 0.0

            # caches a reference to the 'bb' series
            df["bb"]

            # repr machinery triggers consolidation
            repr(df)

            # Assignment to wrong series
            df["bb"].iloc[0] = 0.17
            df._clear_item_cache()
            tm.assert_almost_equal(df["bb"][0], 0.17)

    def test_setitem_cache_updating(self):
        # GH 5424
        cont = ["one", "two", "three", "four", "five", "six", "seven"]

        for do_ref in [False, False]:
            df = DataFrame({"a": cont, "b": cont[3:] + cont[:3], "c": np.arange(7)})

            # ref the cache
            if do_ref:
                df.loc[0, "c"]

            # set it
            df.loc[7, "c"] = 1

            assert df.loc[0, "c"] == 0.0
            assert df.loc[7, "c"] == 1.0

        # GH 7084
        # not updating cache on series setting with slices
        expected = DataFrame(
            {"A": [600, 600, 600]}, index=date_range("5/7/2014", "5/9/2014")
        )
        out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014"))
        df = DataFrame({"C": ["A", "A", "A"], "D": [100, 200, 300]})

        # loop through df to update out
        six = Timestamp("5/7/2014")
        eix = Timestamp("5/9/2014")
        for ix, row in df.iterrows():
            out.loc[six:eix, row["C"]] = out.loc[six:eix, row["C"]] + row["D"]

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out["A"], expected["A"])

        # try via a chain indexing
        # this actually works
        out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014"))
        for ix, row in df.iterrows():
            v = out[row["C"]][six:eix] + row["D"]
            out[row["C"]][six:eix] = v

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out["A"], expected["A"])

        out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014"))
        for ix, row in df.iterrows():
            out.loc[six:eix, row["C"]] += row["D"]

        tm.assert_frame_equal(out, expected)
        tm.assert_series_equal(out["A"], expected["A"])


class TestChaining:
    def test_setitem_chained_setfault(self):

        # GH6026
        data = ["right", "left", "left", "left", "right", "left", "timeout"]
        mdata = ["right", "left", "left", "left", "right", "left", "none"]

        df = DataFrame({"response": np.array(data)})
        mask = df.response == "timeout"
        df.response[mask] = "none"
        tm.assert_frame_equal(df, DataFrame({"response": mdata}))

        recarray = np.rec.fromarrays([data], names=["response"])
        df = DataFrame(recarray)
        mask = df.response == "timeout"
        df.response[mask] = "none"
        tm.assert_frame_equal(df, DataFrame({"response": mdata}))

        df = DataFrame({"response": data, "response1": data})
        mask = df.response == "timeout"
        df.response[mask] = "none"
        tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data}))

        # GH 6056
        expected = DataFrame(dict(A=[np.nan, "bar", "bah", "foo", "bar"]))
        df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"])))
        df["A"].iloc[0] = np.nan
        result = df.head()
        tm.assert_frame_equal(result, expected)

        df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"])))
        df.A.iloc[0] = np.nan
        result = df.head()
        tm.assert_frame_equal(result, expected)

    def test_detect_chained_assignment(self):

        pd.set_option("chained_assignment", "raise")

        # work with the chain
        expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB"))
        df = DataFrame(np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64")
        assert df._is_copy is None

        df["A"][0] = -5
        df["A"][1] = -6
        tm.assert_frame_equal(df, expected)

        # test with the chaining
        df = DataFrame(
            {
                "A": Series(range(2), dtype="int64"),
                "B": np.array(np.arange(2, 4), dtype=np.float64),
            }
        )
        assert df._is_copy is None

        with pytest.raises(com.SettingWithCopyError):
            df["A"][0] = -5

        with pytest.raises(com.SettingWithCopyError):
            df["A"][1] = np.nan

        assert df["A"]._is_copy is None

        # Using a copy (the chain), fails
        df = DataFrame(
            {
                "A": Series(range(2), dtype="int64"),
                "B": np.array(np.arange(2, 4), dtype=np.float64),
            }
        )

        with pytest.raises(com.SettingWithCopyError):
            df.loc[0]["A"] = -5

        # Doc example
        df = DataFrame(
            {
                "a": ["one", "one", "two", "three", "two", "one", "six"],
                "c": Series(range(7), dtype="int64"),
            }
        )
        assert df._is_copy is None

        with pytest.raises(com.SettingWithCopyError):
            indexer = df.a.str.startswith("o")
            df[indexer]["c"] = 42

        expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]})
        df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})

        with pytest.raises(com.SettingWithCopyError):
            df["A"][0] = 111

        with pytest.raises(com.SettingWithCopyError):
            df.loc[0]["A"] = 111

        df.loc[0, "A"] = 111
        tm.assert_frame_equal(df, expected)

        # gh-5475: Make sure that is_copy is picked up reconstruction
        df = DataFrame({"A": [1, 2]})
        assert df._is_copy is None

        with tm.ensure_clean("__tmp__pickle") as path:
            df.to_pickle(path)
            df2 = pd.read_pickle(path)
            df2["B"] = df2["A"]
            df2["B"] = df2["A"]

        # gh-5597: a spurious raise as we are setting the entire column here
        from string import ascii_letters as letters

        def random_text(nobs=100):
            df = []
            for i in range(nobs):
                idx = np.random.randint(len(letters), size=2)
                idx.sort()

                df.append([letters[idx[0] : idx[1]]])

            return DataFrame(df, columns=["letters"])

        df = random_text(100000)

        # Always a copy
        x = df.iloc[[0, 1, 2]]
        assert x._is_copy is not None

        x = df.iloc[[0, 1, 2, 4]]
        assert x._is_copy is not None

        # Explicitly copy
        indexer = df.letters.apply(lambda x: len(x) > 10)
        df = df.loc[indexer].copy()

        assert df._is_copy is None
        df["letters"] = df["letters"].apply(str.lower)

        # Implicitly take
        df = random_text(100000)
        indexer = df.letters.apply(lambda x: len(x) > 10)
        df = df.loc[indexer]

        assert df._is_copy is not None
        df["letters"] = df["letters"].apply(str.lower)

        # Implicitly take 2
        df = random_text(100000)
        indexer = df.letters.apply(lambda x: len(x) > 10)

        df = df.loc[indexer]
        assert df._is_copy is not None
        df.loc[:, "letters"] = df["letters"].apply(str.lower)

        # Should be ok even though it's a copy!
        assert df._is_copy is None

        df["letters"] = df["letters"].apply(str.lower)
        assert df._is_copy is None

        df = random_text(100000)
        indexer = df.letters.apply(lambda x: len(x) > 10)
        df.loc[indexer, "letters"] = df.loc[indexer, "letters"].apply(str.lower)

        # an identical take, so no copy
        df = DataFrame({"a": [1]}).dropna()
        assert df._is_copy is None
        df["a"] += 1

        df = DataFrame(np.random.randn(10, 4))
        s = df.iloc[:, 0].sort_values()

        tm.assert_series_equal(s, df.iloc[:, 0].sort_values())
        tm.assert_series_equal(s, df[0].sort_values())

        # see gh-6025: false positives
        df = DataFrame({"column1": ["a", "a", "a"], "column2": [4, 8, 9]})
        str(df)

        df["column1"] = df["column1"] + "b"
        str(df)

        df = df[df["column2"] != 8]
        str(df)

        df["column1"] = df["column1"] + "c"
        str(df)

        # from SO:
        # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
        df = DataFrame(np.arange(0, 9), columns=["count"])
        df["group"] = "b"

        with pytest.raises(com.SettingWithCopyError):
            df.iloc[0:5]["group"] = "a"

        # Mixed type setting but same dtype & changing dtype
        df = DataFrame(
            dict(
                A=date_range("20130101", periods=5),
                B=np.random.randn(5),
                C=np.arange(5, dtype="int64"),
                D=list("abcde"),
            )
        )

        with pytest.raises(com.SettingWithCopyError):
            df.loc[2]["D"] = "foo"

        with pytest.raises(com.SettingWithCopyError):
            df.loc[2]["C"] = "foo"

        with pytest.raises(com.SettingWithCopyError):
            df["C"][2] = "foo"

    def test_setting_with_copy_bug(self):

        # operating on a copy
        df = DataFrame(
            {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]}
        )
        mask = pd.isna(df.c)

        msg = "A value is trying to be set on a copy of a slice from a DataFrame"
        with pytest.raises(com.SettingWithCopyError, match=msg):
            df[["c"]][mask] = df[["b"]][mask]

        # invalid warning as we are returning a new object
        # GH 8730
        df1 = DataFrame({"x": Series(["a", "b", "c"]), "y": Series(["d", "e", "f"])})
        df2 = df1[["x"]]

        # this should not raise
        df2["y"] = ["g", "h", "i"]

    def test_detect_chained_assignment_warnings(self):
        with option_context("chained_assignment", "warn"):
            df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})

            with tm.assert_produces_warning(com.SettingWithCopyWarning):
                df.loc[0]["A"] = 111

    def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self):
        # xref gh-13017.
        with option_context("chained_assignment", "warn"):
            df = pd.DataFrame(
                [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]
            )

            with tm.assert_produces_warning(com.SettingWithCopyWarning):
                df.c.loc[df.c > 0] = None

            expected = pd.DataFrame(
                [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]
            )
            tm.assert_frame_equal(df, expected)

    def test_chained_getitem_with_lists(self):

        # GH6394
        # Regression in chained getitem indexing with embedded list-like from
        # 0.12

        df = DataFrame({"A": 5 * [np.zeros(3)], "B": 5 * [np.ones(3)]})
        expected = df["A"].iloc[2]
        result = df.loc[2, "A"]
        tm.assert_numpy_array_equal(result, expected)
        result2 = df.iloc[2]["A"]
        tm.assert_numpy_array_equal(result2, expected)
        result3 = df["A"].loc[2]
        tm.assert_numpy_array_equal(result3, expected)
        result4 = df["A"].iloc[2]
        tm.assert_numpy_array_equal(result4, expected)

    def test_cache_updating(self):
        # GH 4939, make sure to update the cache on setitem

        df = tm.makeDataFrame()
        df["A"]  # cache series
        df.loc["Hello Friend"] = df.iloc[0]
        assert "Hello Friend" in df["A"].index
        assert "Hello Friend" in df["B"].index

        # 10264
        df = DataFrame(
            np.zeros((5, 5), dtype="int64"),
            columns=["a", "b", "c", "d", "e"],
            index=range(5),
        )
        df["f"] = 0
        df.f.values[3] = 1

        df.f.values[3] = 2
        expected = DataFrame(
            np.zeros((5, 6), dtype="int64"),
            columns=["a", "b", "c", "d", "e", "f"],
            index=range(5),
        )
        expected.at[3, "f"] = 2
        tm.assert_frame_equal(df, expected)
        expected = Series([0, 0, 0, 2, 0], name="f")
        tm.assert_series_equal(df.f, expected)