tests/indexing/test_loc.py · alkaline-ml/pandas

alkaline-ml / pandas python

Repository URL to install this package:
Version: 1.1.1

/ tests / indexing / test_loc.py

""" test label based indexing with loc """
from io import StringIO
import re

import numpy as np
import pytest

from pandas.compat.numpy import _is_numpy_dev

import pandas as pd
from pandas import DataFrame, Series, Timestamp, date_range
import pandas._testing as tm
from pandas.api.types import is_scalar
from pandas.tests.indexing.common import Base


class TestLoc(Base):
    def test_loc_getitem_int(self):

        # int label
        self.check_result("loc", 2, typs=["labels"], fails=KeyError)

    def test_loc_getitem_label(self):

        # label
        self.check_result("loc", "c", typs=["empty"], fails=KeyError)

    def test_loc_getitem_label_out_of_range(self):

        # out of range label
        self.check_result(
            "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError,
        )
        self.check_result("loc", "f", typs=["floats"], fails=KeyError)
        self.check_result("loc", "f", typs=["floats"], fails=KeyError)
        self.check_result(
            "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError,
        )
        self.check_result("loc", 20, typs=["labels"], fails=KeyError)
        self.check_result("loc", 20, typs=["ts"], axes=0, fails=KeyError)
        self.check_result("loc", 20, typs=["floats"], axes=0, fails=KeyError)

    def test_loc_getitem_label_list(self):
        # TODO: test something here?
        # list of labels
        pass

    def test_loc_getitem_label_list_with_missing(self):
        self.check_result(
            "loc", [0, 1, 2], typs=["empty"], fails=KeyError,
        )
        self.check_result(
            "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError,
        )

        self.check_result(
            "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError,
        )

        # GH 17758 - MultiIndex and missing keys
        self.check_result(
            "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError,
        )

    def test_loc_getitem_label_list_fails(self):
        # fails
        self.check_result(
            "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError,
        )

    def test_loc_getitem_label_array_like(self):
        # TODO: test something?
        # array like
        pass

    def test_loc_getitem_bool(self):
        # boolean indexers
        b = [True, False, True, False]

        self.check_result("loc", b, typs=["empty"], fails=IndexError)

    def test_loc_getitem_label_slice(self):

        # label slices (with ints)

        # real label slices

        # GH 14316

        self.check_result(
            "loc",
            slice(1, 3),
            typs=["labels", "mixed", "empty", "ts", "floats"],
            fails=TypeError,
        )

        self.check_result(
            "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError,
        )

        self.check_result(
            "loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError,
        )
        self.check_result(
            "loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError,
        )

        self.check_result(
            "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError,
        )

    def test_setitem_from_duplicate_axis(self):
        # GH#34034
        df = DataFrame(
            [[20, "a"], [200, "a"], [200, "a"]],
            columns=["col1", "col2"],
            index=[10, 1, 1],
        )
        df.loc[1, "col1"] = np.arange(2)
        expected = DataFrame(
            [[20, "a"], [0, "a"], [1, "a"]], columns=["col1", "col2"], index=[10, 1, 1]
        )
        tm.assert_frame_equal(df, expected)


class TestLoc2:
    # TODO: better name, just separating out things that rely on base class

    def test_loc_getitem_dups(self):
        # GH 5678
        # repeated getitems on a dup index returning a ndarray
        df = DataFrame(
            np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)]
        )
        expected = df.loc["A", 0]
        result = df.loc[:, 0].loc["A"]
        tm.assert_series_equal(result, expected)

    def test_loc_getitem_dups2(self):

        # GH4726
        # dup indexing with iloc/loc
        df = DataFrame(
            [[1, 2, "foo", "bar", Timestamp("20130101")]],
            columns=["a", "a", "a", "a", "a"],
            index=[1],
        )
        expected = Series(
            [1, 2, "foo", "bar", Timestamp("20130101")],
            index=["a", "a", "a", "a", "a"],
            name=1,
        )

        result = df.iloc[0]
        tm.assert_series_equal(result, expected)

        result = df.loc[1]
        tm.assert_series_equal(result, expected)

    def test_loc_setitem_dups(self):

        # GH 6541
        df_orig = DataFrame(
            {
                "me": list("rttti"),
                "foo": list("aaade"),
                "bar": np.arange(5, dtype="float64") * 1.34 + 2,
                "bar2": np.arange(5, dtype="float64") * -0.34 + 2,
            }
        ).set_index("me")

        indexer = tuple(["r", ["bar", "bar2"]])
        df = df_orig.copy()
        df.loc[indexer] *= 2.0
        tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])

        indexer = tuple(["r", "bar"])
        df = df_orig.copy()
        df.loc[indexer] *= 2.0
        assert df.loc[indexer] == 2.0 * df_orig.loc[indexer]

        indexer = tuple(["t", ["bar", "bar2"]])
        df = df_orig.copy()
        df.loc[indexer] *= 2.0
        tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer])

    def test_loc_setitem_slice(self):
        # GH10503

        # assigning the same type should not change the type
        df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")})
        ix = df1["a"] == 1
        newb1 = df1.loc[ix, "b"] + 1
        df1.loc[ix, "b"] = newb1
        expected = DataFrame(
            {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")}
        )
        tm.assert_frame_equal(df1, expected)

        # assigning a new type should get the inferred type
        df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64")
        ix = df1["a"] == 1
        newb2 = df2.loc[ix, "b"]
        df1.loc[ix, "b"] = newb2
        expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64")
        tm.assert_frame_equal(df2, expected)

    def test_loc_setitem_dtype(self):
        # GH31340
        df = DataFrame({"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]})
        cols = ["a", "b", "c"]
        df.loc[:, cols] = df.loc[:, cols].astype("float32")

        expected = DataFrame(
            {"id": ["A"], "a": [1.2], "b": [0.0], "c": [-2.5]}, dtype="float32"
        )  # id is inferred as object

        tm.assert_frame_equal(df, expected)

    def test_getitem_label_list_with_missing(self):
        s = Series(range(3), index=["a", "b", "c"])

        # consistency
        with pytest.raises(KeyError, match="with any missing labels"):
            s[["a", "d"]]

        s = Series(range(3))
        with pytest.raises(KeyError, match="with any missing labels"):
            s[[0, 3]]

    @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]])
    def test_loc_getitem_bool_diff_len(self, index):
        # GH26658
        s = Series([1, 2, 3])
        msg = f"Boolean index has wrong length: {len(index)} instead of {len(s)}"
        with pytest.raises(IndexError, match=msg):
            _ = s.loc[index]

    def test_loc_getitem_int_slice(self):
        # TODO: test something here?
        pass

    def test_loc_to_fail(self):

        # GH3449
        df = DataFrame(
            np.random.random((3, 3)), index=["a", "b", "c"], columns=["e", "f", "g"]
        )

        # raise a KeyError?
        msg = (
            r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are "
            r"in the \[index\]\""
        )
        with pytest.raises(KeyError, match=msg):
            df.loc[[1, 2], [1, 2]]

        # GH  7496
        # loc should not fallback

        s = Series(dtype=object)
        s.loc[1] = 1
        s.loc["a"] = 2

        with pytest.raises(KeyError, match=r"^-1$"):
            s.loc[-1]

        msg = (
            r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are "
            r"in the \[index\]\""
        )
        with pytest.raises(KeyError, match=msg):
            s.loc[[-1, -2]]

        msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\""
        with pytest.raises(KeyError, match=msg):
            s.loc[["4"]]

        s.loc[-1] = 3
        with pytest.raises(KeyError, match="with any missing labels"):
            s.loc[[-1, -2]]

        s["a"] = 2
        msg = (
            r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are "
            r"in the \[index\]\""
        )
        with pytest.raises(KeyError, match=msg):
            s.loc[[-2]]

        del s["a"]

        with pytest.raises(KeyError, match=msg):
            s.loc[[-2]] = 0

        # inconsistency between .loc[values] and .loc[values,:]
        # GH 7999
        df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"])

        msg = (
            r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are "
            r"in the \[index\]\""
        )
        with pytest.raises(KeyError, match=msg):
            df.loc[[3], :]

        with pytest.raises(KeyError, match=msg):
            df.loc[[3]]

    def test_loc_getitem_list_with_fail(self):
        # 15747
        # should KeyError if *any* missing labels

        s = Series([1, 2, 3])

        s.loc[[2]]

        with pytest.raises(
            KeyError,
            match=re.escape(
                "\"None of [Int64Index([3], dtype='int64')] are in the [index]\""
            ),
        ):
            s.loc[[3]]

        # a non-match and a match
        with pytest.raises(KeyError, match="with any missing labels"):
            s.loc[[2, 3]]

    def test_loc_index(self):
        # gh-17131
        # a boolean index should index like a boolean numpy array

        df = DataFrame(
            np.random.random(size=(5, 10)),
            index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"],
        )

        mask = df.index.map(lambda x: "alpha" in x)
        expected = df.loc[np.array(mask)]

        result = df.loc[mask]
        tm.assert_frame_equal(result, expected)

        result = df.loc[mask.values]
        tm.assert_frame_equal(result, expected)

        result = df.loc[pd.array(mask, dtype="boolean")]
        tm.assert_frame_equal(result, expected)

    def test_loc_general(self):

        df = DataFrame(
            np.random.rand(4, 4),
            columns=["A", "B", "C", "D"],
            index=["A", "B", "C", "D"],
        )

        # want this to work
        result = df.loc[:, "A":"B"].iloc[0:2, :]
        assert (result.columns == ["A", "B"]).all()
        assert (result.index == ["A", "B"]).all()

        # mixed type
        result = DataFrame({"a": [Timestamp("20130101")], "b": [1]}).iloc[0]
        expected = Series([Timestamp("20130101"), 1], index=["a", "b"], name=0)
        tm.assert_series_equal(result, expected)
        assert result.dtype == object

    def test_loc_setitem_consistency(self):
        # GH 6149
        # coerce similarly for setitem and loc when rows have a null-slice
        expected = DataFrame(
            {
                "date": Series(0, index=range(5), dtype=np.int64),
                "val": Series(range(5), dtype=np.int64),
            }
        )

        df = DataFrame(
            {
                "date": date_range("2000-01-01", "2000-01-5"),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df.loc[:, "date"] = 0
        tm.assert_frame_equal(df, expected)

        df = DataFrame(
            {
                "date": date_range("2000-01-01", "2000-01-5"),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df.loc[:, "date"] = np.array(0, dtype=np.int64)
        tm.assert_frame_equal(df, expected)

        df = DataFrame(
            {
                "date": date_range("2000-01-01", "2000-01-5"),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df.loc[:, "date"] = np.array([0, 0, 0, 0, 0], dtype=np.int64)
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(
            {
                "date": Series("foo", index=range(5)),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df = DataFrame(
            {
                "date": date_range("2000-01-01", "2000-01-5"),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df.loc[:, "date"] = "foo"
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(
            {
                "date": Series(1.0, index=range(5)),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df = DataFrame(
            {
                "date": date_range("2000-01-01", "2000-01-5"),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        df.loc[:, "date"] = 1.0
        tm.assert_frame_equal(df, expected)

        # GH 15494
        # setting on frame with single row
        df = DataFrame({"date": Series([Timestamp("20180101")])})
        df.loc[:, "date"] = "string"
        expected = DataFrame({"date": Series(["string"])})
        tm.assert_frame_equal(df, expected)

    def test_loc_setitem_consistency_empty(self):
        # empty (essentially noops)
        expected = DataFrame(columns=["x", "y"])
        expected["x"] = expected["x"].astype(np.int64)
        df = DataFrame(columns=["x", "y"])
        df.loc[:, "x"] = 1
        tm.assert_frame_equal(df, expected)

        df = DataFrame(columns=["x", "y"])
        df["x"] = 1
        tm.assert_frame_equal(df, expected)

    def test_loc_setitem_consistency_slice_column_len(self):
        # .loc[:,column] setting with slice == len of the column
        # GH10408
        data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat
Level_1,,,Something,StartDate,EndDate,Yes/No,SomethingElse
Region,Site,RespondentID,,,,,
Region_1,Site_1,3987227376,A,5/25/2015 10:59,5/25/2015 11:22,Yes,
Region_1,Site_1,3980680971,A,5/21/2015 9:40,5/21/2015 9:52,Yes,Yes
Region_1,Site_2,3977723249,A,5/20/2015 8:27,5/20/2015 8:41,Yes,
Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No"""

        df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2])
        df.loc[:, ("Respondent", "StartDate")] = pd.to_datetime(
            df.loc[:, ("Respondent", "StartDate")]
        )
        df.loc[:, ("Respondent", "EndDate")] = pd.to_datetime(
            df.loc[:, ("Respondent", "EndDate")]
        )
        df.loc[:, ("Respondent", "Duration")] = (
            df.loc[:, ("Respondent", "EndDate")]
            - df.loc[:, ("Respondent", "StartDate")]
        )

        df.loc[:, ("Respondent", "Duration")] = df.loc[
            :, ("Respondent", "Duration")
        ].astype("timedelta64[s]")
        expected = Series(
            [1380, 720, 840, 2160.0], index=df.index, name=("Respondent", "Duration")
        )
        tm.assert_series_equal(df[("Respondent", "Duration")], expected)

    @pytest.mark.parametrize("unit", ["Y", "M", "D", "h", "m", "s", "ms", "us"])
    def test_loc_assign_non_ns_datetime(self, unit):
        # GH 27395, non-ns dtype assignment via .loc should work
        # and return the same result when using simple assignment
        df = DataFrame(
            {
                "timestamp": [
                    np.datetime64("2017-02-11 12:41:29"),
                    np.datetime64("1991-11-07 04:22:37"),
                ]
            }
        )

        df.loc[:, unit] = df.loc[:, "timestamp"].values.astype(f"datetime64[{unit}]")
        df["expected"] = df.loc[:, "timestamp"].values.astype(f"datetime64[{unit}]")
        expected = Series(df.loc[:, "expected"], name=unit)
        tm.assert_series_equal(df.loc[:, unit], expected)

    def test_loc_modify_datetime(self):
        # see gh-28837
        df = DataFrame.from_dict(
            {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]}
        )

        df["date_dt"] = pd.to_datetime(df["date"], unit="ms", cache=True)

        df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"]
        df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"]

        expected = DataFrame(
            [
                [1485264372711, "2017-01-24 13:26:12.711", "2017-01-24 13:26:12.711"],
                [1485265925110, "2017-01-24 13:52:05.110", "2017-01-24 13:52:05.110"],
                [1540215845888, "2018-10-22 13:44:05.888", "2018-10-22 13:44:05.888"],
                [1540282121025, "2018-10-23 08:08:41.025", "2018-10-23 08:08:41.025"],
            ],
            columns=["date", "date_dt", "date_dt_cp"],
        )

        columns = ["date_dt", "date_dt_cp"]
        expected[columns] = expected[columns].apply(pd.to_datetime)

        tm.assert_frame_equal(df, expected)

    def test_loc_setitem_frame(self):
        df = DataFrame(np.random.randn(4, 4), index=list("abcd"), columns=list("ABCD"))

        result = df.iloc[0, 0]

        df.loc["a", "A"] = 1
        result = df.loc["a", "A"]
        assert result == 1

        result = df.iloc[0, 0]
        assert result == 1

        df.loc[:, "B":"D"] = 0
        expected = df.loc[:, "B":"D"]
        result = df.iloc[:, 1:]
        tm.assert_frame_equal(result, expected)

        # GH 6254
        # setting issue
        df = DataFrame(index=[3, 5, 4], columns=["A"])
        df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64")
        expected = DataFrame(dict(A=Series([1, 2, 3], index=[4, 3, 5]))).reindex(
            index=[3, 5, 4]
        )
        tm.assert_frame_equal(df, expected)

        # GH 6252
        # setting with an empty frame
        keys1 = ["@" + str(i) for i in range(5)]
        val1 = np.arange(5, dtype="int64")

        keys2 = ["@" + str(i) for i in range(4)]
        val2 = np.arange(4, dtype="int64")

        index = list(set(keys1).union(keys2))
        df = DataFrame(index=index)
        df["A"] = np.nan
        df.loc[keys1, "A"] = val1

        df["B"] = np.nan
        df.loc[keys2, "B"] = val2

        expected = DataFrame(
            dict(A=Series(val1, index=keys1), B=Series(val2, index=keys2))
        ).reindex(index=index)
        tm.assert_frame_equal(df, expected)

        # GH 8669
        # invalid coercion of nan -> int
        df = DataFrame({"A": [1, 2, 3], "B": np.nan})
        df.loc[df.B > df.A, "B"] = df.A
        expected = DataFrame({"A": [1, 2, 3], "B": np.nan})
        tm.assert_frame_equal(df, expected)

        # GH 6546
        # setting with mixed labels
        df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]})

        result = df.loc[0, [1, 2]]
        expected = Series([1, 3], index=[1, 2], dtype=object, name=0)
        tm.assert_series_equal(result, expected)

        expected = DataFrame({1: [5, 2], 2: [6, 4], "a": ["a", "b"]})
        df.loc[0, [1, 2]] = [5, 6]
        tm.assert_frame_equal(df, expected)

    def test_loc_setitem_frame_multiples(self):
        # multiple setting
        df = DataFrame(
            {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)}
        )
        rhs = df.loc[1:2]
        rhs.index = df.index[0:2]
        df.loc[0:1] = rhs
        expected = DataFrame(
            {"A": ["bar", "baz", "baz"], "B": Series([1, 2, 2], dtype=np.int64)}
        )
        tm.assert_frame_equal(df, expected)

        # multiple setting with frame on rhs (with M8)
        df = DataFrame(
            {
                "date": date_range("2000-01-01", "2000-01-5"),
                "val": Series(range(5), dtype=np.int64),
            }
        )
        expected = DataFrame(
            {
                "date": [
                    Timestamp("20000101"),
                    Timestamp("20000102"),
                    Timestamp("20000101"),
                    Timestamp("20000102"),
                    Timestamp("20000103"),
                ],
                "val": Series([0, 1, 0, 1, 2], dtype=np.int64),
            }
        )
        rhs = df.loc[0:2]
        rhs.index = df.index[2:5]
        df.loc[2:4] = rhs
        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize(
        "indexer", [["A"], slice(None, "A", None), np.array(["A"])]
    )
    @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])])
    def test_loc_setitem_with_scalar_index(self, indexer, value):
        # GH #19474
        # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated
        # elementwisely, not using "setter('A', ['Z'])".

        df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
        df.loc[0, indexer] = value
        result = df.loc[0, "A"]

        assert is_scalar(result) and result == "Z"

    @pytest.mark.parametrize(
        "index,box,expected",
        [
            (
                ([0, 2], ["A", "B", "C", "D"]),
                7,
                pd.DataFrame(
                    [[7, 7, 7, 7], [3, 4, np.nan, np.nan], [7, 7, 7, 7]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                (1, ["C", "D"]),
                [7, 8],
                pd.DataFrame(
                    [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                (1, ["A", "B", "C"]),
                np.array([7, 8, 9], dtype=np.int64),
                pd.DataFrame(
                    [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]],
                    columns=["A", "B", "C"],
                ),
            ),
            (
                (slice(1, 3, None), ["B", "C", "D"]),
                [[7, 8, 9], [10, 11, 12]],
                pd.DataFrame(
                    [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                (slice(1, 3, None), ["C", "A", "D"]),
                np.array([[7, 8, 9], [10, 11, 12]], dtype=np.int64),
                pd.DataFrame(
                    [[1, 2, np.nan, np.nan], [8, 4, 7, 9], [11, 6, 10, 12]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                (slice(None, None, None), ["A", "C"]),
                pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]),
                pd.DataFrame(
                    [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"]
                ),
            ),
        ],
    )
    def test_loc_setitem_missing_columns(self, index, box, expected):
        # GH 29334
        df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"])
        df.loc[index] = box
        tm.assert_frame_equal(df, expected)

    def test_loc_coercion(self):

        # 12411
        df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC"), pd.NaT]})
        expected = df.dtypes

        result = df.iloc[[0]]
        tm.assert_series_equal(result.dtypes, expected)

        result = df.iloc[[1]]
        tm.assert_series_equal(result.dtypes, expected)

        # 12045
        import datetime

        df = DataFrame(
            {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]}
        )
        expected = df.dtypes

        result = df.iloc[[0]]
        tm.assert_series_equal(result.dtypes, expected)

        result = df.iloc[[1]]
        tm.assert_series_equal(result.dtypes, expected)

        # 11594
        df = DataFrame({"text": ["some words"] + [None] * 9})
        expected = df.dtypes

        result = df.iloc[0:2]
        tm.assert_series_equal(result.dtypes, expected)

        result = df.iloc[3:]
        tm.assert_series_equal(result.dtypes, expected)

    def test_setitem_new_key_tz(self):
        # GH#12862 should not raise on assigning the second value
        vals = [
            pd.to_datetime(42).tz_localize("UTC"),
            pd.to_datetime(666).tz_localize("UTC"),
        ]
        expected = pd.Series(vals, index=["foo", "bar"])

        ser = pd.Series(dtype=object)
        ser["foo"] = vals[0]
        ser["bar"] = vals[1]

        tm.assert_series_equal(ser, expected)

        ser = pd.Series(dtype=object)
        ser.loc["foo"] = vals[0]
        ser.loc["bar"] = vals[1]

        tm.assert_series_equal(ser, expected)

    def test_loc_non_unique(self):
        # GH3659
        # non-unique indexer with loc slice
        # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs

        # these are going to raise because the we are non monotonic
        df = DataFrame(
            {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]
        )
        msg = "'Cannot get left slice bound for non-unique label: 1'"
        with pytest.raises(KeyError, match=msg):
            df.loc[1:]
        msg = "'Cannot get left slice bound for non-unique label: 0'"
        with pytest.raises(KeyError, match=msg):
            df.loc[0:]
        msg = "'Cannot get left slice bound for non-unique label: 1'"
        with pytest.raises(KeyError, match=msg):
            df.loc[1:2]

        # monotonic are ok
        df = DataFrame(
            {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]
        ).sort_index(axis=0)
        result = df.loc[1:]
        expected = DataFrame({"A": [2, 4, 5, 6], "B": [4, 6, 7, 8]}, index=[1, 1, 2, 3])
        tm.assert_frame_equal(result, expected)

        result = df.loc[0:]
        tm.assert_frame_equal(result, df)

        result = df.loc[1:2]
        expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2])
        tm.assert_frame_equal(result, expected)

    def test_loc_non_unique_memory_error(self):

        # GH 4280
        # non_unique index with a large selection triggers a memory error

        columns = list("ABCDEFG")

        def gen_test(l, l2):
            return pd.concat(
                [
                    DataFrame(
                        np.random.randn(l, len(columns)),
                        index=np.arange(l),
                        columns=columns,
                    ),
                    DataFrame(
                        np.ones((l2, len(columns))), index=[0] * l2, columns=columns
                    ),
                ]
            )

        def gen_expected(df, mask):
            len_mask = len(mask)
            return pd.concat(
                [
                    df.take([0]),
                    DataFrame(
                        np.ones((len_mask, len(columns))),
                        index=[0] * len_mask,
                        columns=columns,
                    ),
                    df.take(mask[1:]),
                ]
            )

        df = gen_test(900, 100)
        assert df.index.is_unique is False

        mask = np.arange(100)
        result = df.loc[mask]
        expected = gen_expected(df, mask)
        tm.assert_frame_equal(result, expected)

        df = gen_test(900000, 100000)
        assert df.index.is_unique is False

        mask = np.arange(100000)
        result = df.loc[mask]
        expected = gen_expected(df, mask)
        tm.assert_frame_equal(result, expected)

    def test_loc_name(self):
        # GH 3880
        df = DataFrame([[1, 1], [1, 1]])
        df.index.name = "index_name"
        result = df.iloc[[0, 1]].index.name
        assert result == "index_name"

        result = df.loc[[0, 1]].index.name
        assert result == "index_name"

    def test_loc_empty_list_indexer_is_ok(self):

        df = tm.makeCustomDataframe(5, 2)
        # vertical empty
        tm.assert_frame_equal(
            df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
        )

    def test_identity_slice_returns_new_object(self):
        # GH13873
        original_df = DataFrame({"a": [1, 2, 3]})
        sliced_df = original_df.loc[:]
        assert sliced_df is not original_df
        assert original_df[:] is not original_df

        # should be a shallow copy
        original_df["a"] = [4, 4, 4]
        assert (sliced_df["a"] == 4).all()

        # These should not return copies
        assert original_df is original_df.loc[:, :]
        df = DataFrame(np.random.randn(10, 4))
        assert df[0] is df.loc[:, 0]

        # Same tests for Series
        original_series = Series([1, 2, 3, 4, 5, 6])
        sliced_series = original_series.loc[:]
        assert sliced_series is not original_series
        assert original_series[:] is not original_series

        original_series[:3] = [7, 8, 9]
        assert all(sliced_series[:3] == [7, 8, 9])

    def test_loc_copy_vs_view(self):
        # GH 15631
        x = DataFrame(zip(range(3), range(3)), columns=["a", "b"])

        y = x.copy()
        q = y.loc[:, "a"]
        q += 2

        tm.assert_frame_equal(x, y)

        z = x.copy()
        q = z.loc[x.index, "a"]
        q += 2

        tm.assert_frame_equal(x, z)

    def test_loc_uint64(self):
        # GH20722
        # Test whether loc accept uint64 max value as index.
        s = pd.Series(
            [1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]
        )

        result = s.loc[np.iinfo("uint64").max - 1]
        expected = s.iloc[0]
        assert result == expected

        result = s.loc[[np.iinfo("uint64").max - 1]]
        expected = s.iloc[[0]]
        tm.assert_series_equal(result, expected)

        result = s.loc[[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]]
        tm.assert_series_equal(result, s)

    def test_loc_setitem_empty_append(self):
        # GH6173, various appends to an empty dataframe

        data = [1, 2, 3]
        expected = DataFrame({"x": data, "y": [None] * len(data)})

        # appends to fit length of data
        df = DataFrame(columns=["x", "y"])
        df.loc[:, "x"] = data
        tm.assert_frame_equal(df, expected)

        # only appends one value
        expected = DataFrame({"x": [1.0], "y": [np.nan]})
        df = DataFrame(columns=["x", "y"], dtype=float)
        df.loc[0, "x"] = expected.loc[0, "x"]
        tm.assert_frame_equal(df, expected)

    @pytest.mark.xfail(_is_numpy_dev, reason="gh-35481")
    def test_loc_setitem_empty_append_raises(self):
        # GH6173, various appends to an empty dataframe

        data = [1, 2]
        df = DataFrame(columns=["x", "y"])
        df.index = df.index.astype(np.int64)
        msg = (
            r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] "
            r"are in the \[index\]"
        )
        with pytest.raises(KeyError, match=msg):
            df.loc[[0, 1], "x"] = data

        msg = "cannot copy sequence with size 2 to array axis with dimension 0"
        with pytest.raises(ValueError, match=msg):
            df.loc[0:2, "x"] = data

    def test_indexing_zerodim_np_array(self):
        # GH24924
        df = DataFrame([[1, 2], [3, 4]])
        result = df.loc[np.array(0)]
        s = pd.Series([1, 2], name=0)
        tm.assert_series_equal(result, s)

    def test_series_indexing_zerodim_np_array(self):
        # GH24924
        s = Series([1, 2])
        result = s.loc[np.array(0)]
        assert result == 1

    def test_loc_reverse_assignment(self):
        # GH26939
        data = [1, 2, 3, 4, 5, 6] + [None] * 4
        expected = Series(data, index=range(2010, 2020))

        result = pd.Series(index=range(2010, 2020), dtype=np.float64)
        result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1]

        tm.assert_series_equal(result, expected)


def test_series_loc_getitem_label_list_missing_values():
    # gh-11428
    key = np.array(
        ["2001-01-04", "2001-01-02", "2001-01-04", "2001-01-14"], dtype="datetime64"
    )
    s = Series([2, 5, 8, 11], date_range("2001-01-01", freq="D", periods=4))
    with pytest.raises(KeyError, match="with any missing labels"):
        s.loc[key]


@pytest.mark.parametrize(
    "columns, column_key, expected_columns, check_column_type",
    [
        ([2011, 2012, 2013], [2011, 2012], [0, 1], True),
        ([2011, 2012, "All"], [2011, 2012], [0, 1], False),
        ([2011, 2012, "All"], [2011, "All"], [0, 2], True),
    ],
)
def test_loc_getitem_label_list_integer_labels(
    columns, column_key, expected_columns, check_column_type
):
    # gh-14836
    df = DataFrame(np.random.rand(3, 3), columns=columns, index=list("ABC"))
    expected = df.iloc[:, expected_columns]
    result = df.loc[["A", "B", "C"], column_key]
    tm.assert_frame_equal(result, expected, check_column_type=check_column_type)


def test_loc_setitem_float_intindex():
    # GH 8720
    rand_data = np.random.randn(8, 4)
    result = pd.DataFrame(rand_data)
    result.loc[:, 0.5] = np.nan
    expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1)))
    expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5])
    tm.assert_frame_equal(result, expected)

    result = pd.DataFrame(rand_data)
    result.loc[:, 0.5] = np.nan
    tm.assert_frame_equal(result, expected)


def test_loc_axis_1_slice():
    # GH 10586
    cols = [(yr, m) for yr in [2014, 2015] for m in [7, 8, 9, 10]]
    df = pd.DataFrame(
        np.ones((10, 8)),
        index=tuple("ABCDEFGHIJ"),
        columns=pd.MultiIndex.from_tuples(cols),
    )
    result = df.loc(axis=1)[(2014, 9):(2015, 8)]
    expected = pd.DataFrame(
        np.ones((10, 4)),
        index=tuple("ABCDEFGHIJ"),
        columns=pd.MultiIndex.from_tuples(
            [(2014, 9), (2014, 10), (2015, 7), (2015, 8)]
        ),
    )
    tm.assert_frame_equal(result, expected)


def test_loc_set_dataframe_multiindex():
    # GH 14592
    expected = pd.DataFrame(
        "a", index=range(2), columns=pd.MultiIndex.from_product([range(2), range(2)])
    )
    result = expected.copy()
    result.loc[0, [(0, 1)]] = result.loc[0, [(0, 1)]]
    tm.assert_frame_equal(result, expected)


def test_loc_mixed_int_float():
    # GH#19456
    ser = pd.Series(range(2), pd.Index([1, 2.0], dtype=object))

    result = ser.loc[1]
    assert result == 0


def test_loc_with_positional_slice_deprecation():
    # GH#31840
    ser = pd.Series(range(4), index=["A", "B", "C", "D"])

    with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
        ser.loc[:3] = 2

    expected = pd.Series([2, 2, 2, 3], index=["A", "B", "C", "D"])
    tm.assert_series_equal(ser, expected)


def test_loc_slice_disallows_positional():
    # GH#16121, GH#24612, GH#31810
    dti = pd.date_range("2016-01-01", periods=3)
    df = pd.DataFrame(np.random.random((3, 2)), index=dti)

    ser = df[0]

    msg = (
        "cannot do slice indexing on DatetimeIndex with these "
        r"indexers \[1\] of type int"
    )

    for obj in [df, ser]:
        with pytest.raises(TypeError, match=msg):
            obj.loc[1:3]

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            # GH#31840 deprecated incorrect behavior
            obj.loc[1:3] = 1

    with pytest.raises(TypeError, match=msg):
        df.loc[1:3, 1]

    with tm.assert_produces_warning(FutureWarning):
        # GH#31840 deprecated incorrect behavior
        df.loc[1:3, 1] = 2


def test_loc_datetimelike_mismatched_dtypes():
    # GH#32650 dont mix and match datetime/timedelta/period dtypes

    df = pd.DataFrame(
        np.random.randn(5, 3),
        columns=["a", "b", "c"],
        index=pd.date_range("2012", freq="H", periods=5),
    )
    # create dataframe with non-unique DatetimeIndex
    df = df.iloc[[0, 2, 2, 3]].copy()

    dti = df.index
    tdi = pd.TimedeltaIndex(dti.asi8)  # matching i8 values

    msg = r"None of \[TimedeltaIndex.* are in the \[index\]"
    with pytest.raises(KeyError, match=msg):
        df.loc[tdi]

    with pytest.raises(KeyError, match=msg):
        df["a"].loc[tdi]


def test_loc_with_period_index_indexer():
    # GH#4125
    idx = pd.period_range("2002-01", "2003-12", freq="M")
    df = pd.DataFrame(np.random.randn(24, 10), index=idx)
    tm.assert_frame_equal(df, df.loc[idx])
    tm.assert_frame_equal(df, df.loc[list(idx)])
    tm.assert_frame_equal(df, df.loc[list(idx)])
    tm.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]])
    tm.assert_frame_equal(df, df.loc[list(idx)])
alkaline-ml / pandas python

Version: 1.1.1

/ tests / indexing / test_loc.py

Products

About

Resources

Contact Gemfury