Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / pandas   python

Repository URL to install this package:

Version: 1.1.1 

/ tests / frame / methods / test_isin.py

import numpy as np
import pytest

import pandas as pd
from pandas import DataFrame, MultiIndex, Series
import pandas._testing as tm


class TestDataFrameIsIn:
    def test_isin(self):
        # GH#4211
        df = DataFrame(
            {
                "vals": [1, 2, 3, 4],
                "ids": ["a", "b", "f", "n"],
                "ids2": ["a", "n", "c", "n"],
            },
            index=["foo", "bar", "baz", "qux"],
        )
        other = ["a", "b", "c"]

        result = df.isin(other)
        expected = DataFrame([df.loc[s].isin(other) for s in df.index])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
    def test_isin_empty(self, empty):
        # GH#16991
        df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
        expected = DataFrame(False, df.index, df.columns)

        result = df.isin(empty)
        tm.assert_frame_equal(result, expected)

    def test_isin_dict(self):
        df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
        d = {"A": ["a"]}

        expected = DataFrame(False, df.index, df.columns)
        expected.loc[0, "A"] = True

        result = df.isin(d)
        tm.assert_frame_equal(result, expected)

        # non unique columns
        df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]})
        df.columns = ["A", "A"]
        expected = DataFrame(False, df.index, df.columns)
        expected.loc[0, "A"] = True
        result = df.isin(d)
        tm.assert_frame_equal(result, expected)

    def test_isin_with_string_scalar(self):
        # GH#4763
        df = DataFrame(
            {
                "vals": [1, 2, 3, 4],
                "ids": ["a", "b", "f", "n"],
                "ids2": ["a", "n", "c", "n"],
            },
            index=["foo", "bar", "baz", "qux"],
        )
        msg = (
            r"only list-like or dict-like objects are allowed "
            r"to be passed to DataFrame.isin\(\), you passed a 'str'"
        )
        with pytest.raises(TypeError, match=msg):
            df.isin("a")

        with pytest.raises(TypeError, match=msg):
            df.isin("aaa")

    def test_isin_df(self):
        df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
        df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]})
        expected = DataFrame(False, df1.index, df1.columns)
        result = df1.isin(df2)
        expected["A"].loc[[1, 3]] = True
        expected["B"].loc[[0, 2]] = True
        tm.assert_frame_equal(result, expected)

        # partial overlapping columns
        df2.columns = ["A", "C"]
        result = df1.isin(df2)
        expected["B"] = False
        tm.assert_frame_equal(result, expected)

    def test_isin_tuples(self):
        # GH#16394
        df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]})
        df["C"] = list(zip(df["A"], df["B"]))
        result = df["C"].isin([(1, "a")])
        tm.assert_series_equal(result, Series([True, False, False], name="C"))

    def test_isin_df_dupe_values(self):
        df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]})
        # just cols duped
        df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"])
        msg = r"cannot compute isin with a duplicate axis\."
        with pytest.raises(ValueError, match=msg):
            df1.isin(df2)

        # just index duped
        df2 = DataFrame(
            [[0, 2], [12, 4], [2, np.nan], [4, 5]],
            columns=["A", "B"],
            index=[0, 0, 1, 1],
        )
        with pytest.raises(ValueError, match=msg):
            df1.isin(df2)

        # cols and index:
        df2.columns = ["B", "B"]
        with pytest.raises(ValueError, match=msg):
            df1.isin(df2)

    def test_isin_dupe_self(self):
        other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]})
        df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"])
        result = df.isin(other)
        expected = DataFrame(False, index=df.index, columns=df.columns)
        expected.loc[0] = True
        expected.iloc[1, 1] = True
        tm.assert_frame_equal(result, expected)

    def test_isin_against_series(self):
        df = pd.DataFrame(
            {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"]
        )
        s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"])
        expected = DataFrame(False, index=df.index, columns=df.columns)
        expected["A"].loc["a"] = True
        expected.loc["d"] = True
        result = df.isin(s)
        tm.assert_frame_equal(result, expected)

    def test_isin_multiIndex(self):
        idx = MultiIndex.from_tuples(
            [
                (0, "a", "foo"),
                (0, "a", "bar"),
                (0, "b", "bar"),
                (0, "b", "baz"),
                (2, "a", "foo"),
                (2, "a", "bar"),
                (2, "c", "bar"),
                (2, "c", "baz"),
                (1, "b", "foo"),
                (1, "b", "bar"),
                (1, "c", "bar"),
                (1, "c", "baz"),
            ]
        )
        df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx)
        df2 = DataFrame(
            {
                "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
                "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1],
            }
        )
        # against regular index
        expected = DataFrame(False, index=df1.index, columns=df1.columns)
        result = df1.isin(df2)
        tm.assert_frame_equal(result, expected)

        df2.index = idx
        expected = df2.values.astype(bool)
        expected[:, 1] = ~expected[:, 1]
        expected = DataFrame(expected, columns=["A", "B"], index=idx)

        result = df1.isin(df2)
        tm.assert_frame_equal(result, expected)

    def test_isin_empty_datetimelike(self):
        # GH#15473
        df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])})
        df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]})
        df2 = DataFrame({"date": []})
        df3 = DataFrame()

        expected = DataFrame({"date": [False, False]})

        result = df1_ts.isin(df2)
        tm.assert_frame_equal(result, expected)
        result = df1_ts.isin(df3)
        tm.assert_frame_equal(result, expected)

        result = df1_td.isin(df2)
        tm.assert_frame_equal(result, expected)
        result = df1_td.isin(df3)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "values",
        [
            pd.DataFrame({"a": [1, 2, 3]}, dtype="category"),
            pd.Series([1, 2, 3], dtype="category"),
        ],
    )
    def test_isin_category_frame(self, values):
        # GH#34256
        df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
        expected = DataFrame({"a": [True, True, True], "b": [False, False, False]})

        result = df.isin(values)
        tm.assert_frame_equal(result, expected)