Gemfury

duality-group / dask python

Repository URL to install this package:
Details
dask / dask / dataframe / tests / test_indexing.py
import numpy as np
import pandas as pd
import pytest

import dask
import dask.dataframe as dd
from dask.base import tokenize
from dask.dataframe._compat import PANDAS_GT_110, PANDAS_GT_120, tm
from dask.dataframe.indexing import _coerce_loc_index
from dask.dataframe.utils import assert_eq, make_meta

dsk = {
    ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
    ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
    ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
}
meta = make_meta(
    {"a": "i8", "b": "i8"}, index=pd.Index([], "i8"), parent_meta=pd.DataFrame()
)
d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
full = d.compute()
CHECK_FREQ = {}
if PANDAS_GT_110:
    CHECK_FREQ["check_freq"] = False


def test_loc():
    assert d.loc[3:8].divisions[0] == 3
    assert d.loc[3:8].divisions[-1] == 8

    assert d.loc[5].divisions == (5, 5)

    assert_eq(d.loc[5], full.loc[5:5])
    assert_eq(d.loc[3:8], full.loc[3:8])
    assert_eq(d.loc[:8], full.loc[:8])
    assert_eq(d.loc[3:], full.loc[3:])
    assert_eq(d.loc[[5]], full.loc[[5]])

    assert_eq(d.a.loc[5], full.a.loc[5:5])
    assert_eq(d.a.loc[3:8], full.a.loc[3:8])
    assert_eq(d.a.loc[:8], full.a.loc[:8])
    assert_eq(d.a.loc[3:], full.a.loc[3:])
    assert_eq(d.a.loc[[5]], full.a.loc[[5]])
    assert_eq(d.a.loc[[]], full.a.loc[[]])
    assert_eq(d.a.loc[np.array([])], full.a.loc[np.array([])])

    pytest.raises(KeyError, lambda: d.loc[1000])
    assert_eq(d.loc[1000:], full.loc[1000:])
    assert_eq(d.loc[1000:2000], full.loc[1000:2000])
    assert_eq(d.loc[:-1000], full.loc[:-1000])
    assert_eq(d.loc[-2000:-1000], full.loc[-2000:-1000])

    assert sorted(d.loc[5].dask) == sorted(d.loc[5].dask)
    assert sorted(d.loc[5].dask) != sorted(d.loc[6].dask)


def test_loc_non_informative_index():
    df = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 30, 40])
    ddf = dd.from_pandas(df, npartitions=2, sort=True)
    ddf.divisions = (None,) * 3
    assert not ddf.known_divisions

    ddf.loc[20:30].compute(scheduler="sync")

    assert_eq(ddf.loc[20:30], df.loc[20:30])

    df = pd.DataFrame({"x": [1, 2, 3, 4]}, index=[10, 20, 20, 40])
    ddf = dd.from_pandas(df, npartitions=2, sort=True)
    assert_eq(ddf.loc[20], df.loc[20:20])


def test_loc_with_text_dates():
    A = dd._compat.makeTimeSeries().iloc[:5]
    B = dd._compat.makeTimeSeries().iloc[5:]
    s = dd.Series(
        {("df", 0): A, ("df", 1): B},
        "df",
        A,
        [A.index.min(), B.index.min(), B.index.max()],
    )

    assert s.loc["2000":"2010"].divisions == s.divisions
    assert_eq(s.loc["2000":"2010"], s)
    assert len(s.loc["2000-01-03":"2000-01-05"].compute()) == 3


def test_loc_with_series():
    assert_eq(d.loc[d.a % 2 == 0], full.loc[full.a % 2 == 0])

    assert sorted(d.loc[d.a % 2 == 0].dask) == sorted(d.loc[d.a % 2 == 0].dask)
    assert sorted(d.loc[d.a % 2 == 0].dask) != sorted(d.loc[d.a % 3 == 0].dask)


def test_loc_with_array():
    assert_eq(d.loc[(d.a % 2 == 0).values], full.loc[(full.a % 2 == 0).values])

    assert sorted(d.loc[(d.a % 2 == 0).values].dask) == sorted(
        d.loc[(d.a % 2 == 0).values].dask
    )
    assert sorted(d.loc[(d.a % 2 == 0).values].dask) != sorted(
        d.loc[(d.a % 3 == 0).values].dask
    )


def test_loc_with_function():
    assert_eq(d.loc[lambda df: df["a"] > 3, :], full.loc[lambda df: df["a"] > 3, :])

    def _col_loc_fun(_df):
        return _df.columns.str.contains("b")

    assert_eq(d.loc[:, _col_loc_fun], full.loc[:, _col_loc_fun])


def test_loc_with_array_different_partition():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("ABCDE"),
    )
    ddf = dd.from_pandas(df, 3)

    assert_eq(ddf.loc[(ddf.A > 0).values], df.loc[(df.A > 0).values])
    with pytest.raises(ValueError):
        ddf.loc[(ddf.A > 0).repartition(["a", "g", "k", "o", "t"]).values]


def test_loc_with_series_different_partition():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("ABCDE"),
    )
    ddf = dd.from_pandas(df, 3)

    assert_eq(ddf.loc[ddf.A > 0], df.loc[df.A > 0])
    assert_eq(
        ddf.loc[(ddf.A > 0).repartition(["a", "g", "k", "o", "t"])], df.loc[df.A > 0]
    )


def test_loc_with_non_boolean_series():
    df = pd.Series(
        np.random.randn(20),
        index=list("abcdefghijklmnopqrst"),
    )
    ddf = dd.from_pandas(df, 3)

    s = pd.Series(list("bdmnat"))
    ds = dd.from_pandas(s, npartitions=3)

    msg = (
        "Cannot index with non-boolean dask Series. Try passing computed values instead"
    )
    with pytest.raises(KeyError, match=msg):
        ddf.loc[ds]

    assert_eq(ddf.loc[s], df.loc[s])

    with pytest.raises(KeyError, match=msg):
        ddf.loc[ds.values]

    assert_eq(ddf.loc[s.values], df.loc[s])

    ddf = ddf.clear_divisions()
    with pytest.raises(KeyError, match=msg):
        ddf.loc[ds]

    with pytest.raises(
        KeyError, match="Cannot index with list against unknown division"
    ):
        ddf.loc[s]


def test_loc2d():
    # index indexer is always regarded as slice for duplicated values
    assert_eq(d.loc[5, "a"], full.loc[5:5, "a"])
    # assert_eq(d.loc[[5], 'a'], full.loc[[5], 'a'])
    assert_eq(d.loc[5, ["a"]], full.loc[5:5, ["a"]])
    # assert_eq(d.loc[[5], ['a']], full.loc[[5], ['a']])

    assert_eq(d.loc[3:8, "a"], full.loc[3:8, "a"])
    assert_eq(d.loc[:8, "a"], full.loc[:8, "a"])
    assert_eq(d.loc[3:, "a"], full.loc[3:, "a"])
    assert_eq(d.loc[[8], "a"], full.loc[[8], "a"])

    assert_eq(d.loc[3:8, ["a"]], full.loc[3:8, ["a"]])
    assert_eq(d.loc[:8, ["a"]], full.loc[:8, ["a"]])
    assert_eq(d.loc[3:, ["a"]], full.loc[3:, ["a"]])

    # 3d
    with pytest.raises(pd.core.indexing.IndexingError):
        d.loc[3, 3, 3]

    # Series should raise
    with pytest.raises(pd.core.indexing.IndexingError):
        d.a.loc[3, 3]

    with pytest.raises(pd.core.indexing.IndexingError):
        d.a.loc[3:, 3]

    with pytest.raises(pd.core.indexing.IndexingError):
        d.a.loc[d.a % 2 == 0, 3]


def test_loc2d_with_known_divisions():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("ABCDE"),
    )
    ddf = dd.from_pandas(df, 3)

    assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"])
    assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]])
    assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"])
    assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]])
    assert_eq(ddf.loc[["n"], ["A"]], df.loc[["n"], ["A"]])
    assert_eq(ddf.loc[["a", "c", "n"], ["A"]], df.loc[["a", "c", "n"], ["A"]])
    assert_eq(ddf.loc[["t", "b"], ["A"]], df.loc[["t", "b"], ["A"]])
    assert_eq(
        ddf.loc[["r", "r", "c", "g", "h"], ["A"]],
        df.loc[["r", "r", "c", "g", "h"], ["A"]],
    )


def test_loc2d_with_unknown_divisions():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("ABCDE"),
    )
    ddf = dd.from_pandas(df, 3)

    ddf.divisions = (None,) * len(ddf.divisions)
    assert ddf.known_divisions is False

    assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"])
    assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]])
    assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"])
    assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]])


def test_loc2d_duplicated_columns():
    df = pd.DataFrame(
        np.random.randn(20, 5),
        index=list("abcdefghijklmnopqrst"),
        columns=list("AABCD"),
    )
    ddf = dd.from_pandas(df, 3)

    assert_eq(ddf.loc["a", "A"], df.loc[["a"], "A"])
    assert_eq(ddf.loc["a", ["A"]], df.loc[["a"], ["A"]])
    assert_eq(ddf.loc["j", "B"], df.loc[["j"], "B"])
    assert_eq(ddf.loc["j", ["B"]], df.loc[["j"], ["B"]])

    assert_eq(ddf.loc["a":"o", "A"], df.loc["a":"o", "A"])
    assert_eq(ddf.loc["a":"o", ["A"]], df.loc["a":"o", ["A"]])
    assert_eq(ddf.loc["j":"q", "B"], df.loc["j":"q", "B"])
    assert_eq(ddf.loc["j":"q", ["B"]], df.loc["j":"q", ["B"]])

    assert_eq(ddf.loc["a":"o", "B":"D"], df.loc["a":"o", "B":"D"])
    assert_eq(ddf.loc["a":"o", "B":"D"], df.loc["a":"o", "B":"D"])
    assert_eq(ddf.loc["j":"q", "B":"A"], df.loc["j":"q", "B":"A"])
    assert_eq(ddf.loc["j":"q", "B":"A"], df.loc["j":"q", "B":"A"])

    assert_eq(ddf.loc[ddf.B > 0, "B"], df.loc[df.B > 0, "B"])
    assert_eq(ddf.loc[ddf.B > 0, ["A", "C"]], df.loc[df.B > 0, ["A", "C"]])


def test_getitem():
    df = pd.DataFrame(
        {
            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
            "B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
            "C": [True, False, True] * 3,
        },
        columns=list("ABC"),
    )
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf["A"], df["A"])
    # check cache consistency
    tm.assert_series_equal(ddf["A"]._meta, ddf._meta["A"])

    assert_eq(ddf[["A", "B"]], df[["A", "B"]])
    tm.assert_frame_equal(ddf[["A", "B"]]._meta, ddf._meta[["A", "B"]])

    assert_eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._meta, ddf._meta.C)

    assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    pytest.raises(KeyError, lambda: df["X"])
    pytest.raises(KeyError, lambda: df[["A", "X"]])
    pytest.raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf[0], df[0])
    assert_eq(ddf[[1, 2]], df[[1, 2]])

    pytest.raises(KeyError, lambda: df[8])
    pytest.raises(KeyError, lambda: df[[1, 8]])


def test_getitem_slice():
    df = pd.DataFrame(
        {
            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9],
            "B": [9, 8, 7, 6, 5, 4, 3, 2, 1],
            "C": [True, False, True] * 3,
        },
        index=list("abcdefghi"),
    )
    ddf = dd.from_pandas(df, 3)
    assert_eq(ddf["a":"e"], df["a":"e"])
    assert_eq(ddf["a":"b"], df["a":"b"])
    assert_eq(ddf["f":], df["f":])


def test_getitem_integer_slice():
    df = pd.DataFrame({"A": range(6)})
    ddf = dd.from_pandas(df, 2)
    # integer slicing is iloc based
    with pytest.raises(NotImplementedError):
        ddf[1:3]

    df = pd.DataFrame({"A": range(6)}, index=[1.0, 2.0, 3.0, 5.0, 10.0, 11.0])
    ddf = dd.from_pandas(df, 2)
    # except for float dtype indexes
    assert_eq(ddf[2:8], df[2:8])
    assert_eq(ddf[2:], df[2:])
    assert_eq(ddf[:8], df[:8])


def test_loc_on_numpy_datetimes():
    df = pd.DataFrame(
        {"x": [1, 2, 3]}, index=list(map(np.datetime64, ["2014", "2015", "2016"]))
    )
    a = dd.from_pandas(df, 2)
    a.divisions = tuple(map(np.datetime64, a.divisions))

    assert_eq(a.loc["2014":"2015"], a.loc["2014":"2015"])


def test_loc_on_pandas_datetimes():
    df = pd.DataFrame(
        {"x": [1, 2, 3]}, index=list(map(pd.Timestamp, ["2014", "2015", "2016"]))
    )
    a = dd.from_pandas(df, 2)
    a.divisions = tuple(map(pd.Timestamp, a.divisions))

    assert_eq(a.loc["2014":"2015"], a.loc["2014":"2015"])


def test_loc_datetime_no_freq():
    # https://github.com/dask/dask/issues/2389

    datetime_index = pd.date_range("2016-01-01", "2016-01-31", freq="12h")
    datetime_index.freq = None  # FORGET FREQUENCY
    df = pd.DataFrame({"num": range(len(datetime_index))}, index=datetime_index)

    ddf = dd.from_pandas(df, npartitions=1)
    slice_ = slice("2016-01-03", "2016-01-05")
    result = ddf.loc[slice_, :]
    expected = df.loc[slice_, :]
    assert_eq(result, expected)


def test_coerce_loc_index():
    for t in [pd.Timestamp, np.datetime64]:
        assert isinstance(_coerce_loc_index([t("2014")], "2014"), t)


def test_loc_timestamp_str():

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.date_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
    assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"])
    # same reso, dask result is always DataFrame
    assert_eq(
        df.loc["2011-01-02 10:00"].to_frame().T,
        ddf.loc["2011-01-02 10:00"],
        **CHECK_FREQ,
    )

    # series
    assert_eq(df.A.loc["2011-01-02"], ddf.A.loc["2011-01-02"], **CHECK_FREQ)
    assert_eq(
        df.A.loc["2011-01-02":"2011-01-10"],
        ddf.A.loc["2011-01-02":"2011-01-10"],
        **CHECK_FREQ,
    )

    # slice with timestamp (dask result must be DataFrame)
    assert_eq(
        df.loc[pd.Timestamp("2011-01-02")].to_frame().T,
        ddf.loc[pd.Timestamp("2011-01-02")],
        **CHECK_FREQ,
    )
    assert_eq(
        df.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
        ddf.loc[pd.Timestamp("2011-01-02") : pd.Timestamp("2011-01-10")],
        **CHECK_FREQ,
    )
    assert_eq(
        df.loc[pd.Timestamp("2011-01-02 10:00")].to_frame().T,
        ddf.loc[pd.Timestamp("2011-01-02 10:00")],
        **CHECK_FREQ,
    )

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.date_range("2011-01-01", freq="M", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    assert_eq(df.loc["2011-01"], ddf.loc["2011-01"])
    assert_eq(df.loc["2011"], ddf.loc["2011"])

    assert_eq(df.loc["2011-01":"2012-05"], ddf.loc["2011-01":"2012-05"])
    assert_eq(df.loc["2011":"2015"], ddf.loc["2011":"2015"])

    # series
    assert_eq(df.B.loc["2011-01"], ddf.B.loc["2011-01"])
    assert_eq(df.B.loc["2011"], ddf.B.loc["2011"])

    assert_eq(df.B.loc["2011-01":"2012-05"], ddf.B.loc["2011-01":"2012-05"])
    assert_eq(df.B.loc["2011":"2015"], ddf.B.loc["2011":"2015"])


def test_getitem_timestamp_str():

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.date_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    if PANDAS_GT_120:
        with pytest.warns(
            FutureWarning, match="Indexing a DataFrame with a datetimelike"
        ):
            assert_eq(df.loc["2011-01-02"], ddf["2011-01-02"])
    else:
        assert_eq(df.loc["2011-01-02"], ddf["2011-01-02"])
    assert_eq(df["2011-01-02":"2011-01-10"], ddf["2011-01-02":"2011-01-10"])

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.date_range("2011-01-01", freq="D", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    assert_eq(df.loc["2011-01"], ddf.loc["2011-01"])
    assert_eq(df.loc["2011"], ddf.loc["2011"])

    assert_eq(df["2011-01":"2012-05"], ddf["2011-01":"2012-05"])
    assert_eq(df["2011":"2015"], ddf["2011":"2015"])


@pytest.mark.xfail(
    not PANDAS_GT_110, reason=".loc partial index with PeriodIndex not yet supported"
)
def test_loc_period_str():
    # .loc with PeriodIndex doesn't support partial string indexing
    # https://github.com/pydata/pandas/issues/13429
    # -> this started working in pandas 1.1
    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.period_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    assert_eq(df.loc["2011-01-02"], ddf.loc["2011-01-02"])
    assert_eq(df.loc["2011-01-02":"2011-01-10"], ddf.loc["2011-01-02":"2011-01-10"])
    # same reso, dask result is always DataFrame

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.period_range("2011-01-01", freq="D", periods=100),
    )
    ddf = dd.from_pandas(df, 50)
    assert_eq(df.loc["2011-01"], ddf.loc["2011-01"])
    assert_eq(df.loc["2011"], ddf.loc["2011"])

    assert_eq(df.loc["2011-01":"2012-05"], ddf.loc["2011-01":"2012-05"])
    assert_eq(df.loc["2011":"2015"], ddf.loc["2011":"2015"])


def test_getitem_period_str():

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.period_range("2011-01-01", freq="H", periods=100),
    )
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    if PANDAS_GT_120:
        with pytest.warns(
            FutureWarning, match="Indexing a DataFrame with a datetimelike"
        ):
            assert_eq(df.loc["2011-01-02"], ddf["2011-01-02"])
    else:
        assert_eq(df["2011-01-02"], ddf["2011-01-02"])
    assert_eq(df["2011-01-02":"2011-01-10"], ddf["2011-01-02":"2011-01-10"])
    # same reso, dask result is always DataFrame

    df = pd.DataFrame(
        {"A": np.random.randn(100), "B": np.random.randn(100)},
        index=pd.period_range("2011-01-01", freq="D", periods=100),
    )
    ddf = dd.from_pandas(df, 50)

    if PANDAS_GT_120:
        with pytest.warns(
            FutureWarning, match="Indexing a DataFrame with a datetimelike"
        ):
            assert_eq(df.loc["2011-01"], ddf["2011-01"])
    else:
        assert_eq(df["2011-01"], ddf["2011-01"])

    if PANDAS_GT_120:
        with pytest.warns(
            FutureWarning, match="Indexing a DataFrame with a datetimelike"
        ):
            assert_eq(df.loc["2011"], ddf["2011"])
    else:
        assert_eq(df["2011"], ddf["2011"])

    assert_eq(df["2011-01":"2012-05"], ddf["2011-01":"2012-05"])
    assert_eq(df["2011":"2015"], ddf["2011":"2015"])


@pytest.mark.parametrize(
    "index",
    [
        pd.date_range("2011-01-01", freq="H", periods=100),  # time index
        range(100),  # numerical index
    ],
)
def test_to_series(index):
    df = pd.DataFrame({"A": np.random.randn(100)}, index=index)
    ddf = dd.from_pandas(df, 10)

    expected = df.index.to_series()
    actual = ddf.index.to_series()

    assert actual.known_divisions
    assert_eq(expected, actual)


@pytest.mark.parametrize(
    "index",
    [
        pd.date_range("2011-01-01", freq="H", periods=100),  # time index
        range(100),  # numerical index
    ],
)
def test_to_frame(index):
    df = pd.DataFrame({"A": np.random.randn(100)}, index=index)
    ddf = dd.from_pandas(df, 10)

    expected = df.index.to_frame()
    actual = ddf.index.to_frame()

    assert actual.known_divisions
    assert_eq(expected, actual)

    # test name option
    assert_eq(df.index.to_frame(name="foo"), ddf.index.to_frame(name="foo"))


@pytest.mark.parametrize("indexer", [0, [0], [0, 1], [1, 0], [False, True, True]])
def test_iloc(indexer):
    df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
    ddf = dd.from_pandas(df, 2)

    result = ddf.iloc[:, indexer]
    expected = df.iloc[:, indexer]

    assert_eq(result, expected)


def test_iloc_series():
    s = pd.Series([1, 2, 3])
    ds = dd.from_pandas(s, 2)
    with pytest.raises(AttributeError):
        ds.iloc[:]


def test_iloc_raises():
    df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
    ddf = dd.from_pandas(df, 2)

    with pytest.raises(NotImplementedError):
        ddf.iloc[[0, 1], :]

    with pytest.raises(NotImplementedError):
        ddf.iloc[[0, 1], [0, 1]]

    with pytest.raises(ValueError):
        ddf.iloc[[0, 1], [0, 1], [1, 2]]

    with pytest.raises(IndexError):
        ddf.iloc[:, [5, 6]]


def test_iloc_duplicate_columns():
    df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
    ddf = dd.from_pandas(df, 2)
    df.columns = ["A", "A", "C"]
    ddf.columns = ["A", "A", "C"]

    selection = ddf.iloc[:, 2]
    # Check that `iloc` is called instead of getitem
    assert any([key.startswith("iloc") for key in selection.dask.layers.keys()])

    select_first = ddf.iloc[:, 1]
    assert_eq(select_first, df.iloc[:, 1])

    select_zeroth = ddf.iloc[:, 0]
    assert_eq(select_zeroth, df.iloc[:, 0])

    select_list_cols = ddf.iloc[:, [0, 2]]
    assert_eq(select_list_cols, df.iloc[:, [0, 2]])

    select_negative = ddf.iloc[:, -1:-3:-1]
    assert_eq(select_negative, df.iloc[:, -1:-3:-1])


def test_iloc_dispatch_to_getitem():
    df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
    ddf = dd.from_pandas(df, 2)

    selection = ddf.iloc[:, 2]

    assert all([not key.startswith("iloc") for key in selection.dask.layers.keys()])
    assert any([key.startswith("getitem") for key in selection.dask.layers.keys()])

    select_first = ddf.iloc[:, 1]
    assert_eq(select_first, df.iloc[:, 1])

    select_zeroth = ddf.iloc[:, 0]
    assert_eq(select_zeroth, df.iloc[:, 0])

    select_list_cols = ddf.iloc[:, [0, 2]]
    assert_eq(select_list_cols, df.iloc[:, [0, 2]])

    select_negative = ddf.iloc[:, -1:-3:-1]
    assert_eq(select_negative, df.iloc[:, -1:-3:-1])


def test_iloc_out_of_order_selection():
    df = pd.DataFrame({"A": [1] * 100, "B": [2] * 100, "C": [3] * 100, "D": [4] * 100})
    ddf = dd.from_pandas(df, 2)
    ddf = ddf[["C", "A", "B"]]
    a = ddf.iloc[:, 0]
    b = ddf.iloc[:, 1]
    c = ddf.iloc[:, 2]

    assert a.name == "C"
    assert b.name == "A"
    assert c.name == "B"

    a1, b1, c1 = dask.compute(a, b, c)

    assert a1.name == "C"
    assert b1.name == "A"
    assert c1.name == "B"


def test_pandas_nullable_boolean_data_type():
    s1 = pd.Series([0, 1, 2])
    s2 = pd.Series([True, False, pd.NA], dtype="boolean")

    ddf1 = dd.from_pandas(s1, npartitions=1)
    ddf2 = dd.from_pandas(s2, npartitions=1)

    assert_eq(ddf1[ddf2], s1[s2])
    assert_eq(ddf1.loc[ddf2], s1.loc[s2])


def test_deterministic_hashing_series():
    obj = pd.Series([0, 1, 2])

    dask_df = dd.from_pandas(obj, npartitions=1)

    ddf1 = dask_df.loc[0:1]
    ddf2 = dask_df.loc[0:1]

    assert tokenize(ddf1) == tokenize(ddf2)

    ddf2 = dask_df.loc[0:2]
    assert tokenize(ddf1) != tokenize(ddf2)


def test_deterministic_hashing_dataframe():
    # Add duplicate column names in order to use _iLocIndexer._iloc path
    obj = pd.DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=["a", "b", "c", "c"])

    dask_df = dd.from_pandas(obj, npartitions=1)

    ddf1 = dask_df.loc[0:1, ["a", "c"]]
    ddf2 = dask_df.loc[0:1, ["a", "c"]]

    assert tokenize(ddf1) == tokenize(ddf2)

    ddf1 = dask_df.loc[0:1, "c"]
    ddf2 = dask_df.loc[0:1, "c"]

    assert tokenize(ddf1) == tokenize(ddf2)

    ddf1 = dask_df.iloc[:, [0, 1]]
    ddf2 = dask_df.iloc[:, [0, 1]]

    assert tokenize(ddf1) == tokenize(ddf2)

    ddf2 = dask_df.iloc[:, [0, 2]]
    assert tokenize(ddf1) != tokenize(ddf2)
duality-group / dask python

Products

About

Resources

Contact Gemfury