Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / pandas   python

Repository URL to install this package:

Version: 1.1.1 

/ tests / internals / test_internals.py

from collections import OrderedDict
from datetime import date, datetime
import itertools
import operator
import re

import numpy as np
import pytest

from pandas._libs.internals import BlockPlacement

import pandas as pd
from pandas import Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series
import pandas._testing as tm
import pandas.core.algorithms as algos
from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray
from pandas.core.internals import BlockManager, SingleBlockManager, make_block


@pytest.fixture
def mgr():
    return create_mgr(
        "a: f8; b: object; c: f8; d: object; e: f8;"
        "f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;"
        "k: M8[ns, US/Eastern]; l: M8[ns, CET];"
    )


def assert_block_equal(left, right):
    tm.assert_numpy_array_equal(left.values, right.values)
    assert left.dtype == right.dtype
    assert isinstance(left.mgr_locs, BlockPlacement)
    assert isinstance(right.mgr_locs, BlockPlacement)
    tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array)


def get_numeric_mat(shape):
    arr = np.arange(shape[0])
    return np.lib.stride_tricks.as_strided(
        x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1)
    ).copy()


N = 10


def create_block(typestr, placement, item_shape=None, num_offset=0):
    """
    Supported typestr:

        * float, f8, f4, f2
        * int, i8, i4, i2, i1
        * uint, u8, u4, u2, u1
        * complex, c16, c8
        * bool
        * object, string, O
        * datetime, dt, M8[ns], M8[ns, tz]
        * timedelta, td, m8[ns]
        * sparse (SparseArray with fill_value=0.0)
        * sparse_na (SparseArray with fill_value=np.nan)
        * category, category2

    """
    placement = BlockPlacement(placement)
    num_items = len(placement)

    if item_shape is None:
        item_shape = (N,)

    shape = (num_items,) + item_shape

    mat = get_numeric_mat(shape)

    if typestr in (
        "float",
        "f8",
        "f4",
        "f2",
        "int",
        "i8",
        "i4",
        "i2",
        "i1",
        "uint",
        "u8",
        "u4",
        "u2",
        "u1",
    ):
        values = mat.astype(typestr) + num_offset
    elif typestr in ("complex", "c16", "c8"):
        values = 1.0j * (mat.astype(typestr) + num_offset)
    elif typestr in ("object", "string", "O"):
        values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape)
    elif typestr in ("b", "bool"):
        values = np.ones(shape, dtype=np.bool_)
    elif typestr in ("datetime", "dt", "M8[ns]"):
        values = (mat * 1e9).astype("M8[ns]")
    elif typestr.startswith("M8[ns"):
        # datetime with tz
        m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
        assert m is not None, f"incompatible typestr -> {typestr}"
        tz = m.groups()[0]
        assert num_items == 1, "must have only 1 num items for a tz-aware"
        values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
    elif typestr in ("timedelta", "td", "m8[ns]"):
        values = (mat * 1).astype("m8[ns]")
    elif typestr in ("category",):
        values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
    elif typestr in ("category2",):
        values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
    elif typestr in ("sparse", "sparse_na"):
        # FIXME: doesn't support num_rows != 10
        assert shape[-1] == 10
        assert all(s == 1 for s in shape[:-1])
        if typestr.endswith("_na"):
            fill_value = np.nan
        else:
            fill_value = 0.0
        values = SparseArray(
            [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
            fill_value=fill_value,
        )
        arr = values.sp_values.view()
        arr += num_offset - 1
    else:
        raise ValueError(f'Unsupported typestr: "{typestr}"')

    return make_block(values, placement=placement, ndim=len(shape))


def create_single_mgr(typestr, num_rows=None):
    if num_rows is None:
        num_rows = N

    return SingleBlockManager(
        create_block(typestr, placement=slice(0, num_rows), item_shape=()),
        Index(np.arange(num_rows)),
    )


def create_mgr(descr, item_shape=None):
    """
    Construct BlockManager from string description.

    String description syntax looks similar to np.matrix initializer.  It looks
    like this::

        a,b,c: f8; d,e,f: i8

    Rules are rather simple:

    * see list of supported datatypes in `create_block` method
    * components are semicolon-separated
    * each component is `NAME,NAME,NAME: DTYPE_ID`
    * whitespace around colons & semicolons are removed
    * components with same DTYPE_ID are combined into single block
    * to force multiple blocks with same dtype, use '-SUFFIX'::

        'a:f8-1; b:f8-2; c:f8-foobar'

    """
    if item_shape is None:
        item_shape = (N,)

    offset = 0
    mgr_items = []
    block_placements = OrderedDict()
    for d in descr.split(";"):
        d = d.strip()
        if not len(d):
            continue
        names, blockstr = d.partition(":")[::2]
        blockstr = blockstr.strip()
        names = names.strip().split(",")

        mgr_items.extend(names)
        placement = list(np.arange(len(names)) + offset)
        try:
            block_placements[blockstr].extend(placement)
        except KeyError:
            block_placements[blockstr] = placement
        offset += len(names)

    mgr_items = Index(mgr_items)

    blocks = []
    num_offset = 0
    for blockstr, placement in block_placements.items():
        typestr = blockstr.split("-")[0]
        blocks.append(
            create_block(
                typestr, placement, item_shape=item_shape, num_offset=num_offset
            )
        )
        num_offset += len(placement)

    return BlockManager(
        sorted(blocks, key=lambda b: b.mgr_locs[0]),
        [mgr_items] + [np.arange(n) for n in item_shape],
    )


class TestBlock:
    def setup_method(self, method):
        self.fblock = create_block("float", [0, 2, 4])
        self.cblock = create_block("complex", [7])
        self.oblock = create_block("object", [1, 3])
        self.bool_block = create_block("bool", [5])

    def test_constructor(self):
        int32block = create_block("i4", [0])
        assert int32block.dtype == np.int32

    def test_pickle(self):
        def _check(blk):
            assert_block_equal(tm.round_trip_pickle(blk), blk)

        _check(self.fblock)
        _check(self.cblock)
        _check(self.oblock)
        _check(self.bool_block)

    def test_mgr_locs(self):
        assert isinstance(self.fblock.mgr_locs, BlockPlacement)
        tm.assert_numpy_array_equal(
            self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64)
        )

    def test_attrs(self):
        assert self.fblock.shape == self.fblock.values.shape
        assert self.fblock.dtype == self.fblock.values.dtype
        assert len(self.fblock) == len(self.fblock.values)

    def test_copy(self):
        cop = self.fblock.copy()
        assert cop is not self.fblock
        assert_block_equal(self.fblock, cop)

    def test_delete(self):
        newb = self.fblock.copy()
        newb.delete(0)
        assert isinstance(newb.mgr_locs, BlockPlacement)
        tm.assert_numpy_array_equal(
            newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64)
        )
        assert (newb.values[0] == 1).all()

        newb = self.fblock.copy()
        newb.delete(1)
        assert isinstance(newb.mgr_locs, BlockPlacement)
        tm.assert_numpy_array_equal(
            newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64)
        )
        assert (newb.values[1] == 2).all()

        newb = self.fblock.copy()
        newb.delete(2)
        tm.assert_numpy_array_equal(
            newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64)
        )
        assert (newb.values[1] == 1).all()

        newb = self.fblock.copy()

        with pytest.raises(IndexError, match=None):
            newb.delete(3)


class TestBlockManager:
    def test_attrs(self):
        mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2")
        assert mgr.nblocks == 2
        assert len(mgr) == 6

    def test_is_mixed_dtype(self):
        assert not create_mgr("a,b:f8").is_mixed_type
        assert not create_mgr("a:f8-1; b:f8-2").is_mixed_type

        assert create_mgr("a,b:f8; c,d: f4").is_mixed_type
        assert create_mgr("a,b:f8; c,d: object").is_mixed_type

    def test_duplicate_ref_loc_failure(self):
        tmp_mgr = create_mgr("a:bool; a: f8")

        axes, blocks = tmp_mgr.axes, tmp_mgr.blocks

        blocks[0].mgr_locs = np.array([0])
        blocks[1].mgr_locs = np.array([0])

        # test trying to create block manager with overlapping ref locs

        msg = "Gaps in blk ref_locs"

        with pytest.raises(AssertionError, match=msg):
            mgr = BlockManager(blocks, axes)
            mgr._rebuild_blknos_and_blklocs()

        blocks[0].mgr_locs = np.array([0])
        blocks[1].mgr_locs = np.array([1])
        mgr = BlockManager(blocks, axes)
        mgr.iget(1)

    def test_pickle(self, mgr):

        mgr2 = tm.round_trip_pickle(mgr)
        tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

        # GH2431
        assert hasattr(mgr2, "_is_consolidated")
        assert hasattr(mgr2, "_known_consolidated")

        # reset to False on load
        assert not mgr2._is_consolidated
        assert not mgr2._known_consolidated

    @pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"])
    def test_non_unique_pickle(self, mgr_string):
        mgr = create_mgr(mgr_string)
        mgr2 = tm.round_trip_pickle(mgr)
        tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

    def test_categorical_block_pickle(self):
        mgr = create_mgr("a: category")
        mgr2 = tm.round_trip_pickle(mgr)
        tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))

        smgr = create_single_mgr("category")
        smgr2 = tm.round_trip_pickle(smgr)
        tm.assert_series_equal(Series(smgr), Series(smgr2))

    def test_iget(self):
        cols = Index(list("abc"))
        values = np.random.rand(3, 3)
        block = make_block(values=values.copy(), placement=np.arange(3))
        mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])

        tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0])
        tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1])
        tm.assert_almost_equal(mgr.iget(2).internal_values(), values[2])

    def test_set(self):
        mgr = create_mgr("a,b,c: int", item_shape=(3,))

        mgr.insert(len(mgr.items), "d", np.array(["foo"] * 3))
        mgr.iset(1, np.array(["bar"] * 3))
        tm.assert_numpy_array_equal(mgr.iget(0).internal_values(), np.array([0] * 3))
        tm.assert_numpy_array_equal(
            mgr.iget(1).internal_values(), np.array(["bar"] * 3, dtype=np.object_)
        )
        tm.assert_numpy_array_equal(mgr.iget(2).internal_values(), np.array([2] * 3))
        tm.assert_numpy_array_equal(
            mgr.iget(3).internal_values(), np.array(["foo"] * 3, dtype=np.object_)
        )

    def test_set_change_dtype(self, mgr):
        mgr.insert(len(mgr.items), "baz", np.zeros(N, dtype=bool))

        mgr.iset(mgr.items.get_loc("baz"), np.repeat("foo", N))
        idx = mgr.items.get_loc("baz")
        assert mgr.iget(idx).dtype == np.object_

        mgr2 = mgr.consolidate()
        mgr2.iset(mgr2.items.get_loc("baz"), np.repeat("foo", N))
        idx = mgr2.items.get_loc("baz")
        assert mgr2.iget(idx).dtype == np.object_

        mgr2.insert(len(mgr2.items), "quux", tm.randn(N).astype(int))
        idx = mgr2.items.get_loc("quux")
        assert mgr2.iget(idx).dtype == np.int_

        mgr2.iset(mgr2.items.get_loc("quux"), tm.randn(N))
        assert mgr2.iget(idx).dtype == np.float_

    def test_copy(self, mgr):
        cp = mgr.copy(deep=False)
        for blk, cp_blk in zip(mgr.blocks, cp.blocks):

            # view assertion
            tm.assert_equal(cp_blk.values, blk.values)
            if isinstance(blk.values, np.ndarray):
                assert cp_blk.values.base is blk.values.base
            else:
                # DatetimeTZBlock has DatetimeIndex values
                assert cp_blk.values._data.base is blk.values._data.base

        cp = mgr.copy(deep=True)
        for blk, cp_blk in zip(mgr.blocks, cp.blocks):

            # copy assertion we either have a None for a base or in case of
            # some blocks it is an array (e.g. datetimetz), but was copied
            tm.assert_equal(cp_blk.values, blk.values)
            if not isinstance(cp_blk.values, np.ndarray):
                assert cp_blk.values._data.base is not blk.values._data.base
            else:
                assert cp_blk.values.base is None and blk.values.base is None

    def test_sparse(self):
        mgr = create_mgr("a: sparse-1; b: sparse-2")
        # what to test here?
        assert mgr.as_array().dtype == np.float64

    def test_sparse_mixed(self):
        mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8")
        assert len(mgr.blocks) == 3
        assert isinstance(mgr, BlockManager)

        # TODO: what to test here?

    @pytest.mark.parametrize(
        "mgr_string, dtype",
        [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)],
    )
    def test_as_array_float(self, mgr_string, dtype):
        mgr = create_mgr(mgr_string)
        assert mgr.as_array().dtype == dtype

    @pytest.mark.parametrize(
        "mgr_string, dtype",
        [
            ("a: bool-1; b: bool-2", np.bool_),
            ("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64),
            ("c: i4; d: i2; e: u1", np.int32),
        ],
    )
    def test_as_array_int_bool(self, mgr_string, dtype):
        mgr = create_mgr(mgr_string)
        assert mgr.as_array().dtype == dtype

    def test_as_array_datetime(self):
        mgr = create_mgr("h: datetime-1; g: datetime-2")
        assert mgr.as_array().dtype == "M8[ns]"

    def test_as_array_datetime_tz(self):
        mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]")
        assert mgr.iget(0).dtype == "datetime64[ns, US/Eastern]"
        assert mgr.iget(1).dtype == "datetime64[ns, CET]"
        assert mgr.as_array().dtype == "object"

    @pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"])
    def test_astype(self, t):
        # coerce all
        mgr = create_mgr("c: f4; d: f2; e: f8")

        t = np.dtype(t)
        tmgr = mgr.astype(t)
        assert tmgr.iget(0).dtype.type == t
        assert tmgr.iget(1).dtype.type == t
        assert tmgr.iget(2).dtype.type == t

        # mixed
        mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8")

        t = np.dtype(t)
        tmgr = mgr.astype(t, errors="ignore")
        assert tmgr.iget(2).dtype.type == t
        assert tmgr.iget(4).dtype.type == t
        assert tmgr.iget(5).dtype.type == t
        assert tmgr.iget(6).dtype.type == t

        assert tmgr.iget(0).dtype.type == np.object_
        assert tmgr.iget(1).dtype.type == np.object_
        if t != np.int64:
            assert tmgr.iget(3).dtype.type == np.datetime64
        else:
            assert tmgr.iget(3).dtype.type == t

    def test_convert(self):
        def _compare(old_mgr, new_mgr):
            """ compare the blocks, numeric compare ==, object don't """
            old_blocks = set(old_mgr.blocks)
            new_blocks = set(new_mgr.blocks)
            assert len(old_blocks) == len(new_blocks)

            # compare non-numeric
            for b in old_blocks:
                found = False
                for nb in new_blocks:
                    if (b.values == nb.values).all():
                        found = True
                        break
                assert found

            for b in new_blocks:
                found = False
                for ob in old_blocks:
                    if (b.values == ob.values).all():
                        found = True
                        break
                assert found

        # noops
        mgr = create_mgr("f: i8; g: f8")
        new_mgr = mgr.convert()
        _compare(mgr, new_mgr)

        # convert
        mgr = create_mgr("a,b,foo: object; f: i8; g: f8")
        mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
        mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
        mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
        new_mgr = mgr.convert(numeric=True)
        assert new_mgr.iget(0).dtype == np.int64
        assert new_mgr.iget(1).dtype == np.float64
        assert new_mgr.iget(2).dtype == np.object_
        assert new_mgr.iget(3).dtype == np.int64
        assert new_mgr.iget(4).dtype == np.float64

        mgr = create_mgr(
            "a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2"
        )
        mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
        mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
        mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
        new_mgr = mgr.convert(numeric=True)
        assert new_mgr.iget(0).dtype == np.int64
        assert new_mgr.iget(1).dtype == np.float64
        assert new_mgr.iget(2).dtype == np.object_
        assert new_mgr.iget(3).dtype == np.int32
        assert new_mgr.iget(4).dtype == np.bool_
        assert new_mgr.iget(5).dtype.type, np.datetime64
        assert new_mgr.iget(6).dtype == np.int64
        assert new_mgr.iget(7).dtype == np.float64
        assert new_mgr.iget(8).dtype == np.float16

    def test_invalid_ea_block(self):
        with pytest.raises(AssertionError, match="block.size != values.size"):
            create_mgr("a: category; b: category")

        with pytest.raises(AssertionError, match="block.size != values.size"):
            create_mgr("a: category2; b: category2")

    def test_interleave(self):
        # self
        for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]:
            mgr = create_mgr(f"a: {dtype}")
            assert mgr.as_array().dtype == dtype
            mgr = create_mgr(f"a: {dtype}; b: {dtype}")
            assert mgr.as_array().dtype == dtype

    @pytest.mark.parametrize(
        "mgr_string, dtype",
        [
            ("a: category", "i8"),
            ("a: category; b: category", "i8"),
            ("a: category; b: category2", "object"),
            ("a: category2", "object"),
            ("a: category2; b: category2", "object"),
            ("a: f8", "f8"),
            ("a: f8; b: i8", "f8"),
            ("a: f4; b: i8", "f8"),
            ("a: f4; b: i8; d: object", "object"),
            ("a: bool; b: i8", "object"),
            ("a: complex", "complex"),
            ("a: f8; b: category", "object"),
            ("a: M8[ns]; b: category", "object"),
            ("a: M8[ns]; b: bool", "object"),
            ("a: M8[ns]; b: i8", "object"),
            ("a: m8[ns]; b: bool", "object"),
            ("a: m8[ns]; b: i8", "object"),
            ("a: M8[ns]; b: m8[ns]", "object"),
        ],
    )
    def test_interleave_dtype(self, mgr_string, dtype):
        # will be converted according the actual dtype of the underlying
        mgr = create_mgr("a: category")
        assert mgr.as_array().dtype == "i8"
        mgr = create_mgr("a: category; b: category2")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: category2")
        assert mgr.as_array().dtype == "object"

        # combinations
        mgr = create_mgr("a: f8")
        assert mgr.as_array().dtype == "f8"
        mgr = create_mgr("a: f8; b: i8")
        assert mgr.as_array().dtype == "f8"
        mgr = create_mgr("a: f4; b: i8")
        assert mgr.as_array().dtype == "f8"
        mgr = create_mgr("a: f4; b: i8; d: object")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: bool; b: i8")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: complex")
        assert mgr.as_array().dtype == "complex"
        mgr = create_mgr("a: f8; b: category")
        assert mgr.as_array().dtype == "f8"
        mgr = create_mgr("a: M8[ns]; b: category")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: M8[ns]; b: bool")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: M8[ns]; b: i8")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: m8[ns]; b: bool")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: m8[ns]; b: i8")
        assert mgr.as_array().dtype == "object"
        mgr = create_mgr("a: M8[ns]; b: m8[ns]")
        assert mgr.as_array().dtype == "object"

    def test_consolidate_ordering_issues(self, mgr):
        mgr.iset(mgr.items.get_loc("f"), tm.randn(N))
        mgr.iset(mgr.items.get_loc("d"), tm.randn(N))
        mgr.iset(mgr.items.get_loc("b"), tm.randn(N))
        mgr.iset(mgr.items.get_loc("g"), tm.randn(N))
        mgr.iset(mgr.items.get_loc("h"), tm.randn(N))

        # we have datetime/tz blocks in mgr
        cons = mgr.consolidate()
        assert cons.nblocks == 4
        cons = mgr.consolidate().get_numeric_data()
        assert cons.nblocks == 1
        assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
        tm.assert_numpy_array_equal(
            cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64)
        )

    def test_reindex_items(self):
        # mgr is not consolidated, f8 & f8-2 blocks
        mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2")

        reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0)
        assert reindexed.nblocks == 2
        tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"]))
        tm.assert_almost_equal(
            mgr.iget(6).internal_values(), reindexed.iget(0).internal_values()
        )
        tm.assert_almost_equal(
            mgr.iget(2).internal_values(), reindexed.iget(1).internal_values()
        )
        tm.assert_almost_equal(
            mgr.iget(0).internal_values(), reindexed.iget(2).internal_values()
        )
        tm.assert_almost_equal(
            mgr.iget(3).internal_values(), reindexed.iget(3).internal_values()
        )

    def test_get_numeric_data(self):
        mgr = create_mgr(
            "int: int; float: float; complex: complex;"
            "str: object; bool: bool; obj: object; dt: datetime",
            item_shape=(3,),
        )
        mgr.iset(5, np.array([1, 2, 3], dtype=np.object_))

        numeric = mgr.get_numeric_data()
        tm.assert_index_equal(
            numeric.items, pd.Index(["int", "float", "complex", "bool"])
        )
        tm.assert_almost_equal(
            mgr.iget(mgr.items.get_loc("float")).internal_values(),
            numeric.iget(numeric.items.get_loc("float")).internal_values(),
        )

        # Check sharing
        numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
        tm.assert_almost_equal(
            mgr.iget(mgr.items.get_loc("float")).internal_values(),
            np.array([100.0, 200.0, 300.0]),
        )

        numeric2 = mgr.get_numeric_data(copy=True)
        tm.assert_index_equal(
            numeric.items, pd.Index(["int", "float", "complex", "bool"])
        )
        numeric2.iset(
            numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
        )
        tm.assert_almost_equal(
            mgr.iget(mgr.items.get_loc("float")).internal_values(),
            np.array([100.0, 200.0, 300.0]),
        )

    def test_get_bool_data(self):
        mgr = create_mgr(
            "int: int; float: float; complex: complex;"
            "str: object; bool: bool; obj: object; dt: datetime",
            item_shape=(3,),
        )
        mgr.iset(6, np.array([True, False, True], dtype=np.object_))

        bools = mgr.get_bool_data()
        tm.assert_index_equal(bools.items, pd.Index(["bool"]))
        tm.assert_almost_equal(
            mgr.iget(mgr.items.get_loc("bool")).internal_values(),
            bools.iget(bools.items.get_loc("bool")).internal_values(),
        )

        bools.iset(0, np.array([True, False, True]))
        tm.assert_numpy_array_equal(
            mgr.iget(mgr.items.get_loc("bool")).internal_values(),
            np.array([True, False, True]),
        )

        # Check sharing
        bools2 = mgr.get_bool_data(copy=True)
        bools2.iset(0, np.array([False, True, False]))
        tm.assert_numpy_array_equal(
            mgr.iget(mgr.items.get_loc("bool")).internal_values(),
            np.array([True, False, True]),
        )

    def test_unicode_repr_doesnt_raise(self):
        repr(create_mgr("b,\u05d0: object"))

    @pytest.mark.parametrize(
        "mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"]
    )
    def test_equals(self, mgr_string):
        # unique items
        bm1 = create_mgr(mgr_string)
        bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
        assert bm1.equals(bm2)

    @pytest.mark.parametrize(
        "mgr_string",
        [
            "a:i8;b:f8",  # basic case
            "a:i8;b:f8;c:c8;d:b",  # many types
            "a:i8;e:dt;f:td;g:string",  # more types
            "a:i8;b:category;c:category2",  # categories
            "c:sparse;d:sparse_na;b:f8",  # sparse
        ],
    )
    def test_equals_block_order_different_dtypes(self, mgr_string):
        # GH 9330
        bm = create_mgr(mgr_string)
        block_perms = itertools.permutations(bm.blocks)
        for bm_perm in block_perms:
            bm_this = BlockManager(bm_perm, bm.axes)
            assert bm.equals(bm_this)
            assert bm_this.equals(bm)

    def test_single_mgr_ctor(self):
        mgr = create_single_mgr("f8", num_rows=5)
        assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0]

    @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
    def test_validate_bool_args(self, value):
        bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2")

        msg = (
            'For argument "inplace" expected type bool, '
            f"received type {type(value).__name__}."
        )
        with pytest.raises(ValueError, match=msg):
            bm1.replace_list([1], [2], inplace=value)


class TestIndexing:
    # Nosetests-style data-driven tests.
    #
    # This test applies different indexing routines to block managers and
    # compares the outcome to the result of same operations on np.ndarray.
    #
    # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests
    #       and are disabled.

    MANAGERS = [
        create_single_mgr("f8", N),
        create_single_mgr("i8", N),
        # 2-dim
        create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)),
        create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)),
        create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)),
        create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)),
    ]

    @pytest.mark.parametrize("mgr", MANAGERS)
    def test_get_slice(self, mgr):
        def assert_slice_ok(mgr, axis, slobj):
            mat = mgr.as_array()

            # we maybe using an ndarray to test slicing and
            # might not be the full length of the axis
            if isinstance(slobj, np.ndarray):
                ax = mgr.axes[axis]
                if len(ax) and len(slobj) and len(slobj) != len(ax):
                    slobj = np.concatenate(
                        [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)]
                    )
            sliced = mgr.get_slice(slobj, axis=axis)
            mat_slobj = (slice(None),) * axis + (slobj,)
            tm.assert_numpy_array_equal(
                mat[mat_slobj], sliced.as_array(), check_dtype=False
            )
            tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis])

        assert mgr.ndim <= 2, mgr.ndim
        for ax in range(mgr.ndim):
            # slice
            assert_slice_ok(mgr, ax, slice(None))
            assert_slice_ok(mgr, ax, slice(3))
            assert_slice_ok(mgr, ax, slice(100))
            assert_slice_ok(mgr, ax, slice(1, 4))
            assert_slice_ok(mgr, ax, slice(3, 0, -2))

            # boolean mask
            assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_))
            assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
            assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))

            if mgr.shape[ax] >= 3:
                assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
                assert_slice_ok(mgr, ax, np.array([True, True, False], dtype=np.bool_))

            # fancy indexer
            assert_slice_ok(mgr, ax, [])
            assert_slice_ok(mgr, ax, list(range(mgr.shape[ax])))

            if mgr.shape[ax] >= 3:
                assert_slice_ok(mgr, ax, [0, 1, 2])
                assert_slice_ok(mgr, ax, [-1, -2, -3])

    @pytest.mark.parametrize("mgr", MANAGERS)
    def test_take(self, mgr):
        def assert_take_ok(mgr, axis, indexer):
            mat = mgr.as_array()
            taken = mgr.take(indexer, axis)
            tm.assert_numpy_array_equal(
                np.take(mat, indexer, axis), taken.as_array(), check_dtype=False
            )
            tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis])

        for ax in range(mgr.ndim):
            # take/fancy indexer
            assert_take_ok(mgr, ax, indexer=[])
            assert_take_ok(mgr, ax, indexer=[0, 0, 0])
            assert_take_ok(mgr, ax, indexer=list(range(mgr.shape[ax])))

            if mgr.shape[ax] >= 3:
                assert_take_ok(mgr, ax, indexer=[0, 1, 2])
                assert_take_ok(mgr, ax, indexer=[-1, -2, -3])

    @pytest.mark.parametrize("mgr", MANAGERS)
    @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
    def test_reindex_axis(self, fill_value, mgr):
        def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value):
            mat = mgr.as_array()
            indexer = mgr.axes[axis].get_indexer_for(new_labels)

            reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value)
            tm.assert_numpy_array_equal(
                algos.take_nd(mat, indexer, axis, fill_value=fill_value),
                reindexed.as_array(),
                check_dtype=False,
            )
            tm.assert_index_equal(reindexed.axes[axis], new_labels)

        for ax in range(mgr.ndim):
            assert_reindex_axis_is_ok(mgr, ax, pd.Index([]), fill_value)
            assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value)
            assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value)
            assert_reindex_axis_is_ok(
                mgr, ax, pd.Index(["foo", "bar", "baz"]), fill_value
            )
            assert_reindex_axis_is_ok(
                mgr, ax, pd.Index(["foo", mgr.axes[ax][0], "baz"]), fill_value
            )

            if mgr.shape[ax] >= 3:
                assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][:-3], fill_value)
                assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][-3::-1], fill_value)
                assert_reindex_axis_is_ok(
                    mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value
                )

    @pytest.mark.parametrize("mgr", MANAGERS)
    @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
    def test_reindex_indexer(self, fill_value, mgr):
        def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value):
            mat = mgr.as_array()
            reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value)
            reindexed = mgr.reindex_indexer(
                new_labels, indexer, axis, fill_value=fill_value
            )
            tm.assert_numpy_array_equal(
                reindexed_mat, reindexed.as_array(), check_dtype=False
            )
            tm.assert_index_equal(reindexed.axes[axis], new_labels)

        for ax in range(mgr.ndim):
            assert_reindex_indexer_is_ok(mgr, ax, pd.Index([]), [], fill_value)
            assert_reindex_indexer_is_ok(
                mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value
            )
            assert_reindex_indexer_is_ok(
                mgr,
                ax,
                pd.Index(["foo"] * mgr.shape[ax]),
                np.arange(mgr.shape[ax]),
                fill_value,
            )
            assert_reindex_indexer_is_ok(
                mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value,
            )
            assert_reindex_indexer_is_ok(
                mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value,
            )
            assert_reindex_indexer_is_ok(
                mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value
            )
            assert_reindex_indexer_is_ok(
                mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value,
            )
            assert_reindex_indexer_is_ok(
                mgr,
                ax,
                pd.Index(["foo", mgr.axes[ax][0], "baz"]),
                [-1, -1, -1],
                fill_value,
            )

            if mgr.shape[ax] >= 3:
                assert_reindex_indexer_is_ok(
                    mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value,
                )


class TestBlockPlacement:
    @pytest.mark.parametrize(
        "slc, expected",
        [
            (slice(0, 4), 4),
            (slice(0, 4, 2), 2),
            (slice(0, 3, 2), 2),
            (slice(0, 1, 2), 1),
            (slice(1, 0, -1), 1),
        ],
    )
    def test_slice_len(self, slc, expected):
        assert len(BlockPlacement(slc)) == expected

    @pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)])
    def test_zero_step_raises(self, slc):
        msg = "slice step cannot be zero"
        with pytest.raises(ValueError, match=msg):
            BlockPlacement(slc)

    @pytest.mark.parametrize(
        "slc",
        [
            slice(None, None),
            slice(10, None),
            slice(None, None, -1),
            slice(None, 10, -1),
            # These are "unbounded" because negative index will
            #  change depending on container shape.
            slice(-1, None),
            slice(None, -1),
            slice(-1, -1),
            slice(-1, None, -1),
            slice(None, -1, -1),
            slice(-1, -1, -1),
        ],
    )
    def test_unbounded_slice_raises(self, slc):
        msg = "unbounded slice"
        with pytest.raises(ValueError, match=msg):
            BlockPlacement(slc)

    @pytest.mark.parametrize(
        "slc",
        [
            slice(0, 0),
            slice(100, 0),
            slice(100, 100),
            slice(100, 100, -1),
            slice(0, 100, -1),
        ],
    )
    def test_not_slice_like_slices(self, slc):
        assert not BlockPlacement(slc).is_slice_like

    @pytest.mark.parametrize(
        "arr, slc",
        [
            ([0], slice(0, 1, 1)),
            ([100], slice(100, 101, 1)),
            ([0, 1, 2], slice(0, 3, 1)),
            ([0, 5, 10], slice(0, 15, 5)),
            ([0, 100], slice(0, 200, 100)),
            ([2, 1], slice(2, 0, -1)),
        ],
    )
    def test_array_to_slice_conversion(self, arr, slc):
        assert BlockPlacement(arr).as_slice == slc

    @pytest.mark.parametrize(
        "arr",
        [
            [],
            [-1],
            [-1, -2, -3],
            [-10],
            [-1],
            [-1, 0, 1, 2],
            [-2, 0, 2, 4],
            [1, 0, -1],
            [1, 1, 1],
        ],
    )
    def test_not_slice_like_arrays(self, arr):
        assert not BlockPlacement(arr).is_slice_like

    @pytest.mark.parametrize(
        "slc, expected",
        [(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])],
    )
    def test_slice_iter(self, slc, expected):
        assert list(BlockPlacement(slc)) == expected

    @pytest.mark.parametrize(
        "slc, arr",
        [
            (slice(0, 3), [0, 1, 2]),
            (slice(0, 0), []),
            (slice(3, 0), []),
            (slice(3, 0, -1), [3, 2, 1]),
        ],
    )
    def test_slice_to_array_conversion(self, slc, arr):
        tm.assert_numpy_array_equal(
            BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64)
        )

    def test_blockplacement_add(self):
        bpl = BlockPlacement(slice(0, 5))
        assert bpl.add(1).as_slice == slice(1, 6, 1)
        assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2)
        assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5]

    @pytest.mark.parametrize(
        "val, inc, expected",
        [
            (slice(0, 0), 0, []),
            (slice(1, 4), 0, [1, 2, 3]),
            (slice(3, 0, -1), 0, [3, 2, 1]),
            ([1, 2, 4], 0, [1, 2, 4]),
            (slice(0, 0), 10, []),
            (slice(1, 4), 10, [11, 12, 13]),
            (slice(3, 0, -1), 10, [13, 12, 11]),
            ([1, 2, 4], 10, [11, 12, 14]),
            (slice(0, 0), -1, []),
            (slice(1, 4), -1, [0, 1, 2]),
            ([1, 2, 4], -1, [0, 1, 3]),
        ],
    )
    def test_blockplacement_add_int(self, val, inc, expected):
        assert list(BlockPlacement(val).add(inc)) == expected

    @pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]])
    def test_blockplacement_add_int_raises(self, val):
        msg = "iadd causes length change"
        with pytest.raises(ValueError, match=msg):
            BlockPlacement(val).add(-10)


class DummyElement:
    def __init__(self, value, dtype):
        self.value = value
        self.dtype = np.dtype(dtype)

    def __array__(self):
        return np.array(self.value, dtype=self.dtype)

    def __str__(self) -> str:
        return f"DummyElement({self.value}, {self.dtype})"

    def __repr__(self) -> str:
        return str(self)

    def astype(self, dtype, copy=False):
        self.dtype = dtype
        return self

    def view(self, dtype):
        return type(self)(self.value.view(dtype), dtype)

    def any(self, axis=None):
        return bool(self.value)


class TestCanHoldElement:
    def test_datetime_block_can_hold_element(self):
        block = create_block("datetime", [0])

        # We will check that block._can_hold_element iff arr.__setitem__ works
        arr = pd.array(block.values.ravel())

        # coerce None
        assert block._can_hold_element(None)
        arr[0] = None
        assert arr[0] is pd.NaT

        # coerce different types of datetime objects
        vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)]
        for val in vals:
            assert block._can_hold_element(val)
            arr[0] = val

        val = date(2010, 10, 10)
        assert not block._can_hold_element(val)

        msg = (
            "'value' should be a 'Timestamp', 'NaT', "
            "or array of those. Got 'date' instead."
        )
        with pytest.raises(TypeError, match=msg):
            arr[0] = val

    @pytest.mark.parametrize(
        "value, dtype",
        [
            (1, "i8"),
            (1.0, "f8"),
            (2 ** 63, "f8"),
            (1j, "complex128"),
            (2 ** 63, "complex128"),
            (True, "bool"),
            (np.timedelta64(20, "ns"), "<m8[ns]"),
            (np.datetime64(20, "ns"), "<M8[ns]"),
        ],
    )
    @pytest.mark.parametrize(
        "op",
        [
            operator.add,
            operator.sub,
            operator.mul,
            operator.truediv,
            operator.mod,
            operator.pow,
        ],
        ids=lambda x: x.__name__,
    )
    def test_binop_other(self, op, value, dtype):
        skip = {
            (operator.add, "bool"),
            (operator.sub, "bool"),
            (operator.mul, "bool"),
            (operator.truediv, "bool"),
            (operator.mod, "i8"),
            (operator.mod, "complex128"),
            (operator.pow, "bool"),
        }
        if (op, dtype) in skip:
            pytest.skip(f"Invalid combination {op},{dtype}")

        e = DummyElement(value, dtype)
        s = pd.DataFrame({"A": [e.value, e.value]}, dtype=e.dtype)

        invalid = {
            (operator.pow, "<M8[ns]"),
            (operator.mod, "<M8[ns]"),
            (operator.truediv, "<M8[ns]"),
            (operator.mul, "<M8[ns]"),
            (operator.add, "<M8[ns]"),
            (operator.pow, "<m8[ns]"),
            (operator.mul, "<m8[ns]"),
        }

        if (op, dtype) in invalid:
            msg = (
                None
                if (dtype == "<M8[ns]" and op == operator.add)
                or (dtype == "<m8[ns]" and op == operator.mul)
                else (
                    f"cannot perform __{op.__name__}__ with this "
                    "index type: (DatetimeArray|TimedeltaArray)"
                )
            )

            with pytest.raises(TypeError, match=msg):
                op(s, e.value)
        else:
            # FIXME: Since dispatching to Series, this test no longer
            # asserts anything meaningful
            result = op(s, e.value).dtypes
            expected = op(s, value).dtypes
            tm.assert_series_equal(result, expected)


class TestShouldStore:
    def test_should_store_categorical(self):
        cat = pd.Categorical(["A", "B", "C"])
        df = pd.DataFrame(cat)
        blk = df._mgr.blocks[0]

        # matching dtype
        assert blk.should_store(cat)
        assert blk.should_store(cat[:-1])

        # different dtype
        assert not blk.should_store(cat.as_ordered())

        # ndarray instead of Categorical
        assert not blk.should_store(np.asarray(cat))


@pytest.mark.parametrize(
    "typestr, holder",
    [
        ("category", Categorical),
        ("M8[ns]", DatetimeArray),
        ("M8[ns, US/Central]", DatetimeArray),
        ("m8[ns]", TimedeltaArray),
        ("sparse", SparseArray),
    ],
)
def test_holder(typestr, holder):
    blk = create_block(typestr, [1])
    assert blk._holder is holder


def test_validate_ndim():
    values = np.array([1.0, 2.0])
    placement = slice(2)
    msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"

    with pytest.raises(ValueError, match=msg):
        make_block(values, placement, ndim=2)


def test_block_shape():
    idx = pd.Index([0, 1, 2, 3, 4])
    a = pd.Series([1, 2, 3]).reindex(idx)
    b = pd.Series(pd.Categorical([1, 2, 3])).reindex(idx)

    assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer


def test_make_block_no_pandas_array():
    # https://github.com/pandas-dev/pandas/pull/24866
    arr = pd.arrays.PandasArray(np.array([1, 2]))

    # PandasArray, no dtype
    result = make_block(arr, slice(len(arr)))
    assert result.is_integer is True
    assert result.is_extension is False

    # PandasArray, PandasDtype
    result = make_block(arr, slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False

    # ndarray, PandasDtype
    result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype)
    assert result.is_integer is True
    assert result.is_extension is False


def test_dataframe_not_equal():
    # see GH28839
    df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]})
    df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]})
    assert df1.equals(df2) is False


def test_missing_unicode_key():
    df = DataFrame({"a": [1]})
    with pytest.raises(KeyError, match="\u05d0"):
        df.loc[:, "\u05d0"]  # should not raise UnicodeEncodeError


def test_set_change_dtype_slice():
    # GH#8850
    cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")])
    df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols)
    df["2nd"] = df["2nd"] * 2.0

    blocks = df._to_dict_of_blocks()
    assert sorted(blocks.keys()) == ["float64", "int64"]
    tm.assert_frame_equal(
        blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])
    )
    tm.assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:]))


def test_interleave_non_unique_cols():
    df = DataFrame(
        [[pd.Timestamp("20130101"), 3.5], [pd.Timestamp("20130102"), 4.5]],
        columns=["x", "x"],
        index=[1, 2],
    )

    df_unique = df.copy()
    df_unique.columns = ["x", "y"]
    assert df_unique.values.shape == df.values.shape
    tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
    tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])


def test_single_block_manager_fastpath_deprecated():
    # GH#33092
    ser = pd.Series(range(3))
    blk = ser._data.blocks[0]
    with tm.assert_produces_warning(FutureWarning):
        SingleBlockManager(blk, ser.index, fastpath=True)