Repository URL to install this package:
|
Version:
0.15.2 ▾
|
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.
# noqa: MD02
"""
Details about how Indexing Helper Class works.
_LocationIndexerBase provide methods framework for __getitem__
and __setitem__ that work with Modin DataFrame's internal index. Base
class's __{get,set}item__ takes in partitions & idx_in_partition data
and perform lookup/item write.
_LocIndexer and _iLocIndexer is responsible for indexer specific logic and
lookup computation. Loc will take care of enlarge DataFrame. Both indexer
will take care of translating pandas's lookup to Modin DataFrame's internal
lookup.
An illustration is available at
https://github.com/ray-project/ray/pull/1955#issuecomment-386781826
"""
import numpy as np
import pandas
import itertools
from pandas.api.types import is_list_like, is_bool
from pandas.core.dtypes.common import is_integer, is_bool_dtype, is_integer_dtype
from pandas.core.indexing import IndexingError
from modin.error_message import ErrorMessage
from modin.logging import LoggerMetaClass, metaclass_resolver
from .dataframe import DataFrame
from .series import Series
from .utils import is_scalar
def is_slice(x):
"""
Check that argument is an instance of slice.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
True if argument is a slice, False otherwise.
"""
return isinstance(x, slice)
def compute_sliced_len(slc, sequence_len):
"""
Compute length of sliced object.
Parameters
----------
slc : slice
Slice object.
sequence_len : int
Length of sequence, to which slice will be applied.
Returns
-------
int
Length of object after applying slice object on it.
"""
# This will translate slice to a range, from which we can retrieve length
return len(range(*slc.indices(sequence_len)))
def is_2d(x):
"""
Check that argument is a list or a slice.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
`True` if argument is a list or slice, `False` otherwise.
"""
return is_list_like(x) or is_slice(x)
def is_tuple(x):
"""
Check that argument is a tuple.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
True if argument is a tuple, False otherwise.
"""
return isinstance(x, tuple)
def is_boolean_array(x):
"""
Check that argument is an array of bool.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
True if argument is an array of bool, False otherwise.
"""
if isinstance(x, (np.ndarray, Series, pandas.Series, pandas.Index)):
return is_bool_dtype(x.dtype)
elif isinstance(x, (DataFrame, pandas.DataFrame)):
return all(map(is_bool_dtype, x.dtypes))
return is_list_like(x) and all(map(is_bool, x))
def is_integer_array(x):
"""
Check that argument is an array of integers.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
True if argument is an array of integers, False otherwise.
"""
if isinstance(x, (np.ndarray, Series, pandas.Series, pandas.Index)):
return is_integer_dtype(x.dtype)
elif isinstance(x, (DataFrame, pandas.DataFrame)):
return all(map(is_integer_dtype, x.dtypes))
return is_list_like(x) and all(map(is_integer, x))
def is_integer_slice(x):
"""
Check that argument is an array of int.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
True if argument is an array of int, False otherwise.
"""
if not is_slice(x):
return False
for pos in [x.start, x.stop, x.step]:
if not ((pos is None) or is_integer(pos)):
return False # one position is neither None nor int
return True
def is_range_like(obj):
"""
Check if the object is range-like.
Objects that are considered range-like have information about the range (start and
stop positions, and step) and also have to be iterable. Examples of range-like
objects are: Python range, pandas.RangeIndex.
Parameters
----------
obj : object
Returns
-------
bool
"""
return (
hasattr(obj, "__iter__")
and hasattr(obj, "start")
and hasattr(obj, "stop")
and hasattr(obj, "step")
)
def boolean_mask_to_numeric(indexer):
"""
Convert boolean mask to numeric indices.
Parameters
----------
indexer : list-like of booleans
Returns
-------
np.ndarray of ints
Numerical positions of ``True`` elements in the passed `indexer`.
"""
if isinstance(indexer, (np.ndarray, Series, pandas.Series)):
return np.where(indexer)[0]
else:
# It's faster to build the resulting numpy array from the reduced amount of data via
# `compress` iterator than convert non-numpy-like `indexer` to numpy and apply `np.where`.
return np.fromiter(
# `itertools.compress` masks `data` with the `selectors` mask,
# works about ~10% faster than a pure list comprehension
itertools.compress(data=range(len(indexer)), selectors=indexer),
dtype=np.int64,
)
_ILOC_INT_ONLY_ERROR = """
Location based indexing can only have [integer, integer slice (START point is
INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types.
"""
_VIEW_IS_COPY_WARNING = """
Modin is making a copy of of the DataFrame. This behavior diverges from Pandas.
This will be fixed in future releases.
"""
def _compute_ndim(row_loc, col_loc):
"""
Compute the number of dimensions of result from locators.
Parameters
----------
row_loc : list or scalar
Row locator.
col_loc : list or scalar
Column locator.
Returns
-------
{0, 1, 2}
Number of dimensions in located dataset.
"""
row_scalar = is_scalar(row_loc) or is_tuple(row_loc)
col_scalar = is_scalar(col_loc) or is_tuple(col_loc)
if row_scalar and col_scalar:
ndim = 0
elif row_scalar ^ col_scalar:
ndim = 1
else:
ndim = 2
return ndim
class _LocationIndexerBase(object, metaclass=LoggerMetaClass):
"""
Base class for location indexer like loc and iloc.
Parameters
----------
modin_df : modin.pandas.DataFrame
DataFrame to operate on.
"""
def __init__(self, modin_df):
self.df = modin_df
self.qc = modin_df._query_compiler
self.row_scalar = False
self.col_scalar = False
def __getitem__(self, row_lookup, col_lookup, ndim):
"""
Retrieve dataset according to `row_lookup` and `col_lookup`.
Parameters
----------
row_lookup : slice(None), range or np.ndarray
The global row index to retrieve data from.
col_lookup : slice(None), range or np.ndarray
The global col index to retrieve data from.
ndim : {0, 1, 2}
Number of dimensions in dataset to be retrieved.
Returns
-------
modin.pandas.DataFrame or modin.pandas.Series
Located dataset.
Notes
-----
Usage of `slice(None)` as a lookup is a hack to pass information about
full-axis grab without computing actual indices that triggers lazy computations.
Ideally, this API should get rid of using slices as indexers and either use a
common ``Indexer`` object or range and ``np.ndarray`` only.
"""
if isinstance(row_lookup, slice):
ErrorMessage.catch_bugs_and_request_email(
failure_condition=row_lookup != slice(None),
extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}",
)
row_lookup = None
if isinstance(col_lookup, slice):
ErrorMessage.catch_bugs_and_request_email(
failure_condition=col_lookup != slice(None),
extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}",
)
col_lookup = None
qc_view = self.qc.view(row_lookup, col_lookup)
if ndim == 2:
return self.df.__constructor__(query_compiler=qc_view)
if isinstance(self.df, Series) and not self.row_scalar:
return self.df.__constructor__(query_compiler=qc_view)
if isinstance(self.df, Series):
axis = 0
elif ndim == 0:
axis = None
else:
axis = (
None
if self.col_scalar and self.row_scalar
else 1
if self.col_scalar
else 0
)
return self.df.__constructor__(query_compiler=qc_view).squeeze(axis=axis)
def __setitem__(self, row_lookup, col_lookup, item, axis=None):
"""
Assign `item` value to located dataset.
Parameters
----------
row_lookup : slice or scalar
The global row index to write item to.
col_lookup : slice or scalar
The global col index to write item to.
item : DataFrame, Series or scalar
The new item needs to be set. It can be any shape that's
broadcast-able to the product of the lookup tables.
axis : {None, 0, 1}, default: None
If not None, it means that whole axis is used to assign a value.
0 means assign to whole column, 1 means assign to whole row.
If None, it means that partial assignment is done on both axes.
"""
# Convert slices to indices for the purposes of application.
# TODO (devin-petersohn): Apply to slice without conversion to list
if isinstance(row_lookup, slice):
row_lookup = range(len(self.qc.index))[row_lookup]
if isinstance(col_lookup, slice):
col_lookup = range(len(self.qc.columns))[col_lookup]
# This is True when we dealing with assignment of a full column. This case
# should be handled in a fastpath with `df[col] = item`.
if axis == 0:
self.df[self.df.columns[col_lookup][0]] = item
# This is True when we are assigning to a full row. We want to reuse the setitem
# mechanism to operate along only one axis for performance reasons.
elif axis == 1:
if hasattr(item, "_query_compiler"):
item = item._query_compiler
new_qc = self.qc.setitem(1, self.qc.index[row_lookup[0]], item)
self.df._create_or_update_from_compiler(new_qc, inplace=True)
# Assignment to both axes.
else:
if isinstance(row_lookup, slice):
new_row_len = len(self.df.index[row_lookup])
else:
new_row_len = len(row_lookup)
if isinstance(col_lookup, slice):
new_col_len = len(self.df.columns[col_lookup])
else:
new_col_len = len(col_lookup)
to_shape = new_row_len, new_col_len
if not is_scalar(item):
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
self._write_items(row_lookup, col_lookup, item)
def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
"""
Use NumPy to broadcast or reshape item.
Parameters
----------
row_lookup : slice or scalar
The global row index to locate inside of `item`.
col_lookup : slice or scalar
The global col index to locate inside of `item`.
item : DataFrame, Series, or query_compiler
Value that should be broadcast to a new shape of `to_shape`.
to_shape : tuple of two int
Shape of dataset that `item` should be broadcasted to.
Returns
-------
numpy.ndarray
`item` after it was broadcasted to `to_shape`.
Raises
------
ValueError
If `row_lookup` or `col_lookup` contain values missing in
`self` index or columns correspondingly.
If `item` cannot be broadcast from its own shape to `to_shape`.
Notes
-----
NumPy is memory efficient, there shouldn't be performance issue.
"""
# It is valid to pass a DataFrame or Series to __setitem__ that is larger than
# the target the user is trying to overwrite. This
if isinstance(item, (pandas.Series, pandas.DataFrame, Series, DataFrame)):
# convert indices in lookups to names, as Pandas reindex expects them to be so
axes_to_reindex = {}
index_values = self.qc.index[row_lookup]
if not index_values.equals(item.index):
axes_to_reindex["index"] = index_values
if hasattr(item, "columns"):
column_values = self.qc.columns[col_lookup]
if not column_values.equals(item.columns):
axes_to_reindex["columns"] = column_values
# New value for columns/index make that reindex add NaN values
if axes_to_reindex:
item = item.reindex(**axes_to_reindex)
try:
item = np.array(item)
if np.prod(to_shape) == np.prod(item.shape):
return item.reshape(to_shape)
else:
return np.broadcast_to(item, to_shape)
except ValueError:
from_shape = np.array(item).shape
raise ValueError(
f"could not broadcast input array from shape {from_shape} into shape "
+ f"{to_shape}"
)
def _write_items(self, row_lookup, col_lookup, item):
"""
Perform remote write and replace blocks.
Parameters
----------
row_lookup : slice or scalar
The global row index to write item to.
col_lookup : slice or scalar
The global col index to write item to.
item : numpy.ndarray
The new item value that needs to be assigned to `self`.
"""
new_qc = self.qc.write_items(row_lookup, col_lookup, item)
self.df._create_or_update_from_compiler(new_qc, inplace=True)
def _determine_setitem_axis(self, row_lookup, col_lookup, row_scalar, col_scalar):
"""
Determine an axis along which we should do an assignment.
Parameters
----------
row_lookup : slice or list
Indexer for rows.
col_lookup : slice or list
Indexer for columns.
row_scalar : bool
Whether indexer for rows is scalar or not.
col_scalar : bool
Whether indexer for columns is scalar or not.
Returns
-------
int or None
None if this will be a both axis assignment, number of axis to assign in other cases.
Notes
-----
axis = 0: column assignment df[col] = item
axis = 1: row assignment df.loc[row] = item
axis = None: assignment along both axes
"""
if self.df.shape == (1, 1):
return None if not (row_scalar ^ col_scalar) else 1 if row_scalar else 0
def get_axis(axis):
return self.qc.index if axis == 0 else self.qc.columns
row_lookup_len, col_lookup_len = [
len(lookup)
if not isinstance(lookup, slice)
else compute_sliced_len(lookup, len(get_axis(i)))
for i, lookup in enumerate([row_lookup, col_lookup])
]
if col_lookup_len == 1 and row_lookup_len == 1:
axis = None
elif (
row_lookup_len == len(self.qc.index)
and col_lookup_len == 1
and isinstance(self.df, DataFrame)
):
axis = 0
elif col_lookup_len == len(self.qc.columns) and row_lookup_len == 1:
axis = 1
else:
axis = None
return axis
def _parse_row_and_column_locators(self, tup):
"""
Unpack the user input for getitem and setitem and compute ndim.
loc[a] -> ([a], :), 1D
loc[[a,b]] -> ([a,b], :),
loc[a,b] -> ([a], [b]), 0D
Parameters
----------
tup : tuple
User input to unpack.
Returns
-------
row_loc : scalar or list
Row locator(s) as a scalar or List.
col_list : scalar or list
Column locator(s) as a scalar or List.
ndim : {0, 1, 2}
Number of dimensions of located dataset.
"""
row_loc, col_loc = slice(None), slice(None)
if is_tuple(tup):
row_loc = tup[0]
if len(tup) == 2:
col_loc = tup[1]
if len(tup) > 2:
raise IndexingError("Too many indexers")
else:
row_loc = tup
row_loc = row_loc(self.df) if callable(row_loc) else row_loc
col_loc = col_loc(self.df) if callable(col_loc) else col_loc
return row_loc, col_loc, _compute_ndim(row_loc, col_loc)
# HACK: This method bypasses regular ``loc/iloc.__getitem__`` flow in order to ensure better
# performance in the case of boolean masking. The only purpose of this method is to compensate
# for a lack of backend's indexing API, there is no Query Compiler method allowing masking
# along both axis when any of the indexers is a boolean. That's why rows and columns masking
# phases are separate in this case.
# TODO: Remove this method and handle this case naturally via ``loc/iloc.__getitem__`` flow
# when QC API would support both-axis masking with boolean indexers.
def _handle_boolean_masking(self, row_loc, col_loc):
"""
Retrieve dataset according to the boolean mask for rows and an indexer for columns.
In comparison with the regular ``loc/iloc.__getitem__`` flow this method efficiently
masks rows with a Modin Series boolean mask without materializing it (if the selected
execution implements such masking).
Parameters
----------
row_loc : modin.pandas.Series of bool dtype
Boolean mask to index rows with.
col_loc : object
An indexer along column axis.
Returns
-------
modin.pandas.DataFrame or modin.pandas.Series
Located dataset.
"""
ErrorMessage.catch_bugs_and_request_email(
failure_condition=not isinstance(row_loc, Series),
extra_log=f"Only ``modin.pandas.Series`` boolean masks are acceptable, got: {type(row_loc)}",
)
masked_df = self.df.__constructor__(
query_compiler=self.qc.getitem_array(row_loc._query_compiler)
)
# Passing `slice(None)` as a row indexer since we've just applied it
return type(self)(masked_df)[(slice(None), col_loc)]
class _LocIndexer(metaclass_resolver(_LocationIndexerBase)):
"""
An indexer for modin_df.loc[] functionality.
Parameters
----------
modin_df : modin.pandas.DataFrame
DataFrame to operate on.
"""
def __getitem__(self, key):
"""
Retrieve dataset according to `key`.
Parameters
----------
key : callable, scalar, or tuple
The global row index to retrieve data from.
Returns
-------
modin.pandas.DataFrame or modin.pandas.Series
Located dataset.
See Also
--------
pandas.DataFrame.loc
"""
if self.df.empty:
return self.df._default_to_pandas(lambda df: df.loc[key])
row_loc, col_loc, ndim = self._parse_row_and_column_locators(key)
self.row_scalar = is_scalar(row_loc)
self.col_scalar = is_scalar(col_loc)
if isinstance(row_loc, Series) and is_boolean_array(row_loc):
return self._handle_boolean_masking(row_loc, col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
result = super(_LocIndexer, self).__getitem__(row_lookup, col_lookup, ndim)
if isinstance(result, Series):
result._parent = self.df
result._parent_axis = 0
col_loc_as_list = [col_loc] if self.col_scalar else col_loc
row_loc_as_list = [row_loc] if self.row_scalar else row_loc
# Pandas drops the levels that are in the `loc`, so we have to as well.
if (
isinstance(result, (Series, DataFrame))
and result._query_compiler.has_multiindex()
):
if (
isinstance(result, Series)
and not isinstance(col_loc_as_list, slice)
and all(
col_loc_as_list[i] in result.index.levels[i]
for i in range(len(col_loc_as_list))
)
):
result.index = result.index.droplevel(list(range(len(col_loc_as_list))))
elif not isinstance(row_loc_as_list, slice) and all(
not isinstance(row_loc_as_list[i], slice)
and row_loc_as_list[i] in result.index.levels[i]
for i in range(len(row_loc_as_list))
):
result.index = result.index.droplevel(list(range(len(row_loc_as_list))))
if (
hasattr(result, "columns")
and not isinstance(col_loc_as_list, slice)
and result._query_compiler.has_multiindex(axis=1)
and all(
col_loc_as_list[i] in result.columns.levels[i]
for i in range(len(col_loc_as_list))
)
):
result.columns = result.columns.droplevel(list(range(len(col_loc_as_list))))
# This is done for cases where the index passed in has other state, like a
# frequency in the case of DateTimeIndex.
if (
row_lookup is not None
and isinstance(col_loc, slice)
and col_loc == slice(None)
and isinstance(key, pandas.Index)
):
result.index = key
return result
def __setitem__(self, key, item):
"""
Assign `item` value to dataset located by `key`.
Parameters
----------
key : callable or tuple
The global row index to assign data to.
item : modin.pandas.DataFrame, modin.pandas.Series or scalar
Value that should be assigned to located dataset.
See Also
--------
pandas.DataFrame.loc
"""
row_loc, col_loc, _ = self._parse_row_and_column_locators(key)
if isinstance(row_loc, list) and len(row_loc) == 1:
if row_loc[0] not in self.qc.index:
index = self.qc.index.insert(len(self.qc.index), row_loc[0])
self.qc = self.qc.reindex(labels=index, axis=0)
self.df._update_inplace(new_query_compiler=self.qc)
if (
isinstance(col_loc, list)
and len(col_loc) == 1
and col_loc[0] not in self.qc.columns
):
new_col = pandas.Series(index=self.df.index)
new_col[row_loc] = item
self.df.insert(loc=len(self.df.columns), column=col_loc[0], value=new_col)
self.qc = self.df._query_compiler
else:
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
super(_LocIndexer, self).__setitem__(
row_lookup,
col_lookup,
item,
axis=self._determine_setitem_axis(
row_lookup, col_lookup, is_scalar(row_loc), is_scalar(col_loc)
),
)
def _compute_enlarge_labels(self, locator, base_index):
"""
Help to _enlarge_axis, compute common labels and extra labels.
Parameters
----------
locator : pandas.Index
Index from locator.
base_index : pandas.Index
Current index.
Returns
-------
nan_labels : pandas.Index
The labels that need to be added.
"""
# base_index_type can be pd.Index or pd.DatetimeIndex
# depending on user input and pandas behavior
# See issue #2264
base_index_type = type(base_index)
locator_as_index = base_index_type(locator)
nan_labels = locator_as_index.difference(base_index)
common_labels = locator_as_index.intersection(base_index)
if len(common_labels) == 0:
raise KeyError(
"None of [{labels}] are in the [{base_index_name}]".format(
labels=list(locator_as_index), base_index_name=base_index
)
)
return nan_labels
def _compute_lookup(self, row_loc, col_loc):
"""
Compute index and column labels from index and column locators.
Parameters
----------
row_loc : scalar, slice, list, array or tuple
Row locator.
col_loc : scalar, slice, list, array or tuple
Columns locator.
Returns
-------
row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise
List of index labels.
col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise
List of columns labels.
Notes
-----
Usage of `slice(None)` as a resulting lookup is a hack to pass information about
full-axis grab without computing actual indices that triggers lazy computations.
Ideally, this API should get rid of using slices as indexers and either use a
common ``Indexer`` object or range and ``np.ndarray`` only.
"""
lookups = []
for axis, axis_loc in enumerate((row_loc, col_loc)):
if is_scalar(axis_loc):
axis_loc = np.array([axis_loc])
if isinstance(axis_loc, slice) or is_range_like(axis_loc):
if isinstance(axis_loc, slice) and axis_loc == slice(None):
axis_lookup = axis_loc
else:
axis_labels = self.qc.get_axis(axis)
# `slice_indexer` returns a fully-defined numeric slice for a non-fully-defined labels-based slice
axis_lookup = axis_labels.slice_indexer(
axis_loc.start, axis_loc.stop, axis_loc.step
)
# Converting negative indices to their actual positions:
axis_lookup = pandas.RangeIndex(
start=(
axis_lookup.start
if axis_lookup.start >= 0
else axis_lookup.start + len(axis_labels)
),
stop=(
axis_lookup.stop
if axis_lookup.stop >= 0
else axis_lookup.stop + len(axis_labels)
),
step=axis_lookup.step,
)
elif self.qc.has_multiindex(axis):
# `Index.get_locs` raises an IndexError by itself if missing labels were provided,
# we don't have to do missing-check for the received `axis_lookup`.
if isinstance(axis_loc, pandas.MultiIndex):
axis_lookup = self.qc.get_axis(axis).get_indexer_for(axis_loc)
else:
axis_lookup = self.qc.get_axis(axis).get_locs(axis_loc)
elif is_boolean_array(axis_loc):
axis_lookup = boolean_mask_to_numeric(axis_loc)
else:
axis_labels = self.qc.get_axis(axis)
if is_list_like(axis_loc) and not isinstance(
axis_loc, (np.ndarray, pandas.Index)
):
# `Index.get_indexer_for` works much faster with numpy arrays than with python lists,
# so although we lose some time here on converting to numpy, `Index.get_indexer_for`
# speedup covers the loss that we gain here.
axis_loc = np.array(axis_loc, dtype=axis_labels.dtype)
axis_lookup = axis_labels.get_indexer_for(axis_loc)
# `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether
# there are any -1 in the received indexer to raise a KeyError here.
missing_mask = axis_lookup < 0
if missing_mask.any():
missing_labels = (
# Converting `axis_loc` to maskable `np.array` to not fail
# on masking non-maskable list-like
np.array(axis_loc)[missing_mask]
if is_list_like(axis_loc)
# If `axis_loc` is not a list-like then we can't select certain
# labels that are missing and so printing the whole indexer
else axis_loc
)
raise KeyError(missing_labels)
if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup):
axis_lookup = axis_lookup.values
lookups.append(axis_lookup)
return lookups
class _iLocIndexer(metaclass_resolver(_LocationIndexerBase)):
"""
An indexer for modin_df.iloc[] functionality.
Parameters
----------
modin_df : modin.pandas.DataFrame
DataFrame to operate on.
"""
def __getitem__(self, key):
"""
Retrieve dataset according to `key`.
Parameters
----------
key : callable or tuple
The global row numbers to retrieve data from.
Returns
-------
DataFrame or Series
Located dataset.
See Also
--------
pandas.DataFrame.iloc
"""
if self.df.empty:
return self.df._default_to_pandas(lambda df: df.iloc[key])
row_loc, col_loc, ndim = self._parse_row_and_column_locators(key)
self.row_scalar = is_scalar(row_loc)
self.col_scalar = is_scalar(col_loc)
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)
if isinstance(row_loc, Series) and is_boolean_array(row_loc):
return self._handle_boolean_masking(row_loc, col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
result = super(_iLocIndexer, self).__getitem__(row_lookup, col_lookup, ndim)
if isinstance(result, Series):
result._parent = self.df
result._parent_axis = 0
return result
def __setitem__(self, key, item):
"""
Assign `item` value to dataset located by `key`.
Parameters
----------
key : callable or tuple
The global row numbers to assign data to.
item : modin.pandas.DataFrame, modin.pandas.Series or scalar
Value that should be assigned to located dataset.
See Also
--------
pandas.DataFrame.iloc
"""
row_loc, col_loc, _ = self._parse_row_and_column_locators(key)
row_scalar = is_scalar(row_loc)
col_scalar = is_scalar(col_loc)
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
super(_iLocIndexer, self).__setitem__(
row_lookup,
col_lookup,
item,
axis=self._determine_setitem_axis(
row_lookup, col_lookup, row_scalar, col_scalar
),
)
def _compute_lookup(self, row_loc, col_loc):
"""
Compute index and column labels from index and column integer locators.
Parameters
----------
row_loc : slice, list, array or tuple
Row locator.
col_loc : slice, list, array or tuple
Columns locator.
Returns
-------
row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise
List of index labels.
col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise
List of columns labels.
Notes
-----
Usage of `slice(None)` as a resulting lookup is a hack to pass information about
full-axis grab without computing actual indices that triggers lazy computations.
Ideally, this API should get rid of using slices as indexers and either use a
common ``Indexer`` object or range and ``np.ndarray`` only.
"""
lookups = []
for axis, axis_loc in enumerate((row_loc, col_loc)):
if is_scalar(axis_loc):
axis_loc = np.array([axis_loc])
if isinstance(axis_loc, slice):
axis_lookup = (
axis_loc
if axis_loc == slice(None)
else pandas.RangeIndex(
*axis_loc.indices(len(self.qc.get_axis(axis)))
)
)
elif is_range_like(axis_loc):
axis_lookup = pandas.RangeIndex(
axis_loc.start, axis_loc.stop, axis_loc.step
)
elif is_boolean_array(axis_loc):
axis_lookup = boolean_mask_to_numeric(axis_loc)
else:
if isinstance(axis_loc, pandas.Index):
axis_loc = axis_loc.values
elif is_list_like(axis_loc) and not isinstance(axis_loc, np.ndarray):
# `Index.__getitem__` works much faster with numpy arrays than with python lists,
# so although we lose some time here on converting to numpy, `Index.__getitem__`
# speedup covers the loss that we gain here.
axis_loc = np.array(axis_loc, dtype=np.int64)
# Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
# if there're no negative indices and so they don't not depend on the axis length.
if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():
axis_lookup = axis_loc
else:
axis_lookup = pandas.RangeIndex(len(self.qc.get_axis(axis)))[
axis_loc
]
if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup):
axis_lookup = axis_lookup.values
lookups.append(axis_lookup)
return lookups
def _check_dtypes(self, locator):
"""
Check that `locator` is an integer scalar, integer slice, integer list or array of booleans.
Parameters
----------
locator : scalar, list, slice or array
Object to check.
Raises
------
ValueError
If check fails.
"""
is_int = is_integer(locator)
is_int_slice = is_integer_slice(locator)
is_int_arr = is_integer_array(locator)
is_bool_arr = is_boolean_array(locator)
if not any([is_int, is_int_slice, is_int_arr, is_bool_arr]):
raise ValueError(_ILOC_INT_ONLY_ERROR)