Repository URL to install this package:
|
Version:
5.5.0 ▾
|
# DataFrame Interchange Protocol Types
# Copied from https://data-apis.org/dataframe-protocol/latest/API.html,
# changed ABCs to Protocols, and subset the type hints to only those that are
# relevant for Altair.
#
# These classes are only for use in type signatures
from __future__ import annotations
import enum
from typing import TYPE_CHECKING, Any, Protocol
if TYPE_CHECKING:
from collections.abc import Iterable
class DtypeKind(enum.IntEnum):
"""
Integer enum for data types.
Attributes
----------
INT : int
Matches to signed integer data type.
UINT : int
Matches to unsigned integer data type.
FLOAT : int
Matches to floating point data type.
BOOL : int
Matches to boolean data type.
STRING : int
Matches to string data type (UTF-8 encoded).
DATETIME : int
Matches to datetime data type.
CATEGORICAL : int
Matches to categorical data type.
"""
INT = 0
UINT = 1
FLOAT = 2
BOOL = 20
STRING = 21 # UTF-8
DATETIME = 22
CATEGORICAL = 23
# Type hint of first element would actually be DtypeKind but can't use that
# as other libraries won't use an instance of our own Enum in this module but have
# their own. Type checkers will raise an error on that even though the enums
# are identical.
class Column(Protocol):
@property
def dtype(self) -> tuple[Any, int, str, str]:
"""
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
Bit-width : the number of bits as an integer
Format string : data type description format string in Apache Arrow C
Data Interface format.
Endianness : current only native endianness (``=``) is supported
Notes
-----
- Kind specifiers are aligned with DLPack where possible (hence the
jump to 20, leave enough room for future extension)
- Masks must be specified as boolean with either bit width 1 (for bit
masks) or 8 (for byte masks).
- Dtype width in bits was preferred over bytes
- Endianness isn't too useful, but included now in case in the future
we need to support non-native endianness
- Went with Apache Arrow format strings over NumPy format strings
because they're more complete from a dataframe perspective
- Format strings are mostly useful for datetime specification, and
for categoricals.
- For categoricals, the format string describes the type of the
categorical in the data buffer. In case of a separate encoding of
the categorical (e.g. an integer to string mapping), this can
be derived from ``self.describe_categorical``.
- Data types not included: complex, Arrow-style null, binary, decimal,
and nested (list, struct, map, union) dtypes.
"""
...
# Have to use a generic Any return type as not all libraries who implement
# the dataframe interchange protocol implement the TypedDict that is usually
# returned here in the same way. As TypedDicts are invariant, even a slight change
# will lead to an error by a type checker. See PR in which this code was added
# for details.
@property
def describe_categorical(self) -> Any:
"""
If the dtype is categorical, there are two options.
- There are only values in the data buffer.
- There is a separate non-categorical Column encoding categorical values.
Raises TypeError if the dtype is not categorical
Returns the dictionary with description on how to interpret the data buffer:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a mapping of
categorical values to other objects exists
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.
TBD: are there any other in-memory representations that are needed?
"""
...
class DataFrame(Protocol):
"""
A data frame class, with only the methods required by the interchange protocol defined.
A "data frame" represents an ordered collection of named columns.
A column's "name" must be a unique string.
Columns may be accessed by name or by position.
This could be a public data frame class, or an object with the methods and
attributes defined on this DataFrame class could be returned from the
``__dataframe__`` method of a public data frame class in a library adhering
to the dataframe interchange protocol specification.
"""
def __dataframe__(
self, nan_as_null: bool = False, allow_copy: bool = True
) -> DataFrame:
"""
Construct a new exchange object, potentially changing the parameters.
``nan_as_null`` is a keyword intended for the consumer to tell the
producer to overwrite null values in the data with ``NaN``.
It is intended for cases where the consumer does not support the bit
mask or byte mask that is the producer's native representation.
``allow_copy`` is a keyword that defines whether or not the library is
allowed to make a copy of the data. For example, copying data would be
necessary if a library supports strided buffers, given that this protocol
specifies contiguous buffers.
"""
...
def column_names(self) -> Iterable[str]:
"""Return an iterator yielding the column names."""
...
def get_column_by_name(self, name: str) -> Column:
"""Return the column whose name is the indicated name."""
...
def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
"""
Return an iterator yielding the chunks.
By default (None), yields the chunks that the data is stored as by the
producer. If given, ``n_chunks`` must be a multiple of
``self.num_chunks()``, meaning the producer must subdivide each chunk
before yielding it.
Note that the producer must ensure that all columns are chunked the
same way.
"""
...