Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / nanoarrow   python

Repository URL to install this package:

Version: 0.7.0.dev132 

/ src / nanoarrow / schema.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import enum
import reprlib
from functools import cached_property
from typing import List, Mapping, Union

from nanoarrow._schema import (
    CArrowTimeUnit,
    CSchemaBuilder,
    CSchemaView,
    SchemaMetadata,
)
from nanoarrow.c_schema import c_schema

from nanoarrow import _repr_utils, _types


class Type(enum.Enum):
    """The Type enumerator provides a means by which the various type
    categories can be identified. Type values can be used in place of
    :class:`Schema` instances in most places for parameter-free types.
    """

    UNINITIALIZED = int(_types.UNINITIALIZED)
    NULL = int(_types.NA)
    BOOL = int(_types.BOOL)
    UINT8 = int(_types.UINT8)
    INT8 = int(_types.INT8)
    UINT16 = int(_types.UINT16)
    INT16 = int(_types.INT16)
    UINT32 = int(_types.UINT32)
    INT32 = int(_types.INT32)
    UINT64 = int(_types.UINT64)
    INT64 = int(_types.INT64)
    HALF_FLOAT = int(_types.HALF_FLOAT)
    FLOAT = int(_types.FLOAT)
    DOUBLE = int(_types.DOUBLE)
    STRING = int(_types.STRING)
    BINARY = int(_types.BINARY)
    FIXED_SIZE_BINARY = int(_types.FIXED_SIZE_BINARY)
    DATE32 = int(_types.DATE32)
    DATE64 = int(_types.DATE64)
    TIMESTAMP = int(_types.TIMESTAMP)
    TIME32 = int(_types.TIME32)
    TIME64 = int(_types.TIME64)
    INTERVAL_MONTHS = int(_types.INTERVAL_MONTHS)
    INTERVAL_DAY_TIME = int(_types.INTERVAL_DAY_TIME)
    DECIMAL128 = int(_types.DECIMAL128)
    DECIMAL256 = int(_types.DECIMAL256)
    LIST = int(_types.LIST)
    STRUCT = int(_types.STRUCT)
    SPARSE_UNION = int(_types.SPARSE_UNION)
    DENSE_UNION = int(_types.DENSE_UNION)
    DICTIONARY = int(_types.DICTIONARY)
    MAP = int(_types.MAP)
    EXTENSION = int(_types.EXTENSION)
    FIXED_SIZE_LIST = int(_types.FIXED_SIZE_LIST)
    DURATION = int(_types.DURATION)
    LARGE_STRING = int(_types.LARGE_STRING)
    LARGE_BINARY = int(_types.LARGE_BINARY)
    LARGE_LIST = int(_types.LARGE_LIST)
    INTERVAL_MONTH_DAY_NANO = int(_types.INTERVAL_MONTH_DAY_NANO)
    RUN_END_ENCODED = int(_types.RUN_END_ENCODED)
    BINARY_VIEW = int(_types.BINARY_VIEW)
    STRING_VIEW = int(_types.STRING_VIEW)

    def __arrow_c_schema__(self):
        # This will only work for parameter-free types
        c_schema = CSchemaBuilder.allocate().set_type(self.value).set_name("").finish()
        return c_schema._capsule


class TimeUnit(enum.Enum):
    """Unit enumerator for timestamp, duration, and time types."""

    SECOND = CArrowTimeUnit.SECOND
    MILLI = CArrowTimeUnit.MILLI
    MICRO = CArrowTimeUnit.MICRO
    NANO = CArrowTimeUnit.NANO

    @staticmethod
    def create(obj):
        """Create a TimeUnit from parameter input.

        This constructor will accept the abbreviations "s", "ms", "us", and "ns"
        and return the appropriate enumerator value.

        >>> import nanoarrow as na
        >>> na.TimeUnit.create("s")
        <TimeUnit.SECOND: 0>
        """

        if isinstance(obj, str):
            if obj == "s":
                return TimeUnit.SECOND
            elif obj == "ms":
                return TimeUnit.MILLI
            elif obj == "us":
                return TimeUnit.MICRO
            elif obj == "ns":
                return TimeUnit.NANO

        return TimeUnit(obj)


class ExtensionAccessor:
    """Accessor for extension type parameters"""

    def __init__(self, schema) -> None:
        self._schema = schema

    @property
    def name(self) -> str:
        """Extension name for this extension type"""
        return self._schema._c_schema_view.extension_name

    @property
    def metadata(self) -> Union[bytes, None]:
        """Extension metadata for this extension type if present"""
        extension_metadata = self._schema._c_schema_view.extension_metadata
        return extension_metadata if extension_metadata else None

    @property
    def storage(self):
        """Storage type for this extension type"""
        metadata = dict(self._schema.metadata.items())

        # Remove metadata keys that cause this type to be treated as an extension
        del metadata[b"ARROW:extension:name"]
        if b"ARROW:extension:metadata" in metadata:
            del metadata[b"ARROW:extension:metadata"]

        return Schema(self._schema, metadata=metadata)


class Schema:
    """Create a nanoarrow Schema

    The Schema is nanoarrow's high-level data type representation, encompassing
    the role of PyArrow's ``Schema``, ``Field``, and ``DataType``. This scope
    maps to that of the ArrowSchema in the Arrow C Data interface.

    Parameters
    ----------
    obj :
        A :class:`Type` specifier or a schema-like object. A schema-like object
        includes:
        * A ``pyarrow.Schema``, `pyarrow.Field``, or ``pyarrow.DataType``
        * A nanoarrow :class:`Schema`, :class:`CSchema`, or :class:`Type`
        * Any object implementing the Arrow PyCapsule interface protocol method.

    name : str, optional
        An optional name to bind to this field.

    nullable : bool, optional
        Explicitly specify field nullability. Fields are nullable by default.

    metadata : mapping, optional
        Explicitly specify field metadata.

    params :
        Type-specific parameters when ``obj`` is a :class:`Type`.

    Examples
    --------

    >>> import nanoarrow as na
    >>> import pyarrow as pa
    >>> na.Schema(na.Type.INT32)
    <Schema> int32
    >>> na.Schema(na.Type.DURATION, unit=na.TimeUnit.SECOND)
    <Schema> duration('s')
    >>> na.Schema(pa.int32())
    <Schema> int32
    """

    def __init__(
        self,
        obj,
        *,
        name=None,
        nullable=None,
        metadata=None,
        fields=None,
        **params,
    ) -> None:
        if isinstance(obj, Type):
            self._c_schema = _c_schema_from_type_and_params(obj, params)
        else:
            if params:
                raise ValueError("params are only supported for obj of class Type")
            self._c_schema = c_schema(obj)

        if (
            name is not None
            or nullable is not None
            or metadata is not None
            or fields is not None
        ):
            self._c_schema = self._c_schema.modify(
                name=name,
                nullable=nullable,
                metadata=metadata,
                children=_clean_fields(fields),
            )

        self._c_schema_view = CSchemaView(self._c_schema)

    @property
    def params(self) -> Mapping:
        """Get parameter names and values for this type

        Returns a dictionary of parameters that can be used to reconstruct
        this type together with its type identifier.

        >>> import nanoarrow as na
        >>> na.fixed_size_binary(123).params
        {'byte_width': 123}
        """
        if self._c_schema_view.type_id not in _PARAM_NAMES:
            return {}

        param_names = _PARAM_NAMES[self._c_schema_view.type_id]
        return {k: getattr(self, k) for k in param_names}

    @property
    def type(self) -> Type:
        """Type enumerator value of this Schema

        >>> import nanoarrow as na
        >>> na.int32().type
        <Type.INT32: 8>
        """
        if self._c_schema_view.extension_name:
            return Type.EXTENSION
        else:
            return Type(self._c_schema_view.type_id)

    @property
    def name(self) -> Union[str, None]:
        """Field name of this Schema

        >>> import nanoarrow as na
        >>> schema = na.struct({"col1": na.int32()})
        >>> schema.field(0).name
        'col1'
        """
        return self._c_schema.name

    @property
    def nullable(self) -> bool:
        """Nullability of this field

        >>> import nanoarrow as na
        >>> na.int32().nullable
        True
        >>> na.int32(nullable=False).nullable
        False
        """
        return self._c_schema_view.nullable

    @cached_property
    def metadata(self) -> Mapping[bytes, bytes]:
        """Access field metadata of this field

        >>> import nanoarrow as na
        >>> schema = na.Schema(na.int32(), metadata={"key": "value"})
        >>> dict(schema.metadata.items())
        {b'key': b'value'}
        """
        c_schema_metadata = self._c_schema.metadata
        return (
            SchemaMetadata.empty() if c_schema_metadata is None else c_schema_metadata
        )

    @cached_property
    def extension(self) -> Union[ExtensionAccessor, None]:
        """Access extension type attributes

        >>> import nanoarrow as na
        >>> schema = na.extension_type(na.int32(), "arrow.example", b"{}")
        >>> schema.extension.name
        'arrow.example'
        >>> schema.extension.metadata
        b'{}'
        """
        extension_name = self._c_schema_view.extension_name
        if extension_name:
            return ExtensionAccessor(self)

    @property
    def byte_width(self) -> Union[int, None]:
        """Element byte width for fixed-size binary type

        Returns ``None`` for types for which this property is not relevant.

        >>> import nanoarrow as na
        >>> na.fixed_size_binary(123).byte_width
        123
        """

        if self._c_schema_view.type_id == _types.FIXED_SIZE_BINARY:
            return self._c_schema_view.fixed_size

    @property
    def unit(self) -> Union[TimeUnit, None]:
        """TimeUnit for timestamp, time, and duration types

        Returns ``None`` for types for which this property is not relevant.

        >>> import nanoarrow as na
        >>> na.timestamp(na.TimeUnit.SECOND).unit
        <TimeUnit.SECOND: 0>
        """

        unit_id = self._c_schema_view.time_unit_id
        if unit_id is not None:
            return TimeUnit(unit_id)

    @property
    def timezone(self) -> Union[str, None]:
        """Timezone for timestamp types

        Returns ``None`` for types for which this property is not relevant or
        for timezone types for which the timezone is not set.

        >>> import nanoarrow as na
        >>> na.timestamp(na.TimeUnit.SECOND, timezone="America/Halifax").timezone
        'America/Halifax'
Loading ...