# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import enum
import reprlib
from functools import cached_property
from typing import List, Mapping, Union
from nanoarrow._schema import (
CArrowTimeUnit,
CSchemaBuilder,
CSchemaView,
SchemaMetadata,
)
from nanoarrow.c_schema import c_schema
from nanoarrow import _repr_utils, _types
class Type(enum.Enum):
"""The Type enumerator provides a means by which the various type
categories can be identified. Type values can be used in place of
:class:`Schema` instances in most places for parameter-free types.
"""
UNINITIALIZED = int(_types.UNINITIALIZED)
NULL = int(_types.NA)
BOOL = int(_types.BOOL)
UINT8 = int(_types.UINT8)
INT8 = int(_types.INT8)
UINT16 = int(_types.UINT16)
INT16 = int(_types.INT16)
UINT32 = int(_types.UINT32)
INT32 = int(_types.INT32)
UINT64 = int(_types.UINT64)
INT64 = int(_types.INT64)
HALF_FLOAT = int(_types.HALF_FLOAT)
FLOAT = int(_types.FLOAT)
DOUBLE = int(_types.DOUBLE)
STRING = int(_types.STRING)
BINARY = int(_types.BINARY)
FIXED_SIZE_BINARY = int(_types.FIXED_SIZE_BINARY)
DATE32 = int(_types.DATE32)
DATE64 = int(_types.DATE64)
TIMESTAMP = int(_types.TIMESTAMP)
TIME32 = int(_types.TIME32)
TIME64 = int(_types.TIME64)
INTERVAL_MONTHS = int(_types.INTERVAL_MONTHS)
INTERVAL_DAY_TIME = int(_types.INTERVAL_DAY_TIME)
DECIMAL128 = int(_types.DECIMAL128)
DECIMAL256 = int(_types.DECIMAL256)
LIST = int(_types.LIST)
STRUCT = int(_types.STRUCT)
SPARSE_UNION = int(_types.SPARSE_UNION)
DENSE_UNION = int(_types.DENSE_UNION)
DICTIONARY = int(_types.DICTIONARY)
MAP = int(_types.MAP)
EXTENSION = int(_types.EXTENSION)
FIXED_SIZE_LIST = int(_types.FIXED_SIZE_LIST)
DURATION = int(_types.DURATION)
LARGE_STRING = int(_types.LARGE_STRING)
LARGE_BINARY = int(_types.LARGE_BINARY)
LARGE_LIST = int(_types.LARGE_LIST)
INTERVAL_MONTH_DAY_NANO = int(_types.INTERVAL_MONTH_DAY_NANO)
RUN_END_ENCODED = int(_types.RUN_END_ENCODED)
BINARY_VIEW = int(_types.BINARY_VIEW)
STRING_VIEW = int(_types.STRING_VIEW)
def __arrow_c_schema__(self):
# This will only work for parameter-free types
c_schema = CSchemaBuilder.allocate().set_type(self.value).set_name("").finish()
return c_schema._capsule
class TimeUnit(enum.Enum):
"""Unit enumerator for timestamp, duration, and time types."""
SECOND = CArrowTimeUnit.SECOND
MILLI = CArrowTimeUnit.MILLI
MICRO = CArrowTimeUnit.MICRO
NANO = CArrowTimeUnit.NANO
@staticmethod
def create(obj):
"""Create a TimeUnit from parameter input.
This constructor will accept the abbreviations "s", "ms", "us", and "ns"
and return the appropriate enumerator value.
>>> import nanoarrow as na
>>> na.TimeUnit.create("s")
<TimeUnit.SECOND: 0>
"""
if isinstance(obj, str):
if obj == "s":
return TimeUnit.SECOND
elif obj == "ms":
return TimeUnit.MILLI
elif obj == "us":
return TimeUnit.MICRO
elif obj == "ns":
return TimeUnit.NANO
return TimeUnit(obj)
class ExtensionAccessor:
"""Accessor for extension type parameters"""
def __init__(self, schema) -> None:
self._schema = schema
@property
def name(self) -> str:
"""Extension name for this extension type"""
return self._schema._c_schema_view.extension_name
@property
def metadata(self) -> Union[bytes, None]:
"""Extension metadata for this extension type if present"""
extension_metadata = self._schema._c_schema_view.extension_metadata
return extension_metadata if extension_metadata else None
@property
def storage(self):
"""Storage type for this extension type"""
metadata = dict(self._schema.metadata.items())
# Remove metadata keys that cause this type to be treated as an extension
del metadata[b"ARROW:extension:name"]
if b"ARROW:extension:metadata" in metadata:
del metadata[b"ARROW:extension:metadata"]
return Schema(self._schema, metadata=metadata)
class Schema:
"""Create a nanoarrow Schema
The Schema is nanoarrow's high-level data type representation, encompassing
the role of PyArrow's ``Schema``, ``Field``, and ``DataType``. This scope
maps to that of the ArrowSchema in the Arrow C Data interface.
Parameters
----------
obj :
A :class:`Type` specifier or a schema-like object. A schema-like object
includes:
* A ``pyarrow.Schema``, `pyarrow.Field``, or ``pyarrow.DataType``
* A nanoarrow :class:`Schema`, :class:`CSchema`, or :class:`Type`
* Any object implementing the Arrow PyCapsule interface protocol method.
name : str, optional
An optional name to bind to this field.
nullable : bool, optional
Explicitly specify field nullability. Fields are nullable by default.
metadata : mapping, optional
Explicitly specify field metadata.
params :
Type-specific parameters when ``obj`` is a :class:`Type`.
Examples
--------
>>> import nanoarrow as na
>>> import pyarrow as pa
>>> na.Schema(na.Type.INT32)
<Schema> int32
>>> na.Schema(na.Type.DURATION, unit=na.TimeUnit.SECOND)
<Schema> duration('s')
>>> na.Schema(pa.int32())
<Schema> int32
"""
def __init__(
self,
obj,
*,
name=None,
nullable=None,
metadata=None,
fields=None,
**params,
) -> None:
if isinstance(obj, Type):
self._c_schema = _c_schema_from_type_and_params(obj, params)
else:
if params:
raise ValueError("params are only supported for obj of class Type")
self._c_schema = c_schema(obj)
if (
name is not None
or nullable is not None
or metadata is not None
or fields is not None
):
self._c_schema = self._c_schema.modify(
name=name,
nullable=nullable,
metadata=metadata,
children=_clean_fields(fields),
)
self._c_schema_view = CSchemaView(self._c_schema)
@property
def params(self) -> Mapping:
"""Get parameter names and values for this type
Returns a dictionary of parameters that can be used to reconstruct
this type together with its type identifier.
>>> import nanoarrow as na
>>> na.fixed_size_binary(123).params
{'byte_width': 123}
"""
if self._c_schema_view.type_id not in _PARAM_NAMES:
return {}
param_names = _PARAM_NAMES[self._c_schema_view.type_id]
return {k: getattr(self, k) for k in param_names}
@property
def type(self) -> Type:
"""Type enumerator value of this Schema
>>> import nanoarrow as na
>>> na.int32().type
<Type.INT32: 8>
"""
if self._c_schema_view.extension_name:
return Type.EXTENSION
else:
return Type(self._c_schema_view.type_id)
@property
def name(self) -> Union[str, None]:
"""Field name of this Schema
>>> import nanoarrow as na
>>> schema = na.struct({"col1": na.int32()})
>>> schema.field(0).name
'col1'
"""
return self._c_schema.name
@property
def nullable(self) -> bool:
"""Nullability of this field
>>> import nanoarrow as na
>>> na.int32().nullable
True
>>> na.int32(nullable=False).nullable
False
"""
return self._c_schema_view.nullable
@cached_property
def metadata(self) -> Mapping[bytes, bytes]:
"""Access field metadata of this field
>>> import nanoarrow as na
>>> schema = na.Schema(na.int32(), metadata={"key": "value"})
>>> dict(schema.metadata.items())
{b'key': b'value'}
"""
c_schema_metadata = self._c_schema.metadata
return (
SchemaMetadata.empty() if c_schema_metadata is None else c_schema_metadata
)
@cached_property
def extension(self) -> Union[ExtensionAccessor, None]:
"""Access extension type attributes
>>> import nanoarrow as na
>>> schema = na.extension_type(na.int32(), "arrow.example", b"{}")
>>> schema.extension.name
'arrow.example'
>>> schema.extension.metadata
b'{}'
"""
extension_name = self._c_schema_view.extension_name
if extension_name:
return ExtensionAccessor(self)
@property
def byte_width(self) -> Union[int, None]:
"""Element byte width for fixed-size binary type
Returns ``None`` for types for which this property is not relevant.
>>> import nanoarrow as na
>>> na.fixed_size_binary(123).byte_width
123
"""
if self._c_schema_view.type_id == _types.FIXED_SIZE_BINARY:
return self._c_schema_view.fixed_size
@property
def unit(self) -> Union[TimeUnit, None]:
"""TimeUnit for timestamp, time, and duration types
Returns ``None`` for types for which this property is not relevant.
>>> import nanoarrow as na
>>> na.timestamp(na.TimeUnit.SECOND).unit
<TimeUnit.SECOND: 0>
"""
unit_id = self._c_schema_view.time_unit_id
if unit_id is not None:
return TimeUnit(unit_id)
@property
def timezone(self) -> Union[str, None]:
"""Timezone for timestamp types
Returns ``None`` for types for which this property is not relevant or
for timezone types for which the timezone is not set.
>>> import nanoarrow as na
>>> na.timestamp(na.TimeUnit.SECOND, timezone="America/Halifax").timezone
'America/Halifax'
Loading ...