# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: language_level = 3
from libc.stdint cimport uintptr_t, uint8_t, int64_t
from cpython.pycapsule cimport PyCapsule_GetPointer
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cpython cimport (
Py_buffer,
PyObject_GetBuffer,
PyBuffer_Release,
PyBUF_ANY_CONTIGUOUS,
PyBUF_FORMAT,
)
from nanoarrow_c cimport (
ArrowArray,
ArrowArrayAppendBytes,
ArrowArrayAppendNull,
ArrowArrayAppendString,
ArrowArrayBuffer,
ArrowArrayFinishBuilding,
ArrowArrayInitFromSchema,
ArrowArrayInitFromType,
ArrowArrayMove,
ArrowArrayRelease,
ArrowArrayStartAppending,
ArrowArrayView,
ArrowArrayViewComputeNullCount,
ArrowArrayViewInitFromSchema,
ArrowArrayViewSetArray,
ArrowArrayViewSetArrayMinimal,
ArrowBitCountSet,
ArrowBuffer,
ArrowBufferMove,
ArrowBufferType,
ArrowBufferView,
ArrowSchemaInitFromType,
ArrowStringView,
ArrowType,
ArrowTypeString,
ArrowValidationLevel,
NANOARROW_BUFFER_TYPE_DATA,
NANOARROW_BUFFER_TYPE_DATA_OFFSET,
NANOARROW_BUFFER_TYPE_TYPE_ID,
NANOARROW_BUFFER_TYPE_UNION_OFFSET,
NANOARROW_BUFFER_TYPE_VALIDITY,
NANOARROW_VALIDATION_LEVEL_DEFAULT,
NANOARROW_VALIDATION_LEVEL_FULL,
NANOARROW_VALIDATION_LEVEL_MINIMAL,
NANOARROW_VALIDATION_LEVEL_NONE,
NANOARROW_OK,
)
from nanoarrow_device_c cimport (
ARROW_DEVICE_CPU,
ArrowDeviceType,
ArrowDeviceArray,
ArrowDeviceArrayInit,
)
from nanoarrow._device cimport Device, CSharedSyncEvent
from nanoarrow._buffer cimport CBuffer, CBufferView
from nanoarrow._schema cimport CSchema, CLayout
from nanoarrow._utils cimport (
alloc_c_array,
alloc_c_device_array,
alloc_c_array_view,
c_array_shallow_copy,
c_device_array_shallow_copy,
Error
)
from typing import Iterable, Tuple, Union
from nanoarrow import _repr_utils
from nanoarrow._device import DEVICE_CPU, DeviceType
cdef class CArrayView:
"""Low-level ArrowArrayView wrapper
This object is a literal wrapper around an ArrowArrayView. It provides field accessors
that return Python objects and handles the structure lifecycle (i.e., initialized
ArrowArrayView structures are always released).
See `nanoarrow.c_array_view()` for construction and usage examples.
"""
def __cinit__(self, object base, uintptr_t addr):
self._base = base
self._ptr = <ArrowArrayView*>addr
self._event = CSharedSyncEvent(DEVICE_CPU)
def _set_array(self, CArray array, Device device=DEVICE_CPU):
cdef Error error = Error()
cdef int code
if device is DEVICE_CPU:
code = ArrowArrayViewSetArray(self._ptr, array._ptr, &error.c_error)
else:
code = ArrowArrayViewSetArrayMinimal(self._ptr, array._ptr, &error.c_error)
error.raise_message_not_ok("ArrowArrayViewSetArray()", code)
self._array_base = array._base
self._event = CSharedSyncEvent(device, <uintptr_t>array._sync_event)
return self
@property
def storage_type_id(self):
return self._ptr.storage_type
@property
def storage_type(self):
cdef const char* type_str = ArrowTypeString(self._ptr.storage_type)
if type_str != NULL:
return type_str.decode('UTF-8')
@property
def layout(self):
return CLayout(self, <uintptr_t>&self._ptr.layout)
def __len__(self):
return self._ptr.length
@property
def length(self):
return len(self)
@property
def offset(self):
return self._ptr.offset
@property
def null_count(self):
if self._ptr.null_count != -1:
return self._ptr.null_count
cdef ArrowBufferType buffer_type = self._ptr.layout.buffer_type[0]
cdef const uint8_t* validity_bits = self._ptr.buffer_views[0].data.as_uint8
if buffer_type != NANOARROW_BUFFER_TYPE_VALIDITY:
self._ptr.null_count = 0
elif validity_bits == NULL:
self._ptr.null_count = 0
elif self._event.device is DEVICE_CPU:
self._ptr.null_count = ArrowArrayViewComputeNullCount(self._ptr)
return self._ptr.null_count
@property
def n_children(self):
return self._ptr.n_children
def child(self, int64_t i):
if i < 0 or i >= self._ptr.n_children:
raise IndexError(f"{i} out of range [0, {self._ptr.n_children})")
cdef CArrayView child = CArrayView(
self._base,
<uintptr_t>self._ptr.children[i]
)
child._event = self._event
return child
@property
def children(self):
for i in range(self.n_children):
yield self.child(i)
@property
def n_buffers(self):
return self.layout.n_buffers
def buffer_type(self, int64_t i):
if i < 0 or i >= self.n_buffers:
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
buffer_type = self._ptr.layout.buffer_type[i]
if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
return "validity"
elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
return "type_id"
elif buffer_type == NANOARROW_BUFFER_TYPE_UNION_OFFSET:
return "union_offset"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_OFFSET:
return "data_offset"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
return "data"
else:
return "none"
def buffer(self, int64_t i):
if i < 0 or i >= self.n_buffers:
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i])
# Check the buffer size here because the error later is cryptic.
# Buffer sizes are set to -1 when they are "unknown", so because of errors
# in nanoarrow/C or because the array is on a non-CPU device, that -1 value
# could leak its way here.
if buffer_view.size_bytes < 0:
raise RuntimeError(f"ArrowArrayView buffer {i} has size_bytes < 0")
return CBufferView(
self._array_base,
<uintptr_t>buffer_view.data.data,
buffer_view.size_bytes,
self._ptr.layout.buffer_data_type[i],
self._ptr.layout.element_size_bits[i],
self._event
)
@property
def buffers(self):
for i in range(self.n_buffers):
yield self.buffer(i)
@property
def dictionary(self):
if self._ptr.dictionary == NULL:
return None
cdef CArrayView dictionary = CArrayView(
self,
<uintptr_t>self._ptr.dictionary
)
dictionary._event = self._event
return dictionary
def __repr__(self):
return _repr_utils.array_view_repr(self)
@staticmethod
def from_schema(CSchema schema):
cdef ArrowArrayView* c_array_view
base = alloc_c_array_view(&c_array_view)
cdef Error error = Error()
cdef int code = ArrowArrayViewInitFromSchema(c_array_view,
schema._ptr, &error.c_error)
error.raise_message_not_ok("ArrowArrayViewInitFromSchema()", code)
return CArrayView(base, <uintptr_t>c_array_view)
@staticmethod
def from_array(CArray array, Device device=DEVICE_CPU):
out = CArrayView.from_schema(array._schema)
return out._set_array(array, device)
cdef class CArray:
"""Low-level ArrowArray wrapper
This object is a literal wrapper around a read-only ArrowArray. It provides field accessors
that return Python objects and handles the C Data interface lifecycle (i.e., initialized
ArrowArray structures are always released).
See `nanoarrow.c_array()` for construction and usage examples.
"""
@staticmethod
def allocate(CSchema schema) -> CArray:
"""Allocate a released ArrowArray"""
cdef ArrowArray* c_array_out
base = alloc_c_array(&c_array_out)
return CArray(base, <uintptr_t>c_array_out, schema)
def __cinit__(self, object base, uintptr_t addr, CSchema schema):
self._base = base
self._ptr = <ArrowArray*>addr
self._schema = schema
self._device_type = ARROW_DEVICE_CPU
self._device_id = -1
self._sync_event = NULL
cdef _set_device(self, ArrowDeviceType device_type, int64_t device_id, void* sync_event):
self._device_type = device_type
self._device_id = device_id
self._sync_event = sync_event
@staticmethod
def _import_from_c_capsule(schema_capsule, array_capsule) -> CArray:
"""Import from a ArrowSchema and ArrowArray PyCapsule tuple.
Parameters
----------
schema_capsule : PyCapsule
A valid PyCapsule with name 'arrow_schema' containing an
ArrowSchema pointer.
array_capsule : PyCapsule
A valid PyCapsule with name 'arrow_array' containing an
ArrowArray pointer.
"""
cdef:
CSchema out_schema
CArray out
out_schema = CSchema._import_from_c_capsule(schema_capsule)
out = CArray(
array_capsule,
<uintptr_t>PyCapsule_GetPointer(array_capsule, 'arrow_array'),
out_schema
)
return out
def __getitem__(self, k) -> CArray:
self._assert_valid()
if not isinstance(k, slice):
raise TypeError(
f"Can't subset CArray with object of type {type(k).__name__}")
if k.step is not None:
raise ValueError("Can't slice CArray with step")
cdef int64_t start = 0 if k.start is None else k.start
cdef int64_t stop = self._ptr.length if k.stop is None else k.stop
if start < 0:
start = self._ptr.length + start
if stop < 0:
stop = self._ptr.length + stop
Loading ...