Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / nanoarrow   python

Repository URL to install this package:

Version: 0.7.0.dev132 

/ src / nanoarrow / _array.pyx

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# cython: language_level = 3

from libc.stdint cimport uintptr_t, uint8_t, int64_t
from cpython.pycapsule cimport PyCapsule_GetPointer
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cpython cimport (
    Py_buffer,
    PyBuffer_Release,
    PyBUF_ANY_CONTIGUOUS,
    PyBUF_FORMAT,
    PyBytes_FromStringAndSize,
    PyObject_GetBuffer,
    PyUnicode_FromStringAndSize,
)

from nanoarrow_c cimport (
    ArrowArray,
    ArrowArrayAppendBytes,
    ArrowArrayAppendNull,
    ArrowArrayAppendString,
    ArrowArrayBuffer,
    ArrowArrayFinishBuilding,
    ArrowArrayInitFromSchema,
    ArrowArrayInitFromType,
    ArrowArrayMove,
    ArrowArrayRelease,
    ArrowArrayStartAppending,
    ArrowArrayView,
    ArrowArrayViewComputeNullCount,
    ArrowArrayViewInitFromSchema,
    ArrowArrayViewIsNull,
    ArrowArrayViewGetBytesUnsafe,
    ArrowArrayViewGetBufferDataType,
    ArrowArrayViewGetBufferElementSizeBits,
    ArrowArrayViewGetBufferType,
    ArrowArrayViewGetBufferView,
    ArrowArrayViewGetNumBuffers,
    ArrowArrayViewGetStringUnsafe,
    ArrowArrayViewSetArray,
    ArrowArrayViewSetArrayMinimal,
    ArrowBitCountSet,
    ArrowBuffer,
    ArrowBufferMove,
    ArrowBufferType,
    ArrowBufferView,
    ArrowSchemaInitFromType,
    ArrowStringView,
    ArrowType,
    ArrowTypeString,
    ArrowValidationLevel,
    NANOARROW_BUFFER_TYPE_DATA,
    NANOARROW_BUFFER_TYPE_DATA_OFFSET,
    NANOARROW_BUFFER_TYPE_VARIADIC_DATA,
    NANOARROW_BUFFER_TYPE_VARIADIC_SIZE,
    NANOARROW_BUFFER_TYPE_TYPE_ID,
    NANOARROW_BUFFER_TYPE_UNION_OFFSET,
    NANOARROW_BUFFER_TYPE_VALIDITY,
    NANOARROW_VALIDATION_LEVEL_DEFAULT,
    NANOARROW_VALIDATION_LEVEL_FULL,
    NANOARROW_VALIDATION_LEVEL_MINIMAL,
    NANOARROW_VALIDATION_LEVEL_NONE,
    NANOARROW_OK,
)

from nanoarrow_device_c cimport (
    ARROW_DEVICE_CPU,
    ArrowDeviceType,
    ArrowDeviceArray,
    ArrowDeviceArrayInit,
)

from nanoarrow._device cimport Device, CSharedSyncEvent

from nanoarrow._buffer cimport CBuffer, CBufferView
from nanoarrow._schema cimport CSchema, CLayout
from nanoarrow._utils cimport (
    alloc_c_array,
    alloc_c_device_array,
    alloc_c_array_view,
    c_array_shallow_copy,
    c_device_array_shallow_copy,
    Error
)

from typing import Iterable, Tuple, Union

from nanoarrow import _repr_utils
from nanoarrow._device import DEVICE_CPU, DeviceType


cdef class CArrayView:
    """Low-level ArrowArrayView wrapper

    This object is a literal wrapper around an ArrowArrayView. It provides field accessors
    that return Python objects and handles the structure lifecycle (i.e., initialized
    ArrowArrayView structures are always released).

    See `nanoarrow.c_array_view()` for construction and usage examples.
    """

    def __cinit__(self, object base, uintptr_t addr):
        self._base = base
        self._ptr = <ArrowArrayView*>addr
        self._event = CSharedSyncEvent(DEVICE_CPU)

    def _set_array(self, CArray array, Device device=DEVICE_CPU):
        cdef Error error = Error()
        cdef int code

        if device is DEVICE_CPU:
            code = ArrowArrayViewSetArray(self._ptr, array._ptr, &error.c_error)
        else:
            code = ArrowArrayViewSetArrayMinimal(self._ptr, array._ptr, &error.c_error)

        error.raise_message_not_ok("ArrowArrayViewSetArray()", code)
        self._array_base = array._base
        self._event = CSharedSyncEvent(device, <uintptr_t>array._sync_event)

        return self

    @property
    def storage_type_id(self):
        return self._ptr.storage_type

    @property
    def storage_type(self):
        cdef const char* type_str = ArrowTypeString(self._ptr.storage_type)
        if type_str != NULL:
            return type_str.decode('UTF-8')

    @property
    def layout(self):
        return CLayout(self, <uintptr_t>&self._ptr.layout)

    def __len__(self):
        return self._ptr.length

    @property
    def length(self):
        return len(self)

    @property
    def offset(self):
        return self._ptr.offset

    @property
    def null_count(self):
        if self._ptr.null_count != -1:
            return self._ptr.null_count

        cdef ArrowBufferType buffer_type = self._ptr.layout.buffer_type[0]
        cdef const uint8_t* validity_bits = self._ptr.buffer_views[0].data.as_uint8

        if buffer_type != NANOARROW_BUFFER_TYPE_VALIDITY:
            self._ptr.null_count = 0
        elif validity_bits == NULL:
            self._ptr.null_count = 0
        elif self._event.device is DEVICE_CPU:
            self._ptr.null_count = ArrowArrayViewComputeNullCount(self._ptr)

        return self._ptr.null_count

    @property
    def n_children(self):
        return self._ptr.n_children

    def child(self, int64_t i):
        if i < 0 or i >= self._ptr.n_children:
            raise IndexError(f"{i} out of range [0, {self._ptr.n_children})")

        cdef CArrayView child = CArrayView(
            self._base,
            <uintptr_t>self._ptr.children[i]
        )

        child._event = self._event

        return child

    @property
    def children(self):
        for i in range(self.n_children):
            yield self.child(i)

    @property
    def n_buffers(self):
        return ArrowArrayViewGetNumBuffers(self._ptr)

    def _buffer_info(self, int64_t i):
        if i < 0 or i >= self.n_buffers:
            raise IndexError(f"{i} out of range [0, {self.n_buffers}]")

        cdef ArrowBufferView view = ArrowArrayViewGetBufferView(self._ptr, i)

        return (
            ArrowArrayViewGetBufferType(self._ptr, i),
            ArrowArrayViewGetBufferDataType(self._ptr, i),
            ArrowArrayViewGetBufferElementSizeBits(self._ptr, i),
            <uintptr_t>view.data.data,
            view.size_bytes
        )

    def buffer_type(self, int64_t i):
        buffer_type = self._buffer_info(i)[0]
        if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
            return "validity"
        elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
            return "type_id"
        elif buffer_type == NANOARROW_BUFFER_TYPE_UNION_OFFSET:
            return "union_offset"
        elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_OFFSET:
            return "data_offset"
        elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
            return "data"
        elif buffer_type == NANOARROW_BUFFER_TYPE_VARIADIC_DATA:
            return "variadic_data"
        elif buffer_type == NANOARROW_BUFFER_TYPE_VARIADIC_SIZE:
            return "variadic_size"
        else:
            return "none"

    def buffer(self, int64_t i):
        _, data_type, element_size_bits, addr, size = self._buffer_info(i)

        cdef ArrowBufferView buffer_view
        buffer_view.data.data = <void*>addr
        buffer_view.size_bytes = size

        # Check the buffer size here because the error later is cryptic.
        # Buffer sizes are set to -1 when they are "unknown", so because of errors
        # in nanoarrow/C or because the array is on a non-CPU device, that -1 value
        # could leak its way here.
        if buffer_view.size_bytes < 0:
            raise RuntimeError(f"ArrowArrayView buffer {i} has size_bytes < 0")

        return CBufferView(
            self._array_base,
            addr,
            size,
            data_type,
            element_size_bits,
            self._event
        )

    @property
    def buffers(self):
        for i in range(self.n_buffers):
            yield self.buffer(i)

    @property
    def dictionary(self):
        if self._ptr.dictionary == NULL:
            return None

        cdef CArrayView dictionary = CArrayView(
            self,
            <uintptr_t>self._ptr.dictionary
        )
        dictionary._event = self._event

        return dictionary

    def _iter_bytes(self, int64_t offset, int64_t length) -> bytes | None:
        cdef ArrowBufferView item_view
        for i in range(offset, length):
            if ArrowArrayViewIsNull(self._ptr, i):
                yield None
            else:
                item_view = ArrowArrayViewGetBytesUnsafe(self._ptr, i)
                yield PyBytes_FromStringAndSize(item_view.data.as_char, item_view.size_bytes)

    def _iter_str(self, int64_t offset, int64_t length) -> str | None:
        cdef ArrowStringView item_view
        for i in range(offset, length):
            if ArrowArrayViewIsNull(self._ptr, i):
                yield None
            else:
                item_view = ArrowArrayViewGetStringUnsafe(self._ptr, i)
                yield PyUnicode_FromStringAndSize(item_view.data, item_view.size_bytes)

    def __repr__(self):
        return _repr_utils.array_view_repr(self)

    @staticmethod
    def from_schema(CSchema schema):
        cdef ArrowArrayView* c_array_view
        base = alloc_c_array_view(&c_array_view)

        cdef Error error = Error()
        cdef int code = ArrowArrayViewInitFromSchema(c_array_view,
                                                     schema._ptr, &error.c_error)
        error.raise_message_not_ok("ArrowArrayViewInitFromSchema()", code)

        return CArrayView(base, <uintptr_t>c_array_view)

    @staticmethod
    def from_array(CArray array, Device device=DEVICE_CPU):
        out = CArrayView.from_schema(array._schema)
        return out._set_array(array, device)


cdef class CArray:
    """Low-level ArrowArray wrapper

    This object is a literal wrapper around a read-only ArrowArray. It provides field accessors
    that return Python objects and handles the C Data interface lifecycle (i.e., initialized
    ArrowArray structures are always released).

    See `nanoarrow.c_array()` for construction and usage examples.
    """

    @staticmethod
    def allocate(CSchema schema) -> CArray:
        """Allocate a released ArrowArray"""
        cdef ArrowArray* c_array_out
        base = alloc_c_array(&c_array_out)
        return CArray(base, <uintptr_t>c_array_out, schema)

    def __cinit__(self, object base, uintptr_t addr, CSchema schema):
        self._base = base
        self._ptr = <ArrowArray*>addr
        self._schema = schema
        self._device_type = ARROW_DEVICE_CPU
        self._device_id = -1
        self._sync_event = NULL

    cdef _set_device(self, ArrowDeviceType device_type, int64_t device_id, void* sync_event):
        self._device_type = device_type
Loading ...