Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev70 

/ include / arrow / c / abi.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

/// \file abi.h Arrow C Data Interface
///
/// The Arrow C Data interface defines a very small, stable set
/// of C definitions which can be easily copied into any project's
/// source code and vendored to be used for columnar data interchange
/// in the Arrow format. For non-C/C++ languages and runtimes,
/// it should be almost as easy to translate the C definitions into
/// the corresponding C FFI declarations.
///
/// Applications and libraries can therefore work with Arrow memory
/// without necessarily using the Arrow libraries or reinventing
/// the wheel. Developers can choose between tight integration
/// with the Arrow software project or minimal integration with
/// the Arrow format only.

#pragma once

#include <stdint.h>

// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html

#ifdef __cplusplus
extern "C" {
#endif

#ifndef ARROW_C_DATA_INTERFACE
#  define ARROW_C_DATA_INTERFACE

#  define ARROW_FLAG_DICTIONARY_ORDERED 1
#  define ARROW_FLAG_NULLABLE 2
#  define ARROW_FLAG_MAP_KEYS_SORTED 4

struct ArrowSchema {
  // Array type description
  const char* format;
  const char* name;
  const char* metadata;
  int64_t flags;
  int64_t n_children;
  struct ArrowSchema** children;
  struct ArrowSchema* dictionary;

  // Release callback
  void (*release)(struct ArrowSchema*);
  // Opaque producer-specific data
  void* private_data;
};

struct ArrowArray {
  // Array data description
  int64_t length;
  int64_t null_count;
  int64_t offset;
  int64_t n_buffers;
  int64_t n_children;
  const void** buffers;
  struct ArrowArray** children;
  struct ArrowArray* dictionary;

  // Release callback
  void (*release)(struct ArrowArray*);
  // Opaque producer-specific data
  void* private_data;
};

#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact"
#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \
    "ARROW:average_byte_width:approximate"
#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact"
#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \
    "ARROW:distinct_count:approximate"
#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact"
#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \
    "ARROW:max_byte_width:approximate"
#  define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact"
#  define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate"
#  define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact"
#  define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate"
#  define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact"
#  define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate"
#  define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact"
#  define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate"

#endif  // ARROW_C_DATA_INTERFACE

#ifndef ARROW_C_DEVICE_DATA_INTERFACE
#  define ARROW_C_DEVICE_DATA_INTERFACE

// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html

// DeviceType for the allocated memory
typedef int32_t ArrowDeviceType;

// CPU device, same as using ArrowArray directly
#  define ARROW_DEVICE_CPU 1
// CUDA GPU Device
#  define ARROW_DEVICE_CUDA 2
// Pinned CUDA CPU memory by cudaMallocHost
#  define ARROW_DEVICE_CUDA_HOST 3
// OpenCL Device
#  define ARROW_DEVICE_OPENCL 4
// Vulkan buffer for next-gen graphics
#  define ARROW_DEVICE_VULKAN 7
// Metal for Apple GPU
#  define ARROW_DEVICE_METAL 8
// Verilog simulator buffer
#  define ARROW_DEVICE_VPI 9
// ROCm GPUs for AMD GPUs
#  define ARROW_DEVICE_ROCM 10
// Pinned ROCm CPU memory allocated by hipMallocHost
#  define ARROW_DEVICE_ROCM_HOST 11
// Reserved for extension
#  define ARROW_DEVICE_EXT_DEV 12
// CUDA managed/unified memory allocated by cudaMallocManaged
#  define ARROW_DEVICE_CUDA_MANAGED 13
// unified shared memory allocated on a oneAPI non-partitioned device.
#  define ARROW_DEVICE_ONEAPI 14
// GPU support for next-gen WebGPU standard
#  define ARROW_DEVICE_WEBGPU 15
// Qualcomm Hexagon DSP
#  define ARROW_DEVICE_HEXAGON 16

struct ArrowDeviceArray {
  // the Allocated Array
  //
  // the buffers in the array (along with the buffers of any
  // children) are what is allocated on the device.
  struct ArrowArray array;
  // The device id to identify a specific device
  int64_t device_id;
  // The type of device which can access this memory.
  ArrowDeviceType device_type;
  // An event-like object to synchronize on if needed.
  void* sync_event;
  // Reserved bytes for future expansion.
  int64_t reserved[3];
};

#endif  // ARROW_C_DEVICE_DATA_INTERFACE

#ifndef ARROW_C_STREAM_INTERFACE
#  define ARROW_C_STREAM_INTERFACE

struct ArrowArrayStream {
  // Callback to get the stream type
  // (will be the same for all arrays in the stream).
  //
  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowSchema must be released independently from the stream.
  int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);

  // Callback to get the next array
  // (if no error and the array is released, the stream has ended)
  //
  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowArray must be released independently from the stream.
  int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);

  // Callback to get optional detailed error information.
  // This must only be called if the last stream operation failed
  // with a non-0 return code.
  //
  // Return value: pointer to a null-terminated character array describing
  // the last error, or NULL if no description is available.
  //
  // The returned pointer is only valid until the next operation on this stream
  // (including release).
  const char* (*get_last_error)(struct ArrowArrayStream*);

  // Release callback: release the stream's own resources.
  // Note that arrays returned by `get_next` must be individually released.
  void (*release)(struct ArrowArrayStream*);

  // Opaque producer-specific data
  void* private_data;
};

#endif  // ARROW_C_STREAM_INTERFACE

#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
#  define ARROW_C_DEVICE_STREAM_INTERFACE

// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
//
// This stream is intended to provide a stream of data on a single
// device, if a producer wants data to be produced on multiple devices
// then multiple streams should be provided. One per device.
struct ArrowDeviceArrayStream {
  // The device that this stream produces data on.
  ArrowDeviceType device_type;

  // Callback to get the stream schema
  // (will be the same for all arrays in the stream).
  //
  // Return value 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowSchema must be released independently from the stream.
  // The schema should be accessible via CPU memory.
  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);

  // Callback to get the next array
  // (if no error and the array is released, the stream has ended)
  //
  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowDeviceArray must be released independently from the stream.
  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);

  // Callback to get optional detailed error information.
  // This must only be called if the last stream operation failed
  // with a non-0 return code.
  //
  // Return value: pointer to a null-terminated character array describing
  // the last error, or NULL if no description is available.
  //
  // The returned pointer is only valid until the next operation on this stream
  // (including release).
  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);

  // Release callback: release the stream's own resources.
  // Note that arrays returned by `get_next` must be individually released.
  void (*release)(struct ArrowDeviceArrayStream* self);

  // Opaque producer-specific data
  void* private_data;
};

#endif  // ARROW_C_DEVICE_STREAM_INTERFACE

#ifndef ARROW_C_ASYNC_STREAM_INTERFACE
#  define ARROW_C_ASYNC_STREAM_INTERFACE

// EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed
// to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler.
//
// The reason for this Task approach instead of the Async interface returning
// the Array directly is to allow for more complex thread handling and reducing
// context switching and data transfers between CPU cores (e.g. from one L1/L2
// cache to another) if desired.
//
// For example, the `on_next_task` callback can be called when data is ready, while
// the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This
// allows for the producer to manage the I/O on one thread which calls `on_next_task`
// and the consumer can determine when the decoding (producer logic in the `extract_data`
// callback of the task) occurs and on which thread, to avoid a CPU core transfer
// (data staying in the L2 cache).
struct ArrowAsyncTask {
  // This callback should populate the ArrowDeviceArray associated with this task.
  // The order of ArrowAsyncTasks provided by the producer enables a consumer to
  // ensure the order of data to process.
  //
  // This function is expected to be synchronous, but should not perform any blocking
  // I/O. Ideally it should be as cheap as possible so as to not tie up the consumer
  // thread unnecessarily.
  //
  // Returns: 0 if successful, errno-compatible error otherwise.
  //
  // If a non-0 value is returned then it should be followed by a call to `on_error`
  // on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly
  // likely that whatever is calling this function may be entirely disconnected from
  // the current control flow. Indicating an error here with a non-zero return allows
  // the current flow to be aware of the error occurring, while still allowing any
  // logging or error handling to still be centralized in the `on_error` callback of
  // the original Async handler.
  //
  // Rather than a release callback, any required cleanup should be performed as part
  // of the invocation of `extract_data`. Ownership of the Array is passed to the consumer
  // calling this, and so it must be released separately.
  //
  // It is only valid to call this method exactly once.
  int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out);

  // opaque task-specific data
  void* private_data;
};

// EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async
// producer and consumer. This object allows the consumer to perform backpressure and flow
// control on the asynchronous stream processing. This object must be owned by the
// producer who creates it, and thus is responsible for cleaning it up.
struct ArrowAsyncProducer {
  // The device type that this stream produces data on.
  ArrowDeviceType device_type;

  // A consumer must call this function to start receiving on_next_task calls.
  //
  // It *must* be valid to call this synchronously from within `on_next_task` or
  // `on_schema`, but this function *must not* immediately call `on_next_task` so as
  // to avoid recursion and reentrant callbacks.
  //
  // After cancel has been called, additional calls to this function must be NOPs,
  // but allowed. While not cancelled, calling this function must register the
  // given number of additional arrays/batches to be produced with the producer.
  // The producer should only call `on_next_task` at most the registered number
  // of arrays before propagating backpressure.
  //
  // Any error encountered by calling request must be propagated by calling the `on_error`
  // callback of the ArrowAsyncDeviceStreamHandler.
  //
  // While not cancelled, any subsequent calls to `on_next_task`, `on_error` or
  // `release` should be scheduled by the producer to be called later.
  //
  // It is invalid for a consumer to call this with a value of n <= 0, producers should
  // error if given such a value.
  void (*request)(struct ArrowAsyncProducer* self, int64_t n);

  // This cancel callback signals a producer that it must eventually stop making calls
  // to on_next_task. It must be idempotent and thread-safe. After calling cancel once,
  // subsequent calls must be NOPs. This must not call any consumer-side handlers other
  // than `on_error`.
  //
  // It is not required that calling cancel affect the producer immediately, only that it
  // must eventually stop calling on_next_task and subsequently call release on the
  // async handler. As such, a consumer must be prepared to receive one or more calls to
  // `on_next_task` even after calling cancel if there are still requested arrays pending.
  //
  // Successful cancellation should *not* result in the producer calling `on_error`, it
  // should finish out any remaining tasks and eventually call `release`.
  //
  // Any error encountered during handling a call to cancel must be reported via the
  // on_error callback on the async stream handler.
  void (*cancel)(struct ArrowAsyncProducer* self);

  // Any additional metadata tied to a specific stream of data. This must either be NULL
  // or a valid pointer to metadata which is encoded in the same way schema metadata
  // would be. Non-null metadata must be valid for the lifetime of this object. As an
Loading ...