Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev259 

/ include / arrow / ipc / options.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <optional>
#include <vector>

#include "arrow/io/caching.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/compression.h"
#include "arrow/util/visibility.h"

namespace arrow {

class MemoryPool;

namespace ipc {

// ARROW-109: We set this number arbitrarily to help catch user mistakes. For
// deeply nested schemas, it is expected the user will indicate explicitly the
// maximum allowed recursion depth
constexpr int kMaxNestingDepth = 64;

/// \brief Options for writing Arrow IPC messages
struct ARROW_EXPORT IpcWriteOptions {
  /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
  ///
  /// Some implementations may not be able to parse streams created with this option.
  bool allow_64bit = false;

  /// \brief The maximum permitted schema nesting depth.
  int max_recursion_depth = kMaxNestingDepth;

  /// \brief Write padding after memory buffers up to this multiple of bytes.
  int32_t alignment = 8;

  /// \brief Write the pre-0.15.0 IPC message format
  ///
  /// This legacy format consists of a 4-byte prefix instead of 8-byte.
  bool write_legacy_ipc_format = false;

  /// \brief The memory pool to use for allocations made during IPC writing
  ///
  /// While Arrow IPC is predominantly zero-copy, it may have to allocate
  /// memory in some cases (for example if compression is enabled).
  MemoryPool* memory_pool = default_memory_pool();

  /// \brief Compression codec to use for record batch body buffers
  ///
  /// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD.
  std::shared_ptr<util::Codec> codec;

  /// \brief Minimum space savings percentage required for compression to be applied
  ///
  /// Space savings is calculated as (1.0 - compressed_size / uncompressed_size).
  ///
  /// For example, if min_space_savings = 0.1, a 100-byte body buffer won't undergo
  /// compression if its expected compressed size exceeds 90 bytes. If this option is
  /// unset, compression will be used indiscriminately. If no codec was supplied, this
  /// option is ignored.
  ///
  /// Values outside of the range [0,1] are handled as errors.
  ///
  /// Note that enabling this option may result in unreadable data for Arrow C++ versions
  /// prior to 12.0.0.
  std::optional<double> min_space_savings;

  /// \brief Use global CPU thread pool to parallelize any computational tasks
  /// like compression
  bool use_threads = true;

  /// \brief Whether to emit dictionary deltas
  ///
  /// If false, a changed dictionary for a given field will emit a full
  /// dictionary replacement.
  /// If true, a changed dictionary will be compared against the previous
  /// version. If possible, a dictionary delta will be emitted, otherwise
  /// a full dictionary replacement.
  ///
  /// Default is false to maximize stream compatibility.
  ///
  /// Also, note that if a changed dictionary is a nested dictionary,
  /// then a delta is never emitted, for compatibility with the read path.
  bool emit_dictionary_deltas = false;

  /// \brief Whether to unify dictionaries for the IPC file format
  ///
  /// The IPC file format doesn't support dictionary replacements.
  /// Therefore, chunks of a column with a dictionary type must have the same
  /// dictionary in each record batch (or an extended dictionary + delta).
  ///
  /// If this option is true, RecordBatchWriter::WriteTable will attempt
  /// to unify dictionaries across each table column.  If this option is
  /// false, incompatible dictionaries across a table column will simply
  /// raise an error.
  ///
  /// Note that enabling this option has a runtime cost. Also, not all types
  /// currently support dictionary unification.
  ///
  /// This option is ignored for IPC streams, which support dictionary replacement
  /// and deltas.
  bool unify_dictionaries = false;

  /// \brief Format version to use for IPC messages and their metadata.
  ///
  /// Presently using V5 version (readable by 1.0.0 and later).
  /// V4 is also available (readable by 0.8.0 and later).
  MetadataVersion metadata_version = MetadataVersion::V5;

  static IpcWriteOptions Defaults();
};

/// \brief Options for reading Arrow IPC messages
struct ARROW_EXPORT IpcReadOptions {
  /// \brief The maximum permitted schema nesting depth.
  int max_recursion_depth = kMaxNestingDepth;

  /// \brief The memory pool to use for allocations made during IPC reading
  ///
  /// While Arrow IPC is predominantly zero-copy, it may have to allocate
  /// memory in some cases (for example if compression is enabled).
  MemoryPool* memory_pool = default_memory_pool();

  /// \brief Top-level schema fields to include when deserializing RecordBatch.
  ///
  /// If empty (the default), return all deserialized fields.
  /// If non-empty, the values are the indices of fields in the top-level schema.
  std::vector<int> included_fields;

  /// \brief Use global CPU thread pool to parallelize any computational tasks
  /// like decompression
  bool use_threads = true;

  /// \brief Whether to convert incoming data to platform-native endianness
  ///
  /// If the endianness of the received schema is not equal to platform-native
  /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
  /// This includes the value buffers of numeric types, temporal types, decimal
  /// types, as well as the offset buffers of variable-sized binary and list-like
  /// types.
  ///
  /// Endianness conversion is achieved by the RecordBatchFileReader,
  /// RecordBatchStreamReader and StreamDecoder classes.
  bool ensure_native_endian = true;

  /// \brief Options to control caching behavior when pre-buffering is requested
  ///
  /// The lazy property will always be reset to true to deliver the expected behavior
  io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults();

  static IpcReadOptions Defaults();
};

namespace internal {

Status CheckCompressionSupported(Compression::type codec);

}  // namespace internal
}  // namespace ipc
}  // namespace arrow