Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev70 

/ include / arrow / buffer.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <cstring>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "arrow/device.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/span.h"
#include "arrow/util/visibility.h"

namespace arrow {

// ----------------------------------------------------------------------
// Buffer classes

/// \class Buffer
/// \brief Object containing a pointer to a piece of contiguous memory with a
/// particular size.
///
/// Buffers have two related notions of length: size and capacity. Size is
/// the number of bytes that might have valid data. Capacity is the number
/// of bytes that were allocated for the buffer in total.
///
/// The Buffer base class does not own its memory, but subclasses often do.
///
/// The following invariant is always true: Size <= Capacity
class ARROW_EXPORT Buffer {
 public:
  ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);

  /// \brief Construct from buffer and size without copying memory
  ///
  /// \param[in] data a memory buffer
  /// \param[in] size buffer size
  ///
  /// \note The passed memory must be kept alive through some other means
  Buffer(const uint8_t* data, int64_t size)
      : is_mutable_(false),
        is_cpu_(true),
        data_(data),
        size_(size),
        capacity_(size),
        device_type_(DeviceAllocationType::kCPU) {
    SetMemoryManager(default_cpu_memory_manager());
  }

  Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
         std::shared_ptr<Buffer> parent = NULLPTR,
         std::optional<DeviceAllocationType> device_type_override = std::nullopt)
      : is_mutable_(false),
        data_(data),
        size_(size),
        capacity_(size),
        parent_(std::move(parent)) {
    // SetMemoryManager will also set device_type_
    SetMemoryManager(std::move(mm));
    // If a device type is specified, use that instead. Example of when this can be
    // useful: the CudaMemoryManager can set device_type_ to kCUDA, but you can specify
    // device_type_override=kCUDA_HOST as the device type to override it.
    if (device_type_override != std::nullopt) {
      device_type_ = *device_type_override;
    }
  }

  Buffer(uintptr_t address, int64_t size, std::shared_ptr<MemoryManager> mm,
         std::shared_ptr<Buffer> parent = NULLPTR)
      : Buffer(reinterpret_cast<const uint8_t*>(address), size, std::move(mm),
               std::move(parent)) {}

  /// \brief Construct from string_view without copying memory
  ///
  /// \param[in] data a string_view object
  ///
  /// \note The memory viewed by data must not be deallocated in the lifetime of the
  /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere
  explicit Buffer(std::string_view data)
      : Buffer(reinterpret_cast<const uint8_t*>(data.data()),
               static_cast<int64_t>(data.size())) {}

  virtual ~Buffer() = default;

  /// An offset into data that is owned by another buffer, but we want to be
  /// able to retain a valid pointer to it even after other shared_ptr's to the
  /// parent buffer have been destroyed
  ///
  /// This method makes no assertions about alignment or padding of the buffer but
  /// in general we expected buffers to be aligned and padded to 64 bytes.  In the future
  /// we might add utility methods to help determine if a buffer satisfies this contract.
  Buffer(const std::shared_ptr<Buffer>& parent, const int64_t offset, const int64_t size)
      : Buffer(parent->data_ + offset, size) {
    parent_ = parent;
    SetMemoryManager(parent->memory_manager_);
  }

  uint8_t operator[](std::size_t i) const { return data_[i]; }

  /// \brief Construct a new std::string with a hexadecimal representation of the buffer.
  /// \return std::string
  std::string ToHexString();

  /// Return true if both buffers are the same size and contain the same bytes
  /// up to the number of compared bytes
  bool Equals(const Buffer& other, int64_t nbytes) const;

  /// Return true if both buffers are the same size and contain the same bytes
  bool Equals(const Buffer& other) const;

  /// Copy a section of the buffer into a new Buffer.
  Result<std::shared_ptr<Buffer>> CopySlice(
      const int64_t start, const int64_t nbytes,
      MemoryPool* pool = default_memory_pool()) const;

  /// Zero bytes in padding, i.e. bytes between size_ and capacity_.
  void ZeroPadding() {
#ifndef NDEBUG
    CheckMutable();
#endif
    // A zero-capacity buffer can have a null data pointer
    if (capacity_ != 0) {
      memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
    }
  }

  /// \brief Construct an immutable buffer that takes ownership of the contents
  /// of an std::string (without copying it).
  ///
  /// \param[in] data a string to own
  /// \return a new Buffer instance
  static std::shared_ptr<Buffer> FromString(std::string data);

  /// \brief Construct an immutable buffer that takes ownership of the contents
  /// of an std::vector (without copying it). Only vectors of TrivialType objects
  /// (integers, floating point numbers, ...) can be wrapped by this function.
  ///
  /// \param[in] vec a vector to own
  /// \return a new Buffer instance
  template <typename T>
  static std::shared_ptr<Buffer> FromVector(std::vector<T> vec) {
    static_assert(std::is_trivial_v<T>,
                  "Buffer::FromVector can only wrap vectors of trivial objects");

    if (vec.empty()) {
      return std::shared_ptr<Buffer>{new Buffer()};
    }

    auto* data = reinterpret_cast<uint8_t*>(vec.data());
    auto size_in_bytes = static_cast<int64_t>(vec.size() * sizeof(T));
    return std::shared_ptr<Buffer>{
        new Buffer{data, size_in_bytes},
        // Keep the vector's buffer alive inside the shared_ptr's destructor until after
        // we have deleted the Buffer. Note we can't use this trick in FromString since
        // std::string's data is inline for short strings so moving invalidates pointers
        // into the string's buffer.
        [vec = std::move(vec)](Buffer* buffer) { delete buffer; }};
  }

  /// \brief Create buffer referencing typed memory with some length without
  /// copying
  /// \param[in] data the typed memory as C array
  /// \param[in] length the number of values in the array
  /// \return a new shared_ptr<Buffer>
  template <typename T, typename SizeType = int64_t>
  static std::shared_ptr<Buffer> Wrap(const T* data, SizeType length) {
    return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data),
                                    static_cast<int64_t>(sizeof(T) * length));
  }

  /// \brief Create buffer referencing std::vector with some length without
  /// copying
  /// \param[in] data the vector to be referenced. If this vector is changed,
  /// the buffer may become invalid
  /// \return a new shared_ptr<Buffer>
  template <typename T>
  static std::shared_ptr<Buffer> Wrap(const std::vector<T>& data) {
    return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data.data()),
                                    static_cast<int64_t>(sizeof(T) * data.size()));
  }

  /// \brief Copy buffer contents into a new std::string
  /// \return std::string
  /// \note Can throw std::bad_alloc if buffer is large
  std::string ToString() const;

  /// \brief View buffer contents as a std::string_view
  /// \return std::string_view
  explicit operator std::string_view() const {
    return {reinterpret_cast<const char*>(data_), static_cast<size_t>(size_)};
  }

  /// \brief Return a pointer to the buffer's data
  ///
  /// The buffer has to be a CPU buffer (`is_cpu()` is true).
  /// Otherwise, an assertion may be thrown or a null pointer may be returned.
  ///
  /// To get the buffer's data address regardless of its device, call `address()`.
  const uint8_t* data() const {
#ifndef NDEBUG
    CheckCPU();
#endif
    return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
  }

  /// \brief Return a pointer to the buffer's data cast to a specific type
  ///
  /// The buffer has to be a CPU buffer (`is_cpu()` is true).
  /// Otherwise, an assertion may be thrown or a null pointer may be returned.
  template <typename T>
  const T* data_as() const {
    return reinterpret_cast<const T*>(data());
  }

  /// \brief Return the buffer's data as a span
  template <typename T>
  util::span<const T> span_as() const {
    return util::span(data_as<T>(), static_cast<size_t>(size() / sizeof(T)));
  }

  /// \brief Return a writable pointer to the buffer's data
  ///
  /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
  /// are true).  Otherwise, an assertion may be thrown or a null pointer may
  /// be returned.
  ///
  /// To get the buffer's mutable data address regardless of its device, call
  /// `mutable_address()`.
  uint8_t* mutable_data() {
#ifndef NDEBUG
    CheckCPU();
    CheckMutable();
#endif
    return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
                                                      : NULLPTR;
  }

  /// \brief Return a writable pointer to the buffer's data cast to a specific type
  ///
  /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
  /// are true).  Otherwise, an assertion may be thrown or a null pointer may
  /// be returned.
  template <typename T>
  T* mutable_data_as() {
    return reinterpret_cast<T*>(mutable_data());
  }

  /// \brief Return the buffer's mutable data as a span
  template <typename T>
  util::span<T> mutable_span_as() {
    return util::span(mutable_data_as<T>(), static_cast<size_t>(size() / sizeof(T)));
  }

  /// \brief Return the device address of the buffer's data
  uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }

  /// \brief Return a writable device address to the buffer's data
  ///
  /// The buffer has to be a mutable buffer (`is_mutable()` is true).
  /// Otherwise, an assertion may be thrown or 0 may be returned.
  uintptr_t mutable_address() const {
#ifndef NDEBUG
    CheckMutable();
#endif
    return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
  }

  /// \brief Return the buffer's size in bytes
  int64_t size() const { return size_; }

  /// \brief Return the buffer's capacity (number of allocated bytes)
  int64_t capacity() const { return capacity_; }

  /// \brief Whether the buffer is directly CPU-accessible
  ///
  /// If this function returns true, you can read directly from the buffer's
  /// `data()` pointer.  Otherwise, you'll have to `View()` or `Copy()` it.
  bool is_cpu() const { return is_cpu_; }

  /// \brief Whether the buffer is mutable
  ///
  /// If this function returns true, you are allowed to modify buffer contents
  /// using the pointer returned by `mutable_data()` or `mutable_address()`.
  bool is_mutable() const { return is_mutable_; }

  const std::shared_ptr<Device>& device() const { return memory_manager_->device(); }

  const std::shared_ptr<MemoryManager>& memory_manager() const { return memory_manager_; }

  DeviceAllocationType device_type() const { return device_type_; }

  std::shared_ptr<Buffer> parent() const { return parent_; }

  /// \brief Get a RandomAccessFile for reading a buffer
  ///
  /// The returned file object reads from this buffer's underlying memory.
  static Result<std::shared_ptr<io::RandomAccessFile>> GetReader(std::shared_ptr<Buffer>);

  /// \brief Get a OutputStream for writing to a buffer
  ///
  /// The buffer must be mutable.  The returned stream object writes into the buffer's
  /// underlying memory (but it won't resize it).
  static Result<std::shared_ptr<io::OutputStream>> GetWriter(std::shared_ptr<Buffer>);

  /// \brief Copy buffer
  ///
  /// The buffer contents will be copied into a new buffer allocated by the
  /// given MemoryManager.  This function supports cross-device copies.
  static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
                                              const std::shared_ptr<MemoryManager>& to);

  /// \brief Copy a non-owned buffer
  ///
  /// This is useful for cases where the source memory area is externally managed
  /// (its lifetime not tied to the source Buffer), otherwise please use Copy().
  static Result<std::unique_ptr<Buffer>> CopyNonOwned(
      const Buffer& source, const std::shared_ptr<MemoryManager>& to);

  /// \brief View buffer
  ///
  /// Return a Buffer that reflects this buffer, seen potentially from another
  /// device, without making an explicit copy of the contents.  The underlying
  /// mechanism is typically implemented by the kernel or device driver, and may
Loading ...