Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev259 

/ include / arrow / array / builder_binary.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <numeric>
#include <string>
#include <string_view>
#include <vector>

#include "arrow/array/array_base.h"
#include "arrow/array/array_binary.h"
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

namespace arrow {

/// \addtogroup binary-builders
///
/// @{

// ----------------------------------------------------------------------
// Binary and String

template <typename TYPE>
class BaseBinaryBuilder
    : public ArrayBuilder,
      public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
 public:
  using TypeClass = TYPE;
  using offset_type = typename TypeClass::offset_type;

  explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
                             int64_t alignment = kDefaultBufferAlignment)
      : ArrayBuilder(pool, alignment),
        offsets_builder_(pool, alignment),
        value_data_builder_(pool, alignment) {}

  BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
      : BaseBinaryBuilder(pool) {}

  Status Append(const uint8_t* value, offset_type length) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendNextOffset();
    // Safety check for UBSAN.
    if (ARROW_PREDICT_TRUE(length > 0)) {
      ARROW_RETURN_NOT_OK(ValidateOverflow(length));
      ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
    }

    UnsafeAppendToBitmap(true);
    return Status::OK();
  }

  Status Append(const char* value, offset_type length) {
    return Append(reinterpret_cast<const uint8_t*>(value), length);
  }

  Status Append(std::string_view value) {
    return Append(value.data(), static_cast<offset_type>(value.size()));
  }

  /// Extend the last appended value by appending more data at the end
  ///
  /// Unlike Append, this does not create a new offset.
  Status ExtendCurrent(const uint8_t* value, offset_type length) {
    // Safety check for UBSAN.
    if (ARROW_PREDICT_TRUE(length > 0)) {
      ARROW_RETURN_NOT_OK(ValidateOverflow(length));
      ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
    }
    return Status::OK();
  }

  Status ExtendCurrent(std::string_view value) {
    return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
                         static_cast<offset_type>(value.size()));
  }

  Status AppendNulls(int64_t length) final {
    const int64_t num_bytes = value_data_builder_.length();
    ARROW_RETURN_NOT_OK(Reserve(length));
    for (int64_t i = 0; i < length; ++i) {
      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    }
    UnsafeAppendToBitmap(length, false);
    return Status::OK();
  }

  Status AppendNull() final {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendNextOffset();
    UnsafeAppendToBitmap(false);
    return Status::OK();
  }

  Status AppendEmptyValue() final {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendNextOffset();
    UnsafeAppendToBitmap(true);
    return Status::OK();
  }

  Status AppendEmptyValues(int64_t length) final {
    const int64_t num_bytes = value_data_builder_.length();
    ARROW_RETURN_NOT_OK(Reserve(length));
    for (int64_t i = 0; i < length; ++i) {
      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    }
    UnsafeAppendToBitmap(length, true);
    return Status::OK();
  }

  /// \brief Append without checking capacity
  ///
  /// Offsets and data should have been presized using Reserve() and
  /// ReserveData(), respectively.
  void UnsafeAppend(const uint8_t* value, offset_type length) {
    UnsafeAppendNextOffset();
    value_data_builder_.UnsafeAppend(value, length);
    UnsafeAppendToBitmap(true);
  }

  void UnsafeAppend(const char* value, offset_type length) {
    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
  }

  void UnsafeAppend(const std::string& value) {
    UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
  }

  void UnsafeAppend(std::string_view value) {
    UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
  }

  /// Like ExtendCurrent, but do not check capacity
  void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
    value_data_builder_.UnsafeAppend(value, length);
  }

  void UnsafeExtendCurrent(std::string_view value) {
    UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
                        static_cast<offset_type>(value.size()));
  }

  void UnsafeAppendNull() {
    const int64_t num_bytes = value_data_builder_.length();
    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    UnsafeAppendToBitmap(false);
  }

  void UnsafeAppendEmptyValue() {
    const int64_t num_bytes = value_data_builder_.length();
    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
    UnsafeAppendToBitmap(true);
  }

  /// \brief Append a sequence of strings in one shot.
  ///
  /// \param[in] values a vector of strings
  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
  /// indicates a valid (non-null) value
  /// \return Status
  Status AppendValues(const std::vector<std::string>& values,
                      const uint8_t* valid_bytes = NULLPTR) {
    std::size_t total_length = std::accumulate(
        values.begin(), values.end(), 0ULL,
        [](uint64_t sum, const std::string& str) { return sum + str.size(); });
    ARROW_RETURN_NOT_OK(Reserve(values.size()));
    ARROW_RETURN_NOT_OK(ReserveData(total_length));

    if (valid_bytes != NULLPTR) {
      for (std::size_t i = 0; i < values.size(); ++i) {
        UnsafeAppendNextOffset();
        if (valid_bytes[i]) {
          value_data_builder_.UnsafeAppend(
              reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
        }
      }
    } else {
      for (const auto& value : values) {
        UnsafeAppendNextOffset();
        value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
                                         value.size());
      }
    }

    UnsafeAppendToBitmap(valid_bytes, values.size());
    return Status::OK();
  }

  /// \brief Append a sequence of nul-terminated strings in one shot.
  ///        If one of the values is NULL, it is processed as a null
  ///        value even if the corresponding valid_bytes entry is 1.
  ///
  /// \param[in] values a contiguous C array of nul-terminated char *
  /// \param[in] length the number of values to append
  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
  /// indicates a valid (non-null) value
  /// \return Status
  Status AppendValues(const char** values, int64_t length,
                      const uint8_t* valid_bytes = NULLPTR) {
    std::size_t total_length = 0;
    std::vector<std::size_t> value_lengths(length);
    bool have_null_value = false;
    for (int64_t i = 0; i < length; ++i) {
      if (values[i] != NULLPTR) {
        auto value_length = strlen(values[i]);
        value_lengths[i] = value_length;
        total_length += value_length;
      } else {
        have_null_value = true;
      }
    }
    ARROW_RETURN_NOT_OK(Reserve(length));
    ARROW_RETURN_NOT_OK(ReserveData(total_length));

    if (valid_bytes) {
      int64_t valid_bytes_offset = 0;
      for (int64_t i = 0; i < length; ++i) {
        UnsafeAppendNextOffset();
        if (valid_bytes[i]) {
          if (values[i]) {
            value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
                                             value_lengths[i]);
          } else {
            UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
                                 i - valid_bytes_offset);
            UnsafeAppendToBitmap(false);
            valid_bytes_offset = i + 1;
          }
        }
      }
      UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
    } else {
      if (have_null_value) {
        std::vector<uint8_t> valid_vector(length, 0);
        for (int64_t i = 0; i < length; ++i) {
          UnsafeAppendNextOffset();
          if (values[i]) {
            value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
                                             value_lengths[i]);
            valid_vector[i] = 1;
          }
        }
        UnsafeAppendToBitmap(valid_vector.data(), length);
      } else {
        for (int64_t i = 0; i < length; ++i) {
          UnsafeAppendNextOffset();
          value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
                                           value_lengths[i]);
        }
        UnsafeAppendToBitmap(NULLPTR, length);
      }
    }
    return Status::OK();
  }

  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                          int64_t length) override {
    auto bitmap = array.GetValues<uint8_t>(0, 0);
    auto offsets = array.GetValues<offset_type>(1);
    auto data = array.GetValues<uint8_t>(2, 0);
    auto total_length = offsets[offset + length] - offsets[offset];
    ARROW_RETURN_NOT_OK(Reserve(length));
    ARROW_RETURN_NOT_OK(ReserveData(total_length));
    for (int64_t i = 0; i < length; i++) {
      if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
        const offset_type start = offsets[offset + i];
        const offset_type end = offsets[offset + i + 1];
        UnsafeAppend(data + start, end - start);
      } else {
        UnsafeAppendNull();
      }
    }
    return Status::OK();
  }

  void Reset() override {
    ArrayBuilder::Reset();
    offsets_builder_.Reset();
    value_data_builder_.Reset();
  }

  Status ValidateOverflow(int64_t new_bytes) {
    auto new_size = value_data_builder_.length() + new_bytes;
    if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
      return Status::CapacityError("array cannot contain more than ", memory_limit(),
                                   " bytes, have ", new_size);
    } else {
      return Status::OK();
    }
  }

  Status Resize(int64_t capacity) override {
    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
    // One more than requested for offsets
    ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
    return ArrayBuilder::Resize(capacity);
  }

  /// \brief Ensures there is enough allocated capacity to append the indicated
  /// number of bytes to the value data buffer without additional allocations
  Status ReserveData(int64_t elements) {
    ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
    return value_data_builder_.Reserve(elements);
  }

  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
    // Write final offset (values length)
    ARROW_RETURN_NOT_OK(AppendNextOffset());

    // These buffers' padding zeroed by BufferBuilder
    std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
    ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
    ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));

    *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
                           null_count_, 0);
    Reset();
    return Status::OK();
  }

  /// \return data pointer of the value date builder
  const uint8_t* value_data() const { return value_data_builder_.data(); }
  /// \return size of values buffer so far
  int64_t value_data_length() const { return value_data_builder_.length(); }
  /// \return capacity of values buffer
  int64_t value_data_capacity() const { return value_data_builder_.capacity(); }

  /// \return data pointer of the value date builder
  const offset_type* offsets_data() const { return offsets_builder_.data(); }

  /// Temporary access to a value.
  ///
  /// This pointer becomes invalid on the next modifying operation.
  const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
    const offset_type* offsets = offsets_builder_.data();
    const auto offset = offsets[i];
    if (i == (length_ - 1)) {
      *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
    } else {
      *out_length = offsets[i + 1] - offset;
    }
    return value_data_builder_.data() + offset;
  }

  offset_type offset(int64_t i) const { return offsets_data()[i]; }

  /// Temporary access to a value.
  ///
  /// This view becomes invalid on the next modifying operation.
  std::string_view GetView(int64_t i) const {
    offset_type value_length;
    const uint8_t* value_data = GetValue(i, &value_length);
    return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
  }

  // Cannot make this a static attribute because of linking issues
  static constexpr int64_t memory_limit() {
    return std::numeric_limits<offset_type>::max() - 1;
  }

 protected:
  TypedBufferBuilder<offset_type> offsets_builder_;
  TypedBufferBuilder<uint8_t> value_data_builder_;

  Status AppendNextOffset() {
    const int64_t num_bytes = value_data_builder_.length();
    return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
  }

  void UnsafeAppendNextOffset() {
    const int64_t num_bytes = value_data_builder_.length();
    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
  }
};

/// \class BinaryBuilder
/// \brief Builder class for variable-length binary data
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
 public:
  using BaseBinaryBuilder::BaseBinaryBuilder;

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }

  std::shared_ptr<DataType> type() const override { return binary(); }
};

/// \class StringBuilder
/// \brief Builder class for UTF8 strings
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
 public:
  using BinaryBuilder::BinaryBuilder;

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }

  std::shared_ptr<DataType> type() const override { return utf8(); }
};

/// \class LargeBinaryBuilder
/// \brief Builder class for large variable-length binary data
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
 public:
  using BaseBinaryBuilder::BaseBinaryBuilder;

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }

  std::shared_ptr<DataType> type() const override { return large_binary(); }
};

/// \class LargeStringBuilder
/// \brief Builder class for large UTF8 strings
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
 public:
  using LargeBinaryBuilder::LargeBinaryBuilder;

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }

  std::shared_ptr<DataType> type() const override { return large_utf8(); }
};

// ----------------------------------------------------------------------
// BinaryViewBuilder, StringViewBuilder
//
// These builders do not support building raw pointer view arrays.

namespace internal {

// We allocate medium-sized memory chunks and accumulate data in those, which
// may result in some waste if there are many large-ish strings. If a string
// comes along that does not fit into a block, we allocate a new block and
// write into that.
//
// Later we can implement optimizations to continuing filling underfull blocks
// after encountering a large string that required allocating a new block.
class ARROW_EXPORT StringHeapBuilder {
 public:
  static constexpr int64_t kDefaultBlocksize = 32 << 10;  // 32KB

  StringHeapBuilder(MemoryPool* pool, int64_t alignment)
      : pool_(pool), alignment_(alignment) {}

  void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }

  using c_type = BinaryViewType::c_type;

  template <bool Safe>
  std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
                                                          int64_t length) {
    if (length <= BinaryViewType::kInlineSize) {
      return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
    }

    if constexpr (Safe) {
      ARROW_RETURN_NOT_OK(Reserve(length));
    }

    auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
                                         static_cast<int32_t>(blocks_.size() - 1),
                                         current_offset_);

    memcpy(current_out_buffer_, value, static_cast<size_t>(length));
    current_out_buffer_ += length;
    current_remaining_bytes_ -= length;
    current_offset_ += static_cast<int32_t>(length);
    return v;
  }

  static constexpr int64_t ValueSizeLimit() {
    return std::numeric_limits<int32_t>::max();
  }

  /// \brief Ensure that the indicated number of bytes can be appended via
  /// UnsafeAppend operations without the need to allocate more memory
  Status Reserve(int64_t num_bytes) {
    if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
      return Status::CapacityError(
          "BinaryView or StringView elements cannot reference "
          "strings larger than 2GB");
    }
    if (num_bytes > current_remaining_bytes_) {
      ARROW_RETURN_NOT_OK(FinishLastBlock());
      current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
      ARROW_ASSIGN_OR_RAISE(
          std::shared_ptr<ResizableBuffer> new_block,
          AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
      current_offset_ = 0;
      current_out_buffer_ = new_block->mutable_data();
      blocks_.emplace_back(std::move(new_block));
    }
    return Status::OK();
  }

  void Reset() {
    current_offset_ = 0;
    current_out_buffer_ = NULLPTR;
    current_remaining_bytes_ = 0;
    blocks_.clear();
  }

  int64_t current_remaining_bytes() const { return current_remaining_bytes_; }

  Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
    if (!blocks_.empty()) {
      ARROW_RETURN_NOT_OK(FinishLastBlock());
    }
    current_offset_ = 0;
    current_out_buffer_ = NULLPTR;
    current_remaining_bytes_ = 0;
    return std::move(blocks_);
  }

 private:
  Status FinishLastBlock() {
    if (current_remaining_bytes_ > 0) {
      // Avoid leaking uninitialized bytes from the allocator
      ARROW_RETURN_NOT_OK(
          blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
                                 /*shrink_to_fit=*/true));
      blocks_.back()->ZeroPadding();
    }
    return Status::OK();
  }

  MemoryPool* pool_;
  int64_t alignment_;
  int64_t blocksize_ = kDefaultBlocksize;
  std::vector<std::shared_ptr<ResizableBuffer>> blocks_;

  int32_t current_offset_ = 0;
  uint8_t* current_out_buffer_ = NULLPTR;
  int64_t current_remaining_bytes_ = 0;
};

}  // namespace internal

class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
 public:
  using TypeClass = BinaryViewType;

  // this constructor provided for MakeBuilder compatibility
  BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);

  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
                             int64_t alignment = kDefaultBufferAlignment)
      : ArrayBuilder(pool, alignment),
        data_builder_(pool, alignment),
        data_heap_builder_(pool, alignment) {}

  /// Set the size for future preallocated data buffers.
  ///
  /// The default size is 32KB, so after each 32KB of string data appended to the builder
  /// a new data buffer will be allocated. Adjust this to a larger value to decrease the
  /// frequency of allocation, or to a smaller value to lower the overhead of each
  /// allocation.
  void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }

  /// The number of bytes which can be appended to this builder without allocating another
  /// data buffer.
  int64_t current_block_bytes_remaining() const {
    return data_heap_builder_.current_remaining_bytes();
  }

  Status Append(const uint8_t* value, int64_t length) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppendToBitmap(true);
    ARROW_ASSIGN_OR_RAISE(auto v,
                          data_heap_builder_.Append</*Safe=*/true>(value, length));
    data_builder_.UnsafeAppend(v);
    return Status::OK();
  }

  Status Append(const char* value, int64_t length) {
    return Append(reinterpret_cast<const uint8_t*>(value), length);
  }

  Status Append(std::string_view value) {
    return Append(value.data(), static_cast<int64_t>(value.size()));
  }

  /// \brief Append without checking capacity
  ///
  /// Builder should have been presized using Reserve() and ReserveData(),
  /// respectively, and the value must not be larger than 2GB
  void UnsafeAppend(const uint8_t* value, int64_t length) {
    UnsafeAppendToBitmap(true);
    auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
    data_builder_.UnsafeAppend(v);
  }

  void UnsafeAppend(const char* value, int64_t length) {
    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
  }

  void UnsafeAppend(const std::string& value) {
    UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
  }

  void UnsafeAppend(std::string_view value) {
    UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
  }

  /// \brief Ensures there is enough allocated available capacity in the
  /// out-of-line data heap to append the indicated number of bytes without
  /// additional allocations
  Status ReserveData(int64_t length);

  Status AppendNulls(int64_t length) final {
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
    UnsafeSetNull(length);
    return Status::OK();
  }

  /// \brief Append a single null element
  Status AppendNull() final {
    ARROW_RETURN_NOT_OK(Reserve(1));
    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
    UnsafeAppendToBitmap(false);
    return Status::OK();
  }

  /// \brief Append a empty element (length-0 inline string)
  Status AppendEmptyValue() final {
    ARROW_RETURN_NOT_OK(Reserve(1));
    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
    UnsafeAppendToBitmap(true);
    return Status::OK();
  }

  /// \brief Append several empty elements
  Status AppendEmptyValues(int64_t length) final {
    ARROW_RETURN_NOT_OK(Reserve(length));
    data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
    UnsafeSetNotNull(length);
    return Status::OK();
  }

  void UnsafeAppendNull() {
    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
    UnsafeAppendToBitmap(false);
  }

  void UnsafeAppendEmptyValue() {
    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
    UnsafeAppendToBitmap(true);
  }

  /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
  /// the underlying out-of-line string memory to avoid memory lifetime issues
  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                          int64_t length) override;

  void Reset() override;

  Status Resize(int64_t capacity) override {
    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
    capacity = std::max(capacity, kMinBuilderCapacity);
    ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
    return ArrayBuilder::Resize(capacity);
  }

  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

  std::shared_ptr<DataType> type() const override { return binary_view(); }

 protected:
  TypedBufferBuilder<BinaryViewType::c_type> data_builder_;

  // Accumulates out-of-line data in fixed-size chunks which are then attached
  // to the resulting ArrayData
  internal::StringHeapBuilder data_heap_builder_;
};

class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
 public:
  using BinaryViewBuilder::BinaryViewBuilder;
  std::shared_ptr<DataType> type() const override { return utf8_view(); }
};

// ----------------------------------------------------------------------
// FixedSizeBinaryBuilder

class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
 public:
  using TypeClass = FixedSizeBinaryType;

  explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
                                  MemoryPool* pool = default_memory_pool(),
                                  int64_t alignment = kDefaultBufferAlignment);

  Status Append(const uint8_t* value) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(value);
    return Status::OK();
  }

  Status Append(const char* value) {
    return Append(reinterpret_cast<const uint8_t*>(value));
  }

  Status Append(std::string_view view) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(view);
    return Status::OK();
  }

  Status Append(const std::string& s) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(s);
    return Status::OK();
  }

  Status Append(const Buffer& s) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(s);
    return Status::OK();
  }

  Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }

  template <size_t NBYTES>
  Status Append(const std::array<uint8_t, NBYTES>& value) {
    ARROW_RETURN_NOT_OK(Reserve(1));
    UnsafeAppend(
        std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
    return Status::OK();
  }

  Status AppendValues(const uint8_t* data, int64_t length,
                      const uint8_t* valid_bytes = NULLPTR);

  Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
                      int64_t bitmap_offset);

  Status AppendNull() final;
  Status AppendNulls(int64_t length) final;

  Status AppendEmptyValue() final;
  Status AppendEmptyValues(int64_t length) final;

  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                          int64_t length) override {
    return AppendValues(
        array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
        array.GetValues<uint8_t>(0, 0), array.offset + offset);
  }

  void UnsafeAppend(const uint8_t* value) {
    UnsafeAppendToBitmap(true);
    if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
      byte_builder_.UnsafeAppend(value, byte_width_);
    }
  }

  void UnsafeAppend(const char* value) {
    UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
  }

  void UnsafeAppend(std::string_view value) {
#ifndef NDEBUG
    CheckValueSize(static_cast<size_t>(value.size()));
#endif
    UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
  }

  void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }

  void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }

  void UnsafeAppendNull() {
    UnsafeAppendToBitmap(false);
    byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
  }

  Status ValidateOverflow(int64_t new_bytes) const {
    auto new_size = byte_builder_.length() + new_bytes;
    if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
      return Status::CapacityError("array cannot contain more than ", memory_limit(),
                                   " bytes, have ", new_size);
    } else {
      return Status::OK();
    }
  }

  /// \brief Ensures there is enough allocated capacity to append the indicated
  /// number of bytes to the value data buffer without additional allocations
  Status ReserveData(int64_t elements) {
    ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
    return byte_builder_.Reserve(elements);
  }

  void Reset() override;
  Status Resize(int64_t capacity) override;
  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

  /// \cond FALSE
  using ArrayBuilder::Finish;
  /// \endcond

  Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }

  /// \return size of values buffer so far
  int64_t value_data_length() const { return byte_builder_.length(); }

  int32_t byte_width() const { return byte_width_; }

  /// Temporary access to a value.
  ///
  /// This pointer becomes invalid on the next modifying operation.
  const uint8_t* GetValue(int64_t i) const;

  /// Temporary access to a value.
  ///
  /// This view becomes invalid on the next modifying operation.
  std::string_view GetView(int64_t i) const;

  static constexpr int64_t memory_limit() {
    return std::numeric_limits<int64_t>::max() - 1;
  }

  std::shared_ptr<DataType> type() const override {
    return fixed_size_binary(byte_width_);
  }

 protected:
  int32_t byte_width_;
  BufferBuilder byte_builder_;

  /// Temporary access to a value.
  ///
  /// This pointer becomes invalid on the next modifying operation.
  uint8_t* GetMutableValue(int64_t i) {
    uint8_t* data_ptr = byte_builder_.mutable_data();
    return data_ptr + i * byte_width_;
  }

  void CheckValueSize(int64_t size);
};

/// @}

// ----------------------------------------------------------------------
// Chunked builders: build a sequence of BinaryArray or StringArray that are
// limited to a particular size (to the upper limit of 2GB)

namespace internal {

class ARROW_EXPORT ChunkedBinaryBuilder {
 public:
  explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
                                MemoryPool* pool = default_memory_pool());

  ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
                       MemoryPool* pool = default_memory_pool());

  virtual ~ChunkedBinaryBuilder() = default;

  Status Append(const uint8_t* value, int32_t length) {
    if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
                            max_chunk_value_length_)) {
      if (builder_->value_data_length() == 0) {
        // The current item is larger than max_chunk_size_;
        // this chunk will be oversize and hold *only* this item
        ARROW_RETURN_NOT_OK(builder_->Append(value, length));
        return NextChunk();
      }
      // The current item would cause builder_->value_data_length() to exceed
      // max_chunk_size_, so finish this chunk and append the current item to the next
      // chunk
      ARROW_RETURN_NOT_OK(NextChunk());
      return Append(value, length);
    }

    if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
      // The current item would cause builder_->length() to exceed max_chunk_length_, so
      // finish this chunk and append the current item to the next chunk
      ARROW_RETURN_NOT_OK(NextChunk());
    }

    return builder_->Append(value, length);
  }

  Status Append(std::string_view value) {
    return Append(reinterpret_cast<const uint8_t*>(value.data()),
                  static_cast<int32_t>(value.size()));
  }

  Status AppendNull() {
    if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
      ARROW_RETURN_NOT_OK(NextChunk());
    }
    return builder_->AppendNull();
  }

  Status Reserve(int64_t values);

  virtual Status Finish(ArrayVector* out);

 protected:
  Status NextChunk();

  // maximum total character data size per chunk
  int64_t max_chunk_value_length_;

  // maximum elements allowed per chunk
  int64_t max_chunk_length_ = kListMaximumElements;

  // when Reserve() would cause builder_ to exceed its max_chunk_length_,
  // add to extra_capacity_ instead and wait to reserve until the next chunk
  int64_t extra_capacity_ = 0;

  std::unique_ptr<BinaryBuilder> builder_;
  std::vector<std::shared_ptr<Array>> chunks_;
};

class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
 public:
  using ChunkedBinaryBuilder::ChunkedBinaryBuilder;

  Status Finish(ArrayVector* out) override;
};

}  // namespace internal

}  // namespace arrow