Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev259 

/ include / arrow / array / builder_base.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <algorithm>  // IWYU pragma: keep
#include <cstdint>
#include <limits>
#include <memory>
#include <utility>
#include <vector>

#include "arrow/array/array_base.h"
#include "arrow/array/array_primitive.h"
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

namespace arrow {

namespace internal {

template <class Builder, class V>
class ArrayBuilderExtraOps {
 public:
  /// \brief Append a value from an optional or null if it has no value.
  Status AppendOrNull(const std::optional<V>& value) {
    auto* self = static_cast<Builder*>(this);
    return value.has_value() ? self->Append(*value) : self->AppendNull();
  }

  /// \brief Append a value from an optional or null if it has no value.
  ///
  /// Unsafe methods don't check existing size.
  void UnsafeAppendOrNull(const std::optional<V>& value) {
    auto* self = static_cast<Builder*>(this);
    return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull();
  }
};

}  // namespace internal

/// \defgroup numeric-builders Concrete builder subclasses for numeric types
/// @{
/// @}

/// \defgroup temporal-builders Concrete builder subclasses for temporal types
/// @{
/// @}

/// \defgroup binary-builders Concrete builder subclasses for binary types
/// @{
/// @}

/// \defgroup nested-builders Concrete builder subclasses for nested types
/// @{
/// @}

/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
/// @{
/// @}

/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded
/// arrays
/// @{
/// @}

constexpr int64_t kMinBuilderCapacity = 1 << 5;
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;

/// Base class for all data array builders.
///
/// This class provides a facilities for incrementally building the null bitmap
/// (see Append methods) and as a side effect the current number of slots and
/// the null count.
///
/// \note Users are expected to use builders as one of the concrete types below.
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
class ARROW_EXPORT ArrayBuilder {
 public:
  explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment)
      : pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {}

  ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);

  virtual ~ArrayBuilder() = default;

  /// For nested types. Since the objects are owned by this class instance, we
  /// skip shared pointers and just return a raw pointer
  ArrayBuilder* child(int i) { return children_[i].get(); }

  const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }

  int num_children() const { return static_cast<int>(children_.size()); }

  virtual int64_t length() const { return length_; }
  int64_t null_count() const { return null_count_; }
  int64_t capacity() const { return capacity_; }

  /// \brief Ensure that enough memory has been allocated to fit the indicated
  /// number of total elements in the builder, including any that have already
  /// been appended. Does not account for reallocations that may be due to
  /// variable size data, like binary values. To make space for incremental
  /// appends, use Reserve instead.
  ///
  /// \param[in] capacity the minimum number of total array values to
  ///            accommodate. Must be greater than the current capacity.
  /// \return Status
  virtual Status Resize(int64_t capacity);

  /// \brief Ensure that there is enough space allocated to append the indicated
  /// number of elements without any further reallocation. Overallocation is
  /// used in order to minimize the impact of incremental Reserve() calls.
  /// Note that additional_capacity is relative to the current number of elements
  /// rather than to the current capacity, so calls to Reserve() which are not
  /// interspersed with addition of new elements may not increase the capacity.
  ///
  /// \param[in] additional_capacity the number of additional array values
  /// \return Status
  Status Reserve(int64_t additional_capacity) {
    auto current_capacity = capacity();
    auto min_capacity = length() + additional_capacity;
    if (min_capacity <= current_capacity) return Status::OK();

    // leave growth factor up to BufferBuilder
    auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
    return Resize(new_capacity);
  }

  /// Reset the builder.
  virtual void Reset();

  /// \brief Append a null value to builder
  virtual Status AppendNull() = 0;
  /// \brief Append a number of null values to builder
  virtual Status AppendNulls(int64_t length) = 0;

  /// \brief Append a non-null value to builder
  ///
  /// The appended value is an implementation detail, but the corresponding
  /// memory slot is guaranteed to be initialized.
  /// This method is useful when appending a null value to a parent nested type.
  virtual Status AppendEmptyValue() = 0;

  /// \brief Append a number of non-null values to builder
  ///
  /// The appended values are an implementation detail, but the corresponding
  /// memory slot is guaranteed to be initialized.
  /// This method is useful when appending null values to a parent nested type.
  virtual Status AppendEmptyValues(int64_t length) = 0;

  /// \brief Append a value from a scalar
  Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
  virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
  virtual Status AppendScalars(const ScalarVector& scalars);

  /// \brief Append a range of values from an array.
  ///
  /// The given array must be the same type as the builder.
  virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array),
                                  int64_t ARROW_ARG_UNUSED(offset),
                                  int64_t ARROW_ARG_UNUSED(length)) {
    return Status::NotImplemented("AppendArraySlice for builder for ", *type());
  }

  /// \brief Return result of builder as an internal generic ArrayData
  /// object. Resets builder except for dictionary builder
  ///
  /// \param[out] out the finalized ArrayData object
  /// \return Status
  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;

  /// \brief Return result of builder as an Array object.
  ///
  /// The builder is reset except for DictionaryBuilder.
  ///
  /// \param[out] out the finalized Array object
  /// \return Status
  Status Finish(std::shared_ptr<Array>* out);

  /// \brief Return result of builder as an Array object.
  ///
  /// The builder is reset except for DictionaryBuilder.
  ///
  /// \return The finalized Array object
  Result<std::shared_ptr<Array>> Finish();

  /// \brief Return the type of the built Array
  virtual std::shared_ptr<DataType> type() const = 0;

 protected:
  /// Append to null bitmap
  Status AppendToBitmap(bool is_valid);

  /// Vector append. Treat each zero byte as a null.   If valid_bytes is null
  /// assume all of length bits are valid.
  Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);

  /// Uniform append.  Append N times the same validity bit.
  Status AppendToBitmap(int64_t num_bits, bool value);

  /// Set the next length bits to not null (i.e. valid).
  Status SetNotNull(int64_t length);

  // Unsafe operations (don't check capacity/don't resize)

  void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }

  // Append to null bitmap, update the length
  void UnsafeAppendToBitmap(bool is_valid) {
    null_bitmap_builder_.UnsafeAppend(is_valid);
    ++length_;
    if (!is_valid) ++null_count_;
  }

  // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
  // assume all of length bits are valid.
  void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
    if (valid_bytes == NULLPTR) {
      return UnsafeSetNotNull(length);
    }
    null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
    length_ += length;
    null_count_ = null_bitmap_builder_.false_count();
  }

  // Vector append. Copy from a given bitmap. If bitmap is null assume
  // all of length bits are valid.
  void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
    if (bitmap == NULLPTR) {
      return UnsafeSetNotNull(length);
    }
    null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
    length_ += length;
    null_count_ = null_bitmap_builder_.false_count();
  }

  // Append the same validity value a given number of times.
  void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
    if (value) {
      UnsafeSetNotNull(num_bits);
    } else {
      UnsafeSetNull(num_bits);
    }
  }

  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);

  // Set the next validity bits to not null (i.e. valid).
  void UnsafeSetNotNull(int64_t length);

  // Set the next validity bits to null (i.e. invalid).
  void UnsafeSetNull(int64_t length);

  static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);

  /// \brief Finish to an array of the specified ArrayType
  template <typename ArrayType>
  Status FinishTyped(std::shared_ptr<ArrayType>* out) {
    std::shared_ptr<Array> out_untyped;
    ARROW_RETURN_NOT_OK(Finish(&out_untyped));
    *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
    return Status::OK();
  }

  // Check the requested capacity for validity
  Status CheckCapacity(int64_t new_capacity) {
    if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
      return Status::Invalid(
          "Resize capacity must be positive (requested: ", new_capacity, ")");
    }

    if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
      return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
                             ", current length: ", length_, ")");
    }

    return Status::OK();
  }

  // Check for array type
  Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
                        const Array& array, const char* message);
  Status CheckArrayType(Type::type expected_type, const Array& array,
                        const char* message);

  MemoryPool* pool_;
  int64_t alignment_;

  TypedBufferBuilder<bool> null_bitmap_builder_;
  int64_t null_count_ = 0;

  // Array length, so far. Also, the index of the next element to be added
  int64_t length_ = 0;
  int64_t capacity_ = 0;

  // Child value array builders. These are owned by this class
  std::vector<std::shared_ptr<ArrayBuilder>> children_;

 private:
  ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};

/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the data type to create the builder for
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
                   std::unique_ptr<ArrayBuilder>* out);

inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
    const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
  std::unique_ptr<ArrayBuilder> out;
  ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
  return out;
}

/// \brief Construct an empty ArrayBuilder corresponding to the data
/// type, where any top-level or nested dictionary builders return the
/// exact index type specified by the type.
ARROW_EXPORT
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
                             std::unique_ptr<ArrayBuilder>* out);

inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
    const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
  std::unique_ptr<ArrayBuilder> out;
  ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
  return out;
}

/// \brief Construct an empty DictionaryBuilder initialized optionally
/// with a preexisting dictionary
/// \param[in] pool the MemoryPool to use for allocations
/// \param[in] type the dictionary type to create the builder for
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
/// \param[out] out the created ArrayBuilder
ARROW_EXPORT
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
                             const std::shared_ptr<Array>& dictionary,
                             std::unique_ptr<ArrayBuilder>* out);

inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
    const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
    MemoryPool* pool = default_memory_pool()) {
  std::unique_ptr<ArrayBuilder> out;
  ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
  return out;
}

}  // namespace arrow