Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev246 

/ src / arrow / python / numpy_to_arrow.cc

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// Functions for pandas conversion via NumPy

#include "arrow/python/numpy_to_arrow.h"
#include "arrow/python/numpy_interop.h"

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_generate.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/string.h"
#include "arrow/util/utf8.h"
#include "arrow/visit_type_inline.h"

#include "arrow/compute/api_scalar.h"

#include "arrow/python/common.h"
#include "arrow/python/datetime.h"
#include "arrow/python/helpers.h"
#include "arrow/python/iterators.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/numpy_internal.h"
#include "arrow/python/python_to_arrow.h"
#include "arrow/python/type_traits.h"
#include "arrow/python/vendored/pythoncapi_compat.h"

namespace arrow {

using internal::checked_cast;
using internal::CopyBitmap;
using internal::GenerateBitsUnrolled;

namespace py {

using internal::NumPyTypeSize;

// ----------------------------------------------------------------------
// Conversion utilities

namespace {

Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
                          std::shared_ptr<ResizableBuffer>* out) {
  int64_t null_bytes = bit_util::BytesForBits(length);
  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool));

  // Padding zeroed by AllocateResizableBuffer
  memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
  *out = std::move(null_bitmap);
  return Status::OK();
}

// ----------------------------------------------------------------------
// Conversion from NumPy-in-Pandas to Arrow null bitmap

template <int TYPE>
inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
  typedef internal::npy_traits<TYPE> traits;
  typedef typename traits::value_type T;

  int64_t null_count = 0;

  Ndarray1DIndexer<T> values(arr);
  for (int i = 0; i < values.size(); ++i) {
    if (traits::isnull(values[i])) {
      ++null_count;
    } else {
      bit_util::SetBit(bitmap, i);
    }
  }

  return null_count;
}

class NumPyNullsConverter {
 public:
  /// Convert the given array's null values to a null bitmap.
  /// The null bitmap is only allocated if null values are ever possible.
  static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas,
                        std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
                        int64_t* out_null_count) {
    NumPyNullsConverter converter(pool, arr, from_pandas);
    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
    *out_null_bitmap_ = converter.null_bitmap_;
    *out_null_count = converter.null_count_;
    return Status::OK();
  }

  template <int TYPE>
  Status Visit(PyArrayObject* arr) {
    typedef internal::npy_traits<TYPE> traits;

    const bool null_sentinels_possible =
        // Always treat Numpy's NaT as null
        TYPE == NPY_DATETIME || TYPE == NPY_TIMEDELTA ||
        // Observing pandas's null sentinels
        (from_pandas_ && traits::supports_nulls);

    if (null_sentinels_possible) {
      RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
      null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
    }
    return Status::OK();
  }

 protected:
  NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas)
      : pool_(pool),
        arr_(arr),
        from_pandas_(from_pandas),
        null_bitmap_data_(nullptr),
        null_count_(0) {}

  MemoryPool* pool_;
  PyArrayObject* arr_;
  bool from_pandas_;
  std::shared_ptr<ResizableBuffer> null_bitmap_;
  uint8_t* null_bitmap_data_;
  int64_t null_count_;
};

// Returns null count
int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
  int64_t null_count = 0;

  if (!PyArray_Check(mask)) return -1;

  Ndarray1DIndexer<uint8_t> mask_values(mask);
  for (int i = 0; i < length; ++i) {
    if (mask_values[i]) {
      ++null_count;
      bit_util::ClearBit(bitmap, i);
    } else {
      bit_util::SetBit(bitmap, i);
    }
  }
  return null_count;
}

}  // namespace

// ----------------------------------------------------------------------
// Conversion from NumPy arrays (possibly originating from pandas) to Arrow
// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for
// that

class NumPyConverter {
 public:
  NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
                 const std::shared_ptr<DataType>& type, bool from_pandas,
                 const compute::CastOptions& cast_options = compute::CastOptions())
      : pool_(pool),
        type_(type),
        arr_(reinterpret_cast<PyArrayObject*>(arr)),
        dtype_(PyArray_DESCR(arr_)),
        mask_(nullptr),
        from_pandas_(from_pandas),
        cast_options_(cast_options),
        null_bitmap_data_(nullptr),
        null_count_(0) {
    if (mo != nullptr && mo != Py_None) {
      mask_ = reinterpret_cast<PyArrayObject*>(mo);
    }
    length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
    itemsize_ = static_cast<int64_t>(PyArray_ITEMSIZE(arr_));
    stride_ = static_cast<int64_t>(PyArray_STRIDES(arr_)[0]);
  }

  bool is_strided() const { return itemsize_ != stride_; }

  Status Convert();

  const ArrayVector& result() const { return out_arrays_; }

  template <typename T>
  enable_if_primitive_ctype<T, Status> Visit(const T& type) {
    return VisitNative<T>();
  }

  Status Visit(const HalfFloatType& type) { return VisitNative<UInt16Type>(); }

  Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
  Status Visit(const Date64Type& type) { return VisitNative<Date64Type>(); }
  Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
  Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
  Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
  Status Visit(const DurationType& type) { return VisitNative<DurationType>(); }

  Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }

  // NumPy ascii string arrays
  Status Visit(const BinaryType& type);

  // NumPy unicode arrays
  Status Visit(const StringType& type);

  Status Visit(const StructType& type);

  Status Visit(const FixedSizeBinaryType& type);

  // Default case
  Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); }

 protected:
  Status InitNullBitmap() {
    RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
    null_bitmap_data_ = null_bitmap_->mutable_data();
    return Status::OK();
  }

  // Called before ConvertData to ensure Numpy input buffer is in expected
  // Arrow layout
  template <typename ArrowType>
  Status PrepareInputData(std::shared_ptr<Buffer>* data);

  // ----------------------------------------------------------------------
  // Traditional visitor conversion for non-object arrays

  template <typename ArrowType>
  Status ConvertData(std::shared_ptr<Buffer>* data);

  template <typename T>
  Status PushBuilderResult(T* builder) {
    std::shared_ptr<Array> out;
    RETURN_NOT_OK(builder->Finish(&out));
    out_arrays_.emplace_back(out);
    return Status::OK();
  }

  Status PushArray(const std::shared_ptr<ArrayData>& data) {
    out_arrays_.emplace_back(MakeArray(data));
    return Status::OK();
  }

  template <typename ArrowType>
  Status VisitNative() {
    if (mask_ != nullptr) {
      RETURN_NOT_OK(InitNullBitmap());
      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
      if (null_count_ == -1) return Status::Invalid("Invalid mask type");
    } else {
      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
                                                 &null_count_));
    }

    std::shared_ptr<Buffer> data;
    RETURN_NOT_OK(ConvertData<ArrowType>(&data));

    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
    return PushArray(arr_data);
  }

  Status TypeNotImplemented(std::string type_name) {
    return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
                                  "> conversion. ");
  }

  MemoryPool* pool_;
  std::shared_ptr<DataType> type_;
  PyArrayObject* arr_;
  PyArray_Descr* dtype_;
  PyArrayObject* mask_;
  int64_t length_;
  int64_t stride_;
  int64_t itemsize_;

  bool from_pandas_;
  compute::CastOptions cast_options_;

  // Used in visitor pattern
  ArrayVector out_arrays_;

  std::shared_ptr<ResizableBuffer> null_bitmap_;
  uint8_t* null_bitmap_data_;
  int64_t null_count_;
};

Status NumPyConverter::Convert() {
  if (PyArray_NDIM(arr_) != 1) {
    return Status::Invalid("only handle 1-dimensional arrays");
  }

  if (dtype_->type_num == NPY_OBJECT) {
    // If an object array, convert it like a normal Python sequence
    PyConversionOptions py_options;
    py_options.type = type_;
    py_options.from_pandas = from_pandas_;
    ARROW_ASSIGN_OR_RAISE(
        auto chunked_array,
        ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
                          reinterpret_cast<PyObject*>(mask_), py_options, pool_));
    out_arrays_ = chunked_array->chunks();
    return Status::OK();
  }

  if (type_ == nullptr) {
    return Status::Invalid("Must pass data type for non-object arrays");
  }

  // Visit the type to perform conversion
  return VisitTypeInline(*type_, this);
}

namespace {

Status CastBuffer(const std::shared_ptr<DataType>& in_type,
                  const std::shared_ptr<Buffer>& input, const int64_t length,
                  const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
                  const std::shared_ptr<DataType>& out_type,
                  const compute::CastOptions& cast_options, MemoryPool* pool,
                  std::shared_ptr<Buffer>* out) {
  // Must cast
  auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
  compute::ExecContext context(pool);
  ARROW_ASSIGN_OR_RAISE(
      std::shared_ptr<Array> casted_array,
      compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context));
  *out = casted_array->data()->buffers[1];
  return Status::OK();
}

template <typename FromType, typename ToType>
Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool,
                        std::shared_ptr<Buffer>* out) {
  ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool));

  auto in_values = reinterpret_cast<const FromType*>(input.data());
  auto out_values = reinterpret_cast<ToType*>(result->mutable_data());
  for (int64_t i = 0; i < length; ++i) {
    *out_values++ = static_cast<ToType>(*in_values++);
  }
  *out = std::move(result);
  return Status::OK();
}

template <typename T>
void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride,
                         T* output_data) {
  // Passing input_data as non-const is a concession to PyObject*
  for (int64_t i = 0; i < length; ++i) {
    memcpy(output_data + i, input_data, sizeof(T));
    input_data += stride;
  }
}

template <typename T>
void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) {
  // Passing input_data as non-const is a concession to PyObject*
  int64_t j = 0;
  for (int64_t i = 0; i < length; ++i) {
    output_data[i] = input_data[j];
    j += stride;
  }
}

class NumPyStridedConverter {
 public:
  static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
                        std::shared_ptr<Buffer>* out) {
    NumPyStridedConverter converter(arr, length, pool);
    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
    *out = converter.buffer_;
    return Status::OK();
  }
  template <int TYPE>
  Status Visit(PyArrayObject* arr) {
    using traits = internal::npy_traits<TYPE>;
    using T = typename traits::value_type;

    ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_));

    const int64_t stride = PyArray_STRIDES(arr)[0];
    // ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it
    // would do an unsigned division. This cannot be caught by tests without ubsan, since
    // common signed overflow behavior and the fact that the sizeof(T) is currently always
    // a power of two here cause CopyStridedNatural to still produce correct results
    const int64_t element_size = sizeof(T);
    if (stride % element_size == 0) {
      const int64_t stride_elements = stride / element_size;
      CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
                         stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
    } else {
      CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
                          reinterpret_cast<T*>(buffer_->mutable_data()));
    }
    return Status::OK();
  }

 protected:
  NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
      : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
  PyArrayObject* arr_;
  int64_t length_;
  MemoryPool* pool_;
  std::shared_ptr<Buffer> buffer_;
};

}  // namespace

template <typename ArrowType>
inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
  if (PyArray_ISBYTESWAPPED(arr_)) {
    // TODO
    return Status::NotImplemented("Byte-swapped arrays not supported");
  }

  if (dtype_->type_num == NPY_BOOL) {
    int64_t nbytes = bit_util::BytesForBits(length_);
    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_));

    Ndarray1DIndexer<uint8_t> values(arr_);
    int64_t i = 0;
    const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
    GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);

    *data = std::move(buffer);
  } else if (is_strided()) {
    RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
  } else {
    // Can zero-copy
    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
  }

  return Status::OK();
}

template <typename ArrowType>
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
  RETURN_NOT_OK(PrepareInputData<ArrowType>(data));

  ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_));

  if (!input_type->Equals(*type_)) {
    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_,
                             cast_options_, pool_, data));
  }

  return Status::OK();
}

template <>
inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
  std::shared_ptr<DataType> input_type;

  RETURN_NOT_OK(PrepareInputData<Date32Type>(data));

  auto date_dtype =
      reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
  if (dtype_->type_num == NPY_DATETIME) {
    // If we have inbound datetime64[D] data, this needs to be downcasted
    // separately here from int64_t to int32_t, because this data is not
    // supported in compute::Cast
    if (date_dtype->meta.base == NPY_FR_D) {
      // TODO(wesm): How pedantic do we really want to be about checking for int32
      // overflow here?
      Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
      RETURN_NOT_OK(s);
    } else {
      ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
      if (!input_type->Equals(*type_)) {
        // The null bitmap was already computed in VisitNative()
        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
                                 type_, cast_options_, pool_, data));
      }
    }
  } else {
    ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
    if (!input_type->Equals(*type_)) {
      RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
                               type_, cast_options_, pool_, data));
    }
  }

  return Status::OK();
}

template <>
inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
  constexpr int64_t kMillisecondsInDay = 86400000;
  std::shared_ptr<DataType> input_type;

  RETURN_NOT_OK(PrepareInputData<Date64Type>(data));

  auto date_dtype =
      reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
  if (dtype_->type_num == NPY_DATETIME) {
    // If we have inbound datetime64[D] data, this needs to be downcasted
    // separately here from int64_t to int32_t, because this data is not
    // supported in compute::Cast
    if (date_dtype->meta.base == NPY_FR_D) {
      ARROW_ASSIGN_OR_RAISE(auto result,
                            AllocateBuffer(sizeof(int64_t) * length_, pool_));

      auto in_values = reinterpret_cast<const int64_t*>((*data)->data());
      auto out_values = reinterpret_cast<int64_t*>(result->mutable_data());
      for (int64_t i = 0; i < length_; ++i) {
        *out_values++ = kMillisecondsInDay * (*in_values++);
      }
      *data = std::move(result);
    } else {
      ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
      if (!input_type->Equals(*type_)) {
        // The null bitmap was already computed in VisitNative()
        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
                                 type_, cast_options_, pool_, data));
      }
    }
  } else {
    ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
    if (!input_type->Equals(*type_)) {
      RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
                               type_, cast_options_, pool_, data));
    }
  }

  return Status::OK();
}

// Create 16MB chunks for binary data
constexpr int32_t kBinaryChunksize = 1 << 24;

Status NumPyConverter::Visit(const BinaryType& type) {
  ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);

  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

  auto AppendNotNull = [&builder, this](const uint8_t* data) {
    // This is annoying. NumPy allows strings to have nul-terminators, so
    // we must check for them here
    const size_t item_size =
        strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
    return builder.Append(data, static_cast<int32_t>(item_size));
  };

  if (mask_ != nullptr) {
    Ndarray1DIndexer<uint8_t> mask_values(mask_);
    for (int64_t i = 0; i < length_; ++i) {
      if (mask_values[i]) {
        RETURN_NOT_OK(builder.AppendNull());
      } else {
        RETURN_NOT_OK(AppendNotNull(data));
      }
      data += stride_;
    }
  } else {
    for (int64_t i = 0; i < length_; ++i) {
      RETURN_NOT_OK(AppendNotNull(data));
      data += stride_;
    }
  }

  ArrayVector result;
  RETURN_NOT_OK(builder.Finish(&result));
  for (auto arr : result) {
    RETURN_NOT_OK(PushArray(arr->data()));
  }
  return Status::OK();
}

Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
  auto byte_width = type.byte_width();

  if (itemsize_ != byte_width) {
    return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ",
                           byte_width, ")");
  }

  FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

  if (mask_ != nullptr) {
    Ndarray1DIndexer<uint8_t> mask_values(mask_);
    RETURN_NOT_OK(builder.Reserve(length_));
    for (int64_t i = 0; i < length_; ++i) {
      if (mask_values[i]) {
        RETURN_NOT_OK(builder.AppendNull());
      } else {
        RETURN_NOT_OK(builder.Append(data));
      }
      data += stride_;
    }
  } else {
    for (int64_t i = 0; i < length_; ++i) {
      RETURN_NOT_OK(builder.Append(data));
      data += stride_;
    }
  }

  std::shared_ptr<Array> result;
  RETURN_NOT_OK(builder.Finish(&result));
  return PushArray(result->data());
}

namespace {

// NumPy unicode is UCS4/UTF32 always
constexpr int kNumPyUnicodeSize = 4;

Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
                   ::arrow::internal::ChunkedStringBuilder* builder) {
  // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
  // so we need to detect that here to truncate if necessary. Yep.
  Py_ssize_t actual_length = 0;
  for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) {
    const char* code_point = data + actual_length * kNumPyUnicodeSize;
    if ((*code_point == '\0') && (*(code_point + 1) == '\0') &&
        (*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) {
      break;
    }
  }

  OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize,
                                             nullptr, &byteorder));
  RETURN_IF_PYERROR();
  OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj()));
  if (utf8_obj.obj() == NULL) {
    PyErr_Clear();
    return Status::Invalid("failed converting UTF32 to UTF8");
  }

  const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.obj()));
  return builder->Append(
      reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(utf8_obj.obj())), length);
}

}  // namespace

Status NumPyConverter::Visit(const StringType& type) {
  util::InitializeUTF8();

  ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);

  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));

  char numpy_byteorder = dtype_->byteorder;

  // For Python C API, -1 is little-endian, 1 is big-endian
#if ARROW_LITTLE_ENDIAN
  // Yield little-endian from both '|' (native) and '<'
  int byteorder = numpy_byteorder == '>' ? 1 : -1;
#else
  // Yield big-endian from both '|' (native) and '>'
  int byteorder = numpy_byteorder == '<' ? -1 : 1;
#endif

  PyAcquireGIL gil_lock;

  const bool is_binary_type = dtype_->type_num == NPY_STRING;
  const bool is_unicode_type = dtype_->type_num == NPY_UNICODE;

  if (!is_binary_type && !is_unicode_type) {
    const bool is_float_type = dtype_->kind == 'f';
    if (from_pandas_ && is_float_type) {
      // in case of from_pandas=True, accept an all-NaN float array as input
      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
                                                 &null_count_));
      if (null_count_ == length_) {
        auto arr = std::make_shared<NullArray>(length_);
        compute::ExecContext context(pool_);
        ARROW_ASSIGN_OR_RAISE(
            std::shared_ptr<Array> out,
            compute::Cast(*arr, arrow::utf8(), cast_options_, &context));
        out_arrays_.emplace_back(out);
        return Status::OK();
      }
    }
    std::string dtype_string;
    RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast<PyObject*>(dtype_),
                                                  &dtype_string));
    return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string);
  }

  auto AppendNonNullValue = [&](const uint8_t* data) {
    if (is_binary_type) {
      if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
        return builder.Append(data, static_cast<int32_t>(itemsize_));
      } else {
        return Status::Invalid("Encountered non-UTF8 binary value: ",
                               HexEncode(data, itemsize_));
      }
    } else {
      // is_unicode_type case
      return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
                         &builder);
    }
  };

  if (mask_ != nullptr) {
    Ndarray1DIndexer<uint8_t> mask_values(mask_);
    for (int64_t i = 0; i < length_; ++i) {
      if (mask_values[i]) {
        RETURN_NOT_OK(builder.AppendNull());
      } else {
        RETURN_NOT_OK(AppendNonNullValue(data));
      }
      data += stride_;
    }
  } else {
    for (int64_t i = 0; i < length_; ++i) {
      RETURN_NOT_OK(AppendNonNullValue(data));
      data += stride_;
    }
  }

  ArrayVector result;
  RETURN_NOT_OK(builder.Finish(&result));
  for (auto arr : result) {
    RETURN_NOT_OK(PushArray(arr->data()));
  }
  return Status::OK();
}

Status NumPyConverter::Visit(const StructType& type) {
  std::vector<NumPyConverter> sub_converters;
  std::vector<OwnedRefNoGIL> sub_arrays;

  {
    PyAcquireGIL gil_lock;

    // Create converters for each struct type field
    if (PyDataType_FIELDS(dtype_) == NULL || !PyDict_Check(PyDataType_FIELDS(dtype_))) {
      return Status::TypeError("Expected struct array");
    }

    for (auto field : type.fields()) {
      PyObject* tup;
      PyDict_GetItemStringRef(PyDataType_FIELDS(dtype_), field->name().c_str(), &tup);
      RETURN_IF_PYERROR();
      OwnedRef tupref(tup);
      if (tup == NULL) {
        return Status::Invalid("Missing field '", field->name(), "' in struct array");
      }
      PyArray_Descr* sub_dtype =
          reinterpret_cast<PyArray_Descr*>(PyTuple_GET_ITEM(tup, 0));
      DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type));
      int offset = static_cast<int>(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1)));
      RETURN_IF_PYERROR();
      Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */
      PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset);
      RETURN_IF_PYERROR();
      sub_arrays.emplace_back(sub_array);
      sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
                                  from_pandas_);
    }
  }

  std::vector<ArrayVector> groups;
  int64_t null_count = 0;

  // Compute null bitmap and store it as a Boolean Array to include it
  // in the rechunking below
  {
    if (mask_ != nullptr) {
      RETURN_NOT_OK(InitNullBitmap());
      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
      if (null_count_ == -1) return Status::Invalid("Invalid mask type");
    }
    groups.push_back({std::make_shared<BooleanArray>(length_, null_bitmap_)});
  }

  // Convert child data
  for (auto& converter : sub_converters) {
    RETURN_NOT_OK(converter.Convert());
    groups.push_back(converter.result());
  }
  // Ensure the different array groups are chunked consistently
  groups = ::arrow::internal::RechunkArraysConsistently(groups);

  // Make struct array chunks by combining groups
  size_t ngroups = groups.size();
  size_t nchunks = groups[0].size();
  for (size_t chunk = 0; chunk < nchunks; chunk++) {
    // First group has the null bitmaps as Boolean Arrays
    const auto& null_data = groups[0][chunk]->data();
    DCHECK_EQ(null_data->type->id(), Type::BOOL);
    DCHECK_EQ(null_data->buffers.size(), 2);
    const auto& null_buffer = null_data->buffers[1];
    // Careful: the rechunked null bitmap may have a non-zero offset
    // to its buffer, and it may not even start on a byte boundary
    int64_t null_offset = null_data->offset;
    std::shared_ptr<Buffer> fixed_null_buffer;

    if (!null_buffer) {
      fixed_null_buffer = null_buffer;
    } else if (null_offset % 8 == 0) {
      fixed_null_buffer =
          std::make_shared<Buffer>(null_buffer,
                                   // byte offset
                                   null_offset / 8,
                                   // byte size
                                   bit_util::BytesForBits(null_data->length));
    } else {
      ARROW_ASSIGN_OR_RAISE(
          fixed_null_buffer,
          CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length));
    }

    // Create struct array chunk and populate it
    auto arr_data =
        ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0);
    arr_data->buffers.push_back(fixed_null_buffer);
    // Append child chunks
    for (size_t i = 1; i < ngroups; i++) {
      arr_data->child_data.push_back(groups[i][chunk]->data());
    }
    RETURN_NOT_OK(PushArray(arr_data));
  }

  return Status::OK();
}

Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                      const std::shared_ptr<DataType>& type,
                      const compute::CastOptions& cast_options,
                      std::shared_ptr<ChunkedArray>* out) {
  if (!PyArray_Check(ao)) {
    // This code path cannot be reached by Python unit tests currently so this
    // is only a sanity check.
    return Status::TypeError("Input object was not a NumPy array");
  }
  if (PyArray_NDIM(reinterpret_cast<PyArrayObject*>(ao)) != 1) {
    return Status::Invalid("only handle 1-dimensional arrays");
  }

  NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
  RETURN_NOT_OK(converter.Convert());
  const auto& output_arrays = converter.result();
  DCHECK_GT(output_arrays.size(), 0);
  *out = std::make_shared<ChunkedArray>(output_arrays);
  return Status::OK();
}

Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                      const std::shared_ptr<DataType>& type,
                      std::shared_ptr<ChunkedArray>* out) {
  return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
}

}  // namespace py
}  // namespace arrow