// Functions for pandas conversion via NumPy
#include "arrow/python/numpy_to_arrow.h"
#include "arrow/python/numpy_interop.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_generate.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/string.h"
#include "arrow/util/utf8.h"
#include "arrow/visit_type_inline.h"
#include "arrow/compute/api_scalar.h"
#include "arrow/python/common.h"
#include "arrow/python/datetime.h"
#include "arrow/python/helpers.h"
#include "arrow/python/iterators.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/numpy_internal.h"
#include "arrow/python/python_to_arrow.h"
#include "arrow/python/type_traits.h"
#include "arrow/python/vendored/pythoncapi_compat.h"
namespace arrow {
using internal::checked_cast;
using internal::CopyBitmap;
using internal::GenerateBitsUnrolled;
namespace py {
using internal::NumPyTypeSize;
// ----------------------------------------------------------------------
// Conversion utilities
namespace {
Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
std::shared_ptr<ResizableBuffer>* out) {
int64_t null_bytes = bit_util::BytesForBits(length);
ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool));
// Padding zeroed by AllocateResizableBuffer
memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
*out = std::move(null_bitmap);
return Status::OK();
// ----------------------------------------------------------------------
// Conversion from NumPy-in-Pandas to Arrow null bitmap
template <int TYPE>
inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
typedef internal::npy_traits<TYPE> traits;
typedef typename traits::value_type T;
int64_t null_count = 0;
Ndarray1DIndexer<T> values(arr);
for (int i = 0; i < values.size(); ++i) {
if (traits::isnull(values[i])) {
} else {
bit_util::SetBit(bitmap, i);
return null_count;
class NumPyNullsConverter {
/// Convert the given array's null values to a null bitmap.
/// The null bitmap is only allocated if null values are ever possible.
static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas,
std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
int64_t* out_null_count) {
NumPyNullsConverter converter(pool, arr, from_pandas);
RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
*out_null_bitmap_ = converter.null_bitmap_;
*out_null_count = converter.null_count_;
return Status::OK();
template <int TYPE>
Status Visit(PyArrayObject* arr) {
typedef internal::npy_traits<TYPE> traits;
const bool null_sentinels_possible =
// Always treat Numpy's NaT as null
// Observing pandas's null sentinels
(from_pandas_ && traits::supports_nulls);
if (null_sentinels_possible) {
RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
return Status::OK();
NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas)
: pool_(pool),
null_count_(0) {}
MemoryPool* pool_;
PyArrayObject* arr_;
bool from_pandas_;
std::shared_ptr<ResizableBuffer> null_bitmap_;
uint8_t* null_bitmap_data_;
int64_t null_count_;
// Returns null count
int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
int64_t null_count = 0;
if (!PyArray_Check(mask)) return -1;
Ndarray1DIndexer<uint8_t> mask_values(mask);
for (int i = 0; i < length; ++i) {
if (mask_values[i]) {
bit_util::ClearBit(bitmap, i);
} else {
bit_util::SetBit(bitmap, i);
return null_count;
} // namespace
// ----------------------------------------------------------------------
// Conversion from NumPy arrays (possibly originating from pandas) to Arrow
// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for
// that
class NumPyConverter {
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
const std::shared_ptr<DataType>& type, bool from_pandas,
const compute::CastOptions& cast_options = compute::CastOptions())
: pool_(pool),
null_count_(0) {
if (mo != nullptr && mo != Py_None) {
mask_ = reinterpret_cast<PyArrayObject*>(mo);
length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
itemsize_ = static_cast<int64_t>(PyArray_ITEMSIZE(arr_));
stride_ = static_cast<int64_t>(PyArray_STRIDES(arr_)[0]);
bool is_strided() const { return itemsize_ != stride_; }
Status Convert();
const ArrayVector& result() const { return out_arrays_; }
template <typename T>
enable_if_primitive_ctype<T, Status> Visit(const T& type) {
return VisitNative<T>();
Status Visit(const HalfFloatType& type) { return VisitNative<UInt16Type>(); }
Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
Status Visit(const Date64Type& type) { return VisitNative<Date64Type>(); }
Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
Status Visit(const DurationType& type) { return VisitNative<DurationType>(); }
Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
// NumPy ascii string arrays
Status Visit(const BinaryType& type);
// NumPy unicode arrays
Status Visit(const StringType& type);
Status Visit(const StructType& type);
Status Visit(const FixedSizeBinaryType& type);
// Default case
Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); }
Status InitNullBitmap() {
RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
null_bitmap_data_ = null_bitmap_->mutable_data();
return Status::OK();
// Called before ConvertData to ensure Numpy input buffer is in expected
// Arrow layout
template <typename ArrowType>
Status PrepareInputData(std::shared_ptr<Buffer>* data);
// ----------------------------------------------------------------------
// Traditional visitor conversion for non-object arrays
template <typename ArrowType>
Status ConvertData(std::shared_ptr<Buffer>* data);
template <typename T>
Status PushBuilderResult(T* builder) {
std::shared_ptr<Array> out;
return Status::OK();
Status PushArray(const std::shared_ptr<ArrayData>& data) {
return Status::OK();
template <typename ArrowType>
Status VisitNative() {
if (mask_ != nullptr) {
null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
if (null_count_ == -1) return Status::Invalid("Invalid mask type");
} else {
RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
std::shared_ptr<Buffer> data;
auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
return PushArray(arr_data);
Status TypeNotImplemented(std::string type_name) {
return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
"> conversion. ");
MemoryPool* pool_;
std::shared_ptr<DataType> type_;
PyArrayObject* arr_;
PyArray_Descr* dtype_;
PyArrayObject* mask_;
int64_t length_;
int64_t stride_;
int64_t itemsize_;
bool from_pandas_;
compute::CastOptions cast_options_;
// Used in visitor pattern
ArrayVector out_arrays_;
std::shared_ptr<ResizableBuffer> null_bitmap_;
uint8_t* null_bitmap_data_;
int64_t null_count_;
Status NumPyConverter::Convert() {
if (PyArray_NDIM(arr_) != 1) {
return Status::Invalid("only handle 1-dimensional arrays");
if (dtype_->type_num == NPY_OBJECT) {
// If an object array, convert it like a normal Python sequence
PyConversionOptions py_options;
py_options.type = type_;
py_options.from_pandas = from_pandas_;
auto chunked_array,
reinterpret_cast<PyObject*>(mask_), py_options, pool_));
out_arrays_ = chunked_array->chunks();
return Status::OK();
if (type_ == nullptr) {
return Status::Invalid("Must pass data type for non-object arrays");
// Visit the type to perform conversion
return VisitTypeInline(*type_, this);
namespace {
Status CastBuffer(const std::shared_ptr<DataType>& in_type,
const std::shared_ptr<Buffer>& input, const int64_t length,
const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
const std::shared_ptr<DataType>& out_type,
const compute::CastOptions& cast_options, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
// Must cast
auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
compute::ExecContext context(pool);
std::shared_ptr<Array> casted_array,
compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context));
*out = casted_array->data()->buffers[1];
return Status::OK();
template <typename FromType, typename ToType>
Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool));
auto in_values = reinterpret_cast<const FromType*>(input.data());
auto out_values = reinterpret_cast<ToType*>(result->mutable_data());
for (int64_t i = 0; i < length; ++i) {
*out_values++ = static_cast<ToType>(*in_values++);
*out = std::move(result);
return Status::OK();
template <typename T>
void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride,
T* output_data) {
// Passing input_data as non-const is a concession to PyObject*
for (int64_t i = 0; i < length; ++i) {
memcpy(output_data + i, input_data, sizeof(T));
input_data += stride;
template <typename T>
void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) {
// Passing input_data as non-const is a concession to PyObject*
int64_t j = 0;
for (int64_t i = 0; i < length; ++i) {
output_data[i] = input_data[j];
j += stride;
class NumPyStridedConverter {
static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
NumPyStridedConverter converter(arr, length, pool);
RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
*out = converter.buffer_;
return Status::OK();
template <int TYPE>
Status Visit(PyArrayObject* arr) {
using traits = internal::npy_traits<TYPE>;
using T = typename traits::value_type;
ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_));
const int64_t stride = PyArray_STRIDES(arr)[0];
// ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it
// would do an unsigned division. This cannot be caught by tests without ubsan, since
// common signed overflow behavior and the fact that the sizeof(T) is currently always
// a power of two here cause CopyStridedNatural to still produce correct results
const int64_t element_size = sizeof(T);
if (stride % element_size == 0) {
const int64_t stride_elements = stride / element_size;
CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
} else {
CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
return Status::OK();
NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
: arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
PyArrayObject* arr_;
int64_t length_;
MemoryPool* pool_;
std::shared_ptr<Buffer> buffer_;
} // namespace
template <typename ArrowType>
inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
if (PyArray_ISBYTESWAPPED(arr_)) {
return Status::NotImplemented("Byte-swapped arrays not supported");
if (dtype_->type_num == NPY_BOOL) {
int64_t nbytes = bit_util::BytesForBits(length_);
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_));
Ndarray1DIndexer<uint8_t> values(arr_);
int64_t i = 0;
const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
*data = std::move(buffer);
} else if (is_strided()) {
RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
} else {
// Can zero-copy
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
return Status::OK();
template <typename ArrowType>
inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_,
cast_options_, pool_, data));
return Status::OK();
template <>
inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
std::shared_ptr<DataType> input_type;
auto date_dtype =
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
// separately here from int64_t to int32_t, because this data is not
// supported in compute::Cast
if (date_dtype->meta.base == NPY_FR_D) {
// TODO(wesm): How pedantic do we really want to be about checking for int32
// overflow here?
Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
} else {
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
} else {
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
return Status::OK();
template <>
inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
constexpr int64_t kMillisecondsInDay = 86400000;
std::shared_ptr<DataType> input_type;
auto date_dtype =
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
// separately here from int64_t to int32_t, because this data is not
// supported in compute::Cast
if (date_dtype->meta.base == NPY_FR_D) {
AllocateBuffer(sizeof(int64_t) * length_, pool_));
auto in_values = reinterpret_cast<const int64_t*>((*data)->data());
auto out_values = reinterpret_cast<int64_t*>(result->mutable_data());
for (int64_t i = 0; i < length_; ++i) {
*out_values++ = kMillisecondsInDay * (*in_values++);
*data = std::move(result);
} else {
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
} else {
ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
if (!input_type->Equals(*type_)) {
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
type_, cast_options_, pool_, data));
return Status::OK();
// Create 16MB chunks for binary data
constexpr int32_t kBinaryChunksize = 1 << 24;
Status NumPyConverter::Visit(const BinaryType& type) {
::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
auto AppendNotNull = [&builder, this](const uint8_t* data) {
// This is annoying. NumPy allows strings to have nul-terminators, so
// we must check for them here
const size_t item_size =
strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
return builder.Append(data, static_cast<int32_t>(item_size));
if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
} else {
data += stride_;
} else {
for (int64_t i = 0; i < length_; ++i) {
data += stride_;
ArrayVector result;
for (auto arr : result) {
return Status::OK();
Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
auto byte_width = type.byte_width();
if (itemsize_ != byte_width) {
return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ",
byte_width, ")");
FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
} else {
data += stride_;
} else {
for (int64_t i = 0; i < length_; ++i) {
data += stride_;
std::shared_ptr<Array> result;
return PushArray(result->data());
namespace {
// NumPy unicode is UCS4/UTF32 always
constexpr int kNumPyUnicodeSize = 4;
Status AppendUTF32(const char* data, int64_t itemsize, int byteorder,
::arrow::internal::ChunkedStringBuilder* builder) {
// The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
// so we need to detect that here to truncate if necessary. Yep.
Py_ssize_t actual_length = 0;
for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) {
const char* code_point = data + actual_length * kNumPyUnicodeSize;
if ((*code_point == '\0') && (*(code_point + 1) == '\0') &&
(*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) {
OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize,
nullptr, &byteorder));
OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj()));
if (utf8_obj.obj() == NULL) {
return Status::Invalid("failed converting UTF32 to UTF8");
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.obj()));
return builder->Append(
reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(utf8_obj.obj())), length);
} // namespace
Status NumPyConverter::Visit(const StringType& type) {
::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
char numpy_byteorder = dtype_->byteorder;
// For Python C API, -1 is little-endian, 1 is big-endian
// Yield little-endian from both '|' (native) and '<'
int byteorder = numpy_byteorder == '>' ? 1 : -1;
// Yield big-endian from both '|' (native) and '>'
int byteorder = numpy_byteorder == '<' ? -1 : 1;
PyAcquireGIL gil_lock;
const bool is_binary_type = dtype_->type_num == NPY_STRING;
const bool is_unicode_type = dtype_->type_num == NPY_UNICODE;
if (!is_binary_type && !is_unicode_type) {
const bool is_float_type = dtype_->kind == 'f';
if (from_pandas_ && is_float_type) {
// in case of from_pandas=True, accept an all-NaN float array as input
RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
if (null_count_ == length_) {
auto arr = std::make_shared<NullArray>(length_);
compute::ExecContext context(pool_);
std::shared_ptr<Array> out,
compute::Cast(*arr, arrow::utf8(), cast_options_, &context));
return Status::OK();
std::string dtype_string;
return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string);
auto AppendNonNullValue = [&](const uint8_t* data) {
if (is_binary_type) {
if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
return builder.Append(data, static_cast<int32_t>(itemsize_));
} else {
return Status::Invalid("Encountered non-UTF8 binary value: ",
HexEncode(data, itemsize_));
} else {
// is_unicode_type case
return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
} else {
data += stride_;
} else {
for (int64_t i = 0; i < length_; ++i) {
data += stride_;
ArrayVector result;
for (auto arr : result) {
return Status::OK();
Status NumPyConverter::Visit(const StructType& type) {
std::vector<NumPyConverter> sub_converters;
std::vector<OwnedRefNoGIL> sub_arrays;
PyAcquireGIL gil_lock;
// Create converters for each struct type field
if (PyDataType_FIELDS(dtype_) == NULL || !PyDict_Check(PyDataType_FIELDS(dtype_))) {
return Status::TypeError("Expected struct array");
for (auto field : type.fields()) {
PyObject* tup;
PyDict_GetItemStringRef(PyDataType_FIELDS(dtype_), field->name().c_str(), &tup);
OwnedRef tupref(tup);
if (tup == NULL) {
return Status::Invalid("Missing field '", field->name(), "' in struct array");
PyArray_Descr* sub_dtype =
reinterpret_cast<PyArray_Descr*>(PyTuple_GET_ITEM(tup, 0));
DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type));
int offset = static_cast<int>(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1)));
Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */
PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset);
sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
std::vector<ArrayVector> groups;
int64_t null_count = 0;
// Compute null bitmap and store it as a Boolean Array to include it
// in the rechunking below
if (mask_ != nullptr) {
null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
if (null_count_ == -1) return Status::Invalid("Invalid mask type");
groups.push_back({std::make_shared<BooleanArray>(length_, null_bitmap_)});
// Convert child data
for (auto& converter : sub_converters) {
// Ensure the different array groups are chunked consistently
groups = ::arrow::internal::RechunkArraysConsistently(groups);
// Make struct array chunks by combining groups
size_t ngroups = groups.size();
size_t nchunks = groups[0].size();
for (size_t chunk = 0; chunk < nchunks; chunk++) {
// First group has the null bitmaps as Boolean Arrays
const auto& null_data = groups[0][chunk]->data();
DCHECK_EQ(null_data->type->id(), Type::BOOL);
DCHECK_EQ(null_data->buffers.size(), 2);
const auto& null_buffer = null_data->buffers[1];
// Careful: the rechunked null bitmap may have a non-zero offset
// to its buffer, and it may not even start on a byte boundary
int64_t null_offset = null_data->offset;
std::shared_ptr<Buffer> fixed_null_buffer;
if (!null_buffer) {
fixed_null_buffer = null_buffer;
} else if (null_offset % 8 == 0) {
fixed_null_buffer =
// byte offset
null_offset / 8,
// byte size
} else {
CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length));
// Create struct array chunk and populate it
auto arr_data =
ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0);
// Append child chunks
for (size_t i = 1; i < ngroups; i++) {
return Status::OK();
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
const compute::CastOptions& cast_options,
std::shared_ptr<ChunkedArray>* out) {
if (!PyArray_Check(ao)) {
// This code path cannot be reached by Python unit tests currently so this
// is only a sanity check.
return Status::TypeError("Input object was not a NumPy array");
if (PyArray_NDIM(reinterpret_cast<PyArrayObject*>(ao)) != 1) {
return Status::Invalid("only handle 1-dimensional arrays");
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
const auto& output_arrays = converter.result();
DCHECK_GT(output_arrays.size(), 0);
*out = std::make_shared<ChunkedArray>(output_arrays);
return Status::OK();
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
std::shared_ptr<ChunkedArray>* out) {
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
} // namespace py
} // namespace arrow