src/arrow/python/inference.cc · arrow-nightlies/pyarrow

arrow-nightlies / pyarrow python

Repository URL to install this package:
Version: 19.0.0.dev246

/ src / arrow / python / inference.cc

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/python/inference.h"
#include "arrow/python/numpy_interop.h"

#include <datetime.h>

#include <algorithm>
#include <limits>
#include <map>
#include <string>
#include <utility>
#include <vector>

#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"

#include "arrow/python/datetime.h"
#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/python/iterators.h"
#include "arrow/python/numpy_convert.h"

namespace arrow {
namespace py {
namespace {
// Assigns a tuple to interval_types_tuple containing the nametuple for
// MonthDayNanoIntervalType and if present dateutil's relativedelta and
// pandas DateOffset.
Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) {
  OwnedRef relative_delta_module;
  // These are Optional imports so swallow errors.
  OwnedRef relative_delta_type;
  // Try to import pandas to get types.
  internal::InitPandasStaticData();
  if (internal::ImportModule("dateutil.relativedelta", &relative_delta_module).ok()) {
    RETURN_NOT_OK(internal::ImportFromModule(relative_delta_module.obj(), "relativedelta",
                                             &relative_delta_type));
  }

  PyObject* date_offset_type = internal::BorrowPandasDataOffsetType();
  interval_types_tuple->reset(
      PyTuple_New(1 + (date_offset_type != nullptr ? 1 : 0) +
                  (relative_delta_type.obj() != nullptr ? 1 : 0)));
  RETURN_IF_PYERROR();
  int index = 0;
  PyTuple_SetItem(interval_types_tuple->obj(), index++,
                  internal::NewMonthDayNanoTupleType());
  RETURN_IF_PYERROR();
  if (date_offset_type != nullptr) {
    Py_XINCREF(date_offset_type);
    PyTuple_SetItem(interval_types_tuple->obj(), index++, date_offset_type);
    RETURN_IF_PYERROR();
  }
  if (relative_delta_type.obj() != nullptr) {
    PyTuple_SetItem(interval_types_tuple->obj(), index++, relative_delta_type.detach());
    RETURN_IF_PYERROR();
  }
  return Status::OK();
}

}  // namespace

#define _NUMPY_UNIFY_NOOP(DTYPE) \
  case NPY_##DTYPE:              \
    return OK;

#define _NUMPY_UNIFY_PROMOTE(DTYPE) \
  case NPY_##DTYPE:                 \
    current_type_num_ = dtype;      \
    current_dtype_ = descr;         \
    return OK;

#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE)               \
  case NPY_##DTYPE:                                            \
    current_type_num_ = NPY_##NEW_TYPE;                        \
    current_dtype_ = PyArray_DescrFromType(current_type_num_); \
    return OK;

// Form a consensus NumPy dtype to use for Arrow conversion for a
// collection of dtype objects observed one at a time
class NumPyDtypeUnifier {
 public:
  enum Action { OK, INVALID };

  NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {}

  Status InvalidMix(int new_dtype) {
    return Status::Invalid("Cannot mix NumPy dtypes ",
                           GetNumPyTypeName(current_type_num_), " and ",
                           GetNumPyTypeName(new_dtype));
  }

  int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; }

  int Observe_INT8(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_PROMOTE(INT16);
      _NUMPY_UNIFY_PROMOTE(INT32);
      _NUMPY_UNIFY_PROMOTE(INT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT32);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_INT16(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(INT8);
      _NUMPY_UNIFY_PROMOTE(INT32);
      _NUMPY_UNIFY_PROMOTE(INT64);
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_PROMOTE(FLOAT32);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_INT32(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(INT8);
      _NUMPY_UNIFY_NOOP(INT16);
      _NUMPY_UNIFY_PROMOTE(INT32);
      _NUMPY_UNIFY_PROMOTE(INT64);
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_NOOP(UINT16);
      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_INT64(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(INT8);
      _NUMPY_UNIFY_NOOP(INT16);
      _NUMPY_UNIFY_NOOP(INT32);
      _NUMPY_UNIFY_NOOP(INT64);
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_NOOP(UINT16);
      _NUMPY_UNIFY_NOOP(UINT32);
      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_UINT8(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_PROMOTE(UINT16);
      _NUMPY_UNIFY_PROMOTE(UINT32);
      _NUMPY_UNIFY_PROMOTE(UINT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT32);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_UINT16(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_PROMOTE(UINT32);
      _NUMPY_UNIFY_PROMOTE(UINT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT32);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_UINT32(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_NOOP(UINT16);
      _NUMPY_UNIFY_PROMOTE(UINT64);
      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_UINT64(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_NOOP(UINT16);
      _NUMPY_UNIFY_NOOP(UINT32);
      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_FLOAT16(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_PROMOTE(FLOAT32);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_FLOAT32(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(INT8);
      _NUMPY_UNIFY_NOOP(INT16);
      _NUMPY_UNIFY_NOOP(INT32);
      _NUMPY_UNIFY_NOOP(INT64);
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_NOOP(UINT16);
      _NUMPY_UNIFY_NOOP(UINT32);
      _NUMPY_UNIFY_NOOP(UINT64);
      _NUMPY_UNIFY_PROMOTE(FLOAT64);
      default:
        return INVALID;
    }
  }

  int Observe_FLOAT64(PyArray_Descr* descr, int dtype) {
    switch (dtype) {
      _NUMPY_UNIFY_NOOP(INT8);
      _NUMPY_UNIFY_NOOP(INT16);
      _NUMPY_UNIFY_NOOP(INT32);
      _NUMPY_UNIFY_NOOP(INT64);
      _NUMPY_UNIFY_NOOP(UINT8);
      _NUMPY_UNIFY_NOOP(UINT16);
      _NUMPY_UNIFY_NOOP(UINT32);
      _NUMPY_UNIFY_NOOP(UINT64);
      default:
        return INVALID;
    }
  }

  int Observe_DATETIME(PyArray_Descr* dtype_obj) {
    // TODO: check that units are all the same
    return OK;
  }

  Status Observe(PyArray_Descr* descr) {
    int dtype = fix_numpy_type_num(descr->type_num);

    if (current_type_num_ == -1) {
      current_dtype_ = descr;
      current_type_num_ = dtype;
      return Status::OK();
    } else if (current_type_num_ == dtype) {
      return Status::OK();
    }

#define OBSERVE_CASE(DTYPE)                 \
  case NPY_##DTYPE:                         \
    action = Observe_##DTYPE(descr, dtype); \
    break;

    int action = OK;
    switch (current_type_num_) {
      OBSERVE_CASE(BOOL);
      OBSERVE_CASE(INT8);
      OBSERVE_CASE(INT16);
      OBSERVE_CASE(INT32);
      OBSERVE_CASE(INT64);
      OBSERVE_CASE(UINT8);
      OBSERVE_CASE(UINT16);
      OBSERVE_CASE(UINT32);
      OBSERVE_CASE(UINT64);
      OBSERVE_CASE(FLOAT16);
      OBSERVE_CASE(FLOAT32);
      OBSERVE_CASE(FLOAT64);
      case NPY_DATETIME:
        action = Observe_DATETIME(descr);
        break;
      default:
        return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype));
    }

    if (action == INVALID) {
      return InvalidMix(dtype);
    }
    return Status::OK();
  }

  bool dtype_was_observed() const { return current_type_num_ != -1; }

  PyArray_Descr* current_dtype() const { return current_dtype_; }

  int current_type_num() const { return current_type_num_; }

 private:
  int current_type_num_;
  PyArray_Descr* current_dtype_;
};

class TypeInferrer {
  // A type inference visitor for Python values
 public:
  // \param validate_interval the number of elements to observe before checking
  // whether the data is mixed type or has other problems. This helps avoid
  // excess computation for each element while also making sure we "bail out"
  // early with long sequences that may have problems up front
  // \param make_unions permit mixed-type data by creating union types (not yet
  // implemented)
  explicit TypeInferrer(bool pandas_null_sentinels = false,
                        int64_t validate_interval = 100, bool make_unions = false)
      : pandas_null_sentinels_(pandas_null_sentinels),
        validate_interval_(validate_interval),
        make_unions_(make_unions),
        total_count_(0),
        none_count_(0),
        bool_count_(0),
        int_count_(0),
        date_count_(0),
        time_count_(0),
        timestamp_micro_count_(0),
        duration_count_(0),
        float_count_(0),
        binary_count_(0),
        unicode_count_(0),
        decimal_count_(0),
        list_count_(0),
        struct_count_(0),
        arrow_scalar_count_(0),
        numpy_dtype_count_(0),
        interval_count_(0),
        max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
                              std::numeric_limits<int32_t>::min()),
        decimal_type_() {
    ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_));
    ARROW_CHECK_OK(ImportPresentIntervalTypes(&interval_types_));
  }

  /// \param[in] obj a Python object in the sequence
  /// \param[out] keep_going if sufficient information has been gathered to
  /// attempt to begin converting the sequence, *keep_going will be set to true
  /// to signal to the calling visitor loop to terminate
  Status Visit(PyObject* obj, bool* keep_going) {
    ++total_count_;

    if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) {
      ++none_count_;
    } else if (PyBool_Check(obj)) {
      ++bool_count_;
      *keep_going = make_unions_;
    } else if (PyFloat_Check(obj)) {
      ++float_count_;
      *keep_going = make_unions_;
    } else if (internal::IsPyInteger(obj)) {
      ++int_count_;
    } else if (PyDateTime_Check(obj)) {
      // infer timezone from the first encountered datetime object
      if (!timestamp_micro_count_) {
        OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo"));
        if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) {
          ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj()));
        }
      }
      ++timestamp_micro_count_;
      *keep_going = make_unions_;
    } else if (PyDelta_Check(obj)) {
      ++duration_count_;
      *keep_going = make_unions_;
    } else if (PyDate_Check(obj)) {
      ++date_count_;
      *keep_going = make_unions_;
    } else if (PyTime_Check(obj)) {
      ++time_count_;
      *keep_going = make_unions_;
    } else if (internal::IsPyBinary(obj)) {
      ++binary_count_;
      *keep_going = make_unions_;
    } else if (PyUnicode_Check(obj)) {
      ++unicode_count_;
      *keep_going = make_unions_;
    } else if (arrow::py::is_scalar(obj)) {
      RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
      RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
    } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
      RETURN_NOT_OK(VisitSet(obj, keep_going));
    } else if (has_numpy() && PyArray_Check(obj)) {
      RETURN_NOT_OK(VisitNdarray(obj, keep_going));
    } else if (PyDict_Check(obj)) {
      RETURN_NOT_OK(VisitDict(obj));
    } else if (PyList_Check(obj) ||
               (PyTuple_Check(obj) &&
                !PyObject_IsInstance(obj, PyTuple_GetItem(interval_types_.obj(), 0)))) {
      RETURN_NOT_OK(VisitList(obj, keep_going));
    } else if (PyObject_IsInstance(obj, decimal_type_.obj())) {
      RETURN_NOT_OK(max_decimal_metadata_.Update(obj));
      ++decimal_count_;
    } else if (PyObject_IsInstance(obj, interval_types_.obj())) {
      ++interval_count_;
    } else {
      return internal::InvalidValue(obj,
                                    "did not recognize Python value type when inferring "
                                    "an Arrow data type");
    }

    if (total_count_ % validate_interval_ == 0) {
      RETURN_NOT_OK(Validate());
    }

    return Status::OK();
  }

  // Infer value type from a sequence of values
  Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
    if (mask == nullptr || mask == Py_None) {
      return internal::VisitSequence(
          obj, /*offset=*/0,
          [this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); });
    } else {
      return internal::VisitSequenceMasked(
          obj, mask, /*offset=*/0,
          [this](PyObject* value, uint8_t masked, bool* keep_going) {
            if (!masked) {
              return Visit(value, keep_going);
            } else {
              return Status::OK();
            }
          });
    }
  }

  // Infer value type from a sequence of values
  Status VisitIterable(PyObject* obj) {
    return internal::VisitIterable(obj, [this](PyObject* value, bool* keep_going) {
      return Visit(value, keep_going);
    });
  }

  Status GetType(std::shared_ptr<DataType>* out) {
    // TODO(wesm): handling forming unions
    if (make_unions_) {
      return Status::NotImplemented("Creating union types not yet supported");
    }

    RETURN_NOT_OK(Validate());

    if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ != total_count_) {
      return Status::Invalid(
          "pyarrow scalars cannot be mixed "
          "with other Python scalar values currently");
    }

    if (numpy_dtype_count_ > 0) {
      // All NumPy scalars and Nones/nulls
      if (numpy_dtype_count_ + none_count_ == total_count_) {
        return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
      }

      // The "bad path": data contains a mix of NumPy scalars and
      // other kinds of scalars. Note this can happen innocuously
      // because numpy.nan is not a NumPy scalar (it's a built-in
      // PyFloat)

      // TODO(ARROW-5564): Merge together type unification so this
      // hack is not necessary
      switch (numpy_unifier_.current_type_num()) {
        case NPY_BOOL:
          bool_count_ += numpy_dtype_count_;
          break;
        case NPY_INT8:
        case NPY_INT16:
        case NPY_INT32:
        case NPY_INT64:
        case NPY_UINT8:
        case NPY_UINT16:
        case NPY_UINT32:
        case NPY_UINT64:
          int_count_ += numpy_dtype_count_;
          break;
        case NPY_FLOAT32:
        case NPY_FLOAT64:
          float_count_ += numpy_dtype_count_;
          break;
        case NPY_DATETIME:
          return Status::Invalid(
              "numpy.datetime64 scalars cannot be mixed "
              "with other Python scalar values currently");
      }
    }

    if (list_count_) {
      std::shared_ptr<DataType> value_type;
      RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
      *out = list(value_type);
    } else if (struct_count_) {
      RETURN_NOT_OK(GetStructType(out));
    } else if (decimal_count_) {
      if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) {
        // the default constructor does not validate the precision and scale
        ARROW_ASSIGN_OR_RAISE(*out,
                              Decimal256Type::Make(max_decimal_metadata_.precision(),
                                                   max_decimal_metadata_.scale()));
      } else {
        ARROW_ASSIGN_OR_RAISE(*out,
                              Decimal128Type::Make(max_decimal_metadata_.precision(),
                                                   max_decimal_metadata_.scale()));
      }
    } else if (float_count_) {
      // Prioritize floats before integers
      *out = float64();
    } else if (int_count_) {
      *out = int64();
    } else if (date_count_) {
      *out = date32();
    } else if (time_count_) {
      *out = time64(TimeUnit::MICRO);
    } else if (timestamp_micro_count_) {
      *out = timestamp(TimeUnit::MICRO, timezone_);
    } else if (duration_count_) {
      *out = duration(TimeUnit::MICRO);
    } else if (bool_count_) {
      *out = boolean();
    } else if (binary_count_) {
      *out = binary();
    } else if (unicode_count_) {
      *out = utf8();
    } else if (interval_count_) {
      *out = month_day_nano_interval();
    } else if (arrow_scalar_count_) {
      *out = scalar_type_;
    } else {
      *out = null();
    }
    return Status::OK();
  }

  int64_t total_count() const { return total_count_; }

 protected:
  Status Validate() const {
    if (list_count_ > 0) {
      if (list_count_ + none_count_ != total_count_) {
        return Status::Invalid("cannot mix list and non-list, non-null values");
      }
      RETURN_NOT_OK(list_inferrer_->Validate());
    } else if (struct_count_ > 0) {
      if (struct_count_ + none_count_ != total_count_) {
        return Status::Invalid("cannot mix struct and non-struct, non-null values");
      }
      for (const auto& it : struct_inferrers_) {
        RETURN_NOT_OK(it.second.Validate());
      }
    }
    return Status::OK();
  }

  Status VisitArrowScalar(PyObject* obj, bool* keep_going /* unused */) {
    ARROW_ASSIGN_OR_RAISE(auto scalar, arrow::py::unwrap_scalar(obj));
    // Check that all the scalar types for the sequence are the same
    if (arrow_scalar_count_ > 0 && *scalar->type != *scalar_type_) {
      return internal::InvalidValue(obj, "cannot mix scalars with different types");
    }
    scalar_type_ = scalar->type;
    ++arrow_scalar_count_;
    return Status::OK();
  }

  Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
    // Continue visiting dtypes for now.
    // TODO(wesm): devise approach for unions
    ++numpy_dtype_count_;
    *keep_going = true;
    return numpy_unifier_.Observe(dtype);
  }

  Status VisitList(PyObject* obj, bool* keep_going /* unused */) {
    if (!list_inferrer_) {
      list_inferrer_.reset(
          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
    }
    ++list_count_;
    return list_inferrer_->VisitSequence(obj);
  }

  Status VisitSet(PyObject* obj, bool* keep_going /* unused */) {
    if (!list_inferrer_) {
      list_inferrer_.reset(
          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
    }
    ++list_count_;
    return list_inferrer_->VisitIterable(obj);
  }

  Status VisitNdarray(PyObject* obj, bool* keep_going) {
    PyArray_Descr* dtype = PyArray_DESCR(reinterpret_cast<PyArrayObject*>(obj));
    if (dtype->type_num == NPY_OBJECT) {
      return VisitList(obj, keep_going);
    }
    // Not an object array: infer child Arrow type from dtype
    if (!list_inferrer_) {
      list_inferrer_.reset(
          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
    }
    ++list_count_;

    // XXX(wesm): In ARROW-4324 I added accounting to check whether
    // all of the non-null values have NumPy dtypes, but the
    // total_count not being properly incremented here
    ++(*list_inferrer_).total_count_;
    return list_inferrer_->VisitDType(dtype, keep_going);
  }

  Status VisitDict(PyObject* obj) {
    PyObject* key_obj;
    PyObject* value_obj;
    Py_ssize_t pos = 0;

    while (PyDict_Next(obj, &pos, &key_obj, &value_obj)) {
      std::string key;
      if (PyUnicode_Check(key_obj)) {
        RETURN_NOT_OK(internal::PyUnicode_AsStdString(key_obj, &key));
      } else if (PyBytes_Check(key_obj)) {
        key = internal::PyBytes_AsStdString(key_obj);
      } else {
        return Status::TypeError("Expected dict key of type str or bytes, got '",
                                 Py_TYPE(key_obj)->tp_name, "'");
      }
      // Get or create visitor for this key
      auto it = struct_inferrers_.find(key);
      if (it == struct_inferrers_.end()) {
        it = struct_inferrers_
                 .insert(
                     std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
                                                      validate_interval_, make_unions_)))
                 .first;
      }
      TypeInferrer* visitor = &it->second;

      // We ignore termination signals from child visitors for now
      //
      // TODO(wesm): keep track of whether type inference has terminated for
      // the child visitors to avoid doing unneeded work
      bool keep_going = true;
      RETURN_NOT_OK(visitor->Visit(value_obj, &keep_going));
    }

    // We do not terminate visiting dicts since we want the union of all
    // observed keys
    ++struct_count_;
    return Status::OK();
  }

  Status GetStructType(std::shared_ptr<DataType>* out) {
    std::vector<std::shared_ptr<Field>> fields;
    for (auto&& it : struct_inferrers_) {
      std::shared_ptr<DataType> field_type;
      RETURN_NOT_OK(it.second.GetType(&field_type));
      fields.emplace_back(field(it.first, field_type));
    }
    *out = struct_(fields);
    return Status::OK();
  }

 private:
  bool pandas_null_sentinels_;
  int64_t validate_interval_;
  bool make_unions_;
  int64_t total_count_;
  int64_t none_count_;
  int64_t bool_count_;
  int64_t int_count_;
  int64_t date_count_;
  int64_t time_count_;
  int64_t timestamp_micro_count_;
  std::string timezone_;
  int64_t duration_count_;
  int64_t float_count_;
  int64_t binary_count_;
  int64_t unicode_count_;
  int64_t decimal_count_;
  int64_t list_count_;
  int64_t struct_count_;
  int64_t arrow_scalar_count_;
  int64_t numpy_dtype_count_;
  int64_t interval_count_;
  std::unique_ptr<TypeInferrer> list_inferrer_;
  std::map<std::string, TypeInferrer> struct_inferrers_;
  std::shared_ptr<DataType> scalar_type_;

  // If we observe a strongly-typed value in e.g. a NumPy array, we can store
  // it here to skip the type counting logic above
  NumPyDtypeUnifier numpy_unifier_;

  internal::DecimalMetadata max_decimal_metadata_;

  OwnedRefNoGIL decimal_type_;
  OwnedRefNoGIL interval_types_;
};

// Non-exhaustive type inference
Result<std::shared_ptr<DataType>> InferArrowType(PyObject* obj, PyObject* mask,
                                                 bool pandas_null_sentinels) {
  if (pandas_null_sentinels) {
    // ARROW-842: If pandas is not installed then null checks will be less
    // comprehensive, but that is okay.
    internal::InitPandasStaticData();
  }

  std::shared_ptr<DataType> out_type;
  TypeInferrer inferrer(pandas_null_sentinels);
  RETURN_NOT_OK(inferrer.VisitSequence(obj, mask));
  RETURN_NOT_OK(inferrer.GetType(&out_type));
  if (out_type == nullptr) {
    return Status::TypeError("Unable to determine data type");
  } else {
    return std::move(out_type);
  }
}

ARROW_PYTHON_EXPORT
bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); }

ARROW_PYTHON_EXPORT
bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); }

ARROW_PYTHON_EXPORT
bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); }

}  // namespace py
}  // namespace arrow
arrow-nightlies / pyarrow python

Version: 19.0.0.dev246

/ src / arrow / python / inference.cc

Products

About

Resources

Contact Gemfury