include/arrow/util/value_parsing.h · arrow-nightlies/pyarrow

arrow-nightlies / pyarrow python

Repository URL to install this package:
Version: 19.0.0.dev259

/ include / arrow / util / value_parsing.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// This is a private header for string-to-number parsing utilities

#pragma once

#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <memory>
#include <string>
#include <type_traits>

#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/config.h"
#include "arrow/util/macros.h"
#include "arrow/util/time.h"
#include "arrow/util/visibility.h"
#include "arrow/vendored/datetime.h"
#include "arrow/vendored/strptime.h"

namespace arrow {

/// \brief A virtual string to timestamp parser
class ARROW_EXPORT TimestampParser {
 public:
  virtual ~TimestampParser() = default;

  virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
                          int64_t* out,
                          bool* out_zone_offset_present = NULLPTR) const = 0;

  virtual const char* kind() const = 0;

  virtual const char* format() const;

  /// \brief Create a TimestampParser that recognizes strptime-like format strings
  static std::shared_ptr<TimestampParser> MakeStrptime(std::string format);

  /// \brief Create a TimestampParser that recognizes (locale-agnostic) ISO8601
  /// timestamps
  static std::shared_ptr<TimestampParser> MakeISO8601();
};

namespace internal {

/// \brief The entry point for conversion from strings.
///
/// Specializations of StringConverter for `ARROW_TYPE` must define:
/// - A default constructible member type `value_type` which will be yielded on a
///   successful parse.
/// - The static member function `Convert`, callable with signature
///   `(const ARROW_TYPE& t, const char* s, size_t length, value_type* out)`.
///   `Convert` returns truthy for successful parses and assigns the parsed values to
///   `*out`. Parameters required for parsing (for example a timestamp's TimeUnit)
///   are acquired from the type parameter `t`.
template <typename ARROW_TYPE, typename Enable = void>
struct StringConverter;

template <typename T>
struct is_parseable {
  template <typename U, typename = typename StringConverter<U>::value_type>
  static std::true_type Test(U*);

  template <typename U>
  static std::false_type Test(...);

  static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
};

template <typename T, typename R = void>
using enable_if_parseable = enable_if_t<is_parseable<T>::value, R>;

template <>
struct StringConverter<BooleanType> {
  using value_type = bool;

  bool Convert(const BooleanType&, const char* s, size_t length, value_type* out) {
    if (length == 1) {
      // "0" or "1"?
      if (s[0] == '0') {
        *out = false;
        return true;
      }
      if (s[0] == '1') {
        *out = true;
        return true;
      }
      return false;
    }
    if (length == 4) {
      // "true"?
      *out = true;
      return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
              (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
    }
    if (length == 5) {
      // "false"?
      *out = false;
      return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
              (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
              (s[4] == 'e' || s[4] == 'E'));
    }
    return false;
  }
};

// Ideas for faster float parsing:
// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
// - https://github.com/google/double-conversion [used here]
// - https://github.com/achan001/dtoa-fast

ARROW_EXPORT
bool StringToFloat(const char* s, size_t length, char decimal_point, float* out);

ARROW_EXPORT
bool StringToFloat(const char* s, size_t length, char decimal_point, double* out);

ARROW_EXPORT
bool StringToFloat(const char* s, size_t length, char decimal_point, uint16_t* out);

template <>
struct StringConverter<FloatType> {
  using value_type = float;

  explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}

  bool Convert(const FloatType&, const char* s, size_t length, value_type* out) {
    return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
  }

 private:
  const char decimal_point;
};

template <>
struct StringConverter<DoubleType> {
  using value_type = double;

  explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}

  bool Convert(const DoubleType&, const char* s, size_t length, value_type* out) {
    return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
  }

 private:
  const char decimal_point;
};

template <>
struct StringConverter<HalfFloatType> {
  using value_type = uint16_t;

  explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}

  bool Convert(const HalfFloatType&, const char* s, size_t length, value_type* out) {
    return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
  }

 private:
  const char decimal_point;
};

// NOTE: HalfFloatType would require a half<->float conversion library

inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }

#define PARSE_UNSIGNED_ITERATION(C_TYPE)          \
  if (length > 0) {                               \
    uint8_t digit = ParseDecimalDigit(*s++);      \
    result = static_cast<C_TYPE>(result * 10U);   \
    length--;                                     \
    if (ARROW_PREDICT_FALSE(digit > 9U)) {        \
      /* Non-digit */                             \
      return false;                               \
    }                                             \
    result = static_cast<C_TYPE>(result + digit); \
  } else {                                        \
    break;                                        \
  }

#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE)                                     \
  if (length > 0) {                                                               \
    if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
      /* Overflow */                                                              \
      return false;                                                               \
    }                                                                             \
    uint8_t digit = ParseDecimalDigit(*s++);                                      \
    result = static_cast<C_TYPE>(result * 10U);                                   \
    C_TYPE new_result = static_cast<C_TYPE>(result + digit);                      \
    if (ARROW_PREDICT_FALSE(--length > 0)) {                                      \
      /* Too many digits */                                                       \
      return false;                                                               \
    }                                                                             \
    if (ARROW_PREDICT_FALSE(digit > 9U)) {                                        \
      /* Non-digit */                                                             \
      return false;                                                               \
    }                                                                             \
    if (ARROW_PREDICT_FALSE(new_result < result)) {                               \
      /* Overflow */                                                              \
      return false;                                                               \
    }                                                                             \
    result = new_result;                                                          \
  }

inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
  uint8_t result = 0;

  do {
    PARSE_UNSIGNED_ITERATION(uint8_t);
    PARSE_UNSIGNED_ITERATION(uint8_t);
    PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
  } while (false);
  *out = result;
  return true;
}

inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
  uint16_t result = 0;
  do {
    PARSE_UNSIGNED_ITERATION(uint16_t);
    PARSE_UNSIGNED_ITERATION(uint16_t);
    PARSE_UNSIGNED_ITERATION(uint16_t);
    PARSE_UNSIGNED_ITERATION(uint16_t);
    PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
  } while (false);
  *out = result;
  return true;
}

inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
  uint32_t result = 0;
  do {
    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);

    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);
    PARSE_UNSIGNED_ITERATION(uint32_t);

    PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
  } while (false);
  *out = result;
  return true;
}

inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
  uint64_t result = 0;
  do {
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);

    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);

    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);

    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);
    PARSE_UNSIGNED_ITERATION(uint64_t);

    PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
  } while (false);
  *out = result;
  return true;
}

#undef PARSE_UNSIGNED_ITERATION
#undef PARSE_UNSIGNED_ITERATION_LAST

template <typename T>
bool ParseHex(const char* s, size_t length, T* out) {
  // lets make sure that the length of the string is not too big
  if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) {
    return false;
  }
  T result = 0;
  for (size_t i = 0; i < length; i++) {
    result = static_cast<T>(result << 4);
    if (s[i] >= '0' && s[i] <= '9') {
      result = static_cast<T>(result | (s[i] - '0'));
    } else if (s[i] >= 'A' && s[i] <= 'F') {
      result = static_cast<T>(result | (s[i] - 'A' + 10));
    } else if (s[i] >= 'a' && s[i] <= 'f') {
      result = static_cast<T>(result | (s[i] - 'a' + 10));
    } else {
      /* Non-digit */
      return false;
    }
  }
  *out = result;
  return true;
}

template <class ARROW_TYPE>
struct StringToUnsignedIntConverterMixin {
  using value_type = typename ARROW_TYPE::c_type;

  bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
    if (ARROW_PREDICT_FALSE(length == 0)) {
      return false;
    }
    // If it starts with 0x then its hex
    if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
      length -= 2;
      s += 2;

      return ARROW_PREDICT_TRUE(ParseHex(s, length, out));
    }
    // Skip leading zeros
    while (length > 0 && *s == '0') {
      length--;
      s++;
    }
    return ParseUnsigned(s, length, out);
  }
};

template <>
struct StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {
  using StringToUnsignedIntConverterMixin<UInt8Type>::StringToUnsignedIntConverterMixin;
};

template <>
struct StringConverter<UInt16Type>
    : public StringToUnsignedIntConverterMixin<UInt16Type> {
  using StringToUnsignedIntConverterMixin<UInt16Type>::StringToUnsignedIntConverterMixin;
};

template <>
struct StringConverter<UInt32Type>
    : public StringToUnsignedIntConverterMixin<UInt32Type> {
  using StringToUnsignedIntConverterMixin<UInt32Type>::StringToUnsignedIntConverterMixin;
};

template <>
struct StringConverter<UInt64Type>
    : public StringToUnsignedIntConverterMixin<UInt64Type> {
  using StringToUnsignedIntConverterMixin<UInt64Type>::StringToUnsignedIntConverterMixin;
};

template <class ARROW_TYPE>
struct StringToSignedIntConverterMixin {
  using value_type = typename ARROW_TYPE::c_type;
  using unsigned_type = typename std::make_unsigned<value_type>::type;

  bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
    static constexpr auto max_positive =
        static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
    // Assuming two's complement
    static constexpr unsigned_type max_negative = max_positive + 1;
    bool negative = false;
    unsigned_type unsigned_value = 0;

    if (ARROW_PREDICT_FALSE(length == 0)) {
      return false;
    }
    // If it starts with 0x then its hex
    if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
      length -= 2;
      s += 2;

      if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) {
        return false;
      }
      *out = static_cast<value_type>(unsigned_value);
      return true;
    }

    if (*s == '-') {
      negative = true;
      s++;
      if (--length == 0) {
        return false;
      }
    }
    // Skip leading zeros
    while (length > 0 && *s == '0') {
      length--;
      s++;
    }
    if (!ARROW_PREDICT_TRUE(ParseUnsigned(s, length, &unsigned_value))) {
      return false;
    }
    if (negative) {
      if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
        return false;
      }
      // To avoid both compiler warnings (with unsigned negation)
      // and undefined behaviour (with signed negation overflow),
      // use the expanded formula for 2's complement negation.
      *out = static_cast<value_type>(~unsigned_value + 1);
    } else {
      if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
        return false;
      }
      *out = static_cast<value_type>(unsigned_value);
    }
    return true;
  }
};

template <>
struct StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {
  using StringToSignedIntConverterMixin<Int8Type>::StringToSignedIntConverterMixin;
};

template <>
struct StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {
  using StringToSignedIntConverterMixin<Int16Type>::StringToSignedIntConverterMixin;
};

template <>
struct StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {
  using StringToSignedIntConverterMixin<Int32Type>::StringToSignedIntConverterMixin;
};

template <>
struct StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {
  using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
};

namespace detail {

// Inline-able ISO-8601 parser

using ts_type = TimestampType::c_type;

template <typename Duration>
static inline bool ParseHH(const char* s, Duration* out) {
  uint8_t hours = 0;
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(hours >= 24)) {
    return false;
  }
  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours));
  return true;
}

template <typename Duration>
static inline bool ParseHH_MM(const char* s, Duration* out) {
  uint8_t hours = 0;
  uint8_t minutes = 0;
  if (ARROW_PREDICT_FALSE(s[2] != ':')) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(hours >= 24)) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
    return false;
  }
  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
                                              std::chrono::minutes(minutes));
  return true;
}

template <typename Duration>
static inline bool ParseHHMM(const char* s, Duration* out) {
  uint8_t hours = 0;
  uint8_t minutes = 0;
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(hours >= 24)) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
    return false;
  }
  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
                                              std::chrono::minutes(minutes));
  return true;
}

template <typename Duration>
static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
  uint8_t hours = 0;
  uint8_t minutes = 0;
  uint8_t seconds = 0;
  if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(hours >= 24)) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(seconds >= 60)) {
    return false;
  }
  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
                                              std::chrono::minutes(minutes) +
                                              std::chrono::seconds(seconds));
  return true;
}

static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
                                   uint32_t* out) {
  // The decimal point has been peeled off at this point

  // Fail if number of decimal places provided exceeds what the unit can hold.
  // Calculate how many trailing decimal places are omitted for the unit
  // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
  size_t omitted = 0;
  switch (unit) {
    case TimeUnit::MILLI:
      if (ARROW_PREDICT_FALSE(length > 3)) {
        return false;
      }
      if (length < 3) {
        omitted = 3 - length;
      }
      break;
    case TimeUnit::MICRO:
      if (ARROW_PREDICT_FALSE(length > 6)) {
        return false;
      }
      if (length < 6) {
        omitted = 6 - length;
      }
      break;
    case TimeUnit::NANO:
      if (ARROW_PREDICT_FALSE(length > 9)) {
        return false;
      }
      if (length < 9) {
        omitted = 9 - length;
      }
      break;
    default:
      return false;
  }

  if (ARROW_PREDICT_TRUE(omitted == 0)) {
    return ParseUnsigned(s, length, out);
  } else {
    uint32_t subseconds = 0;
    bool success = ParseUnsigned(s, length, &subseconds);
    if (ARROW_PREDICT_TRUE(success)) {
      switch (omitted) {
        case 1:
          *out = subseconds * 10;
          break;
        case 2:
          *out = subseconds * 100;
          break;
        case 3:
          *out = subseconds * 1000;
          break;
        case 4:
          *out = subseconds * 10000;
          break;
        case 5:
          *out = subseconds * 100000;
          break;
        case 6:
          *out = subseconds * 1000000;
          break;
        case 7:
          *out = subseconds * 10000000;
          break;
        case 8:
          *out = subseconds * 100000000;
          break;
        default:
          // Impossible case
          break;
      }
      return true;
    } else {
      return false;
    }
  }
}

}  // namespace detail

template <typename Duration>
static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
  uint16_t year = 0;
  uint8_t month = 0;
  uint8_t day = 0;
  if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
    return false;
  }
  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
    return false;
  }
  arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
                                           arrow_vendored::date::month{month},
                                           arrow_vendored::date::day{day}};
  if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;

  *since_epoch = std::chrono::duration_cast<Duration>(
      arrow_vendored::date::sys_days{ymd}.time_since_epoch());
  return true;
}

static inline bool ParseTimestampISO8601(const char* s, size_t length,
                                         TimeUnit::type unit, TimestampType::c_type* out,
                                         bool* out_zone_offset_present = NULLPTR) {
  using seconds_type = std::chrono::duration<TimestampType::c_type>;

  // We allow the following zone offset formats:
  // - (none)
  // - Z
  // - [+-]HH(:?MM)?
  //
  // We allow the following formats for all units:
  // - "YYYY-MM-DD"
  // - "YYYY-MM-DD[ T]hhZ?"
  // - "YYYY-MM-DD[ T]hh:mmZ?"
  // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
  //
  // We allow the following formats for unit == MILLI, MICRO, or NANO:
  // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
  //
  // We allow the following formats for unit == MICRO, or NANO:
  // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
  //
  // We allow the following formats for unit == NANO:
  // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
  //
  // UTC is always assumed, and the DataType's timezone is ignored.
  //

  if (ARROW_PREDICT_FALSE(length < 10)) return false;

  seconds_type seconds_since_epoch;
  if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
    return false;
  }

  if (length == 10) {
    *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
    return true;
  }

  if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
    return false;
  }

  if (out_zone_offset_present) {
    *out_zone_offset_present = false;
  }

  seconds_type zone_offset(0);
  if (s[length - 1] == 'Z') {
    --length;
    if (out_zone_offset_present) *out_zone_offset_present = true;
  } else if (s[length - 3] == '+' || s[length - 3] == '-') {
    // [+-]HH
    length -= 3;
    if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
      return false;
    }
    if (s[length] == '+') zone_offset *= -1;
    if (out_zone_offset_present) *out_zone_offset_present = true;
  } else if (s[length - 5] == '+' || s[length - 5] == '-') {
    // [+-]HHMM
    length -= 5;
    if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
      return false;
    }
    if (s[length] == '+') zone_offset *= -1;
    if (out_zone_offset_present) *out_zone_offset_present = true;
  } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
    // [+-]HH:MM
    length -= 6;
    if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
      return false;
    }
    if (s[length] == '+') zone_offset *= -1;
    if (out_zone_offset_present) *out_zone_offset_present = true;
  }

  seconds_type seconds_since_midnight;
  switch (length) {
    case 13:  // YYYY-MM-DD[ T]hh
      if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + 11, &seconds_since_midnight))) {
        return false;
      }
      break;
    case 16:  // YYYY-MM-DD[ T]hh:mm
      if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + 11, &seconds_since_midnight))) {
        return false;
      }
      break;
    case 19:  // YYYY-MM-DD[ T]hh:mm:ss
    case 21:  // YYYY-MM-DD[ T]hh:mm:ss.s
    case 22:  // YYYY-MM-DD[ T]hh:mm:ss.ss
    case 23:  // YYYY-MM-DD[ T]hh:mm:ss.sss
    case 24:  // YYYY-MM-DD[ T]hh:mm:ss.ssss
    case 25:  // YYYY-MM-DD[ T]hh:mm:ss.sssss
    case 26:  // YYYY-MM-DD[ T]hh:mm:ss.ssssss
    case 27:  // YYYY-MM-DD[ T]hh:mm:ss.sssssss
    case 28:  // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
    case 29:  // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
      if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
        return false;
      }
      break;
    default:
      return false;
  }

  seconds_since_epoch += seconds_since_midnight;
  seconds_since_epoch += zone_offset;

  if (length <= 19) {
    *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count());
    return true;
  }

  if (ARROW_PREDICT_FALSE(s[19] != '.')) {
    return false;
  }

  uint32_t subseconds = 0;
  if (ARROW_PREDICT_FALSE(
          !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
    return false;
  }

  *out = util::CastSecondsToUnit(unit, seconds_since_epoch.count()) + subseconds;
  return true;
}

#if defined(_WIN32) || defined(ARROW_WITH_MUSL)
static constexpr bool kStrptimeSupportsZone = false;
#else
static constexpr bool kStrptimeSupportsZone = true;
#endif

/// \brief Returns time since the UNIX epoch in the requested unit
static inline bool ParseTimestampStrptime(const char* buf, size_t length,
                                          const char* format, bool ignore_time_in_day,
                                          bool allow_trailing_chars, TimeUnit::type unit,
                                          int64_t* out) {
  // NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
  // The buffer may not be nul-terminated
  std::string clean_copy(buf, length);
  struct tm result;
  memset(&result, 0, sizeof(struct tm));
#ifdef _WIN32
  char* ret = arrow_strptime(clean_copy.c_str(), format, &result);
#else
  char* ret = strptime(clean_copy.c_str(), format, &result);
#endif
  if (ret == NULLPTR) {
    return false;
  }
  if (!allow_trailing_chars && static_cast<size_t>(ret - clean_copy.c_str()) != length) {
    return false;
  }
  // ignore the time part
  arrow_vendored::date::sys_seconds secs =
      arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
                                     (result.tm_mon + 1) / std::max(result.tm_mday, 1));
  if (!ignore_time_in_day) {
    secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
             std::chrono::seconds(result.tm_sec));
#ifndef _WIN32
    secs -= std::chrono::seconds(result.tm_gmtoff);
#endif
  }
  *out = util::CastSecondsToUnit(unit, secs.time_since_epoch().count());
  return true;
}

template <>
struct StringConverter<TimestampType> {
  using value_type = int64_t;

  bool Convert(const TimestampType& type, const char* s, size_t length, value_type* out) {
    return ParseTimestampISO8601(s, length, type.unit(), out);
  }
};

template <>
struct StringConverter<DurationType>
    : public StringToSignedIntConverterMixin<DurationType> {
  using StringToSignedIntConverterMixin<DurationType>::StringToSignedIntConverterMixin;
};

template <typename DATE_TYPE>
struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {
  using value_type = typename DATE_TYPE::c_type;

  using duration_type =
      typename std::conditional<std::is_same<DATE_TYPE, Date32Type>::value,
                                arrow_vendored::date::days,
                                std::chrono::milliseconds>::type;

  bool Convert(const DATE_TYPE& type, const char* s, size_t length, value_type* out) {
    if (ARROW_PREDICT_FALSE(length != 10)) {
      return false;
    }

    duration_type since_epoch;
    if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &since_epoch))) {
      return false;
    }

    *out = static_cast<value_type>(since_epoch.count());
    return true;
  }
};

template <typename TIME_TYPE>
struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
  using value_type = typename TIME_TYPE::c_type;

  // We allow the following formats for all units:
  // - "hh:mm"
  // - "hh:mm:ss"
  //
  // We allow the following formats for unit == MILLI, MICRO, or NANO:
  // - "hh:mm:ss.s{1,3}"
  //
  // We allow the following formats for unit == MICRO, or NANO:
  // - "hh:mm:ss.s{4,6}"
  //
  // We allow the following formats for unit == NANO:
  // - "hh:mm:ss.s{7,9}"

  bool Convert(const TIME_TYPE& type, const char* s, size_t length, value_type* out) {
    const auto unit = type.unit();
    std::chrono::seconds since_midnight;

    if (length == 5) {
      if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
        return false;
      }
      *out =
          static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));
      return true;
    }

    if (ARROW_PREDICT_FALSE(length < 8)) {
      return false;
    }
    if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) {
      return false;
    }

    *out = static_cast<value_type>(util::CastSecondsToUnit(unit, since_midnight.count()));

    if (length == 8) {
      return true;
    }

    if (ARROW_PREDICT_FALSE(s[8] != '.')) {
      return false;
    }

    uint32_t subseconds_count = 0;
    if (ARROW_PREDICT_FALSE(
            !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
      return false;
    }

    *out += subseconds_count;
    return true;
  }
};

/// \brief Convenience wrappers around internal::StringConverter.
template <typename T>
bool ParseValue(const T& type, const char* s, size_t length,
                typename StringConverter<T>::value_type* out) {
  return StringConverter<T>{}.Convert(type, s, length, out);
}

template <typename T>
enable_if_parameter_free<T, bool> ParseValue(
    const char* s, size_t length, typename StringConverter<T>::value_type* out) {
  static T type;
  return StringConverter<T>{}.Convert(type, s, length, out);
}

}  // namespace internal
}  // namespace arrow
arrow-nightlies / pyarrow python

Version: 19.0.0.dev259

/ include / arrow / util / value_parsing.h

Products

About

Resources

Contact Gemfury