// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <memory>
#include <sstream>
#include <string>
#include <string_view>
#include "parquet/platform.h"
#include "parquet/type_fwd.h"
#include "parquet/windows_fixup.h" // for OPTIONAL
namespace arrow::util {
class Codec;
} // namespace arrow::util
namespace parquet {
// ----------------------------------------------------------------------
// Metadata enums to match Thrift metadata
//
// The reason we maintain our own enums is to avoid transitive dependency on
// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
// public API. After building parquet-cpp, you should not need to include
// Thrift headers in your application. This means some boilerplate to convert
// between our types and Parquet's Thrift types.
//
// We can also add special values like NONE to distinguish between metadata
// values being set and not set. As an example consider ConvertedType and
// CompressionCodec
// Mirrors parquet::Type
struct Type {
enum type {
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3,
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
// Should always be last element.
UNDEFINED = 8
};
};
// Mirrors parquet::ConvertedType
struct ConvertedType {
enum type {
NONE, // Not a real converted type, but means no converted type is specified
UTF8,
MAP,
MAP_KEY_VALUE,
LIST,
ENUM,
DECIMAL,
DATE,
TIME_MILLIS,
TIME_MICROS,
TIMESTAMP_MILLIS,
TIMESTAMP_MICROS,
UINT_8,
UINT_16,
UINT_32,
UINT_64,
INT_8,
INT_16,
INT_32,
INT_64,
JSON,
BSON,
INTERVAL,
// DEPRECATED INVALID ConvertedType for all-null data.
// Only useful for reading legacy files written out by interim Parquet C++ releases.
// For writing, always emit LogicalType::Null instead.
// See PARQUET-1990.
NA = 25,
UNDEFINED = 26 // Not a real converted type; should always be last element
};
};
// forward declaration
namespace format {
class LogicalType;
}
// Mirrors parquet::FieldRepetitionType
struct Repetition {
enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
};
// Reference:
// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
// format/converter/ParquetMetadataConverter.java
// Sort order for page and column statistics. Types are associated with sort
// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
// aggregated using a sort order. As of parquet-format version 2.3.1, the
// order used to aggregate stats is always SIGNED and is not stored in the
// Parquet file. These stats are discarded for types that need unsigned.
// See PARQUET-686.
struct SortOrder {
enum type { SIGNED, UNSIGNED, UNKNOWN };
};
namespace schema {
struct DecimalMetadata {
bool isset;
int32_t scale;
int32_t precision;
};
} // namespace schema
/// \brief Implementation of parquet.thrift LogicalType types.
class PARQUET_EXPORT LogicalType {
public:
struct Type {
enum type {
UNDEFINED = 0, // Not a real logical type
STRING = 1,
MAP,
LIST,
ENUM,
DECIMAL,
DATE,
TIME,
TIMESTAMP,
INTERVAL,
INT,
NIL, // Thrift NullType: annotates data that is always null
JSON,
BSON,
UUID,
FLOAT16,
NONE // Not a real logical type; should always be last element
};
};
struct TimeUnit {
enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
};
/// \brief If possible, return a logical type equivalent to the given legacy
/// converted type (and decimal metadata if applicable).
static std::shared_ptr<const LogicalType> FromConvertedType(
const parquet::ConvertedType::type converted_type,
const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
-1});
/// \brief Return the logical type represented by the Thrift intermediary object.
static std::shared_ptr<const LogicalType> FromThrift(
const parquet::format::LogicalType& thrift_logical_type);
/// \brief Return the explicitly requested logical type.
static std::shared_ptr<const LogicalType> String();
static std::shared_ptr<const LogicalType> Map();
static std::shared_ptr<const LogicalType> List();
static std::shared_ptr<const LogicalType> Enum();
static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
static std::shared_ptr<const LogicalType> Date();
static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
LogicalType::TimeUnit::unit time_unit);
/// \brief Create a Timestamp logical type
/// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
/// \param[in] time_unit the resolution of the timestamp
/// \param[in] is_from_converted_type if true, the timestamp was generated
/// by translating a legacy converted type of TIMESTAMP_MILLIS or
/// TIMESTAMP_MICROS. Default is false.
/// \param[in] force_set_converted_type if true, always set the
/// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
/// metadata. Default is false
static std::shared_ptr<const LogicalType> Timestamp(
bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
bool is_from_converted_type = false, bool force_set_converted_type = false);
static std::shared_ptr<const LogicalType> Interval();
static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
/// \brief Create a logical type for data that's always null
///
/// Any physical type can be annotated with this logical type.
static std::shared_ptr<const LogicalType> Null();
static std::shared_ptr<const LogicalType> JSON();
static std::shared_ptr<const LogicalType> BSON();
static std::shared_ptr<const LogicalType> UUID();
static std::shared_ptr<const LogicalType> Float16();
/// \brief Create a placeholder for when no logical type is specified
static std::shared_ptr<const LogicalType> None();
/// \brief Return true if this logical type is consistent with the given underlying
/// physical type.
bool is_applicable(parquet::Type::type primitive_type,
int32_t primitive_length = -1) const;
/// \brief Return true if this logical type is equivalent to the given legacy converted
/// type (and decimal metadata if applicable).
bool is_compatible(parquet::ConvertedType::type converted_type,
parquet::schema::DecimalMetadata converted_decimal_metadata = {
false, -1, -1}) const;
/// \brief If possible, return the legacy converted type (and decimal metadata if
/// applicable) equivalent to this logical type.
parquet::ConvertedType::type ToConvertedType(
parquet::schema::DecimalMetadata* out_decimal_metadata) const;
/// \brief Return a printable representation of this logical type.
std::string ToString() const;
/// \brief Return a JSON representation of this logical type.
std::string ToJSON() const;
/// \brief Return a serializable Thrift object for this logical type.
parquet::format::LogicalType ToThrift() const;
/// \brief Return true if the given logical type is equivalent to this logical type.
bool Equals(const LogicalType& other) const;
/// \brief Return the enumerated type of this logical type.
LogicalType::Type::type type() const;
/// \brief Return the appropriate sort order for this logical type.
SortOrder::type sort_order() const;
// Type checks ...
bool is_string() const;
bool is_map() const;
bool is_list() const;
bool is_enum() const;
bool is_decimal() const;
bool is_date() const;
bool is_time() const;
bool is_timestamp() const;
bool is_interval() const;
bool is_int() const;
bool is_null() const;
bool is_JSON() const;
bool is_BSON() const;
bool is_UUID() const;
bool is_float16() const;
bool is_none() const;
/// \brief Return true if this logical type is of a known type.
bool is_valid() const;
bool is_invalid() const;
/// \brief Return true if this logical type is suitable for a schema GroupNode.
bool is_nested() const;
bool is_nonnested() const;
/// \brief Return true if this logical type is included in the Thrift output for its
/// node.
bool is_serialized() const;
LogicalType(const LogicalType&) = delete;
LogicalType& operator=(const LogicalType&) = delete;
virtual ~LogicalType() noexcept;
protected:
LogicalType();
class Impl;
std::unique_ptr<const Impl> impl_;
};
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
class PARQUET_EXPORT StringLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
StringLogicalType() = default;
};
/// \brief Allowed for group nodes only.
class PARQUET_EXPORT MapLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
MapLogicalType() = default;
};
/// \brief Allowed for group nodes only.
class PARQUET_EXPORT ListLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
ListLogicalType() = default;
};
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
class PARQUET_EXPORT EnumLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
EnumLogicalType() = default;
};
/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
/// depending on the precision.
class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
int32_t precision() const;
int32_t scale() const;
private:
DecimalLogicalType() = default;
};
/// \brief Allowed for physical type INT32.
class PARQUET_EXPORT DateLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
DateLogicalType() = default;
Loading ...