// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include "parquet/encryption/type_fwd.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
class ColumnDescriptor;
class EncodedStatistics;
class FileCryptoMetaData;
class Statistics;
class SchemaDescriptor;
namespace schema {
class ColumnPath;
} // namespace schema
using KeyValueMetadata = ::arrow::KeyValueMetadata;
class PARQUET_EXPORT ApplicationVersion {
public:
// Known Versions with Issues
static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
static const ApplicationVersion& PARQUET_CPP_10353_FIXED_VERSION();
// Application that wrote the file. e.g. "IMPALA"
std::string application_;
// Build name
std::string build_;
// Version of the application that wrote the file, expressed as
// (<major>.<minor>.<patch>). Unmatched parts default to 0.
// "1.2.3" => {1, 2, 3}
// "1.2" => {1, 2, 0}
// "1.2-cdh5" => {1, 2, 0}
struct {
int major;
int minor;
int patch;
std::string unknown;
std::string pre_release;
std::string build_info;
} version;
ApplicationVersion() = default;
explicit ApplicationVersion(const std::string& created_by);
ApplicationVersion(std::string application, int major, int minor, int patch);
// Returns true if version is strictly less than other_version
bool VersionLt(const ApplicationVersion& other_version) const;
// Returns true if version is strictly equal with other_version
bool VersionEq(const ApplicationVersion& other_version) const;
// Checks if the Version has the correct statistics for a given column
bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
SortOrder::type sort_order = SortOrder::SIGNED) const;
};
class PARQUET_EXPORT ColumnCryptoMetaData {
public:
static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
~ColumnCryptoMetaData();
bool Equals(const ColumnCryptoMetaData& other) const;
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
bool encrypted_with_footer_key() const;
const std::string& key_metadata() const;
private:
explicit ColumnCryptoMetaData(const uint8_t* metadata);
class ColumnCryptoMetaDataImpl;
std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
};
/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
struct PageEncodingStats {
PageType::type page_type;
Encoding::type encoding;
int32_t count;
};
/// \brief Public struct for location to page index in ColumnChunkMetaData.
struct IndexLocation {
/// File offset of the given index, in bytes
int64_t offset;
/// Length of the given index, in bytes
int32_t length;
};
/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
class PARQUET_EXPORT ColumnChunkMetaData {
public:
// API convenience to get a MetaData accessor
static std::unique_ptr<ColumnChunkMetaData> Make(
const void* metadata, const ColumnDescriptor* descr,
const ReaderProperties& properties = default_reader_properties(),
const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
int16_t column_ordinal = -1,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
~ColumnChunkMetaData();
bool Equals(const ColumnChunkMetaData& other) const;
// Byte offset of `ColumnMetaData` in `file_path()`.
//
// Note that the meaning of this field has been inconsistent among implementations
// so its use has since been deprecated in the Parquet specification. Modern
// implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
// contained in the `ColumnChunk` struct.
int64_t file_offset() const;
// parameter is only used when a dataset is spread across multiple files
const std::string& file_path() const;
// column metadata
bool is_metadata_set() const;
Type::type type() const;
int64_t num_values() const;
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
bool is_stats_set() const;
std::shared_ptr<Statistics> statistics() const;
Compression::type compression() const;
// Indicate if the ColumnChunk compression is supported by the current
// compiled parquet library.
bool can_decompress() const;
const std::vector<Encoding::type>& encodings() const;
const std::vector<PageEncodingStats>& encoding_stats() const;
std::optional<int64_t> bloom_filter_offset() const;
std::optional<int64_t> bloom_filter_length() const;
bool has_dictionary_page() const;
int64_t dictionary_page_offset() const;
int64_t data_page_offset() const;
bool has_index_page() const;
int64_t index_page_offset() const;
int64_t total_compressed_size() const;
int64_t total_uncompressed_size() const;
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
std::optional<IndexLocation> GetColumnIndexLocation() const;
std::optional<IndexLocation> GetOffsetIndexLocation() const;
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
private:
explicit ColumnChunkMetaData(
const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
int16_t column_ordinal, const ReaderProperties& properties,
const ApplicationVersion* writer_version = NULLPTR,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
// PIMPL Idiom
class ColumnChunkMetaDataImpl;
std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
};
/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
class PARQUET_EXPORT RowGroupMetaData {
public:
/// \brief Create a RowGroupMetaData from a serialized thrift message.
static std::unique_ptr<RowGroupMetaData> Make(
const void* metadata, const SchemaDescriptor* schema,
const ReaderProperties& properties = default_reader_properties(),
const ApplicationVersion* writer_version = NULLPTR,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
~RowGroupMetaData();
bool Equals(const RowGroupMetaData& other) const;
/// \brief The number of columns in this row group. The order must match the
/// parent's column ordering.
int num_columns() const;
/// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
///
/// WARNING, the returned object references memory location in it's parent
/// (RowGroupMetaData) object. Hence, the parent must outlive the returned
/// object.
///
/// \param[in] index of the ColumnChunkMetaData to retrieve.
///
/// \throws ParquetException if the index is out of bound.
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
/// \brief Number of rows in this row group.
int64_t num_rows() const;
/// \brief Total byte size of all the uncompressed column data in this row group.
int64_t total_byte_size() const;
/// \brief Total byte size of all the compressed (and potentially encrypted)
/// column data in this row group.
///
/// This information is optional and may be 0 if omitted.
int64_t total_compressed_size() const;
/// \brief Byte offset from beginning of file to first page (data or
/// dictionary) in this row group
///
/// The file_offset field that this method exposes is optional. This method
/// will return 0 if that field is not set to a meaningful value.
int64_t file_offset() const;
// Return const-pointer to make it clear that this object is not to be copied
const SchemaDescriptor* schema() const;
// Indicate if all of the RowGroup's ColumnChunks can be decompressed.
bool can_decompress() const;
// Sorting columns of the row group if any.
std::vector<SortingColumn> sorting_columns() const;
private:
explicit RowGroupMetaData(
const void* metadata, const SchemaDescriptor* schema,
const ReaderProperties& properties,
const ApplicationVersion* writer_version = NULLPTR,
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
// PIMPL Idiom
class RowGroupMetaDataImpl;
std::unique_ptr<RowGroupMetaDataImpl> impl_;
};
class FileMetaDataBuilder;
/// \brief FileMetaData is a proxy around format::FileMetaData.
class PARQUET_EXPORT FileMetaData {
public:
/// \brief Create a FileMetaData from a serialized thrift message.
static std::shared_ptr<FileMetaData> Make(
const void* serialized_metadata, uint32_t* inout_metadata_len,
const ReaderProperties& properties = default_reader_properties(),
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
~FileMetaData();
bool Equals(const FileMetaData& other) const;
/// \brief The number of parquet "leaf" columns.
///
/// Parquet thrift definition requires that nested schema elements are
/// flattened. This method returns the number of columns in the flattened
/// version.
/// For instance, if the schema looks like this :
/// 0 foo.bar
/// foo.bar.baz 0
/// foo.bar.baz2 1
/// foo.qux 2
/// 1 foo2 3
/// 2 foo3 4
/// This method will return 5, because there are 5 "leaf" fields (so 5
/// flattened fields)
int num_columns() const;
/// \brief The number of flattened schema elements.
///
/// Parquet thrift definition requires that nested schema elements are
/// flattened. This method returns the total number of elements in the
/// flattened list.
int num_schema_elements() const;
/// \brief The total number of rows.
///
/// If the FileMetaData was obtained by calling `SubSet()`, this is the total
/// number of rows in the selected row groups.
int64_t num_rows() const;
/// \brief The number of row groups in the file.
///
/// If the FileMetaData was obtained by calling `SubSet()`, this is the number
/// of selected row groups.
int num_row_groups() const;
/// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
///
/// WARNING, the returned object references memory location in it's parent
/// (FileMetaData) object. Hence, the parent must outlive the returned object.
///
/// \param[in] index of the RowGroup to retrieve.
///
/// \throws ParquetException if the index is out of bound.
std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
/// \brief Return the "version" of the file
///
/// WARNING: The value returned by this method is unreliable as 1) the Parquet
/// file metadata stores the version as a single integer and 2) some producers
/// are known to always write a hardcoded value. Therefore, you cannot use
/// this value to know which features are used in the file.
ParquetVersion::type version() const;
/// \brief Return the application's user-agent string of the writer.
const std::string& created_by() const;
/// \brief Return the application's version of the writer.
const ApplicationVersion& writer_version() const;
/// \brief Size of the original thrift encoded metadata footer.
uint32_t size() const;
/// \brief Indicate if all of the FileMetaData's RowGroups can be decompressed.
///
/// This will return false if any of the RowGroup's page is compressed with a
/// compression format which is not compiled in the current parquet library.
bool can_decompress() const;
bool is_encryption_algorithm_set() const;
EncryptionAlgorithm encryption_algorithm() const;
const std::string& footer_signing_key_metadata() const;
/// \brief Verify signature of FileMetaData when file is encrypted but footer
/// is not encrypted (plaintext footer).
Loading ...