// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <optional>
#include <vector>
#include "parquet/platform.h"
#include "parquet/type_fwd.h"
namespace parquet {
/// A structure for capturing metadata for estimating the unencoded,
/// uncompressed size of data written. This is useful for readers to estimate
/// how much memory is needed to reconstruct data in their memory model and for
/// fine-grained filter push down on nested structures (the histograms contained
/// in this structure can help determine the number of nulls at a particular
/// nesting level and maximum length of lists).
struct PARQUET_EXPORT SizeStatistics {
/// When present, there is expected to be one element corresponding to each
/// definition (i.e. size=max definition+1) where each element
/// represents the number of times the definition level was observed in the
/// data.
///
/// This field may be omitted (a.k.a. zero-length vector) if max_definition_level
/// is 0 without loss of information.
std::vector<int64_t> definition_level_histogram;
/// Same as definition_level_histogram except for repetition levels.
///
/// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level
/// is 0 without loss of information.
std::vector<int64_t> repetition_level_histogram;
/// The number of physical bytes stored for BYTE_ARRAY data values assuming
/// no encoding. This is exclusive of the bytes needed to store the length of
/// each byte array. In other words, this field is equivalent to the `(size
/// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
/// written)`. To determine unencoded sizes of other types readers can use
/// schema information multiplied by the number of non-null and null values.
/// The number of null/non-null values can be inferred from the histograms
/// below.
///
/// For example, if a column chunk is dictionary-encoded with dictionary
/// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
/// then this value for that data page should be 7 (1 + 1 + 2 + 3).
///
/// This field should only be set for types that use BYTE_ARRAY as their
/// physical type.
std::optional<int64_t> unencoded_byte_array_data_bytes;
/// \brief Check if the SizeStatistics is set.
bool is_set() const {
return !repetition_level_histogram.empty() || !definition_level_histogram.empty() ||
unencoded_byte_array_data_bytes.has_value();
}
/// \brief Increment the unencoded byte array data bytes.
void IncrementUnencodedByteArrayDataBytes(int64_t value);
/// \brief Merge two SizeStatistics.
/// \throws ParquetException if SizeStatistics to merge is not compatible.
void Merge(const SizeStatistics& other);
/// \brief Validate the SizeStatistics
/// \throws ParquetException if the histograms don't have the right length,
/// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column.
void Validate(const ColumnDescriptor* descr) const;
/// \brief Reset the SizeStatistics to be empty.
void Reset();
/// \brief Make an empty SizeStatistics object for specific type.
static std::unique_ptr<SizeStatistics> Make(const ColumnDescriptor* descr);
};
} // namespace parquet