Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev246 

/ include / parquet / size_statistics.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <optional>
#include <vector>

#include "parquet/platform.h"
#include "parquet/type_fwd.h"

namespace parquet {

/// A structure for capturing metadata for estimating the unencoded,
/// uncompressed size of data written. This is useful for readers to estimate
/// how much memory is needed to reconstruct data in their memory model and for
/// fine-grained filter push down on nested structures (the histograms contained
/// in this structure can help determine the number of nulls at a particular
/// nesting level and maximum length of lists).
struct PARQUET_EXPORT SizeStatistics {
  /// When present, there is expected to be one element corresponding to each
  /// definition (i.e. size=max definition+1) where each element
  /// represents the number of times the definition level was observed in the
  /// data.
  ///
  /// This field may be omitted (a.k.a. zero-length vector) if max_definition_level
  /// is 0 without loss of information.
  std::vector<int64_t> definition_level_histogram;

  /// Same as definition_level_histogram except for repetition levels.
  ///
  /// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level
  /// is 0 without loss of information.
  std::vector<int64_t> repetition_level_histogram;

  /// The number of physical bytes stored for BYTE_ARRAY data values assuming
  /// no encoding. This is exclusive of the bytes needed to store the length of
  /// each byte array. In other words, this field is equivalent to the `(size
  /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
  /// written)`. To determine unencoded sizes of other types readers can use
  /// schema information multiplied by the number of non-null and null values.
  /// The number of null/non-null values can be inferred from the histograms
  /// below.
  ///
  /// For example, if a column chunk is dictionary-encoded with dictionary
  /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
  /// then this value for that data page should be 7 (1 + 1 + 2 + 3).
  ///
  /// This field should only be set for types that use BYTE_ARRAY as their
  /// physical type.
  std::optional<int64_t> unencoded_byte_array_data_bytes;

  /// \brief Check if the SizeStatistics is set.
  bool is_set() const {
    return !repetition_level_histogram.empty() || !definition_level_histogram.empty() ||
           unencoded_byte_array_data_bytes.has_value();
  }

  /// \brief Increment the unencoded byte array data bytes.
  void IncrementUnencodedByteArrayDataBytes(int64_t value);

  /// \brief Merge two SizeStatistics.
  /// \throws ParquetException if SizeStatistics to merge is not compatible.
  void Merge(const SizeStatistics& other);

  /// \brief Validate the SizeStatistics
  /// \throws ParquetException if the histograms don't have the right length,
  /// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column.
  void Validate(const ColumnDescriptor* descr) const;

  /// \brief Reset the SizeStatistics to be empty.
  void Reset();

  /// \brief Make an empty SizeStatistics object for specific type.
  static std::unique_ptr<SizeStatistics> Make(const ColumnDescriptor* descr);
};

}  // namespace parquet