Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
pyarrow / include / parquet / geospatial / statistics.h
Size: Mime:
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <memory>
#include <optional>

#include "parquet/platform.h"
#include "parquet/types.h"

namespace parquet::geospatial {

/// \brief The maximum number of dimensions represented by a geospatial type
/// (i.e., X, Y, Z, and M)
inline constexpr int kMaxDimensions = 4;

/// \brief NaN, used to represent bounds for which predicate pushdown cannnot
/// be applied (e.g., because a writer did not provide bounds for a given dimension)
inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN();

/// \brief Structure represented encoded statistics to be written to and read from Parquet
/// serialized metadata.
///
/// See the Parquet Thrift definition and GeoStatistics for the specific definition
/// of field values.
struct PARQUET_EXPORT EncodedGeoStatistics {
  bool xy_bounds_present{false};
  double xmin{kNaN};
  double xmax{kNaN};
  double ymin{kNaN};
  double ymax{kNaN};

  bool z_bounds_present{false};
  double zmin{kNaN};
  double zmax{kNaN};

  bool m_bounds_present{false};
  double mmin{kNaN};
  double mmax{kNaN};

  bool geospatial_types_present() const { return !geospatial_types.empty(); }
  std::vector<int32_t> geospatial_types;
};

class GeoStatisticsImpl;

/// \brief Base type for computing geospatial column statistics while writing a file
/// or representing them when reading a file
///
/// These statistics track the minimum and maximum value (omitting NaN values) of the
/// four possible dimensions (X, Y, Z, and M) and the distinct set of geometry
/// type/dimension combinations (e.g., point XY, linestring XYZM) present in the data.
/// Any of these individual components may be "invalid": for example, when reading a
/// Parquet file, information about individual components obtained from the column
/// chunk metadata may have been missing or deemed unusable. Orthogonally,
/// any of these individual components may be "empty": for example, when using
/// GeoStatistics to accumulate bounds whilst writing, if all geometries in a column chunk
/// are null, all ranges (X, Y, Z, and M) will be empty. If all geometries in a column
/// chunk contain only XY coordinates (the most common case), the Z and M ranges will
/// be empty but the X and Y ranges will contain finite bounds. Empty ranges are
/// considered "valid" because they are known to represent exactly zero values (in
/// contrast to an invalid range, whose contents is completely unknown). These concepts
/// are all necessary for this object to accurately represent (1) accumulated or partially
/// accumulated statistics during the writing process and (2) deserialized statistics read
/// from the column chunk metadata during the reading process.
///
/// EXPERIMENTAL
class PARQUET_EXPORT GeoStatistics {
 public:
  GeoStatistics();
  explicit GeoStatistics(const EncodedGeoStatistics& encoded);

  ~GeoStatistics();

  /// \brief Return true if bounds, geometry types, and validity are identical
  bool Equals(const GeoStatistics& other) const;

  /// \brief Update these statistics based on previously calculated or decoded statistics
  ///
  /// Merging statistics with wraparound X values is not currently supported. Merging
  /// two GeoStatistics where one or both has a wraparound X range will result in these
  /// statistics having an X dimension marked as invalid.
  void Merge(const GeoStatistics& other);

  /// \brief Update these statistics based on values
  void Update(const ByteArray* values, int64_t num_values);

  /// \brief Update these statistics based on the non-null elements of values
  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
                    int64_t valid_bits_offset, int64_t num_spaced_values,
                    int64_t num_values);

  /// \brief Update these statistics based on the non-null elements of values
  ///
  /// Currently, BinaryArray and LargeBinaryArray input is supported.
  void Update(const ::arrow::Array& values);

  /// \brief Return these statistics to an empty state
  void Reset();

  /// \brief Encode the statistics for serializing to Thrift
  ///
  /// If invalid WKB was encountered or if the statistics contain NaN
  /// for any reason, Encode() will return nullopt to indicate that
  /// statistics should not be written to thrift.
  std::optional<EncodedGeoStatistics> Encode() const;

  /// \brief Returns false if invalid WKB was encountered
  bool is_valid() const;

  /// \brief Reset existing statistics and populate them from previously-encoded ones
  void Decode(const EncodedGeoStatistics& encoded);

  /// \brief Minimum values in XYZM order
  ///
  /// For dimensions where dimension_valid() is false, the value will be NaN. For
  /// dimensions where dimension_empty() is true, the value will be +Inf.
  ///
  /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
  /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
  /// xmin]. This implementation does not yet generate these types of bounds but they may
  /// be encountered in statistics when reading a Parquet file.
  std::array<double, kMaxDimensions> lower_bound() const;

  /// \brief Maximum values in XYZM order
  ///
  /// For dimensions where dimension_valid() is false, the value will be NaN. For
  /// dimensions where dimension_empty() is true, the value will be -Inf.
  ///
  /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
  /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
  /// xmin]. This implementation does not yet generate these types of bounds but they may
  /// be encountered in statistics when reading a Parquet file.
  std::array<double, kMaxDimensions> upper_bound() const;

  /// \brief Dimension emptiness in XYZM order
  ///
  /// True for a given dimension if and only if zero non-NaN values were encountered
  /// in that dimension and dimension_valid() is true for that dimension.
  ///
  /// When calculating statistics, zero or more of these values may be true because
  /// this implementation calculates bounds for all dimensions; however, it may be
  /// true that zero coordinates were encountered in a given dimension. For example,
  /// dimension_empty() will return four true values if Update() was not called
  /// or if Update() was called with only null values. If Update() was provided
  /// one or more geometries with X and Y dimensions but not Z or M dimensions,
  /// dimension_empty() will return true, true, false, false.
  ///
  /// For statistics read from a Parquet file, dimension_empty() will always contain
  /// false values because there is no mechanism to communicate an empty interval
  /// in the Thrift metadata.
  std::array<bool, kMaxDimensions> dimension_empty() const;

  /// \brief Dimension validity (i.e. presence) in XYZM order
  ///
  /// When calculating statistics, this will always be true because this implementation
  /// calculates statistics for all dimensions. When reading a Parquet file, one or more
  /// of these values may be false because the file may not have provided bounds for all
  /// dimensions.
  ///
  /// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the
  /// canonical values of those outputs for the dimensions where dimension_valid() is
  /// false.
  std::array<bool, kMaxDimensions> dimension_valid() const;

  /// \brief Return the geometry type codes
  ///
  /// This implementation always returns sorted output with no duplicates. When
  /// calculating statistics, a value will always be returned (although the returned
  /// vector may be empty if Update() was never called or was only called with null
  /// values). When reading a Parquet file, std::nullopt may be returned because
  /// the file may not have provided this information.
  std::optional<std::vector<int32_t>> geometry_types() const;

  /// \brief Return a string representation of these statistics
  std::string ToString() const;

 private:
  std::unique_ptr<GeoStatisticsImpl> impl_;
};

}  // namespace parquet::geospatial