Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev251 

/ include / parquet / arrow / schema.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cassert>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"

#include "parquet/level_conversion.h"
#include "parquet/platform.h"
#include "parquet/schema.h"

namespace parquet {

class ArrowReaderProperties;
class ArrowWriterProperties;
class WriterProperties;

namespace arrow {

/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
/// schema into a Parquet schema.
///
/// @{

PARQUET_EXPORT
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
                            const WriterProperties& properties,
                            const ArrowWriterProperties& arrow_properties,
                            schema::NodePtr* out);

PARQUET_EXPORT
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
                                const WriterProperties& properties,
                                const ArrowWriterProperties& arrow_properties,
                                std::shared_ptr<SchemaDescriptor>* out);

PARQUET_EXPORT
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
                                const WriterProperties& properties,
                                std::shared_ptr<SchemaDescriptor>* out);

/// @}

/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
/// schema into an Arrow schema.
///
/// @{

PARQUET_EXPORT
::arrow::Status FromParquetSchema(
    const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
    const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
    std::shared_ptr<::arrow::Schema>* out);

PARQUET_EXPORT
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
                                  const ArrowReaderProperties& properties,
                                  std::shared_ptr<::arrow::Schema>* out);

PARQUET_EXPORT
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
                                  std::shared_ptr<::arrow::Schema>* out);

/// @}

/// \brief Bridge between an arrow::Field and parquet column indices.
struct PARQUET_EXPORT SchemaField {
  std::shared_ptr<::arrow::Field> field;
  std::vector<SchemaField> children;

  // Only set for leaf nodes
  int column_index = -1;

  parquet::internal::LevelInfo level_info;

  bool is_leaf() const { return column_index != -1; }
};

/// \brief Bridge between a parquet Schema and an arrow Schema.
///
/// Expose parquet columns as a tree structure. Useful traverse and link
/// between arrow's Schema and parquet's Schema.
struct PARQUET_EXPORT SchemaManifest {
  static ::arrow::Status Make(
      const SchemaDescriptor* schema,
      const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
      const ArrowReaderProperties& properties, SchemaManifest* manifest);

  const SchemaDescriptor* descr;
  std::shared_ptr<::arrow::Schema> origin_schema;
  std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
  std::vector<SchemaField> schema_fields;

  std::unordered_map<int, const SchemaField*> column_index_to_field;
  std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;

  ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
    auto it = column_index_to_field.find(column_index);
    if (it == column_index_to_field.end()) {
      return ::arrow::Status::KeyError("Column index ", column_index,
                                       " not found in schema manifest, may be malformed");
    }
    *out = it->second;
    return ::arrow::Status::OK();
  }

  const SchemaField* GetParent(const SchemaField* field) const {
    // Returns nullptr also if not found
    auto it = child_to_parent.find(field);
    if (it == child_to_parent.end()) {
      return NULLPTR;
    }
    return it->second;
  }

  /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
  /// correspond to the column root (first node below the parquet schema's root group) of
  /// each leaf referenced in column_indices.
  ///
  /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
  /// the roots are `a` and `i` (return=[0,2]).
  ///
  /// root
  /// -- a  <------
  /// -- -- b  |  |
  /// -- -- -- c  |
  /// -- -- -- d  |
  /// -- -- -- -- e
  /// -- f
  /// -- -- g
  /// -- -- -- h
  /// -- i  <---
  /// -- -- j  |
  /// -- -- -- k
  ::arrow::Result<std::vector<int>> GetFieldIndices(
      const std::vector<int>& column_indices) const {
    const schema::GroupNode* group = descr->group_node();
    std::unordered_set<int> already_added;

    std::vector<int> out;
    for (int column_idx : column_indices) {
      if (column_idx < 0 || column_idx >= descr->num_columns()) {
        return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
      }

      auto field_node = descr->GetColumnRoot(column_idx);
      auto field_idx = group->FieldIndex(*field_node);
      if (field_idx == -1) {
        return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
      }

      if (already_added.insert(field_idx).second) {
        out.push_back(field_idx);
      }
    }
    return out;
  }
};

}  // namespace arrow
}  // namespace parquet