Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev70 

/ include / parquet / file_writer.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <memory>
#include <utility>

#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/schema.h"

namespace parquet {

class ColumnWriter;

// FIXME: copied from reader-internal.cc
static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};

class PARQUET_EXPORT RowGroupWriter {
 public:
  // Forward declare a virtual class 'Contents' to aid dependency injection and more
  // easily create test fixtures
  // An implementation of the Contents class is defined in the .cc file
  struct Contents {
    virtual ~Contents() = default;
    virtual int num_columns() const = 0;
    virtual int64_t num_rows() const = 0;

    // to be used only with ParquetFileWriter::AppendRowGroup
    virtual ColumnWriter* NextColumn() = 0;
    // to be used only with ParquetFileWriter::AppendBufferedRowGroup
    virtual ColumnWriter* column(int i) = 0;

    virtual int current_column() const = 0;
    virtual void Close() = 0;

    /// \brief total uncompressed bytes written by the page writer
    virtual int64_t total_bytes_written() const = 0;
    /// \brief total bytes still compressed but not written by the page writer
    virtual int64_t total_compressed_bytes() const = 0;
    /// \brief total compressed bytes written by the page writer
    virtual int64_t total_compressed_bytes_written() const = 0;

    virtual bool buffered() const = 0;
  };

  explicit RowGroupWriter(std::unique_ptr<Contents> contents);

  /// Construct a ColumnWriter for the indicated row group-relative column.
  ///
  /// To be used only with ParquetFileWriter::AppendRowGroup
  /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
  /// valid until the next call to NextColumn or Close. As the contents are
  /// directly written to the sink, once a new column is started, the contents
  /// of the previous one cannot be modified anymore.
  ColumnWriter* NextColumn();
  /// Index of currently written column. Equal to -1 if NextColumn()
  /// has not been called yet.
  int current_column();
  void Close();

  int num_columns() const;

  /// Construct a ColumnWriter for the indicated row group column.
  ///
  /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
  /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
  /// valid until Close. The contents are buffered in memory and written to sink
  /// on Close
  ColumnWriter* column(int i);

  /**
   * Number of rows that shall be written as part of this RowGroup.
   */
  int64_t num_rows() const;

  /// \brief total uncompressed bytes written by the page writer
  int64_t total_bytes_written() const;
  /// \brief total bytes still compressed but not written by the page writer.
  /// It will always return 0 from the SerializedPageWriter.
  int64_t total_compressed_bytes() const;
  /// \brief total compressed bytes written by the page writer
  int64_t total_compressed_bytes_written() const;

  /// Returns whether the current RowGroupWriter is in the buffered mode and is created
  /// by calling ParquetFileWriter::AppendBufferedRowGroup.
  bool buffered() const;

 private:
  // Holds a pointer to an instance of Contents implementation
  std::unique_ptr<Contents> contents_;
};

PARQUET_EXPORT
void WriteFileMetaData(const FileMetaData& file_metadata,
                       ::arrow::io::OutputStream* sink);

PARQUET_EXPORT
void WriteMetaDataFile(const FileMetaData& file_metadata,
                       ::arrow::io::OutputStream* sink);

PARQUET_EXPORT
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
                                ArrowOutputStream* sink,
                                const std::shared_ptr<Encryptor>& encryptor,
                                bool encrypt_footer);

PARQUET_EXPORT
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
                                ::arrow::io::OutputStream* sink,
                                const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
                                bool encrypt_footer = false);
PARQUET_EXPORT
void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
                             ::arrow::io::OutputStream* sink);

class PARQUET_EXPORT ParquetFileWriter {
 public:
  // Forward declare a virtual class 'Contents' to aid dependency injection and more
  // easily create test fixtures
  // An implementation of the Contents class is defined in the .cc file
  struct Contents {
    Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
             std::shared_ptr<const KeyValueMetadata> key_value_metadata)
        : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
      schema_.Init(std::move(schema));
    }
    virtual ~Contents() {}
    // Perform any cleanup associated with the file contents
    virtual void Close() = 0;

    virtual RowGroupWriter* AppendRowGroup() = 0;
    virtual RowGroupWriter* AppendBufferedRowGroup() = 0;

    virtual int64_t num_rows() const = 0;
    virtual int num_columns() const = 0;
    virtual int num_row_groups() const = 0;

    virtual const std::shared_ptr<WriterProperties>& properties() const = 0;

    const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
      return key_value_metadata_;
    }

    virtual void AddKeyValueMetadata(
        const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) = 0;

    // Return const-pointer to make it clear that this object is not to be copied
    const SchemaDescriptor* schema() const { return &schema_; }

    SchemaDescriptor schema_;

    /// This should be the only place this is stored. Everything else is a const reference
    std::shared_ptr<const KeyValueMetadata> key_value_metadata_;

    const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
    std::shared_ptr<FileMetaData> file_metadata_;
  };

  ParquetFileWriter();
  ~ParquetFileWriter();

  static std::unique_ptr<ParquetFileWriter> Open(
      std::shared_ptr<::arrow::io::OutputStream> sink,
      std::shared_ptr<schema::GroupNode> schema,
      std::shared_ptr<WriterProperties> properties = default_writer_properties(),
      std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);

  void Open(std::unique_ptr<Contents> contents);
  void Close();

  /// Construct a RowGroupWriter with an arbitrary number of rows.
  ///
  /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
  /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
  RowGroupWriter* AppendRowGroup();

  /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
  /// Use this if you want to write a RowGroup based on a certain size
  ///
  /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
  /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
  RowGroupWriter* AppendBufferedRowGroup();

  /// \brief Add key-value metadata to the file.
  /// \param[in] key_value_metadata the metadata to add.
  /// \note This will overwrite any existing metadata with the same key(s).
  /// \throw ParquetException if Close() has been called.
  void AddKeyValueMetadata(
      const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);

  /// Number of columns.
  ///
  /// This number is fixed during the lifetime of the writer as it is determined via
  /// the schema.
  int num_columns() const;

  /// Number of rows in the yet started RowGroups.
  ///
  /// Changes on the addition of a new RowGroup.
  int64_t num_rows() const;

  /// Number of started RowGroups.
  int num_row_groups() const;

  /// Configuration passed to the writer, e.g. the used Parquet format version.
  const std::shared_ptr<WriterProperties>& properties() const;

  /// Returns the file schema descriptor
  const SchemaDescriptor* schema() const;

  /// Returns a column descriptor in schema
  const ColumnDescriptor* descr(int i) const;

  /// Returns the file custom metadata
  const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;

  /// Returns the file metadata, only available after calling Close().
  const std::shared_ptr<FileMetaData> metadata() const;

 private:
  // Holds a pointer to an instance of Contents implementation
  std::unique_ptr<Contents> contents_;
  std::shared_ptr<FileMetaData> file_metadata_;
};

}  // namespace parquet