Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev259 

/ include / arrow / engine / substrait / serde.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// This API is EXPERIMENTAL.

#pragma once

#include <functional>
#include <memory>
#include <string>
#include <string_view>
#include <vector>

#include "arrow/compute/type_fwd.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/engine/substrait/options.h"
#include "arrow/engine/substrait/relation.h"
#include "arrow/engine/substrait/type_fwd.h"
#include "arrow/engine/substrait/visibility.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace engine {

/// \brief Serialize an Acero Plan to a binary protobuf Substrait message
///
/// \param[in] declaration the Acero declaration to serialize.
/// This declaration is the sink relation of the Acero plan.
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
/// \param[in] conversion_options options to control how the conversion is done
///
/// \return a buffer containing the protobuf serialization of the Acero relation
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Buffer>> SerializePlan(
    const acero::Declaration& declaration, ExtensionSet* ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Serialize expressions to a Substrait message
///
/// \param[in] bound_expressions the expressions to serialize.
/// \param[in] conversion_options options to control how the conversion is done
/// \param[in,out] ext_set the extension mapping to use, optional, only needed
///                        if you want to control the value of function anchors
///                        to mirror a previous serialization / deserialization.
///                        Will be updated if new functions are encountered
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Buffer>> SerializeExpressions(
    const BoundExpressions& bound_expressions,
    const ConversionOptions& conversion_options = {}, ExtensionSet* ext_set = NULLPTR);

/// Factory function type for generating the node that consumes the batches produced by
/// each toplevel Substrait relation when deserializing a Substrait Plan.
using ConsumerFactory = std::function<std::shared_ptr<acero::SinkNodeConsumer>()>;

/// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations
///
/// The output of each top-level Substrait relation will be sent to a caller supplied
/// consumer function provided by consumer_factory
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
/// message
/// \param[in] consumer_factory factory function for generating the node that consumes
/// the batches produced by each toplevel Substrait relation
/// \param[in] registry an extension-id-registry to use, or null for the default one.
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
/// Plan is returned here.
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return a vector of ExecNode declarations, one for each toplevel relation in the
/// Substrait Plan
ARROW_ENGINE_EXPORT Result<std::vector<acero::Declaration>> DeserializePlans(
    const Buffer& buf, const ConsumerFactory& consumer_factory,
    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
    const ConversionOptions& conversion_options = {});

/// \brief Deserializes a single-relation Substrait Plan message to an execution plan
///
/// The output of each top-level Substrait relation will be sent to a caller supplied
/// consumer function provided by consumer_factory
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
/// message
/// \param[in] consumer node that consumes the batches produced by each toplevel Substrait
/// relation
/// \param[in] registry an extension-id-registry to use, or null for the default one.
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
/// \param[in] conversion_options options to control how the conversion is to be done.
/// Plan is returned here.
/// \return an ExecPlan for the Substrait Plan
ARROW_ENGINE_EXPORT Result<std::shared_ptr<acero::ExecPlan>> DeserializePlan(
    const Buffer& buf, const std::shared_ptr<acero::SinkNodeConsumer>& consumer,
    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
    const ConversionOptions& conversion_options = {});

/// Factory function type for generating the write options of a node consuming the batches
/// produced by each toplevel Substrait relation when deserializing a Substrait Plan.
using WriteOptionsFactory = std::function<std::shared_ptr<dataset::WriteNodeOptions>()>;

/// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations
///
/// The output of each top-level Substrait relation will be written to a filesystem.
/// `write_options_factory` can be used to control write behavior.
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
/// message
/// \param[in] write_options_factory factory function for generating the write options of
/// a node consuming the batches produced by each toplevel Substrait relation
/// \param[in] registry an extension-id-registry to use, or null for the default one.
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
/// Plan is returned here.
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return a vector of ExecNode declarations, one for each toplevel relation in the
/// Substrait Plan
ARROW_ENGINE_EXPORT Result<std::vector<acero::Declaration>> DeserializePlans(
    const Buffer& buf, const WriteOptionsFactory& write_options_factory,
    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
    const ConversionOptions& conversion_options = {});

/// \brief Deserializes a single-relation Substrait Plan message to an execution plan
///
/// The output of the single Substrait relation will be written to a filesystem.
/// `write_options_factory` can be used to control write behavior.
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
/// message
/// \param[in] write_options write options of a node consuming the batches produced by
/// each toplevel Substrait relation
/// \param[in] registry an extension-id-registry to use, or null for the default one.
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
/// Plan is returned here.
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return an ExecPlan for the Substrait Plan
ARROW_ENGINE_EXPORT Result<std::shared_ptr<acero::ExecPlan>> DeserializePlan(
    const Buffer& buf, const std::shared_ptr<dataset::WriteNodeOptions>& write_options,
    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
    const ConversionOptions& conversion_options = {});

/// \brief Deserializes a Substrait Plan message to a Declaration
///
/// The plan will not contain any sink nodes and will be suitable for use in any
/// of the arrow::compute::DeclarationToXyz methods.
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
/// message
/// \param[in] registry an extension-id-registry to use, or null for the default one.
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
/// Plan is returned here.
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return A declaration representing the Substrait plan
ARROW_ENGINE_EXPORT Result<PlanInfo> DeserializePlan(
    const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR,
    ExtensionSet* ext_set_out = NULLPTR,
    const ConversionOptions& conversion_options = {});

/// \brief Deserialize a Substrait ExtendedExpression message to the corresponding Arrow
/// type
///
/// \param[in] buf a buffer containing the protobuf serialization of a collection of bound
/// expressions
/// \param[in] registry an extension-id-registry to use, or null for the default one
/// \param[in] conversion_options options to control how the conversion is done
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
/// message is returned here.
/// \return A collection of expressions and a common input schema they are bound to
ARROW_ENGINE_EXPORT Result<BoundExpressions> DeserializeExpressions(
    const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR,
    const ConversionOptions& conversion_options = {},
    ExtensionSet* ext_set_out = NULLPTR);

/// \brief Deserializes a Substrait Type message to the corresponding Arrow type
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Type
/// message
/// \param[in] ext_set the extension mapping to use, normally provided by the
/// surrounding Plan message
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return the corresponding Arrow data type
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<DataType>> DeserializeType(
    const Buffer& buf, const ExtensionSet& ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Serializes an Arrow type to a Substrait Type message
///
/// \param[in] type the Arrow data type to serialize
/// \param[in,out] ext_set the extension mapping to use; may be updated to add a
/// mapping for the given type
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return a buffer containing the protobuf serialization of the corresponding Substrait
/// Type message
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Buffer>> SerializeType(
    const DataType& type, ExtensionSet* ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Deserializes a Substrait NamedStruct message to an Arrow schema
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
/// NamedStruct message
/// \param[in] ext_set the extension mapping to use, normally provided by the
/// surrounding Plan message
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return the corresponding Arrow schema
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Schema>> DeserializeSchema(
    const Buffer& buf, const ExtensionSet& ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Serializes an Arrow schema to a Substrait NamedStruct message
///
/// \param[in] schema the Arrow schema to serialize
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
/// mappings for the types used in the schema
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return a buffer containing the protobuf serialization of the corresponding Substrait
/// NamedStruct message
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Buffer>> SerializeSchema(
    const Schema& schema, ExtensionSet* ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Deserializes a Substrait Expression message to a compute expression
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
/// Expression message
/// \param[in] ext_set the extension mapping to use, normally provided by the
/// surrounding Plan message
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return the corresponding Arrow compute expression
ARROW_ENGINE_EXPORT
Result<compute::Expression> DeserializeExpression(
    const Buffer& buf, const ExtensionSet& ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Serializes an Arrow compute expression to a Substrait Expression message
///
/// \param[in] expr the Arrow compute expression to serialize
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
/// mappings for the types used in the expression
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return a buffer containing the protobuf serialization of the corresponding Substrait
/// Expression message
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Buffer>> SerializeExpression(
    const compute::Expression& expr, ExtensionSet* ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Serialize an Acero Declaration to a binary protobuf Substrait message
///
/// \param[in] declaration the Acero declaration to serialize
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
/// \param[in] conversion_options options to control how the conversion is done
///
/// \return a buffer containing the protobuf serialization of the Acero relation
ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> SerializeRelation(
    const acero::Declaration& declaration, ExtensionSet* ext_set,
    const ConversionOptions& conversion_options = {});

/// \brief Deserializes a Substrait Rel (relation) message to an ExecNode declaration
///
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
/// Rel message
/// \param[in] ext_set the extension mapping to use, normally provided by the
/// surrounding Plan message
/// \param[in] conversion_options options to control how the conversion is to be done.
/// \return the corresponding ExecNode declaration
ARROW_ENGINE_EXPORT Result<acero::Declaration> DeserializeRelation(
    const Buffer& buf, const ExtensionSet& ext_set,
    const ConversionOptions& conversion_options = {});

namespace internal {

/// \brief Checks whether two protobuf serializations of a particular Substrait message
/// type are equivalent
///
/// Note that a binary comparison of the two buffers is insufficient. One reason for this
/// is that the fields of a message can be specified in any order in the serialization.
///
/// \param[in] message_name the name of the Substrait message type to check
/// \param[in] l_buf buffer containing the first protobuf serialization to compare
/// \param[in] r_buf buffer containing the second protobuf serialization to compare
/// \return success if equivalent, failure if not
ARROW_ENGINE_EXPORT
Status CheckMessagesEquivalent(std::string_view message_name, const Buffer& l_buf,
                               const Buffer& r_buf);

/// \brief Utility function to convert a JSON serialization of a Substrait message to
/// its binary serialization
///
/// \param[in] type_name the name of the Substrait message type to convert
/// \param[in] json the JSON string to convert
/// \param[in] ignore_unknown_fields if true then unknown fields will be ignored and
///            will not cause an error
///
///            This should generally be true to allow consumption of plans from newer
///            producers but setting to false can be useful if you are testing
///            conformance to a specific Substrait version
/// \return a buffer filled with the binary protobuf serialization of message
ARROW_ENGINE_EXPORT
Result<std::shared_ptr<Buffer>> SubstraitFromJSON(std::string_view type_name,
                                                  std::string_view json,
                                                  bool ignore_unknown_fields = true);

/// \brief Utility function to convert a binary protobuf serialization of a Substrait
/// message to JSON
///
/// \param[in] type_name the name of the Substrait message type to convert
/// \param[in] buf the buffer containing the binary protobuf serialization of the message
/// \return a JSON string representing the message
ARROW_ENGINE_EXPORT
Result<std::string> SubstraitToJSON(std::string_view type_name, const Buffer& buf);

}  // namespace internal
}  // namespace engine
}  // namespace arrow