Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev70 

/ include / arrow / compute / expression.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// This API is EXPERIMENTAL.

#pragma once

#include <memory>
#include <string>
#include <utility>
#include <variant>
#include <vector>

#include "arrow/compute/type_fwd.h"
#include "arrow/datum.h"
#include "arrow/type_fwd.h"
#include "arrow/util/small_vector.h"

namespace arrow {
namespace compute {

/// \defgroup expression-core Expressions to describe data transformations
///
/// @{

/// An unbound expression which maps a single Datum to another Datum.
/// An expression is one of
/// - A literal Datum.
/// - A reference to a single (potentially nested) field of the input Datum.
/// - A call to a compute function, with arguments specified by other Expressions.
class ARROW_EXPORT Expression {
 public:
  struct Call {
    std::string function_name;
    std::vector<Expression> arguments;
    std::shared_ptr<FunctionOptions> options;
    // Cached hash value
    size_t hash;

    // post-Bind properties:
    std::shared_ptr<Function> function;
    const Kernel* kernel = NULLPTR;
    std::shared_ptr<KernelState> kernel_state;
    TypeHolder type;

    void ComputeHash();
  };

  std::string ToString() const;
  bool Equals(const Expression& other) const;
  size_t hash() const;
  struct Hash {
    size_t operator()(const Expression& expr) const { return expr.hash(); }
  };

  /// Bind this expression to the given input type, looking up Kernels and field types.
  /// Some expression simplification may be performed and implicit casts will be inserted.
  /// Any state necessary for execution will be initialized and returned.
  Result<Expression> Bind(const TypeHolder& in, ExecContext* = NULLPTR) const;
  Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;

  // XXX someday
  // Clone all KernelState in this bound expression. If any function referenced by this
  // expression has mutable KernelState, it is not safe to execute or apply simplification
  // passes to it (or copies of it!) from multiple threads. Cloning state produces new
  // KernelStates where necessary to ensure that Expressions may be manipulated safely
  // on multiple threads.
  // Result<ExpressionState> CloneState() const;
  // Status SetState(ExpressionState);

  /// Return true if all an expression's field references have explicit types
  /// and all of its functions' kernels are looked up.
  bool IsBound() const;

  /// Return true if this expression is composed only of Scalar literals, field
  /// references, and calls to ScalarFunctions.
  bool IsScalarExpression() const;

  /// Return true if this expression is literal and entirely null.
  bool IsNullLiteral() const;

  /// Return true if this expression could evaluate to true. Will return true for any
  /// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any
  /// canonicalization or simplification of the expression, so even Expressions
  /// which are unsatisfiable may spuriously return `true` here. This function is
  /// intended for use in predicate pushdown where a filter expression is simplified
  /// by a guarantee, so it assumes that trying to simplify again would be redundant.
  bool IsSatisfiable() const;

  // XXX someday
  // Result<PipelineGraph> GetPipelines();

  bool is_valid() const { return impl_ != NULLPTR; }

  /// Access a Call or return nullptr if this expression is not a call
  const Call* call() const;
  /// Access a Datum or return nullptr if this expression is not a literal
  const Datum* literal() const;
  /// Access a FieldRef or return nullptr if this expression is not a field_ref
  const FieldRef* field_ref() const;

  /// The type to which this expression will evaluate
  const DataType* type() const;
  // XXX someday
  // NullGeneralization::type nullable() const;

  struct Parameter {
    FieldRef ref;

    // post-bind properties
    TypeHolder type;
    ::arrow::internal::SmallVector<int, 2> indices;
  };
  const Parameter* parameter() const;

  Expression() = default;
  explicit Expression(Call call);
  explicit Expression(Datum literal);
  explicit Expression(Parameter parameter);

 private:
  using Impl = std::variant<Datum, Parameter, Call>;
  std::shared_ptr<Impl> impl_;

  ARROW_FRIEND_EXPORT friend bool Identical(const Expression& l, const Expression& r);
};

inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }

ARROW_EXPORT void PrintTo(const Expression&, std::ostream*);

// Factories

ARROW_EXPORT
Expression literal(Datum lit);

template <typename Arg>
Expression literal(Arg&& arg) {
  return literal(Datum(std::forward<Arg>(arg)));
}

ARROW_EXPORT
Expression field_ref(FieldRef ref);

ARROW_EXPORT
Expression call(std::string function, std::vector<Expression> arguments,
                std::shared_ptr<FunctionOptions> options = NULLPTR);

template <typename Options, typename = typename std::enable_if<
                                std::is_base_of<FunctionOptions, Options>::value>::type>
Expression call(std::string function, std::vector<Expression> arguments,
                Options options) {
  return call(std::move(function), std::move(arguments),
              std::make_shared<Options>(std::move(options)));
}

/// Assemble a list of all fields referenced by an Expression at any depth.
ARROW_EXPORT
std::vector<FieldRef> FieldsInExpression(const Expression&);

/// Check if the expression references any fields.
ARROW_EXPORT
bool ExpressionHasFieldRefs(const Expression&);

struct ARROW_EXPORT KnownFieldValues;

/// Assemble a mapping from field references to known values. This derives known values
/// from "equal" and "is_null" Expressions referencing a field and a literal.
ARROW_EXPORT
Result<KnownFieldValues> ExtractKnownFieldValues(
    const Expression& guaranteed_true_predicate);

/// @}

/// \defgroup expression-passes Functions for modification of Expressions
///
/// @{
///
/// These transform bound expressions. Some transforms utilize a guarantee, which is
/// provided as an Expression which is guaranteed to evaluate to true. The
/// guaranteed_true_predicate need not be bound, but canonicalization is currently
/// deferred to producers of guarantees. For example in order to be recognized as a
/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
/// other semantically identical Expressions will not be recognized.

/// Weak canonicalization which establishes guarantees for subsequent passes. Even
/// equivalent Expressions may result in different canonicalized expressions.
/// TODO this could be a strong canonicalization
ARROW_EXPORT
Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);

/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
/// be null so replace the call with a null literal). Includes early evaluation of all
/// calls whose arguments are entirely literal.
ARROW_EXPORT
Result<Expression> FoldConstants(Expression);

/// Simplify Expressions by replacing with known values of the fields which it references.
ARROW_EXPORT
Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
                                                Expression);

/// Simplify an expression by replacing subexpressions based on a guarantee:
/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
/// used to remove redundant function calls from a filter expression or to replace a
/// reference to a constant-value field with a literal.
ARROW_EXPORT
Result<Expression> SimplifyWithGuarantee(Expression,
                                         const Expression& guaranteed_true_predicate);

/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3])
///
/// This isn't usually needed and does not offer any simplification by itself.  However,
/// it can be useful to normalize an expression to paths to make it simpler to work with.
ARROW_EXPORT Result<Expression> RemoveNamedRefs(Expression expression);

/// @}

// Execution

/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
/// RecordBatch which may have missing or incorrectly ordered columns.
/// Missing fields will be replaced with null scalars.
ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
                                             const Datum& partial,
                                             Expression guarantee = literal(true));

/// Execute a scalar expression against the provided state and input ExecBatch. This
/// expression must be bound.
ARROW_EXPORT
Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
                                      ExecContext* = NULLPTR);

/// Convenience function for invoking against a RecordBatch
ARROW_EXPORT
Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
                                      const Datum& partial_input, ExecContext* = NULLPTR);

// Serialization

ARROW_EXPORT
Result<std::shared_ptr<Buffer>> Serialize(const Expression&);

ARROW_EXPORT
Result<Expression> Deserialize(std::shared_ptr<Buffer>);

/// \defgroup expression-convenience Helpers for convenient expression creation
///
/// @{

ARROW_EXPORT Expression project(std::vector<Expression> values,
                                std::vector<std::string> names);

ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);

ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);

ARROW_EXPORT Expression less(Expression lhs, Expression rhs);

ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);

ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);

ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);

ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false);

ARROW_EXPORT Expression is_valid(Expression lhs);

ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
ARROW_EXPORT Expression and_(const std::vector<Expression>&);
ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
ARROW_EXPORT Expression or_(const std::vector<Expression>&);
ARROW_EXPORT Expression not_(Expression operand);

/// @}

}  // namespace compute
}  // namespace arrow