Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

arrow-nightlies / pyarrow   python

Repository URL to install this package:

Version: 19.0.0.dev70 

/ include / arrow / compute / function.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle.

#pragma once

#include <string>
#include <utility>
#include <vector>

#include "arrow/compute/kernel.h"
#include "arrow/compute/type_fwd.h"
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/compare.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"

namespace arrow {
namespace compute {

/// \addtogroup compute-functions
/// @{

/// \brief Contains the number of required arguments for the function.
///
/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
struct ARROW_EXPORT Arity {
  /// \brief A function taking no arguments
  static Arity Nullary() { return Arity(0, false); }

  /// \brief A function taking 1 argument
  static Arity Unary() { return Arity(1, false); }

  /// \brief A function taking 2 arguments
  static Arity Binary() { return Arity(2, false); }

  /// \brief A function taking 3 arguments
  static Arity Ternary() { return Arity(3, false); }

  /// \brief A function taking a variable number of arguments
  ///
  /// \param[in] min_args the minimum number of arguments required when
  /// invoking the function
  static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); }

  // NOTE: the 0-argument form (default constructor) is required for Cython
  explicit Arity(int num_args = 0, bool is_varargs = false)
      : num_args(num_args), is_varargs(is_varargs) {}

  /// The number of required arguments (or the minimum number for varargs
  /// functions).
  int num_args;

  /// If true, then the num_args is the minimum number of required arguments.
  bool is_varargs = false;
};

struct ARROW_EXPORT FunctionDoc {
  /// \brief A one-line summary of the function, using a verb.
  ///
  /// For example, "Add two numeric arrays or scalars".
  std::string summary;

  /// \brief A detailed description of the function, meant to follow the summary.
  std::string description;

  /// \brief Symbolic names (identifiers) for the function arguments.
  ///
  /// Some bindings may use this to generate nicer function signatures.
  std::vector<std::string> arg_names;

  // TODO add argument descriptions?

  /// \brief Name of the options class, if any.
  std::string options_class;

  /// \brief Whether options are required for function execution
  ///
  /// If false, then either the function does not have an options class
  /// or there is a usable default options value.
  bool options_required;

  FunctionDoc() = default;

  FunctionDoc(std::string summary, std::string description,
              std::vector<std::string> arg_names, std::string options_class = "",
              bool options_required = false)
      : summary(std::move(summary)),
        description(std::move(description)),
        arg_names(std::move(arg_names)),
        options_class(std::move(options_class)),
        options_required(options_required) {}

  static const FunctionDoc& Empty();
};

/// \brief An executor of a function with a preconfigured kernel
class ARROW_EXPORT FunctionExecutor {
 public:
  virtual ~FunctionExecutor() = default;
  /// \brief Initialize or re-initialize the preconfigured kernel
  ///
  /// This method may be called zero or more times. Depending on how
  /// the FunctionExecutor was obtained, it may already have been initialized.
  virtual Status Init(const FunctionOptions* options = NULLPTR,
                      ExecContext* exec_ctx = NULLPTR) = 0;
  /// \brief Execute the preconfigured kernel with arguments that must fit it
  ///
  /// The method requires the arguments be castable to the preconfigured types.
  ///
  /// \param[in] args Arguments to execute the function on
  /// \param[in] length Length of arguments batch or -1 to default it. If the
  /// function has no parameters, this determines the batch length, defaulting
  /// to 0. Otherwise, if the function is scalar, this must equal the argument
  /// batch's inferred length or be -1 to default to it. This is ignored for
  /// vector functions.
  virtual Result<Datum> Execute(const std::vector<Datum>& args, int64_t length = -1) = 0;
};

/// \brief Base class for compute functions. Function implementations contain a
/// collection of "kernels" which are implementations of the function for
/// specific argument types. Selecting a viable kernel for executing a function
/// is referred to as "dispatching".
class ARROW_EXPORT Function {
 public:
  /// \brief The kind of function, which indicates in what contexts it is
  /// valid for use.
  enum Kind {
    /// A function that performs scalar data operations on whole arrays of
    /// data. Can generally process Array or Scalar values. The size of the
    /// output will be the same as the size (or broadcasted size, in the case
    /// of mixing Array and Scalar inputs) of the input.
    SCALAR,

    /// A function with array input and output whose behavior depends on the
    /// values of the entire arrays passed, rather than the value of each scalar
    /// value.
    VECTOR,

    /// A function that computes scalar summary statistics from array input.
    SCALAR_AGGREGATE,

    /// A function that computes grouped summary statistics from array input
    /// and an array of group identifiers.
    HASH_AGGREGATE,

    /// A function that dispatches to other functions and does not contain its
    /// own kernels.
    META
  };

  virtual ~Function() = default;

  /// \brief The name of the kernel. The registry enforces uniqueness of names.
  const std::string& name() const { return name_; }

  /// \brief The kind of kernel, which indicates in what contexts it is valid
  /// for use.
  Function::Kind kind() const { return kind_; }

  /// \brief Contains the number of arguments the function requires, or if the
  /// function accepts variable numbers of arguments.
  const Arity& arity() const { return arity_; }

  /// \brief Return the function documentation
  const FunctionDoc& doc() const { return doc_; }

  /// \brief Returns the number of registered kernels for this function.
  virtual int num_kernels() const = 0;

  /// \brief Return a kernel that can execute the function given the exact
  /// argument types (without implicit type casts).
  ///
  /// NB: This function is overridden in CastFunction.
  virtual Result<const Kernel*> DispatchExact(const std::vector<TypeHolder>& types) const;

  /// \brief Return a best-match kernel that can execute the function given the argument
  /// types, after implicit casts are applied.
  ///
  /// \param[in,out] values Argument types. An element may be modified to
  /// indicate that the returned kernel only approximately matches the input
  /// value descriptors; callers are responsible for casting inputs to the type
  /// required by the kernel.
  virtual Result<const Kernel*> DispatchBest(std::vector<TypeHolder>* values) const;

  /// \brief Get a function executor with a best-matching kernel
  ///
  /// The returned executor will by default work with the default FunctionOptions
  /// and KernelContext. If you want to change that, call `FunctionExecutor::Init`.
  virtual Result<std::shared_ptr<FunctionExecutor>> GetBestExecutor(
      std::vector<TypeHolder> inputs) const;

  /// \brief Execute the function eagerly with the passed input arguments with
  /// kernel dispatch, batch iteration, and memory allocation details taken
  /// care of.
  ///
  /// If the `options` pointer is null, then `default_options()` will be used.
  ///
  /// This function can be overridden in subclasses.
  virtual Result<Datum> Execute(const std::vector<Datum>& args,
                                const FunctionOptions* options, ExecContext* ctx) const;

  virtual Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
                                ExecContext* ctx) const;

  /// \brief Returns the default options for this function.
  ///
  /// Whatever option semantics a Function has, implementations must guarantee
  /// that default_options() is valid to pass to Execute as options.
  const FunctionOptions* default_options() const { return default_options_; }

  virtual Status Validate() const;

  /// \brief Returns the pure property for this function.
  ///
  /// Impure functions are those that may return different results for the same
  /// input arguments. For example, a function that returns a random number is
  /// not pure. An expression containing only pure functions can be simplified by
  /// pre-evaluating any sub-expressions that have constant arguments.
  virtual bool is_pure() const { return true; }

 protected:
  Function(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
           const FunctionOptions* default_options)
      : name_(std::move(name)),
        kind_(kind),
        arity_(arity),
        doc_(std::move(doc)),
        default_options_(default_options) {}

  Status CheckArity(size_t num_args) const;

  std::string name_;
  Function::Kind kind_;
  Arity arity_;
  const FunctionDoc doc_;
  const FunctionOptions* default_options_ = NULLPTR;
};

namespace detail {

template <typename KernelType>
class FunctionImpl : public Function {
 public:
  /// \brief Return pointers to current-available kernels for inspection
  std::vector<const KernelType*> kernels() const {
    std::vector<const KernelType*> result;
    for (const auto& kernel : kernels_) {
      result.push_back(&kernel);
    }
    return result;
  }

  int num_kernels() const override { return static_cast<int>(kernels_.size()); }

 protected:
  FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
               const FunctionOptions* default_options)
      : Function(std::move(name), kind, arity, std::move(doc), default_options) {}

  std::vector<KernelType> kernels_;
};

/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
ARROW_EXPORT
const Kernel* DispatchExactImpl(const Function* func, const std::vector<TypeHolder>&);

/// \brief Return an error message if no Kernel is found.
ARROW_EXPORT
Status NoMatchingKernel(const Function* func, const std::vector<TypeHolder>&);

}  // namespace detail

/// \brief A function that executes elementwise operations on arrays or
/// scalars, and therefore whose results generally do not depend on the order
/// of the values in the arguments. Accepts and returns arrays that are all of
/// the same size. These functions roughly correspond to the functions used in
/// SQL expressions.
class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
 public:
  using KernelType = ScalarKernel;

  ScalarFunction(std::string name, const Arity& arity, FunctionDoc doc,
                 const FunctionOptions* default_options = NULLPTR, bool is_pure = true)
      : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity,
                                           std::move(doc), default_options),
        is_pure_(is_pure) {}

  /// \brief Add a kernel with given input/output types, no required state
  /// initialization, preallocation for fixed-width types, and default null
  /// handling (intersect validity bitmaps of inputs).
  Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
                   ArrayKernelExec exec, KernelInit init = NULLPTR);

  /// \brief Add a kernel (function implementation). Returns error if the
  /// kernel's signature does not match the function's arity.
  Status AddKernel(ScalarKernel kernel);

  /// \brief Returns the pure property for this function.
  bool is_pure() const override { return is_pure_; }

 private:
  const bool is_pure_;
};

/// \brief A function that executes general array operations that may yield
/// outputs of different sizes or have results that depend on the whole array
/// contents. These functions roughly correspond to the functions found in
/// non-SQL array languages like APL and its derivatives.
class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
 public:
  using KernelType = VectorKernel;

  VectorFunction(std::string name, const Arity& arity, FunctionDoc doc,
                 const FunctionOptions* default_options = NULLPTR)
      : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity,
                                           std::move(doc), default_options) {}

  /// \brief Add a simple kernel with given input/output types, no required
  /// state initialization, no data preallocation, and no preallocation of the
  /// validity bitmap.
  Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
                   ArrayKernelExec exec, KernelInit init = NULLPTR);

  /// \brief Add a kernel (function implementation). Returns error if the
  /// kernel's signature does not match the function's arity.
  Status AddKernel(VectorKernel kernel);
Loading ...