#pragma once
#include <ATen/core/boxing/OperatorKernel.h>
#include <c10/core/DispatchKeySet.h>
#include <c10/util/intrusive_ptr.h>
namespace c10 {
struct IValue;
using Stack = std::vector<IValue>;
class OperatorHandle;
class KernelFunction;
// This kernel implements the behavior of falling through to the next available
// registered dispatch key. The implementation of this function is FAST; it is
// no overhead to fallthrough to the next key. See cpp file for some more
// implementation notes; notably, this does NOT actually go through the
// boxing/unboxing codepath.
TORCH_API void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
// Note [Ambiguity in AutogradOther kernel]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// This error-reporting kernel is registered to the AutogradOther entry in the
// dispatch table when there is both a CompositeImplicitAutograd kernel and a
// backend kernel for ANY backend that maps to AutogradOther. To see why
// this is necessary in the AutogradOther case, it's helpful to first see
// why everything works out fine for a backend that has a reserved Autograd
// entry (see rule 2.2 in [Note] DispatchTable computation):
//
// CPU AutogradCPU
// reg? registers with...
// -------------------------------------------------
// y Autograd registration takes precedence
// over CompositeImplicitAutograd.
// This is good, because the CPU specific backend
// implementation is more specialized and typically better;
// if we used the composite, we would bypass it.
// (NB: the Autograd key is guaranteed to exist because
// the autograd codegen requires it!)
//
// n CompositeImplicitAutograd takes precedence.
// This is also good, because the Autograd
// registration (if it exists) would try to redispatch
// to the (non-existent) CPU implementation; by
// using the composite, we ensure the operator
// actually works.
//
// As you can see, when we have a specific Autograd key (AutogradCPU), we can
// decide whether or not to use the CompositeImplicitAutograd kernel or the
// Autograd kernel based on whether or not the backend kernel exists.
//
// However, for AutogradOther (which is the catchall autograd kernel for
// everything that doesn't have a specific Autograd key), we can't do this
// trick because there isn't any unique backend to peek at to disambiguate;
// if there are some backends that have implementations they prefer Autograd,
// but unimplemented backends would prefer CompositeImplicitAutograd. Rather
// than arbitrarily pick one or the other, we just register a kernel that raises
// an error and let the user decide how to proceed.
TORCH_API void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
// Note [named_not_supported_kernel]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// This kernel implements reporting an error message saying that named tensor is
// not supported. This kernel doesn't rely on the Stack, and so it is special
// cased in the dispatcher to be triggered before we attempt boxing (so we can
// give a good error message in cases when boxing is not supported). When
// boxing is universally supported this can be removed.
[[noreturn]] TORCH_API void named_not_supported_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
/**
* BoxedKernel is similar to a std::function storing a boxed kernel.
*/
class TORCH_API BoxedKernel final {
public:
// This is how boxed kernels are actually stored
//
// Note [Plumbing Keys Through The Dispatcher]
// Benchmarks have shown that it is expensive for the dispatcher to read from thread-local storage (TLS)
// upon every dispatch call into order to compute which kernel to dispatch to.
//
// To mitigate this, we've updated the calling convention inside the dispatcher to expect every kernel that it stores
// to have a first argument of type DispatchKeySet.
//
// What are the invariants of the DispatchKeySet when it gets passed to a kernel?
// - All keys to the left of the current dispatch key have been masked out.
// (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the highest bit to be DispatchKey::Tracer)
// - All other keys that dispatcher normally would have computed through TLS + global state + op arguments
// are still in the set.
//
// Kernels can then opt into using this keyset to save the dispatcher from doing repeated work during redispatches:
// recalculating the highest-priority dispatch key, which involves reading from TLS. Instead, the kernels that opt in will
// calculate an updated DispatchKeySet directly from the old one, and pass the updated set directly into the dispatcher
// upon redispatching.
//
// This is an opt-in mechanism: Kernels can automatically opt in by setting the first argument in their signature
// to be of type DispatchKeySet. See the kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for examples.
//
// The mechanism for optionally passing that DispatchKeySet into the kernel lives in make_boxed_from_unboxed_functor.h.
// See Note [Plumbing Keys Through The Dispatcher 2] for details.
using InternalBoxedKernelFunction = void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
// This is the public API for how boxed kernels are defined
using BoxedKernelFunction = void(const OperatorHandle&, Stack*);
using BoxedKernelFunction_withDispatchKeys = void(const OperatorHandle&, DispatchKeySet, Stack*);
BoxedKernel();
// Fast path for dispatch to allow not touching the boxed kernel in
// the common case where unboxed is available.
bool isValid() const;
bool isFallthrough() const;
/**
* Call the function with boxed arguments.
*/
void callBoxed(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Stack* stack) const;
/**
* Create a KernelFunction from a boxed function.
*
* Example:
*
* > void boxed_func(OperatorKernel*, Stack* stack) {...}
* > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>();
*/
template<BoxedKernelFunction* func>
static BoxedKernel makeFromFunction();
/**
* TODO: This will only be useful if we write a backend fallback that plumbs dispatch keys (currently there are none)
* See Note [Plumbing Keys Through The Dispatcher] for details.
*/
template<BoxedKernelFunction_withDispatchKeys* func>
static BoxedKernel makeFromFunction();
/**
* Create a KernelFunction from a boxed functor.
*
* Example:
*
* > class MyFunctor final : public c10::OperatorKernel {
* > public:
* > void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
* > };
* > BoxedKernel func = BoxedKernel::makeFromFunctor(std::make_unique<MyFunctor>());
*/
template<class KernelFunctor>
static BoxedKernel makeFromFunctor(std::unique_ptr<KernelFunctor> kernelFunctor);
static BoxedKernel makeFallthrough();
static BoxedKernel makeAmbiguousAutogradOther();
static BoxedKernel makeNamedNotSupported();
private:
friend class KernelFunction;
template<BoxedKernelFunction* func>
static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, DispatchKeySet, Stack* stack);
template<BoxedKernelFunction_withDispatchKeys* func>
static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, DispatchKeySet, Stack* stack);
explicit BoxedKernel(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func);
OperatorKernel* getFunctor() const;
InternalBoxedKernelFunction* getFnPtr() const;
c10::intrusive_ptr<OperatorKernel> functor_;
InternalBoxedKernelFunction* boxed_kernel_func_;
};
} // namespace c10
#include <ATen/core/boxing/BoxedKernel_impl.h>