#pragma once
#include <c10/core/DeviceType.h>
#include <c10/macros/Macros.h>
#include <ostream>
#include <string>
namespace c10 {
// Semantically, each value of BackendComponent identifies a "backend" for our
// dispatch. Some functionalities that we may dispatch to are allowed to
// register different handlers for each backend. The BackendComponent is then
// used to figure out which backend implementation to dispatch to.
// In implementation terms, the backend component identifies a specific "bit" in
// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
// functionalities. When we encounter a functionality bit that is known to be
// customizeable per-backend, then we also look at the lower BackendComponent
// bits and take the highest bit to determine which backend's implementation to
// use.
// WARNING! If you add a new backend component to the end of this list,
// make sure you update PrivateUse3Bit. (But you shouldn't: private use
// keys should have higher precedence than all built-in keys)
// If you add a new (non-privateuse) backend here,
// make sure to add an Autograd<Backend> fallthrough kernel
// in aten/src/ATen/core/VariableFallbackKernel.cpp
#define C10_FORALL_BACKEND_COMPONENTS(_, extra) \
_(CPU, extra) \
_(CUDA, extra) \
_(HIP, extra) \
_(XLA, extra) \
_(MPS, extra) \
_(IPU, extra) \
_(XPU, extra) \
_(HPU, extra) \
_(VE, extra) \
_(Lazy, extra) \
_(Meta, extra) \
_(MTIA, extra) \
_(PrivateUse1, extra) \
_(PrivateUse2, extra) \
_(PrivateUse3, extra)
// WARNING! If we add a new per-backend functionality key that has higher
// priority than Autograd, then make sure you update EndOfRuntimeBackendKeys
#define C10_FORALL_FUNCTIONALITY_KEYS(_) \
_(Dense, ) \
_(Quantized, Quantized) \
_(Sparse, Sparse) \
_(NestedTensor, NestedTensor) \
_(AutogradFunctionality, Autograd)
enum class BackendComponent : uint8_t {
// A "backend" is colloquially used to refer to handlers for dispatch
// which actually implement the numerics of an operation in question.
//
// Due to the nature of the enum, these backends are specified in
// an ordered way, but for most backends this order is not semantically
// meaningful (e.g., it's valid to reorder these backends without changing
// semantics). The only situation when backend ordering is meaningful
// is when the backend participates in multiple dispatch with another
// backend; e.g., CPU and CUDA (cuda must have higher priority).
// These keys don't correspond to individual kernels.
// Instead, they represent the backends that are allowed to override specific
// pieces of functionality:
// - dense kernels (e.g. DispatchKey::CPU)
// - sparse kernels (e.g. DispatchKey::SparseCPU)
// - quantized kernels (e.g. DispatchKey::QuantizedCPU)
// - autograd kernels (e.g. DispatchKey::AutogradCPU)
// We reserve space in the runtime operator table for this full cross product
// of
// [backends in this enum] x [keys below that are explicitly marked as having
// per-backend functionality]
//
// A meta tensor is a tensor without any data associated with it. (They
// have also colloquially been referred to as tensors on the "null" device).
// A meta tensor can be used to dry run operators without actually doing any
// computation, e.g., add on two meta tensors would give you another meta
// tensor with the output shape and dtype, but wouldn't actually add anything.
InvalidBit = 0,
#define DEFINE_BACKEND_COMPONENT(n, _) n##Bit,
C10_FORALL_BACKEND_COMPONENTS(DEFINE_BACKEND_COMPONENT, unused)
#undef DEFINE_BACKEND_COMPONENT
// Define an alias to represent end of backend dispatch keys.
// If you add new backend keys after PrivateUse3, please also update it here.
EndOfBackendKeys = PrivateUse3Bit,
};
// Semantically, a dispatch key identifies a possible "level" in our
// dispatch, for which a handler may be registered. Each handler corresponds
// to a type of functionality.
//
// In implementation terms, the dispatch key identifies a specific "bit" in a
// DispatchKeySet. Higher bit indexes get handled by dispatching first (because
// we "count leading zeros" when we extract the highest priority dispatch
// key.)
//
// Note [DispatchKey Classification]
// This enum actually contains several types of keys, which are explained
// in more detail further down:
// (1) non-customizable backends (e.g. FPGA)
// (2) non-customizable functionalities (e.g. Functionalize)
// (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
// AutogradFunctionality) (4) per-backend instances of customizable
// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
// CompositeImplicitAutograd)
//
// Of the categories above, it's important to note:
// (a) which keys are assigned individual bits in a DispatchKeySet
// (b) which keys are assigned individual slots in the runtime operator table
// ("Runtime keys")
//
// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
// (1), (2) and (4) all get their own dedicated slots in the runtime operator
// table.
// See Note [DispatchKeySet Internal Representation] for more details.
//
// NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py
enum class DispatchKey : uint16_t {
// ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
// This is not a "real" functionality, but it exists to give us a "nullopt"
// element we can return for cases when a DispatchKeySet contains no elements.
// You can think a more semantically accurate definition of DispatchKey is:
//
// using DispatchKey = optional<RealDispatchKey>
//
// and Undefined == nullopt. We didn't actually represent
// it this way because optional<RealDispatchKey> would take two
// words, when DispatchKey fits in eight bits.
Undefined = 0,
// Define an alias for Undefined to represent CatchAll (long term
// this will get eliminated, but for now it's convenient)
CatchAll = Undefined,
// ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
// Every value in the enum (up to EndOfFunctionalityKeys)
// corresponds to an individual "functionality" that can be dispatched to.
// This is represented in the DispatchKeySet by assigning each of these enum
// values
// to each of the remaining (64 - len(BackendComponent)) bits.
//
// Most of these functionalities have a single handler assigned to them,
// making them "runtime keys".
// That map to a single slot in the runtime operator table.
//
// A few functionalities are allowed to be customizable per backend.
// See [Note: Per-Backend Functionality Dispatch Keys] for details.
// See [Note: Per-Backend Functionality Dispatch Keys]
Dense,
// Below are non-extensible backends.
// These are backends that currently don't have their own overrides for
// Autograd/Sparse/Quantized kernels,
// and we therefore don't waste space in the runtime operator table allocating
// space for them.
// If any of these backends ever need to customize, e.g., Autograd, then we'll
// need to add a DispatchKey::*Bit for them.
// TODO: put this in BackendComponents
FPGA, // Xilinx support lives out of tree at
// https://gitlab.com/pytorch-complex/vitis_kernels
// TODO: put this in BackendComponents
// ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and
// https://github.com/microsoft/onnxruntime, and is also used to test general
// backend/extension machinery in the core. cf:
// - test/cpp_extensions/ort_extension.cpp
// - test/test_torch.py
// - aten/src/ATen/test/extension_backend_test.cpp
ORT,
Vulkan, // TODO: put this in BackendComponents
Metal, // TODO: put this in BackendComponents
// See [Note: Per-Backend Functionality Dispatch Keys]
Quantized,
// This backend is to support custom RNGs; it lets you go
// to a different kernel if you pass in a generator that is not a
// traditional CPUGeneratorImpl/CUDAGeneratorImpl. To make use of this
// key:
// 1) set it as a second parameter of at::Generator constructor call in
// the user-defined PRNG class.
// 2) use it as a dispatch key while registering custom kernels
// (templatized kernels specialized for user-defined PRNG class)
// intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp
CustomRNGKeyId,
// TODO: Make Mkldnn a functionality key, so we can give it Meta
// support
// Here are backends which specify more specialized operators
// based on the layout of the tensor. Note that the sparse backends
// are one case where ordering matters: sparse multi-dispatches with
// the corresponding dense tensors, and must be handled before them.
MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
// NB: not to be confused with MKLDNN, which is Caffe2 only
// See [Note: Per-Backend Functionality Dispatch Keys]
Sparse,
// TODO: Make SparseCsr a functionality key
SparseCsrCPU,
SparseCsrCUDA,
NestedTensor,
// In some situations, it is not immediately obvious what the correct
// backend for function is, because the function in question doesn't
// have any "tensor" arguments. In this case, a BackendSelect function
// can be registered to implement the custom determination of the
// correct backend.
BackendSelect,
Python,
// Out-of-core key for Fake Tensor in torchdistx.
// See https://pytorch.org/torchdistx/latest/fake_tensor.html
// TODO: delete this in favor of Python-implemented fake tensor
Fake,
// See Note [Out-of-tree vmap+grad prototype]. The purpose of this key
// is to insert code after the "autograd subsystem" runs, so this key should
// be directly after ADInplaceOrView and all of the autograd keys.
FuncTorchDynamicLayerBackMode,
// Alias and mutation removal.
// If some backends want to opt into only alias removal or only mutation
// removal,
// we can consider adding separate keys dedicated to those individual passes.
// See Note [Functionalization Pass In Core] for details.
Functionalize,
// The named dispatch key is set for any tensors with named dimensions.
// Although we have a dispatch key for named tensors, for historical reasons,
// this dispatch key doesn't do any of the substantive functionality for named
// tensor (though, hypothetically, it could!) At the moment, it's just
// responsible for letting us give good error messages when operations
// don't support named tensors.
//
// NB: If you ever consider moving named tensor functionality into
// this dispatch key, note that it might be necessary add another dispatch
// key that triggers before composite operators, in case a composite operator
// has named dimension propagation that doesn't match that of its
// constituent parts.
// TODO: delete this once torchdim lands in functorch
Named,
// The Conjugate dispatch key is set for any tensors that need to perform
// conjugation
// This is implemented at a dispatch level right before any backends run
Conjugate,
// The Negative dispatch key is set for any tensors that need to perform
// negation
// This is implemented at a dispatch level right before any backends run
Negative,
ZeroTensor, // registered at build/aten/src/ATen/RegisterZeroTensor.cpp
// Note [ADInplaceOrView key]
// ADInplaceOrView key is used by inplace or view ops to register a kernel
// that does additional setup for future autograd computation.
//
// 1. For inplace ops this kernel does version bump
// 2. For view ops this kernel does `as_view` setup where we properly setup
// DifferentiableViewMeta on the view tensors.
//
// For other ops it's fallthrough kernel since there's no extra
// work to do.
//
// Note [Dream: skip VariableType kernel when requires_grad=false]
//
// In an ideal world where we can skip VariableType kernel for inputs
// with requires_grad=false, instead of a fallthrough kernel, we'll
// register a kernel shown below to all functional ops as well:
// torch::Tensor my_functional_op(...) {
// {
// // Note for every op in VariableType, you need to go through
// // `AutoDispatchBelowADInplaceOrView` guard exactly once to add the
// // key to TLS excluded set. If you don't go through it at all,
// // inplace/view ops called through `at::` inside your backend
// // kernel will dispatch to ADInplaceOrView kernels and do a lot
// // of extra work.
// at::AutoDispatchBelowADInplaceOrView guard;
// at::redispatch::my_functional_op(...);
// }
// }
// But this work is currently blocked since it adds an extra dispatch
// for all ops and it's non-trivial overhead at model level(a few percents).
// Thus our current approach takes advantage of the fact every kernel go
// through VariableType kernel first and pulls the
// `at::AutoDispatchBelowADInplaceOrView` guard of functional ops
// up to the `VariableType` kernel. Thus we only add the extra dispatch
// to view/inplace ops to minimize its perf impact to real models.
ADInplaceOrView,
// Note [Alias Dispatch Key : Autograd]
// All backends are oblivious to autograd; autograd is handled as a
// layer which happens on top of all backends. It inspects the autograd
// metadata of all inputs, determines what autograd metadata should be
// constructed by the output, and otherwise defers to the backend to
// actually do the numeric computation. Autograd contains
// the bulk of this logic.
// Autograd is now an alias dispatch key which by default maps to all
// backend-specific autograd keys.
// Backend-specific allow backends to override the default kernel registered
// to Autograd key as needed.
// For example, XLA wants to define autograd for einsum directly.
// Registering a custom autograd implementation at the XLA key won't work
// because we process Autograd before XLA. This key has higher priority and
// gets processed first. You generally should NOT redispatch after handling
// autograd here (since that would result in execution of the Autograd
// operator, which you're trying to skip). In AutogradXLA implementations,
// you are responsible for handling autograd yourself, or deferring to other
// operators which support autograd.
// Currently we only have backend-specific autograd keys for CPU/CUDA/XLA and
// reserved user-defined backends. All other in-tree backends share the
// AutogradOther key. We can add specific autograd key for those backends
// upon request.
AutogradOther,
// See [Note: Per-Backend Functionality Dispatch Keys]
AutogradFunctionality,
// NestedTensor is an example of something that isn't a "real backend"
// (because it mostly consists of redispatching kernels)
// but it would like to override autograd functionality in C++.
// We can handle cases like this by adding an extra functionality key
// exclusively for handling autograd for NestedTensor.
// lives out of tree at
// https://github.com/pytorch/nestedtensor
Loading ...