#pragma once
#include <ATen/core/ivalue.h>
#include <ATen/core/operator_name.h>
#include <c10/macros/Export.h>
#include <c10/util/Optional.h>
#include <c10/util/SmallVector.h>
#include <c10/util/variant.h>
#include <array>
#include <atomic>
#include <functional>
#include <memory>
namespace c10 {
class TORCH_API OperatorHandle;
namespace at {
// Kind of record function scope;
enum class C10_API_ENUM RecordScope : uint8_t {
// c10/ATen ops, autograd nodes
// Functions/nodes called from the autograd
// TorchScript functions, methods
// Kernel Function dtype Tag
// Torchbind custom class,
// Generic Build Feature
// Kernel Function dtype Tag
// User defined scope (e.g. with record_function())
// Scopes for static runtime, a specialized TorchScript interpreter
NUM_SCOPES, // must be the last in the list
} // namespace at
namespace std {
template <>
struct hash<at::RecordScope> {
size_t operator()(const at::RecordScope& sc) const {
return static_cast<std::size_t>(sc);
} // namespace std
namespace at {
struct TORCH_API StringView {
StringView() : StringView(nullptr) {}
explicit StringView(const char* str_ptr)
: owned_str_ptr_(nullptr), str_ptr_(str_ptr) {}
explicit StringView(std::string str)
: owned_str_ptr_(std::make_shared<std::string>(std::move(str))),
str_ptr_(owned_str_ptr_->c_str()) {}
const char* str() const {
return str_ptr_;
friend std::ostream& operator<<(std::ostream& os, const StringView& dt) {
os << dt.str();
return os;
friend bool operator==(const StringView& lhs, const StringView& rhs) {
return strcmp(lhs.str(), rhs.str()) == 0;
friend bool operator!=(const StringView& lhs, const StringView& rhs) {
return !(lhs == rhs);
std::shared_ptr<std::string> owned_str_ptr_;
const char* str_ptr_;
// Soft limit on the number of callbacks to use;
constexpr std::size_t kSoftLimitCallbacks = 4;
// An abstract base class for various observer contexts that can be attached to
// the RecordFunction.
struct ObserverContext {
virtual ~ObserverContext() = default;
ObserverContext() {}
typedef c10::SmallVector<uint64_t, kSoftLimitCallbacks> CallbackHandles;
typedef c10::SmallVector<std::unique_ptr<ObserverContext>, kSoftLimitCallbacks>
typedef uint64_t RecordFunctionHandle;
struct RecordFunction;
// PyTorch callbacks/observers API:
* RecordFunctionCallback represents a pair of callbacks to be used with
* RecordFunction, members:
* start, end - the callbacks to run when entering and exiting the scope;
* optionally, the start callback may return an ObserverContext which will
* be passed to the end callback, use appropriate constructor accordingly.
* needs_inputs - whether the callbacks need the inputs passed from the
* observed function/range; NOTE: passing the inputs incurs an additional
* overhead; sampling_probability - if not 1.0, then the callback is
* probabilistically sampled to run; NOTE: start and end callbacks always run as
* a pair and are sampled together; scopes - types of scopes to execute the
* callbacks on (see RecordScope); passing empty set means the callbacks will be
* executed for all possible scope types should_run - optional function that
* returns whether this callback should run; overwrites the effect of setting
* sampling_probability
class TORCH_API RecordFunctionCallback {
using StartCallback =
std::unique_ptr<ObserverContext> (*)(const RecordFunction&);
using EndCallback = void (*)(const RecordFunction&, ObserverContext*);
// This interface supports observers that require passing an ObserverContext
// between start and end callbacks.
explicit RecordFunctionCallback(
StartCallback start,
EndCallback end = nullptr)
: start_(start), end_(end) {
RecordFunctionCallback& needsInputs(bool needs_inputs) {
needs_inputs_ = needs_inputs;
return *this;
RecordFunctionCallback& needsOutputs(bool needs_outputs) {
needs_outputs_ = needs_outputs;
return *this;
RecordFunctionCallback& needsIds(bool needs_ids) {
needs_ids_ = needs_ids;
return *this;
RecordFunctionCallback& samplingProb(double sampling_prob) {
sampling_prob >= 0.0 && sampling_prob <= 1.0,
"Invalid sampling probability");
sampling_prob_ = sampling_prob;
return *this;
RecordFunctionCallback& scopes(
const std::unordered_set<RecordScope, std::hash<RecordScope>>& scopes) {
if (!scopes.empty()) {
for (auto sc : scopes) {
scopes_[static_cast<size_t>(sc)] = true;
} else {
return *this;
bool needsInputs() const {
return needs_inputs_;
bool needsOutputs() const {
return needs_outputs_;
bool needsIds() const {
return needs_ids_;
double samplingProb() const {
return sampling_prob_;
bool checkScope(RecordScope sc) const {
return scopes_[(size_t)sc];
StartCallback start() const {
return start_;
EndCallback end() const {
return end_;
StartCallback start_;
EndCallback end_;
double sampling_prob_ = 1.0;
std::array<bool, static_cast<size_t>(RecordScope::NUM_SCOPES)> scopes_ = {};
bool needs_inputs_ = false;
bool needs_outputs_ = false;
bool needs_ids_ = false;
// Notes:
// - two types of callbacks are provided: thread local and global
// - thread local callbacks are added/removed only for the given thread
// and are stored locally for each thread and separately from the list
// of the global callbacks
// - global callbacks are stored in a single per process list and are
// invoked by every RecordFunction, in addition to the thread local
// callbacks specific to the given thread
// - we allow the added callbacks to be sampled, by specifying a sampling
// probability for each callback pair, if the start callback is
// not picked to run, the corresponding end callback won't be called
// - a typical use case for the global callbacks is passive monitoring
// in the background (e.g. fleet-wide monitoring), without focusing on
// the specific piece of code
// - in contrast, thread local callbacks are enabled locally, on demand,
// for the specific piece of code (range) and are not sampled
// - a typical use case for thread local callbacks is profiler and code
// execution tracer
// - note, thread local callbacks are automatically propagated with
// ThreadLocalState across JIT continuations and async tasks (at::launch)
typedef uint64_t CallbackHandle;
constexpr CallbackHandle INVALID_CALLBACK_HANDLE{0};
// It is unnecessary to use atomic operations for enabling
// thread-local function callbacks. Moreover, it prevents saving to
// ThreadLocalState because std::atomic is non-copyable.
struct RecordFunctionCallbacksEntry {
RecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h)
: callback_(cb), handle_(h) {}
RecordFunctionCallback callback_;
bool enabled_{true};
CallbackHandle handle_;
// Holds pairs (callbacks, unique_id)
using RecordFunctionCallbacks = std::vector<RecordFunctionCallbacksEntry>;
// Generated by the callback managers to determine which functions to run.
struct StepCallbacks {
StepCallbacks() = default;
StepCallbacks(uint64_t thread_id, RecordScope scope)
: thread_id_{thread_id}, scope_{scope} {}
bool empty() const {
return callbacks_.empty();
struct StartEndPair {
RecordFunctionCallback::StartCallback start_;
RecordFunctionCallback::EndCallback end_;
using StartEndPairs = c10::SmallVector<StartEndPair, kSoftLimitCallbacks>;
StartEndPairs callbacks_;
uint64_t thread_id_{0};
RecordScope scope_{RecordScope::FUNCTION};
bool needs_inputs_{false};
bool needs_outputs_{false};
bool needs_ids_{false};
struct TORCH_API RecordFunction {
// Default constructor is used with before function called afterwards:
// scope - record scope that this function tracks
// pre_sampled - whether this RecordFunction was already pre-sampled with
// kLowProb probability
explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION);
explicit RecordFunction(StepCallbacks&& step_callbacks);
template <typename F>
void before(
F fn,
c10::ArrayRef<const c10::IValue> args,
int64_t current_sequence_nr = -1) {
if (!isActive()) {
inputs_ = args;
before(fn, current_sequence_nr);
template <typename F>
void before(
F fn,
const std::vector<IValue>* args,
int64_t current_sequence_nr = -1) {
c10::ArrayRef<const c10::IValue>(args->data(), args->size()),
// Destructor calls end callbacks
virtual ~RecordFunction();
RecordFunction(const RecordFunction&) = delete;
RecordFunction& operator=(const RecordFunction&) = delete;
const char* name() const;
int64_t seqNr() const {
return sequence_nr_;
c10::ArrayRef<const IValue> inputs() const {
#ifndef NDEBUG
inputs_valid_, "Called inputs() outside RecordFunction start callback");
return inputs_;
const std::vector<c10::IValue>& outputs() const {
return outputs_;
void setOutputs(std::vector<c10::IValue>&& outputs) {
outputs_ = std::move(outputs);
void setOutputs(c10::ArrayRef<c10::IValue> outputs) {
outputs_ = outputs.vec();
size_t num_inputs() const;
size_t num_outputs() const;
// Retrieves the thread_id that this RecordFunction ran start callbacks with.
// Useful for writing thread safe end callbacks that may be potentially
// executed in a different thread (async ops)
uint64_t threadId() const {
return step_callbacks_.thread_id_;
// For backward functions - thread id of the corresponding forward function,
// or zero otherwise;
// used alongside with sequence number to correlate backward functions with
// the forward ones
uint64_t forwardThreadId() const {
return fwd_thread_id_;
void setForwardThreadId(uint64_t thread_id) {
fwd_thread_id_ = thread_id;
RecordScope scope() const {
return step_callbacks_.scope_;
// Returns logical thread_id for the current thread
static uint64_t currentThreadId();
// Internal functions, do not use directly;
// used in python's context manager
// before functions initialize RecordFunction members and call
// start callbacks
using schema_ref_t = std::reference_wrapper<const c10::FunctionSchema>;
void before(const char* name, int64_t sequence_nr = -1);
void before(std::string name, int64_t sequence_nr = -1);
void before(schema_ref_t schema, int64_t sequence_nr = -1);
// Sets node ID for distributed profiling
static void setDefaultNodeId(int64_t defaultNodeId);
// Gets node ID for distributed profiling
static int64_t getDefaultNodeId();
// Calls end callbacks. After end(), accessors will no longer provide useful
// results.
void end();
// Internal-only, used only force async event for distributed events
// profiling.
void _setAsync();
// Returns whether this RecordFunction corresponds to an async event orn ot.
bool isAsync() const;
// Internal-only, used to denote out variant used for Static Runtime execution
void _setStaticRuntimeOutVariant();
bool isStaticRuntimeOutVariant() const;
RecordFunctionHandle handle() const {
return handle_;
c10::optional<OperatorName> operator_name() const;
// This method returns a copy of the FunctionSchema and can be expensive.
c10::optional<FunctionSchema> operator_schema() const;
void setHandle(RecordFunctionHandle handle) {
handle_ = handle;
// Whether this RecordFunction runs any callbacks.
bool isActive() const {
return !step_callbacks_.empty();
bool needsInputs() const {
return step_callbacks_.needs_inputs_;
bool needsOutputs() const {
return step_callbacks_.needs_outputs_;
int64_t debugHandle() const {
return debug_handle_;
void setDebugHandle(int64_t debug_handle) {
debug_handle_ = debug_handle;
void invalidateInputs() {
#ifndef NDEBUG
inputs_valid_ = false;
void runStartCallbacks();
StepCallbacks step_callbacks_;
// In cases when RecordFunction might be active but we chose not to
// use the observers (e.g. operator is not observed), this boolean
// flag is used to check whether the start callbacks were called
bool called_start_callbacks_ = false;
#ifndef NDEBUG
bool inputs_valid_ = false;
// Stores various ObserverContext objects with event metadata for callbacks.
ObserverContextList ctx_;
c10::variant<std::string, schema_ref_t> fn_;
int64_t sequence_nr_ = -1;
c10::ArrayRef<const IValue> inputs_;
std::vector<c10::IValue> outputs_;
// For backward functions - thread id of the the forward function
uint64_t fwd_thread_id_ = 0;
// Unique id for this RecordFunction, used in callbacks to track start
// and end of ranges
RecordFunctionHandle handle_{0};
// Whether this record_function corresponds to an async event or not. Async
// events can complete in different threads or follow a future-like pattern
// of use.
bool is_async_{false};
// Debug handles are used for lazy annotation of module hierarchy
// and callstack.
// This is specifically is useful for mobile runtime, where generated
// debug handles can be lazily symbolicated using debug information
int64_t debug_handle_{-1};
// Whether this RecordFunction is used for an out variant run with
// Static Runtime
bool is_static_runtime_out_variant_{false};
TORCH_API StepCallbacks getStepCallbacks(RecordScope scope);
TORCH_API c10::optional<StepCallbacks> getStepCallbacksUnlessEmpty(
RecordScope scope);
namespace detail {
template <typename Inputs, typename F, typename... Args>
void record_function_with_scope(
RecordFunction& guard,
F fn,
const Inputs& inputs,
Args&&... args) {
if (guard.needsInputs()) {
c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()),
} else {
guard.before(fn, std::forward<Args>(args)...);
template <typename Inputs, typename F, typename... Args>
void record_function_with_scope_and_debug_handle(
RecordFunction& guard,
F fn,
int64_t debug_handle,
const Inputs& inputs,
Args&&... args) {
if (guard.needsInputs()) {
c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()),
} else {
guard.before(fn, std::forward<Args>(args)...);
template <typename F, typename... Args>
void record_function_with_scope(
RecordFunction& guard,
F fn,
c10::ArrayRef<const c10::IValue> inputs,
Args&&... args) {
return record_function_with_scope<
c10::ArrayRef<const c10::IValue>,
Args...>(guard, std::move(fn), inputs, std::forward<Args>(args)...);
template <typename F, typename... Args>
void record_function_with_scope_and_debug_handle(
RecordFunction& guard,
F fn,
int64_t debug_handle,
c10::ArrayRef<const c10::IValue> inputs,
Args&&... args) {
return record_function_with_scope_and_debug_handle<
c10::ArrayRef<const c10::IValue>,
guard, std::move(fn), debug_handle, inputs, std::forward<Args>(args)...);
} // namespace detail
// optional argument - function's seq_no
#define RECORD_FUNCTION_WITH_SCOPE(scope, fn, inputs, ...) \
at::RecordFunction guard(scope); \
if (guard.isActive()) { \
::at::detail::record_function_with_scope( \
guard, fn, inputs, ##__VA_ARGS__); \
scope, fn, inputs, outputs, ...) \
at::RecordFunction guard(scope); \
if (guard.isActive()) { \
if (guard.needsInputs()) { \
guard.before(fn, inputs, ##__VA_ARGS__); \
} else { \
guard.before(fn, ##__VA_ARGS__); \
} \
if (guard.needsOutputs()) { \
guard.setOutputs(outputs); \
} \
#define RECORD_FUNCTION(fn, inputs, ...) \
at::RecordScope::FUNCTION, fn, inputs, ##__VA_ARGS__)
#define RECORD_FUNCTION_WITH_INPUTS_OUTPUTS(fn, inputs, outputs, ...) \
at::RecordScope::FUNCTION, fn, inputs, outputs, ##__VA_ARGS__)
// Custom user scopes in C++; similar to Python's 'with record_function("..."):'
#define RECORD_USER_SCOPE(fn) \
at::RecordScope::USER_SCOPE, fn, c10::ArrayRef<const c10::IValue>{})
// RECORD_USER_SCOPE with inputs
#define RECORD_USER_SCOPE_WITH_INPUTS(fn, inputs) \
// Helper macro to pass in debug handle that is used to
// post process events
scope, fn, debug_handle, inputs, ...) \
at::RecordFunction guard(scope); \
if (guard.isActive()) { \
::at::detail::record_function_with_scope_and_debug_handle( \
guard, fn, debug_handle, inputs, ##__VA_ARGS__); \
// Helper macros to record LITE INTERPETER scope events with debug handles
fn, debug_handle, inputs) \
at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
// Bookend to the RECORD_FUNCTION macros. Use this after the kernel
// launch to let the profiler bind the outputs to the op that produced
// them. Note that guard is declared by RECORD_FUNCTION so this macro
// needs to be called from the same scope as RECORD_FUNCTION
#define RECORD_OUTPUTS(outputs) \
if (guard.needsOutputs()) { \
guard.setOutputs( \
std::vector<c10::IValue>(outputs.begin(), outputs.end())); \
* addThreadLocalCallback adds a thread local callback to run with
* RecordFunction, returns handle to use with removeThreadLocalCallback
TORCH_API CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb);
* hasThreadLocalCallbacks returns whether there're callbacks registered
* with addThreadLocalCallback
TORCH_API bool hasThreadLocalCallbacks();
* clearThreadLocalCallbacks removes all thread local callbacks
TORCH_API void clearThreadLocalCallbacks();
* addGlobalCallback adds a global callback to run with RecordFunction:
* only during the program initialization
TORCH_API CallbackHandle addGlobalCallback(RecordFunctionCallback cb);
* removeCallback removes a callback given the handle returned by
* addThreadLocalCallback or addGlobalCallback;
* no other code can run simultaneously
TORCH_API void removeCallback(CallbackHandle handle);
* Prevent the given callback from executing. If handle is invalid,
* does nothing.
TORCH_API void disableCallback(CallbackHandle handle);
* Allow the given callback, previously disabled with disableCallback, to
* execute again. If handle is invalid, does nothing.
TORCH_API void reenableCallback(CallbackHandle handle);
* hasGlobalCallbacks returns whether there're global callbacks
* registered with pushGlobalCallback
TORCH_API bool hasGlobalCallbacks();
* clearGlobalCallbacks removes all global callbacks
TORCH_API void clearGlobalCallbacks();
// for both thread local and global callbacks
TORCH_API bool hasCallbacks();
TORCH_API void clearCallbacks();
* enableRecordFunction enables RecordFunction thread locally
TORCH_API void enableRecordFunction(bool enable = true);
* isRecordFunctionEnabled returns whether RecordFunction
* is enabled thread locally
TORCH_API bool isRecordFunctionEnabled();
class TORCH_API RecordFunctionGuard {
explicit RecordFunctionGuard(bool is_enabled = true)
: prev_value_(isRecordFunctionEnabled()) {
virtual ~RecordFunctionGuard() {
bool prev_value_ = false;
class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
DisableRecordFunctionGuard() : RecordFunctionGuard(false) {}
~DisableRecordFunctionGuard() override = default;
struct TORCH_API RecordFunctionTLS {
// Thread local vector of callbacks, holds pairs (callbacks, unique_id);
// must be sorted in increasing handles order
RecordFunctionCallbacks sorted_tls_callbacks_;
bool tls_record_function_enabled_ = true;
TORCH_API const RecordFunctionTLS& get_record_function_tls_();
TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls);
TORCH_API void set_record_function_seed_for_testing(uint32_t seed);
} // namespace at