#pragma once
#include <ATen/core/ivalue.h>
#include <pybind11/pybind11.h>
#include <torch/csrc/jit/api/module.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
namespace py = pybind11;
namespace torch {
namespace throughput_benchmark {
/**
* The struct is used to provide results of a benchmark to the caller
* In the future all additional statics should be added here.
*/
struct BenchmarkExecutionStats {
float latency_avg_ms{-1};
int64_t num_iters{-1};
};
std::ostream& operator<<(
std::ostream& os,
const BenchmarkExecutionStats& value);
/**
* Use this struct in order to configure a throughput benchmark run.
* This struct should include parameters related to threading, batching, number
* of iterations, warm-up, etc. More configs can be added as needed.
* General rule here is that only things that c++ must(!) to be aware of should
* be here. If we can keep other parts in python, we should keep them there.
* This is typical for things that are not perf critical and don't affect
* execution statistics benchmark returns.
*/
struct BenchmarkConfig {
public:
// Calling threads are those threads that are calling into a module in
// parallel.
int num_calling_threads{1};
// Worker threads are not supported yet. This is just an example that we plan
// to support some sort of multi-threaded forward calls. We may change this
// setting in the future to support different intra and inter op parallelizm
// which is not available in PyTorch yet
int num_worker_threads{1};
// Warmup iters are used to make sure we run a module a few times before
// actually measuring things. This way we avoid cold caches and any other
// similar problems
int num_warmup_iters{1};
// Number of iterations the benchmark should run with. This number is separate
// from the warmup iterations
int64_t num_iters{100};
// If set autograd profiler will be enabled. I.e. this variable would be
// created before the main benchmark loop (but after the warmup):
// RecordProfile guard(profiler_output_path);
std::string profiler_output_path{""};
};
namespace detail {
/**
* A helper class to abstract out different models we test throughput of
*/
template <class Input, class Output, class Model>
class BenchmarkHelper {
public:
BenchmarkHelper();
// NOLINTNEXTLINE(modernize-pass-by-value)
explicit BenchmarkHelper(Model model) : model_(model), initialized_(true) {}
// This method to be used in benchmark() method
// Note that there is no result. This way we don't have to call this under GIL
// even when running in the nn.Module mode. Otherwise destructor of the result
// would race with Python
void runOnce(Input&&) const;
// This method is to be used when calling from Python dirrectly
Output runOnce(py::args&&, py::kwargs&&) const;
// Aggregate input in the format Model expects in order to avoid further
// conversions at the benchmark time
void addInput(py::args&&, py::kwargs&&);
void addInput(Input&&);
BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;
bool initialized() const {
return initialized_;
}
// Destructor doesn't require the GIL because it is going to be executed on
// the PyThon thread
std::vector<Input> inputs_;
Model model_;
bool initialized_{false};
};
struct C10_HIDDEN ModuleInput {
ModuleInput(ModuleInput&& other) = default;
ModuleInput(const ModuleInput&) = delete;
ModuleInput& operator=(ModuleInput& other) = delete;
ModuleInput& operator=(ModuleInput&& other) = delete;
ModuleInput(py::args&& args, py::kwargs&& kwargs)
: args(std::move(args)), kwargs(std::move(kwargs)) {}
py::args args;
py::kwargs kwargs;
};
typedef py::object ModuleOutput;
typedef std::vector<at::IValue> ScriptModuleInput;
typedef at::IValue ScriptModuleOutput;
template <class Input>
Input cloneInput(const Input& input);
typedef BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>
ScriptModuleBenchmark;
template <>
inline BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>::
BenchmarkHelper()
: model_("Module", std::make_shared<jit::CompilationUnit>()),
initialized_(false) {}
typedef BenchmarkHelper<ModuleInput, py::object, py::object> ModuleBenchmark;
template <>
inline BenchmarkHelper<ModuleInput, py::object, py::object>::BenchmarkHelper()
: initialized_(false) {}
template <>
void ScriptModuleBenchmark::runOnce(ScriptModuleInput&& input) const;
template <>
ScriptModuleOutput ScriptModuleBenchmark::runOnce(
py::args&& args,
py::kwargs&& kwargs) const;
template <>
void ModuleBenchmark::runOnce(ModuleInput&& input) const;
template <>
ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs)
const;
template <>
void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);
template <>
void ScriptModuleBenchmark::addInput(ScriptModuleInput&& input);
template <>
void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);
} // namespace detail
/**
* This class is a small c++ component responsible for executing a PyTorch
* module under an inference server like load. It can emulate multiple calling
* threads to a single module provided. In the future we plan to enhance this
* component to support inter and intra-op parallelism as well as multiple
* models running in a single process.
*
* For current available configurations refer to the BenchmkarConfig
* documentation
*
* The class supports working with either nn.Module or ScriptModule.
* Under the hood it just dispatches to corresponding specialization of
* class BenchmarkHelper<Input, Output, Model>
*/
class C10_HIDDEN ThroughputBenchmark {
public:
explicit ThroughputBenchmark(jit::Module module);
explicit ThroughputBenchmark(py::object module);
// Add one more input example. This input example should be in the exact
// format the module under test expects. It is responsibility of the module to
// perform any such format checks, the benchmark doesn't perform any
// validation of its own
void addInput(py::args args, py::kwargs kwargs);
// Equivalent to just running the model dirrectly on the given input
py::object runOnce(py::args&& args, py::kwargs&& kwargs);
// The main method of the class allows to perform a multi-threaded benchmark
// It returns BenchmarkExecutionStats object with a lot of useful statistics
// about runtime execution. We can enhance this class in the future to provide
// more information to the user
BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;
private:
detail::ScriptModuleBenchmark script_module_;
detail::ModuleBenchmark module_;
};
} // namespace throughput_benchmark
} // namespace torch
#include <torch/csrc/utils/throughput_benchmark-inl.h>