#pragma once
#include <unordered_map>
#include "onnx/onnx_pb.h"
#include "c10/util/Exception.h"
#include "c10/util/SmallVector.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/onnx/onnxifi_graph_info.h"
#include "caffe2/onnx/onnxifi_init.h"
#include "caffe2/opt/shape_info.h"
#include "caffe2/utils/proto_utils.h"
#include "caffe2/utils/string_utils.h"
namespace caffe2 {
namespace details {
/// Provides slicing info for the outputs. All the vector members should be of
/// the same size as number of outputs of the Onnxifi op.
struct OutputReshapeInfo {
std::vector<Tensor> begins;
std::vector<Tensor> ends;
std::vector<bool> fast_path;
};
struct TensorInfo {
std::vector<uint64_t> dims;
uint64_t onnxifi_type;
bool quantized;
uint32_t quantizationAxis;
uint64_t quantizationParams;
std::vector<float> scales;
std::vector<int32_t> biases;
explicit TensorInfo(const TensorProto& t);
explicit TensorInfo(const QTensorProto& t);
TensorInfo(TensorInfo&&) = default;
TensorInfo& operator=(TensorInfo&&) = default;
};
} // namespace details
template <typename Context>
class OnnxifiOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
explicit OnnxifiOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<Context>(operator_def, ws),
use_onnx_(this->template GetSingleArgument<int>("use_onnx", 0)),
use_glow_aot_(this->template GetSingleArgument<int>("use_glow_aot", 0)),
max_batch_size_(
this->template GetSingleArgument<int>("max_batch_size", 0)),
max_seq_size_(this->template GetSingleArgument<int>("max_seq_size", 0)),
timeout_(this->template GetSingleArgument<int>("timeout", 0)),
nominal_batch_idx_(
this->template GetSingleArgument<int>("nominal_batch_idx", 0)),
use_passed_output_shapes_(this->template GetSingleArgument<int>("use_passed_output_shapes", 0)),
adjust_quantized_offset_(this->template GetSingleArgument<int>(
"adjust_quantized_offset",
128)) {
lib_ = onnx::initOnnxifiLibrary();
backend_graph_map_ptr_ = onnx::getOnnxBackendGraphMap();
CAFFE_ENFORCE(lib_, "Cannot initialize ONNXIFI library");
auto onnx_model_str =
this->template GetSingleArgument<std::string>("onnx_model", "");
CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
if (use_glow_aot_) {
auto netdef_str =
this->template GetSingleArgument<std::string>("netdef_str", "");
CAFFE_ENFORCE(ParseProtoFromLargeString(netdef_str, &netdef_));
} else if (!use_onnx_) {
CAFFE_ENFORCE(ParseProtoFromLargeString(onnx_model_str, &netdef_));
}
// Setup input/output descriptor templates
input_names_ =
this->template GetRepeatedArgument<std::string>("input_names");
output_names_ =
this->template GetRepeatedArgument<std::string>("output_names");
CAFFE_ENFORCE_EQ(input_names_.size(), operator_def.input_size());
CAFFE_ENFORCE_EQ(output_names_.size(), operator_def.output_size());
for (const auto& input : input_names_) {
input_desc_.push_back(onnxTensorDescriptorV1());
input_desc_.back().name = input.c_str();
}
all_offsets_.reserve(ws->Blobs().size());
all_scales_.reserve(ws->Blobs().size());
input_shapes_.resize(input_names_.size());
output_shapes_max_bs_.resize(output_names_.size());
quantized_outputs_.resize(output_names_.size(), false);
int output_idx = 0;
ArgumentHelper helper(operator_def);
auto output_shape_info =
helper.GetRepeatedArgument<TensorProto>("output_shape_info");
auto output_qshape_info =
helper.GetRepeatedArgument<QTensorProto>("output_qshape_info");
std::unordered_map<std::string, TensorProto> output_shape_map;
for (const auto& info : output_shape_info) {
output_shape_map.emplace(info.name(), info);
}
std::unordered_map<std::string, QTensorProto> output_qshape_map;
for (const auto& info : output_qshape_info) {
output_qshape_map.emplace(info.name(), info);
}
bool has_quantized_output = false;
for (const auto& output : output_names_) {
output_desc_.push_back(onnxTensorDescriptorV1());
output_desc_.back().name = output.c_str();
// For output, we try to get its output size hint
const auto it = output_shape_map.find(output);
if (it != output_shape_map.end()) {
output_shape_hints_.emplace(
output_idx, details::TensorInfo(it->second));
} else {
const auto qit = output_qshape_map.find(output);
if (qit != output_qshape_map.end()) {
output_shape_hints_.emplace(
output_idx, details::TensorInfo(qit->second));
quantized_outputs_[output_idx] = true;
has_quantized_output = true;
}
}
++output_idx;
}
if (!has_quantized_output) {
adjust_quantized_offset_ = 0;
}
LOG(INFO) << "use_onnx_=" << use_onnx_
<< ", use_glow_aot_=" << use_glow_aot_
<< ", use_passed_output_shapes_=" << use_passed_output_shapes_;
if (use_passed_output_shapes_) {
// Populate output_shapes_per_bs_
for (int bs = 1; bs < max_batch_size_; ++bs) {
auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());
std::unordered_map<std::string, details::TensorInfo> name_to_shape;
for (const auto& output_shape_tp : output_shapes_tp) {
name_to_shape.emplace(output_shape_tp.name(), details::TensorInfo{output_shape_tp});
}
for (const auto& output_qshape_tp : output_qshapes_tp) {
name_to_shape.emplace(output_qshape_tp.name(), details::TensorInfo{output_qshape_tp});
}
for (output_idx = 0; output_idx < output_names_.size(); ++output_idx) {
auto it = name_to_shape.find(output_names_[output_idx]);
CAFFE_ENFORCE(it != name_to_shape.end());
output_shapes_per_bs_[bs].push_back({});
auto &output_shapes = output_shapes_per_bs_[bs].back();
std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes));
}
}
}
// Get output resizing hints
adjust_output_batch_ =
this->template GetSingleArgument<int>("adjust_output_batch", 0);
// Encode arguments starting with "custom_" to backend
std::vector<uint64_t> property_pointers;
std::vector<int64_t> int_args;
std::vector<float> float_args;
buildPropertyList(operator_def, &property_pointers, &int_args, &float_args);
// Initialize the backend if it has not been already created. When we
// initialized the backend, we will get the weights (initializers) from the
// workspace and offload onto the backend. This should be done only once.
// Subsequent call of this function with the same model id should find a
// cached backend and therefore there is no need to repeat the above
// process.
buildBackendAndGraph(ws, property_pointers, onnx_model_str);
}
~OnnxifiOp() {
backend_graph_shared_ptr_.reset();
backend_graph_map_ptr_->remove(op_id_string_);
#ifdef ONNXIFI_ENABLE_EXT
traces_.reset();
#endif
}
bool RunOnDevice() override;
void setEnableTracing(bool b) {
enable_tracing_ = b;
}
#ifdef ONNXIFI_ENABLE_EXT
std::shared_ptr<onnxTraceEventList> traces() const {
return traces_;
}
#endif
private:
// Second argument is a cache vector to avoid repeated reallocation.
// The existence of this is not ideal, which is purely due to the fact that
// we use int64_t for c2::tensor dim but uint64_t for onnxDesciptor dim.
// Maybe we should just use int64_t.
void setOutputShapeAndType(
int output_idx,
c10::SmallVector<int64_t, 4>& tensor_dims_int64);
void buildPropertyList(
const OperatorDef& /* unused */,
std::vector<uint64_t>* property_list,
std::vector<int64_t>* /* unused */,
std::vector<float>* /* unused */) {
property_list->push_back(ONNXIFI_BACKEND_PROPERTY_NONE);
}
void buildBackendAndGraph(
Workspace* ws,
const std::vector<uint64_t>& property_pointers,
const std::string& onnx_model_str) {
op_id_string_ =
this->template GetSingleArgument<std::string>("model_id", "") + ":" +
this->template GetSingleArgument<std::string>("net_pos", "");
auto initializers =
this->template GetRepeatedArgument<std::string>("initializers");
// Build the Onnxifi engine
auto backend_index =
this->template GetSingleArgument<int>("backend_id", use_onnx_ ? 1 : 0);
// If using Glow AOT, override the backend_id to 1, since it uses a custom
// ONNX format, and that's the id we use for the ONNX backend.
if (use_glow_aot_) {
backend_index = 1;
}
auto creator = [this,
ws,
property_pointers,
backend_index,
&onnx_model_str,
&initializers]() {
std::vector<onnxBackendID> backend_ids;
size_t num_backends{0};
CAFFE_ENFORCE_EQ(
lib_->onnxGetBackendIDs(nullptr, &num_backends),
ONNXIFI_STATUS_FALLBACK);
CAFFE_ENFORCE_GT(
num_backends, 0, "At least 1 onnxifi backend should be available");
CAFFE_ENFORCE_LT(
backend_index,
num_backends,
"Backend idx out of bound: ",
backend_index,
", #backends: ",
num_backends);
backend_ids.resize(num_backends);
CAFFE_ENFORCE_EQ(
lib_->onnxGetBackendIDs(backend_ids.data(), &num_backends),
ONNXIFI_STATUS_SUCCESS);
onnxBackendID backend_id = backend_ids[backend_index];
onnxBackend backend{nullptr};
CAFFE_ENFORCE_EQ(
lib_->onnxInitBackend(backend_id, property_pointers.data(), &backend),
ONNXIFI_STATUS_SUCCESS);
// Release unused backend ids.
for (size_t i = 0; i < num_backends; ++i) {
if (i == backend_index) {
continue;
}
lib_->onnxReleaseBackendID(backend_ids[i]);
}
// Get weights
std::vector<std::string> weight_names;
std::vector<std::vector<uint64_t>> weight_shapes;
auto weight_descs = buildInitializationList(
ws,
initializers,
&weight_names,
&weight_shapes,
&all_scales_,
&all_offsets_);
// Extra weight shapes
std::unordered_map<std::string, ShapeInfo> weight_shape_info;
for (size_t i = 0; i < weight_names.size(); ++i) {
TensorShape shape;
const auto& shape0 = weight_shapes[i];
for (const auto d : shape0) {
shape.add_dims(d);
}
weight_shape_info[weight_names[i]] = ShapeInfo(
std::vector<TensorBoundShape::DimType>(
shape0.size(), TensorBoundShape_DimType_CONSTANT),
std::move(shape));
}
Blob* defered_blob_reader = nullptr;
if (ws->HasBlob("__DEFERRED_BLOB_READER__")) {
defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__");
}
onnxGraph graph{nullptr};
static const uint64_t auxPropertiesListAOT[] = {
ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE};
auto ret = lib_->onnxInitGraph(
backend,
use_glow_aot_ ? auxPropertiesListAOT : nullptr,
onnx_model_str.size(),
(const void*)(onnx_model_str.c_str()),
weight_descs.size(),
weight_descs.data(),
&graph,
static_cast<uint32_t>(max_seq_size_),
defered_blob_reader);
if (ret != ONNXIFI_STATUS_SUCCESS) {
if (ret == ONNXIFI_STATUS_FATAL_ERROR) {
C10_THROW_ERROR(
OnnxfiBackendSystemError, "Fatal error during onnxInitGraph");
} else {
CAFFE_THROW("onnxInitGraph failed");
}
}
return std::make_shared<onnx::BackendGraphInfo>(
backend_id, backend, graph, lib_, std::move(weight_shape_info));
};
backend_graph_shared_ptr_ =
backend_graph_map_ptr_->insert(op_id_string_, creator);
backend_id_ = backend_graph_shared_ptr_->backend_id;
backend_ = backend_graph_shared_ptr_->backend;
graph_ = backend_graph_shared_ptr_->graph;
input_shape_info_ = backend_graph_shared_ptr_->weight_shape_info;
getExtFunctionPointers();
}
/// Set up function pointer if onnxifi_ext is enabled
void getExtFunctionPointers() {
#ifdef ONNXIFI_ENABLE_EXT
union {
onnxExtensionFunctionPointer p;
decltype(onnxSetIOAndRunGraphPointer_) set;
decltype(onnxReleaseTraceEventsPointer_) release;
Loading ...