#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
#include <opencv2/opencv.hpp>
#include <algorithm>
#include <iostream>
#include "c10/core/thread_pool.h"
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/image/transform_gpu.h"
#include "caffe2/operators/prefetch_op.h"
#include "caffe2/proto/caffe2_legacy.pb.h"
#include "caffe2/utils/cast.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
class CUDAContext;
template <class Context>
class ImageInputOp final : public PrefetchOperator<Context> {
// SINGLE_LABEL: single integer label for multi-class classification
// MULTI_LABEL_SPARSE: sparse active label indices for multi-label
// classification MULTI_LABEL_DENSE: dense label embedding vector for label
// embedding regression MULTI_LABEL_WEIGHTED_SPARSE: sparse active label
// indices with per-label weights for multi-label classification
// SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification
// with weighted sampling EMBEDDING_LABEL: an array of floating numbers
// representing dense embedding.
// It is useful for model distillation
enum LABEL_TYPE {
SINGLE_LABEL = 0,
MULTI_LABEL_SPARSE = 1,
MULTI_LABEL_DENSE = 2,
MULTI_LABEL_WEIGHTED_SPARSE = 3,
SINGLE_LABEL_WEIGHTED = 4,
EMBEDDING_LABEL = 5,
};
// INCEPTION_STYLE: Random crop with size 8% - 100% image area and aspect
// ratio in [3/4, 4/3]. Reference: GoogleNet paper
enum SCALE_JITTER_TYPE {
NO_SCALE_JITTER = 0,
INCEPTION_STYLE = 1
// TODO(zyan3): ResNet-style random scale jitter
};
public:
using OperatorBase::OutputSize;
using PrefetchOperator<Context>::context_;
using PrefetchOperator<Context>::prefetch_thread_;
explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws);
~ImageInputOp() {
PrefetchOperator<Context>::Finalize();
}
bool Prefetch() override;
bool CopyPrefetched() override;
private:
using BoundingBox = struct {
bool valid;
int ymin;
int xmin;
int height;
int width;
};
// Structure to store per-image information
// This can be modified by the DecodeAnd* so needs
// to be privatized per launch.
using PerImageArg = struct { BoundingBox bounding_params; };
bool GetImageAndLabelAndInfoFromDBValue(
const string& value,
cv::Mat* img,
PerImageArg& info,
int item_id,
std::mt19937* randgen);
void DecodeAndTransform(
const std::string& value,
float* image_data,
int item_id,
const int channels,
std::size_t thread_index);
void DecodeAndTransposeOnly(
const std::string& value,
uint8_t* image_data,
int item_id,
const int channels,
std::size_t thread_index);
bool ApplyTransformOnGPU(
const std::vector<std::int64_t>& dims,
const c10::Device& type);
unique_ptr<db::DBReader> owned_reader_;
const db::DBReader* reader_;
Tensor prefetched_image_;
Tensor prefetched_label_;
vector<Tensor> prefetched_additional_outputs_;
Tensor prefetched_image_on_device_;
Tensor prefetched_label_on_device_;
vector<Tensor> prefetched_additional_outputs_on_device_;
// Default parameters for images
PerImageArg default_arg_;
int batch_size_;
LABEL_TYPE label_type_;
int num_labels_;
bool color_;
bool color_jitter_;
float img_saturation_;
float img_brightness_;
float img_contrast_;
bool color_lighting_;
float color_lighting_std_;
std::vector<std::vector<float>> color_lighting_eigvecs_;
std::vector<float> color_lighting_eigvals_;
SCALE_JITTER_TYPE scale_jitter_type_;
int scale_;
// Minsize is similar to scale except that it will only
// force the image to scale up if it is too small. In other words,
// it ensures that both dimensions of the image are at least minsize_
int minsize_;
bool warp_;
int crop_;
std::vector<float> mean_;
std::vector<float> std_;
Tensor mean_gpu_;
Tensor std_gpu_;
bool mirror_;
bool is_test_;
bool use_caffe_datum_;
bool gpu_transform_;
bool mean_std_copied_ = false;
// thread pool for parse + decode
int num_decode_threads_;
int additional_inputs_offset_;
int additional_inputs_count_;
std::vector<int> additional_output_sizes_;
std::shared_ptr<TaskThreadPool> thread_pool_;
// Output type for GPU transform path
TensorProto_DataType output_type_;
// random minsize
vector<int> random_scale_;
bool random_scaling_;
// Working variables
std::vector<std::mt19937> randgen_per_thread_;
// number of exceptions produced by opencv while reading image data
std::atomic<long> num_decode_errors_in_batch_{0};
// opencv exceptions tolerance
float max_decode_error_ratio_;
};
template <class Context>
ImageInputOp<Context>::ImageInputOp(
const OperatorDef& operator_def,
Workspace* ws)
: PrefetchOperator<Context>(operator_def, ws),
reader_(nullptr),
batch_size_(
OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
label_type_(static_cast<LABEL_TYPE>(
OperatorBase::template GetSingleArgument<int>("label_type", 0))),
num_labels_(
OperatorBase::template GetSingleArgument<int>("num_labels", 0)),
color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
color_jitter_(
OperatorBase::template GetSingleArgument<int>("color_jitter", 0)),
img_saturation_(OperatorBase::template GetSingleArgument<float>(
"img_saturation",
0.4)),
img_brightness_(OperatorBase::template GetSingleArgument<float>(
"img_brightness",
0.4)),
img_contrast_(
OperatorBase::template GetSingleArgument<float>("img_contrast", 0.4)),
color_lighting_(
OperatorBase::template GetSingleArgument<int>("color_lighting", 0)),
color_lighting_std_(OperatorBase::template GetSingleArgument<float>(
"color_lighting_std",
0.1)),
scale_jitter_type_(static_cast<SCALE_JITTER_TYPE>(
OperatorBase::template GetSingleArgument<int>(
"scale_jitter_type",
0))),
scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
minsize_(OperatorBase::template GetSingleArgument<int>("minsize", -1)),
warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)),
is_test_(OperatorBase::template GetSingleArgument<int>(
OpSchema::Arg_IsTest,
0)),
use_caffe_datum_(
OperatorBase::template GetSingleArgument<int>("use_caffe_datum", 0)),
gpu_transform_(OperatorBase::template GetSingleArgument<int>(
"use_gpu_transform",
0)),
num_decode_threads_(
OperatorBase::template GetSingleArgument<int>("decode_threads", 4)),
additional_output_sizes_(
OperatorBase::template GetRepeatedArgument<int>("output_sizes", {})),
thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)),
// output type only supported with CUDA and use_gpu_transform for now
output_type_(
cast::GetCastDataType(ArgumentHelper(operator_def), "output_type")),
random_scale_(OperatorBase::template GetRepeatedArgument<int>(
"random_scale",
{-1, -1})),
max_decode_error_ratio_(OperatorBase::template GetSingleArgument<float>(
"max_decode_error_ratio",
1.0)) {
if ((random_scale_[0] == -1) || (random_scale_[1] == -1)) {
random_scaling_ = false;
} else {
random_scaling_ = true;
minsize_ = random_scale_[0];
}
mean_ = OperatorBase::template GetRepeatedArgument<float>(
"mean_per_channel",
{OperatorBase::template GetSingleArgument<float>("mean", 0.)});
std_ = OperatorBase::template GetRepeatedArgument<float>(
"std_per_channel",
{OperatorBase::template GetSingleArgument<float>("std", 1.)});
if (additional_output_sizes_.size() == 0) {
additional_output_sizes_ = std::vector<int>(OutputSize() - 2, 1);
} else {
CAFFE_ENFORCE(
additional_output_sizes_.size() == OutputSize() - 2,
"If the output sizes are specified, they must be specified for all "
"additional outputs");
}
additional_inputs_count_ = OutputSize() - 2;
default_arg_.bounding_params = {
false,
OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1),
OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1),
OperatorBase::template GetSingleArgument<int>("bounding_height", -1),
OperatorBase::template GetSingleArgument<int>("bounding_width", -1),
};
if (operator_def.input_size() == 0) {
LOG(ERROR) << "You are using an old ImageInputOp format that creates "
"a local db reader. Consider moving to the new style "
"that takes in a DBReader blob instead.";
string db_name = OperatorBase::template GetSingleArgument<string>("db", "");
CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name.");
owned_reader_.reset(new db::DBReader(
OperatorBase::template GetSingleArgument<string>("db_type", "leveldb"),
db_name));
reader_ = owned_reader_.get();
}
// hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order
color_lighting_eigvecs_.push_back(
std::vector<float>{-144.7125f, 183.396f, 102.2295f});
color_lighting_eigvecs_.push_back(
std::vector<float>{-148.104f, -1.1475f, -207.57f});
color_lighting_eigvecs_.push_back(
std::vector<float>{-148.818f, -177.174f, 107.1765f});
color_lighting_eigvals_ = std::vector<float>{0.2175f, 0.0188f, 0.0045f};
CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative.");
if (use_caffe_datum_) {
CAFFE_ENFORCE(
label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED,
"Caffe datum only supports single integer label");
}
if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
CAFFE_ENFORCE_GT(
num_labels_,
0,
"Number of labels must be set for using either sparse label indices or dense label embedding.");
}
if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE ||
label_type_ == SINGLE_LABEL_WEIGHTED) {
additional_inputs_offset_ = 3;
} else {
additional_inputs_offset_ = 2;
}
CAFFE_ENFORCE(
(scale_ > 0) != (minsize_ > 0),
"Must provide one and only one of scaling or minsize");
CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value.");
CAFFE_ENFORCE_GE(
scale_ > 0 ? scale_ : minsize_,
crop_,
"The scale/minsize value must be no smaller than the crop value.");
CAFFE_ENFORCE_EQ(
mean_.size(),
std_.size(),
"The mean and std. dev vectors must be of the same size.");
CAFFE_ENFORCE(
mean_.size() == 1 || mean_.size() == 3,
"The mean and std. dev vectors must be of size 1 or 3");
CAFFE_ENFORCE(
!use_caffe_datum_ || OutputSize() == 2,
"There can only be 2 outputs if the Caffe datum format is used");
CAFFE_ENFORCE(
random_scale_.size() == 2, "Must provide [scale_min, scale_max]");
CAFFE_ENFORCE_GE(
random_scale_[1],
random_scale_[0],
"random scale must provide a range [min, max]");
if (default_arg_.bounding_params.ymin < 0 ||
default_arg_.bounding_params.xmin < 0 ||
default_arg_.bounding_params.height < 0 ||
default_arg_.bounding_params.width < 0) {
default_arg_.bounding_params.valid = false;
} else {
default_arg_.bounding_params.valid = true;
}
if (mean_.size() == 1) {
// We are going to extend to 3 using the first value
mean_.resize(3, mean_[0]);
std_.resize(3, std_[0]);
}
LOG(INFO) << "Creating an image input op with the following setting: ";
LOG(INFO) << " Using " << num_decode_threads_ << " CPU threads;";
if (gpu_transform_) {
LOG(INFO) << " Performing transformation on GPU";
}
LOG(INFO) << " Outputting in batches of " << batch_size_ << " images;";
LOG(INFO) << " Treating input image as "
<< (color_ ? "color " : "grayscale ") << "image;";
if (default_arg_.bounding_params.valid) {
Loading ...