Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

neilisaac / torch   python

Repository URL to install this package:

/ include / caffe2 / image / image_input_op.h


#ifndef CAFFE2_IMAGE_IMAGE_INPUT_OP_H_
#define CAFFE2_IMAGE_IMAGE_INPUT_OP_H_

#include <opencv2/opencv.hpp>

#include <algorithm>
#include <iostream>

#include "c10/core/thread_pool.h"
#include "caffe2/core/common.h"
#include "caffe2/core/db.h"
#include "caffe2/image/transform_gpu.h"
#include "caffe2/operators/prefetch_op.h"
#include "caffe2/proto/caffe2_legacy.pb.h"
#include "caffe2/utils/cast.h"
#include "caffe2/utils/math.h"

namespace caffe2 {

class CUDAContext;

template <class Context>
class ImageInputOp final : public PrefetchOperator<Context> {
  // SINGLE_LABEL: single integer label for multi-class classification
  // MULTI_LABEL_SPARSE: sparse active label indices for multi-label
  // classification MULTI_LABEL_DENSE: dense label embedding vector for label
  // embedding regression MULTI_LABEL_WEIGHTED_SPARSE: sparse active label
  // indices with per-label weights for multi-label classification
  // SINGLE_LABEL_WEIGHTED: single integer label for multi-class classification
  // with weighted sampling EMBEDDING_LABEL: an array of floating numbers
  // representing dense embedding.
  //   It is useful for model distillation
  enum LABEL_TYPE {
    SINGLE_LABEL = 0,
    MULTI_LABEL_SPARSE = 1,
    MULTI_LABEL_DENSE = 2,
    MULTI_LABEL_WEIGHTED_SPARSE = 3,
    SINGLE_LABEL_WEIGHTED = 4,
    EMBEDDING_LABEL = 5,
  };

  // INCEPTION_STYLE: Random crop with size 8% - 100% image area and aspect
  // ratio in [3/4, 4/3]. Reference: GoogleNet paper
  enum SCALE_JITTER_TYPE {
    NO_SCALE_JITTER = 0,
    INCEPTION_STYLE = 1
    // TODO(zyan3): ResNet-style random scale jitter
  };

 public:
  using OperatorBase::OutputSize;
  using PrefetchOperator<Context>::context_;
  using PrefetchOperator<Context>::prefetch_thread_;
  explicit ImageInputOp(const OperatorDef& operator_def, Workspace* ws);
  ~ImageInputOp() {
    PrefetchOperator<Context>::Finalize();
  }

  bool Prefetch() override;
  bool CopyPrefetched() override;

 private:
  using BoundingBox = struct {
    bool valid;
    int ymin;
    int xmin;
    int height;
    int width;
  };

  // Structure to store per-image information
  // This can be modified by the DecodeAnd* so needs
  // to be privatized per launch.
  using PerImageArg = struct { BoundingBox bounding_params; };

  bool GetImageAndLabelAndInfoFromDBValue(
      const string& value,
      cv::Mat* img,
      PerImageArg& info,
      int item_id,
      std::mt19937* randgen);
  void DecodeAndTransform(
      const std::string& value,
      float* image_data,
      int item_id,
      const int channels,
      std::size_t thread_index);
  void DecodeAndTransposeOnly(
      const std::string& value,
      uint8_t* image_data,
      int item_id,
      const int channels,
      std::size_t thread_index);
  bool ApplyTransformOnGPU(
      const std::vector<std::int64_t>& dims,
      const c10::Device& type);

  unique_ptr<db::DBReader> owned_reader_;
  const db::DBReader* reader_;
  Tensor prefetched_image_;
  Tensor prefetched_label_;
  vector<Tensor> prefetched_additional_outputs_;
  Tensor prefetched_image_on_device_;
  Tensor prefetched_label_on_device_;
  vector<Tensor> prefetched_additional_outputs_on_device_;
  // Default parameters for images
  PerImageArg default_arg_;
  int batch_size_;
  LABEL_TYPE label_type_;
  int num_labels_;

  bool color_;
  bool color_jitter_;
  float img_saturation_;
  float img_brightness_;
  float img_contrast_;
  bool color_lighting_;
  float color_lighting_std_;
  std::vector<std::vector<float>> color_lighting_eigvecs_;
  std::vector<float> color_lighting_eigvals_;
  SCALE_JITTER_TYPE scale_jitter_type_;
  int scale_;
  // Minsize is similar to scale except that it will only
  // force the image to scale up if it is too small. In other words,
  // it ensures that both dimensions of the image are at least minsize_
  int minsize_;
  bool warp_;
  int crop_;
  std::vector<float> mean_;
  std::vector<float> std_;
  Tensor mean_gpu_;
  Tensor std_gpu_;
  bool mirror_;
  bool is_test_;
  bool use_caffe_datum_;
  bool gpu_transform_;
  bool mean_std_copied_ = false;

  // thread pool for parse + decode
  int num_decode_threads_;
  int additional_inputs_offset_;
  int additional_inputs_count_;
  std::vector<int> additional_output_sizes_;
  std::shared_ptr<TaskThreadPool> thread_pool_;

  // Output type for GPU transform path
  TensorProto_DataType output_type_;

  // random minsize
  vector<int> random_scale_;
  bool random_scaling_;

  // Working variables
  std::vector<std::mt19937> randgen_per_thread_;

  // number of exceptions produced by opencv while reading image data
  std::atomic<long> num_decode_errors_in_batch_{0};
  // opencv exceptions tolerance
  float max_decode_error_ratio_;
};

template <class Context>
ImageInputOp<Context>::ImageInputOp(
    const OperatorDef& operator_def,
    Workspace* ws)
    : PrefetchOperator<Context>(operator_def, ws),
      reader_(nullptr),
      batch_size_(
          OperatorBase::template GetSingleArgument<int>("batch_size", 0)),
      label_type_(static_cast<LABEL_TYPE>(
          OperatorBase::template GetSingleArgument<int>("label_type", 0))),
      num_labels_(
          OperatorBase::template GetSingleArgument<int>("num_labels", 0)),
      color_(OperatorBase::template GetSingleArgument<int>("color", 1)),
      color_jitter_(
          OperatorBase::template GetSingleArgument<int>("color_jitter", 0)),
      img_saturation_(OperatorBase::template GetSingleArgument<float>(
          "img_saturation",
          0.4)),
      img_brightness_(OperatorBase::template GetSingleArgument<float>(
          "img_brightness",
          0.4)),
      img_contrast_(
          OperatorBase::template GetSingleArgument<float>("img_contrast", 0.4)),
      color_lighting_(
          OperatorBase::template GetSingleArgument<int>("color_lighting", 0)),
      color_lighting_std_(OperatorBase::template GetSingleArgument<float>(
          "color_lighting_std",
          0.1)),
      scale_jitter_type_(static_cast<SCALE_JITTER_TYPE>(
          OperatorBase::template GetSingleArgument<int>(
              "scale_jitter_type",
              0))),
      scale_(OperatorBase::template GetSingleArgument<int>("scale", -1)),
      minsize_(OperatorBase::template GetSingleArgument<int>("minsize", -1)),
      warp_(OperatorBase::template GetSingleArgument<int>("warp", 0)),
      crop_(OperatorBase::template GetSingleArgument<int>("crop", -1)),
      mirror_(OperatorBase::template GetSingleArgument<int>("mirror", 0)),
      is_test_(OperatorBase::template GetSingleArgument<int>(
          OpSchema::Arg_IsTest,
          0)),
      use_caffe_datum_(
          OperatorBase::template GetSingleArgument<int>("use_caffe_datum", 0)),
      gpu_transform_(OperatorBase::template GetSingleArgument<int>(
          "use_gpu_transform",
          0)),
      num_decode_threads_(
          OperatorBase::template GetSingleArgument<int>("decode_threads", 4)),
      additional_output_sizes_(
          OperatorBase::template GetRepeatedArgument<int>("output_sizes", {})),
      thread_pool_(std::make_shared<TaskThreadPool>(num_decode_threads_)),
      // output type only supported with CUDA and use_gpu_transform for now
      output_type_(
          cast::GetCastDataType(ArgumentHelper(operator_def), "output_type")),
      random_scale_(OperatorBase::template GetRepeatedArgument<int>(
          "random_scale",
          {-1, -1})),
      max_decode_error_ratio_(OperatorBase::template GetSingleArgument<float>(
          "max_decode_error_ratio",
          1.0)) {
  if ((random_scale_[0] == -1) || (random_scale_[1] == -1)) {
    random_scaling_ = false;
  } else {
    random_scaling_ = true;
    minsize_ = random_scale_[0];
  }

  mean_ = OperatorBase::template GetRepeatedArgument<float>(
      "mean_per_channel",
      {OperatorBase::template GetSingleArgument<float>("mean", 0.)});

  std_ = OperatorBase::template GetRepeatedArgument<float>(
      "std_per_channel",
      {OperatorBase::template GetSingleArgument<float>("std", 1.)});

  if (additional_output_sizes_.size() == 0) {
    additional_output_sizes_ = std::vector<int>(OutputSize() - 2, 1);
  } else {
    CAFFE_ENFORCE(
        additional_output_sizes_.size() == OutputSize() - 2,
        "If the output sizes are specified, they must be specified for all "
        "additional outputs");
  }
  additional_inputs_count_ = OutputSize() - 2;

  default_arg_.bounding_params = {
      false,
      OperatorBase::template GetSingleArgument<int>("bounding_ymin", -1),
      OperatorBase::template GetSingleArgument<int>("bounding_xmin", -1),
      OperatorBase::template GetSingleArgument<int>("bounding_height", -1),
      OperatorBase::template GetSingleArgument<int>("bounding_width", -1),
  };

  if (operator_def.input_size() == 0) {
    LOG(ERROR) << "You are using an old ImageInputOp format that creates "
                  "a local db reader. Consider moving to the new style "
                  "that takes in a DBReader blob instead.";
    string db_name = OperatorBase::template GetSingleArgument<string>("db", "");
    CAFFE_ENFORCE_GT(db_name.size(), 0, "Must specify a db name.");
    owned_reader_.reset(new db::DBReader(
        OperatorBase::template GetSingleArgument<string>("db_type", "leveldb"),
        db_name));
    reader_ = owned_reader_.get();
  }

  // hard-coded PCA eigenvectors and eigenvalues, based on RBG channel order
  color_lighting_eigvecs_.push_back(
      std::vector<float>{-144.7125f, 183.396f, 102.2295f});
  color_lighting_eigvecs_.push_back(
      std::vector<float>{-148.104f, -1.1475f, -207.57f});
  color_lighting_eigvecs_.push_back(
      std::vector<float>{-148.818f, -177.174f, 107.1765f});

  color_lighting_eigvals_ = std::vector<float>{0.2175f, 0.0188f, 0.0045f};

  CAFFE_ENFORCE_GT(batch_size_, 0, "Batch size should be nonnegative.");
  if (use_caffe_datum_) {
    CAFFE_ENFORCE(
        label_type_ == SINGLE_LABEL || label_type_ == SINGLE_LABEL_WEIGHTED,
        "Caffe datum only supports single integer label");
  }
  if (label_type_ != SINGLE_LABEL && label_type_ != SINGLE_LABEL_WEIGHTED) {
    CAFFE_ENFORCE_GT(
        num_labels_,
        0,
        "Number of labels must be set for using either sparse label indices or dense label embedding.");
  }
  if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE ||
      label_type_ == SINGLE_LABEL_WEIGHTED) {
    additional_inputs_offset_ = 3;
  } else {
    additional_inputs_offset_ = 2;
  }
  CAFFE_ENFORCE(
      (scale_ > 0) != (minsize_ > 0),
      "Must provide one and only one of scaling or minsize");
  CAFFE_ENFORCE_GT(crop_, 0, "Must provide the cropping value.");
  CAFFE_ENFORCE_GE(
      scale_ > 0 ? scale_ : minsize_,
      crop_,
      "The scale/minsize value must be no smaller than the crop value.");

  CAFFE_ENFORCE_EQ(
      mean_.size(),
      std_.size(),
      "The mean and std. dev vectors must be of the same size.");
  CAFFE_ENFORCE(
      mean_.size() == 1 || mean_.size() == 3,
      "The mean and std. dev vectors must be of size 1 or 3");
  CAFFE_ENFORCE(
      !use_caffe_datum_ || OutputSize() == 2,
      "There can only be 2 outputs if the Caffe datum format is used");

  CAFFE_ENFORCE(
      random_scale_.size() == 2, "Must provide [scale_min, scale_max]");
  CAFFE_ENFORCE_GE(
      random_scale_[1],
      random_scale_[0],
      "random scale must provide a range [min, max]");

  if (default_arg_.bounding_params.ymin < 0 ||
      default_arg_.bounding_params.xmin < 0 ||
      default_arg_.bounding_params.height < 0 ||
      default_arg_.bounding_params.width < 0) {
    default_arg_.bounding_params.valid = false;
  } else {
    default_arg_.bounding_params.valid = true;
  }

  if (mean_.size() == 1) {
    // We are going to extend to 3 using the first value
    mean_.resize(3, mean_[0]);
    std_.resize(3, std_[0]);
  }

  LOG(INFO) << "Creating an image input op with the following setting: ";
  LOG(INFO) << "    Using " << num_decode_threads_ << " CPU threads;";
  if (gpu_transform_) {
    LOG(INFO) << "    Performing transformation on GPU";
  }
  LOG(INFO) << "    Outputting in batches of " << batch_size_ << " images;";
  LOG(INFO) << "    Treating input image as "
            << (color_ ? "color " : "grayscale ") << "image;";
  if (default_arg_.bounding_params.valid) {
Loading ...