Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

neilisaac / torch   python

Repository URL to install this package:

Version: 1.8.0 

/ include / caffe2 / operators / reducer_functors.h


#ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
#define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_

#include <array>

#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
#include "caffe2/utils/proto_utils.h"

namespace caffe2 {

////////////////////////////////////////////////////////////////////////////////
// Range reducers: can leverage that input segment is continuous and provide
// special implementation
////////////////////////////////////////////////////////////////////////////////

// Put forward and backward in the same template?
template <typename T, class Context>
class SumRangeReducer;
template <typename T, class Context>
class SumRangeReducerGradient;

template <typename T>
class SumRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* in,
      T* out,
      CPUContext* /*context*/) {
    // do we need to go through wrapper in math.h?
    EigenVectorMap<T> out_vec(out, block_size);
    out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
  }
};

template <typename T, class Context>
class SumRangeReducerGradient {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* segment_grad,
      T* data_grad,
      const T* /*data_in*/, // unused
      const T* /*data_out*/, // unused
      Context* context) {
    // do we have some op that does it smartly with minimum number of memcpy?
    for (int64_t i = 0; i < blocks; ++i) {
      context->template CopySameDevice<T>(
          block_size, segment_grad, data_grad + block_size * i);
    }
  }
};

struct SumRangeReducerDef {
  template <typename T, class Context>
  using Reducer = SumRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = SumRangeReducerGradient<T, Context>;
  static constexpr const char* name = "Sum";
  static constexpr const char* doc =
      "Summation is done element-wise across slices of the input tensor and "
      "doesn't change the shape of the individual blocks.";
};

// Put forward and backward in the same template?
template <typename T, class Context>
class LogSumExpRangeReducer;
template <typename T, class Context>
class LogSumExpRangeReducerGradient;

template <typename T>
class LogSumExpRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* in,
      T* out,
      CPUContext* /*context*/) {
    for (int j = 0; j < block_size; ++j) {
      T max_value = std::numeric_limits<T>::lowest();
      for (int i = 0; i < blocks; ++i) {
        max_value = std::max(max_value, in[i * block_size + j]);
      }
      T scaled_exp_sum = 0;
      for (int i = 0; i < blocks; ++i) {
        scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
      }
      *(out++) = std::log(scaled_exp_sum) + max_value;
    }
  }
  T r{1};
};

template <typename T, class Context>
class LogSumExpRangeReducerGradient {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* segment_grad, // GO
      T* data_grad, // GI
      const T* data_in, // I
      const T* data_out, // O
      Context* /*context*/) {
    for (int j = 0; j < block_size; ++j) {
      const T out_grad = *(segment_grad++);
      const T offset = *(data_out++);
      for (int i = 0; i < blocks; ++i) {
        auto idx = i * block_size + j;
        data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
      }
    }
  }
};

struct LogSumExpRangeReducerDef {
  template <typename T, class Context>
  using Reducer = LogSumExpRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = LogSumExpRangeReducerGradient<T, Context>;
  static constexpr const char* name = "LogSumExp";
  static constexpr const char* doc =
      "LogSumExp computes the element-wise log of the sum of exponentials of "
      "input slices. Operation doesn't change the shape of individual blocks.";
};

template <typename T, class Context>
class LogMeanExpRangeReducer;
template <typename T, class Context>
class LogMeanExpRangeReducerGradient;

template <typename T>
class LogMeanExpRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* in,
      T* out,
      CPUContext* /*context*/) {
    for (int j = 0; j < block_size; ++j) {
      T max_value = std::numeric_limits<T>::lowest();
      for (int i = 0; i < blocks; ++i) {
        max_value = std::max(max_value, in[i * block_size + j]);
      }
      T scaled_exp_sum = 0;
      for (int i = 0; i < blocks; ++i) {
        scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
      }
      scaled_exp_sum /= blocks;
      *(out++) = std::log(scaled_exp_sum) + max_value;
    }
  }
};

template <typename T, class Context>
class LogMeanExpRangeReducerGradient {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* segment_grad, // GO
      T* data_grad, // GI
      const T* data_in, // I
      const T* data_out, // O
      Context* /*context*/) {
    for (int j = 0; j < block_size; ++j) {
      const T out_grad = *(segment_grad++);
      const T offset = *(data_out++);
      for (int i = 0; i < blocks; ++i) {
        auto idx = i * block_size + j;
        data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
      }
    }
  }
};

struct LogMeanExpRangeReducerDef {
  template <typename T, class Context>
  using Reducer = LogMeanExpRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = LogMeanExpRangeReducerGradient<T, Context>;
  static constexpr const char* name = "LogMeanExp";
  static constexpr const char* doc =
      "LogMeanExp computes the element-wise log of the mean of exponentials of "
      "input slices. Operation doesn't change the shape of individual blocks.";
};

template <typename T, class Context>
class MeanRangeReducer;
template <typename T, class Context>
class MeanRangeReducerGradient;

template <typename T>
class MeanRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* in,
      T* out,
      CPUContext* /*context*/) {
    for (int j = 0; j < block_size; ++j) {
      T avg_value = 0;
      for (int i = 0; i < blocks; ++i) {
        avg_value += in[i * block_size + j] / blocks;
      }
      *(out++) = avg_value;
    }
  }
};

template <typename T, class Context>
class MeanRangeReducerGradient {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* segment_grad, // GO
      T* data_grad, // GI
      const T* /*data_in*/, // I
      const T* /*data_out*/, // O
      Context* /*context*/) {
    const auto in_grad = 1.0 / blocks;
    for (int j = 0; j < block_size; ++j) {
      const T out_grad = *(segment_grad++);
      for (int i = 0; i < blocks; ++i) {
        auto idx = i * block_size + j;
        data_grad[idx] = out_grad * in_grad;
      }
    }
  }
};

struct MeanRangeReducerDef {
  template <typename T, class Context>
  using Reducer = MeanRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = MeanRangeReducerGradient<T, Context>;
  static constexpr const char* name = "Mean";
  static constexpr const char* doc =
      "Mean computation is done element-wise, so that each element of the "
      "output slice corresponds to the average value of the respective "
      "elements in the input slices. Operation doesn't change the shape of "
      "individual blocks.";
};

template <typename T, class Context>
class MaxRangeReducer;
template <typename T, class Context>
class MaxRangeReducerGradient;

template <typename T>
class MaxRangeReducer<T, CPUContext> {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* in,
      T* out,
      CPUContext* /*context*/) {
    for (int j = 0; j < block_size; ++j) {
      T max_value = std::numeric_limits<T>::lowest();
      for (int i = 0; i < blocks; ++i) {
        max_value = std::max(max_value, in[i * block_size + j]);
      }
      *(out++) = max_value;
    }
  }
};

template <typename T, class Context>
class MaxRangeReducerGradient {
 public:
  void operator()(
      const int64_t block_size,
      const int64_t blocks,
      const T* segment_grad, // GO
      T* data_grad, // GI
      const T* data_in, // I
      const T* data_out, // O
      Context* /*context*/) {
    std::memset(
        static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
    for (int j = 0; j < block_size; ++j) {
      const T out_grad = *(segment_grad++);
      const T out = data_out[j];
      for (int i = 0; i < blocks; ++i) {
        auto idx = i * block_size + j;
        if (out == data_in[idx]) {
          data_grad[idx] = out_grad;
        }
      }
    }
  }
};

struct MaxRangeReducerDef {
  template <typename T, class Context>
  using Reducer = MaxRangeReducer<T, Context>;
  template <typename T, class Context>
  using ReducerGradient = MaxRangeReducerGradient<T, Context>;
  static constexpr const char* name = "Max";
  static constexpr const char* doc =
      "Max computation is done element-wise, so that each element of the "
      "output slice corresponds to the max value of the respective "
      "elements in the input slices. Operation doesn't change the shape of "
      "individual blocks. This implementation imitates torch nn.Max operator. "
      "If the maximum value occurs more than once, the operator will return "
      "the first occurrence of value. When computing the gradient using the "
      "backward propagation, the gradient input corresponding to the first "
      "occurrence of the maximum value will be used.";
};

////////////////////////////////////////////////////////////////////////////////
// Incremental reducers: consume elements one by one
////////////////////////////////////////////////////////////////////////////////

// Base implementation, everything can be overwritten
class BaseReducer {
 public:
  static constexpr int kInputCount = 1;

  struct Meta {
    int64_t block_size;
    vector<int64_t> block_shape;
    bool first_dim;

    explicit Meta(bool first = true) : first_dim(first) {}

    void computeMeta(at::IntArrayRef dims, size_t skip_dims) {
      first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
                : block_shape.assign(dims.begin(), dims.end() - skip_dims);
      block_size = first_dim ? size_from_dim_(skip_dims, dims)
                             : size_from_dim_(dims.size() - skip_dims, dims);
    }

    void observeInput(int input, const Tensor& value, int skip_dims) {
Loading ...