// conv_transpose_op_impl.h is the templated implementation of the
// conv_transpose_op.h file.
#ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
#define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
#include "caffe2/core/common.h"
#ifdef C10_MOBILE
#include "caffe2/core/logging.h"
#include "caffe2/operators/conv_op_shared.h"
#include "caffe2/operators/conv_transpose_op_mobile.h"
#include "caffe2/utils/cpu_neon.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/fixed_divisor.h"
#include "caffe2/utils/math.h"
#include "caffe2/utils/math/utils.h"
C10_DECLARE_bool(caffe2_force_shared_col_buffer);
namespace caffe2 {
template <typename T, typename Context>
void runTileContiguous(
int tileId,
int N,
int M,
int H,
int W,
int outputH,
int outputW,
int C,
int kernelH,
int kernelW,
int strideH,
int strideW,
int padT,
const T* filterData,
const T* Xdata,
T* colBufferData,
T* Ydata,
Context* context) {
// The tile size is exactly the length of a single row
int tileSize = W;
auto kernelDataSize = C * kernelH * kernelW;
auto currentTileStart = tileSize * tileId;
// gemm tile
math::GemmEx<T, Context>(
CblasTrans,
CblasNoTrans,
kernelDataSize,
tileSize,
M,
1,
filterData,
kernelDataSize,
Xdata + currentTileStart,
H * W,
0,
colBufferData,
tileSize,
context);
// col2im tile
// We assume that there is no padding in the columns (padL and padR
// == 0).
// FIXME: it is actually possible for us to handle padding, figure
// out how to adjust the bounds
// We write into Y in a de-interleaved fashion; in other words,
// every column (mod strideW) == 0 together in one block,
// every column (mod strideW) == 1 in another,
// ... and so on.
int colBlockSize = (W + kernelW / strideW);
int numColBlocks = strideW;
for (int c = 0; c < kernelDataSize; ++c) {
int w_offset = c % kernelW;
int h_offset = (c / kernelW) % kernelH;
int c_im = c / kernelH / kernelW;
// Each row is a separate tile that we handle. First determine the
// row into which we are writing the output.
// We can properly handle padding for the rows.
int rowY = tileId * strideH - padT + h_offset;
// If this row is out of bounds, then skip it
if (!math::utils::IsAGeZeroAndALtB(rowY, outputH)) {
continue;
}
// FIXME: we don't actually handle a dynamic padL > 0
constexpr int kPadL = 0;
int colOffsetStart = -kPadL + w_offset;
int colBlockY = colOffsetStart % strideW;
// However, within a block we may not start writing at offset
// 0. The offset at which we begin writing is determined by
// colOffsetStart
int colWithinBlockOffsetY = colOffsetStart / strideW;
// So, this is where we begin reading/writing in Y
int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
// This is the complete offset into Y from the start
// Each row has strideW blocks of size colBlockSize
int offsetY = rowY * colBlockSize * numColBlocks + colY;
T* colBufferPointer = colBufferData + c * tileSize;
T* yPointer =
Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
int b = 0;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
// We vectorize the loop within the row
{
constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 4;
int limit = (tileSize / kUnroll) * kUnroll;
for (; b < limit; b += kUnroll) {
float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
float32x4_t y0 = vld1q_f32(yPointer + 0);
float32x4_t y1 = vld1q_f32(yPointer + 4);
float32x4_t y2 = vld1q_f32(yPointer + 8);
float32x4_t y3 = vld1q_f32(yPointer + 12);
y0 = vaddq_f32(y0, cb0);
y1 = vaddq_f32(y1, cb1);
y2 = vaddq_f32(y2, cb2);
y3 = vaddq_f32(y3, cb3);
vst1q_f32(yPointer + 0, y0);
vst1q_f32(yPointer + 4, y1);
vst1q_f32(yPointer + 8, y2);
vst1q_f32(yPointer + 12, y3);
colBufferPointer += kUnroll;
yPointer += kUnroll;
}
}
{
constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
int limit = (tileSize / kUnroll) * kUnroll;
for (; b < limit; b += kUnroll) {
float32x4_t cb0 = vld1q_f32(colBufferPointer);
float32x4_t y0 = vld1q_f32(yPointer);
y0 = vaddq_f32(y0, cb0);
vst1q_f32(yPointer, y0);
colBufferPointer += kUnroll;
yPointer += kUnroll;
}
}
#endif
// Handle un-vectorizable epilogue
for (; b < tileSize; ++b) {
*yPointer += *colBufferPointer;
++yPointer;
++colBufferPointer;
}
}
}
template <typename T, int N>
struct StoreInterleaved {};
template <>
struct StoreInterleaved<float, 1> {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
inline static void store(float* p, float32x4_t v[1]) {
vst1q_f32(p, v[0]);
}
#endif
inline static void store(float* p, float v[1]) {
p[0] = v[0];
}
};
template <>
struct StoreInterleaved<float, 2> {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
inline static void store(float* p, float32x4_t v[2]) {
float32x4x2_t x = {{v[0], v[1]}};
vst2q_f32(p, x);
}
#endif
inline static void store(float* p, float v[2]) {
p[0] = v[0];
p[1] = v[1];
}
};
template <>
struct StoreInterleaved<float, 3> {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
inline static void store(float* p, float32x4_t v[3]) {
float32x4x3_t x = {{v[0], v[1], v[2]}};
vst3q_f32(p, x);
}
#endif
inline static void store(float* p, float v[3]) {
p[0] = v[0];
p[1] = v[1];
p[2] = v[2];
}
};
template <>
struct StoreInterleaved<float, 4> {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
inline static void store(float* p, float32x4_t v[4]) {
float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
vst4q_f32(p, x);
}
#endif
inline static void store(float* p, float v[4]) {
p[0] = v[0];
p[1] = v[1];
p[2] = v[2];
p[3] = v[3];
}
};
template <int kStrideW>
void reinterleaveRows(
const float* src,
const float* bias,
int c,
int h,
float* dst,
int outputC,
int outputH,
int outputW,
int inputW,
int kernelW,
int strideW,
int adjH) {
// Each row in src is of the form:
// [w mod strideW == 0 elements]...[w mod strideW == strideW - 1
// elements]
// We need to re-interleave the values and write them in the output
int colBlockSize = inputW + kernelW / kStrideW;
int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
int point = c * outputH + h;
src += point * colBlockSize * kStrideW;
dst += point * outputW;
float b = bias ? bias[c] : 0;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4_t biasV = vdupq_n_f32(b);
#endif
int w = 0;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 2;
int limit = ((inputW - 1) / kUnroll) * kUnroll;
for (; w < limit; w += kUnroll) {
// We need to interleave in terms of kStrideW units
float32x4_t v0[kStrideW];
float32x4_t v1[kStrideW];
for (int i = 0; i < kStrideW; ++i) {
v0[i] = vld1q_f32(src + i * colBlockSize);
v1[i] = vld1q_f32(src + i * colBlockSize + 4);
}
// add per-channel bias
for (int i = 0; i < kStrideW; ++i) {
v0[i] = vaddq_f32(v0[i], biasV);
v1[i] = vaddq_f32(v1[i], biasV);
}
// Write interleaved into the output
StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
src += kUnroll;
dst += kUnroll * kStrideW;
}
#endif
// Handle non-vectorizable remainder
for (; w < inputW - 1; ++w) {
float v[kStrideW];
for (int i = 0; i < kStrideW; ++i) {
v[i] = src[i * colBlockSize];
}
// add per-channel bias
for (int i = 0; i < kStrideW; ++i) {
v[i] += b;
}
// Write interleaved into the output
StoreInterleaved<float, kStrideW>::store(dst, v);
src += 1;
dst += kStrideW;
}
// We have handled 0 .. (inputW - 1) * stride inclusive so far.
// Handle the remainder
int outputPoint = (inputW - 1) * kStrideW;
int block = 0;
// Output width may include adjustment into which we don't
// write; ignore it
while (outputPoint < noAdjOutputW) {
float v = src[block * colBlockSize];
dst[0] = v + b;
++outputPoint;
dst += 1;
++block;
if (block >= kStrideW) {
block = 0;
src += 1;
}
}
// Remainder of the buffer comprised of just the `adj` must have
// bias added
for (; outputPoint < outputW; ++outputPoint) {
dst[0] = b;
dst += 1;
}
}
Loading ...