#pragma once
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
// See Note [Do not compile initializers with AVX]
#include <ATen/cpu/vec256/intrinsics.h>
#include <ATen/cpu/vec256/vec256_base.h>
#include <c10/macros/Macros.h>
namespace at {
namespace vec256 {
namespace {
#ifdef CPU_CAPABILITY_AVX2
struct Vec256i {
protected:
__m256i values;
static inline __m256i invert(const __m256i& v) {
const auto ones = _mm256_set1_epi64x(-1);
return _mm256_xor_si256(ones, v);
}
public:
Vec256i() {}
Vec256i(__m256i v) : values(v) {}
operator __m256i() const {
return values;
}
};
#else
struct Vec256i {}; // dummy definition to make Vec256i always defined
#endif // CPU_CAPABILITY_AVX2
#ifdef CPU_CAPABILITY_AVX2
template <>
class Vec256<int64_t> : public Vec256i {
private:
static const Vec256<int64_t> ones;
public:
using value_type = int64_t;
static constexpr int size() {
return 4;
}
using Vec256i::Vec256i;
Vec256() {}
Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
Vec256(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
values = _mm256_setr_epi64x(val1, val2, val3, val4);
}
template <int64_t mask>
static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
__at_align32__ int64_t tmp_values[size()];
a.store(tmp_values);
if (mask & 0x01)
tmp_values[0] = _mm256_extract_epi64(b.values, 0);
if (mask & 0x02)
tmp_values[1] = _mm256_extract_epi64(b.values, 1);
if (mask & 0x04)
tmp_values[2] = _mm256_extract_epi64(b.values, 2);
if (mask & 0x08)
tmp_values[3] = _mm256_extract_epi64(b.values, 3);
return loadu(tmp_values);
}
static Vec256<int64_t> blendv(const Vec256<int64_t>& a, const Vec256<int64_t>& b,
const Vec256<int64_t>& mask) {
return _mm256_blendv_epi8(a.values, b.values, mask.values);
}
template <typename step_t>
static Vec256<int64_t> arange(int64_t base = 0, step_t step = static_cast<step_t>(1)) {
return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
}
static Vec256<int64_t>
set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size()) {
switch (count) {
case 0:
return a;
case 1:
return blend<1>(a, b);
case 2:
return blend<3>(a, b);
case 3:
return blend<7>(a, b);
}
return b;
}
static Vec256<int64_t> loadu(const void* ptr) {
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
}
static Vec256<int64_t> loadu(const void* ptr, int64_t count) {
__at_align32__ int64_t tmp_values[size()];
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
// instructions while a loop would be compiled to one instruction.
for (auto i = 0; i < size(); ++i) {
tmp_values[i] = 0;
}
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
return loadu(tmp_values);
}
void store(void* ptr, int count = size()) const {
if (count == size()) {
// ptr need not to be aligned here. See
// https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
} else if (count > 0) {
__at_align32__ int64_t tmp_values[size()];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
}
}
const int64_t& operator[](int idx) const = delete;
int64_t& operator[](int idx) = delete;
Vec256<int64_t> abs() const {
auto zero = _mm256_set1_epi64x(0);
auto is_larger = _mm256_cmpgt_epi64(zero, values);
auto inverse = _mm256_xor_si256(values, is_larger);
return _mm256_sub_epi64(inverse, is_larger);
}
Vec256<int64_t> real() const {
return *this;
}
Vec256<int64_t> imag() const {
return _mm256_set1_epi64x(0);
}
Vec256<int64_t> conj() const {
return *this;
}
Vec256<int64_t> frac() const;
Vec256<int64_t> neg() const;
Vec256<int64_t> operator==(const Vec256<int64_t>& other) const {
return _mm256_cmpeq_epi64(values, other.values);
}
Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
return invert(_mm256_cmpeq_epi64(values, other.values));
}
Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
return _mm256_cmpgt_epi64(other.values, values);
}
Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
return invert(_mm256_cmpgt_epi64(values, other.values));
}
Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
return _mm256_cmpgt_epi64(values, other.values);
}
Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
return invert(_mm256_cmpgt_epi64(other.values, values));
}
Vec256<int64_t> eq(const Vec256<int64_t>& other) const;
Vec256<int64_t> ne(const Vec256<int64_t>& other) const;
Vec256<int64_t> gt(const Vec256<int64_t>& other) const;
Vec256<int64_t> ge(const Vec256<int64_t>& other) const;
Vec256<int64_t> lt(const Vec256<int64_t>& other) const;
Vec256<int64_t> le(const Vec256<int64_t>& other) const;
};
template <>
class Vec256<int32_t> : public Vec256i {
private:
static const Vec256<int32_t> ones;
public:
using value_type = int32_t;
static constexpr int size() {
return 8;
}
using Vec256i::Vec256i;
Vec256() {}
Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
Vec256(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
int32_t val5, int32_t val6, int32_t val7, int32_t val8) {
values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8);
}
template <int64_t mask>
static Vec256<int32_t> blend(Vec256<int32_t> a, Vec256<int32_t> b) {
return _mm256_blend_epi32(a, b, mask);
}
static Vec256<int32_t> blendv(const Vec256<int32_t>& a, const Vec256<int32_t>& b,
const Vec256<int32_t>& mask) {
return _mm256_blendv_epi8(a.values, b.values, mask.values);
}
template <typename step_t>
static Vec256<int32_t> arange(int32_t base = 0, step_t step = static_cast<step_t>(1)) {
return Vec256<int32_t>(
base, base + step, base + 2 * step, base + 3 * step,
base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
}
static Vec256<int32_t>
set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size()) {
switch (count) {
case 0:
return a;
case 1:
return blend<1>(a, b);
case 2:
return blend<3>(a, b);
case 3:
return blend<7>(a, b);
case 4:
return blend<15>(a, b);
case 5:
return blend<31>(a, b);
case 6:
return blend<63>(a, b);
case 7:
return blend<127>(a, b);
}
return b;
}
static Vec256<int32_t> loadu(const void* ptr) {
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
}
static Vec256<int32_t> loadu(const void* ptr, int32_t count) {
__at_align32__ int32_t tmp_values[size()];
// Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
// for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
// instructions while a loop would be compiled to one instruction.
for (auto i = 0; i < size(); ++i) {
tmp_values[i] = 0;
}
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
return loadu(tmp_values);
}
void store(void* ptr, int count = size()) const {
if (count == size()) {
// ptr need not to be aligned here. See
// https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
} else if (count > 0) {
__at_align32__ int32_t tmp_values[size()];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
}
}
void dump() const {
for (size_t i = 0; i < size(); ++i) {
std::cout << (int)((value_type*)&values)[i] << " ";
}
std::cout << std::endl;
}
const int32_t& operator[](int idx) const = delete;
int32_t& operator[](int idx) = delete;
Vec256<int32_t> abs() const {
return _mm256_abs_epi32(values);
}
Vec256<int32_t> real() const {
return *this;
}
Vec256<int32_t> imag() const {
return _mm256_set1_epi32(0);
}
Vec256<int32_t> conj() const {
return *this;
}
Vec256<int32_t> frac() const;
Vec256<int32_t> neg() const;
Vec256<int32_t> operator==(const Vec256<int32_t>& other) const {
return _mm256_cmpeq_epi32(values, other.values);
}
Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
return invert(_mm256_cmpeq_epi32(values, other.values));
}
Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
return _mm256_cmpgt_epi32(other.values, values);
}
Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
return invert(_mm256_cmpgt_epi32(values, other.values));
}
Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
return _mm256_cmpgt_epi32(values, other.values);
}
Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
return invert(_mm256_cmpgt_epi32(other.values, values));
}
Vec256<int32_t> eq(const Vec256<int32_t>& other) const;
Vec256<int32_t> ne(const Vec256<int32_t>& other) const;
Vec256<int32_t> gt(const Vec256<int32_t>& other) const;
Vec256<int32_t> ge(const Vec256<int32_t>& other) const;
Vec256<int32_t> lt(const Vec256<int32_t>& other) const;
Vec256<int32_t> le(const Vec256<int32_t>& other) const;
};
template <>
inline void convert(const int32_t *src, float *dst, int64_t n) {
int64_t i;
// int32_t and float have same size
#ifndef _MSC_VER
# pragma unroll
#endif
for (i = 0; i <= (n - Vec256<int32_t>::size()); i += Vec256<int32_t>::size()) {
auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
auto output_vec = _mm256_cvtepi32_ps(input_vec);
_mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
}
#ifndef _MSC_VER
# pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<float>(src[i]);
}
}
template <>
inline void convert(const int32_t *src, double *dst, int64_t n) {
int64_t i;
// int32_t has half the size of double
#ifndef _MSC_VER
# pragma unroll
#endif
for (i = 0; i <= (n - Vec256<double>::size()); i += Vec256<double>::size()) {
auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
_mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
}
#ifndef _MSC_VER
# pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<double>(src[i]);
}
}
template <>
class Vec256<int16_t> : public Vec256i {
private:
static const Vec256<int16_t> ones;
public:
using value_type = int16_t;
static constexpr int size() {
return 16;
}
using Vec256i::Vec256i;
Vec256() {}
Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
Vec256(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
int16_t val5, int16_t val6, int16_t val7, int16_t val8,
int16_t val9, int16_t val10, int16_t val11, int16_t val12,
int16_t val13, int16_t val14, int16_t val15, int16_t val16) {
values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8,
val9, val10, val11, val12, val13, val14, val15, val16);
}
Loading ...