Repository URL to install this package:
Version:
1.12.1+cpu ▾
|
#pragma once
#include <cuda.h>
#include <c10/util/Half.h>
#include <c10/util/BFloat16.h>
#include <ATen/NumericUtils.h>
template <typename T>
struct AtomicFPOp;
template <>
struct AtomicFPOp<at::Half> {
template <typename func_t>
inline __device__ at::Half operator() (at::Half *address, at::Half val, const func_t& func) {
unsigned int * address_as_ui =
(unsigned int *) ((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
at::Half hsum;
do {
assumed = old;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
hsum = func(hsum, val);
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
return hsum;
}
};
template <>
struct AtomicFPOp<at::BFloat16> {
template <typename func_t>
inline __device__ at::BFloat16 operator() (at::BFloat16 *address, at::BFloat16 val, const func_t& func) {
unsigned int * address_as_ui =
(unsigned int *) ((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
at::BFloat16 bsum;
do {
assumed = old;
bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
bsum = func(bsum, val);
old = (size_t)address & 2 ? (old & 0xffff) | (bsum.x << 16) : (old & 0xffff0000) | bsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
return bsum.x;
}
};
template <>
struct AtomicFPOp<double> {
template <typename func_t>
inline __device__ double operator() (double * address, double val, const func_t& func) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, func(val, assumed));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
};
template <typename T, size_t n>
struct AtomicAddIntegerImpl;
template<typename T>
struct AtomicAddIntegerImpl<T, 1> {
inline __device__ void operator()(T *address, T val) {
size_t offset = (size_t)address & 3;
uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);
uint32_t old = *address_as_ui;
uint32_t shift = offset * 8;
uint32_t old_byte;
uint32_t newval;
uint32_t assumed;
do {
assumed = old;
old_byte = (old >> shift) & 0xff;
// preserve size in initial cast. Casting directly to uint32_t pads
// negative signed values with 1's (e.g. signed -1 = unsigned ~0).
newval = static_cast<uint8_t>(val + static_cast<T>(old_byte));
newval = (old & ~(0x000000ff << shift)) | (newval << shift);
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
template<typename T>
struct AtomicAddIntegerImpl<T, 2> {
inline __device__ void operator()(T *address, T val) {
size_t offset = (size_t)address & 2;
uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);
bool is_32_align = offset;
uint32_t old = *address_as_ui;
uint32_t old_bytes;
uint32_t newval;
uint32_t assumed;
do {
assumed = old;
old_bytes = is_32_align ? old >> 16 : old & 0xffff;
// preserve size in initial cast. Casting directly to uint32_t pads
// negative signed values with 1's (e.g. signed -1 = unsigned ~0).
newval = static_cast<uint16_t>(val + static_cast<T>(old_bytes));
newval = is_32_align ? (old & 0xffff) | (newval << 16) : (old & 0xffff0000) | newval;
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
template<typename T>
struct AtomicAddIntegerImpl<T, 4> {
inline __device__ void operator()(T *address, T val) {
uint32_t * address_as_ui = (uint32_t *) (address);
uint32_t old = *address_as_ui;
uint32_t newval;
uint32_t assumed;
do {
assumed = old;
newval = static_cast<uint32_t>(val + static_cast<T>(old));
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
template<typename T>
struct AtomicAddIntegerImpl<T, 8> {
inline __device__ void operator()(T *address, T val) {
unsigned long long * address_as_ui = (unsigned long long *) (address);
unsigned long long old = *address_as_ui;
unsigned long long newval;
unsigned long long assumed;
do {
assumed = old;
newval = static_cast<uint64_t>(val + static_cast<T>(old));
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
}
static inline __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
}
static inline __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
}
static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
return atomicAdd(address, val);
}
static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
#if defined(USE_ROCM)
__atomic_fetch_add(address, val, __ATOMIC_RELAXED);
#else
AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
#endif
}
static inline __device__ void gpuAtomicAdd(bool *address, bool val) {
*address = address && val;
}
static inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
return AtomicFPOp<at::Half>()(address, val,
[](at::Half hsum, at::Half val) {
return hsum + val;
});
#else
return atomicAdd(reinterpret_cast<__half*>(address), val);
#endif
}
static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
return AtomicFPOp<at::BFloat16>()(address, val,
[](at::BFloat16 bsum, at::BFloat16 val) {
return bsum + val;
});
}
#if defined(CUDA_VERSION) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
// from CUDA C Programmic Guide
static inline __device__ double atomicAdd(double* address, double val)
#if defined(__clang__) && defined(__CUDA__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wgcc-compat"
__attribute__((enable_if(true, "")))
#pragma GCC diagnostic pop
#endif
{
return AtomicFPOp<double>()(address, val,
[](double val, unsigned long long int assumed) {
return __double_as_longlong(val + __longlong_as_double(assumed));
});
}
#elif defined(USE_ROCM) || !(defined(__CUDA_ARCH__) && (defined(CUDA_VERSION) && CUDA_VERSION < 8000))
/* Note [hip-clang differences to hcc]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* The upcoming hip-clang compiler for ROCm differs from hcc in a few details.
* It exports the __HIP__ macro, we can hence differentiate between hcc and
* hip-clang. In the below, hcc only received support for atomicAdd with double
* typing after work week 18312. hip-clang had support from the first version.
* In general, the code-visible differences between hip-clang and hcc will be
* minimal.
*/
#if defined(__HIP_PLATFORM_HCC__) && __hcc_workweek__ < 18312 && !__HIP__
// This needs to be defined for the host side pass
static inline __device__ double atomicAdd(double *address, double val) { }
#endif
#endif
static inline __device__ double gpuAtomicAdd(double *address, double val) {
return atomicAdd(address, val);
}
static inline __device__ float gpuAtomicAdd(float *address, float val) {
return atomicAdd(address, val);
}
template<typename T>
static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
gpuAtomicAdd(&address->real_, val.real_);
gpuAtomicAdd(&address->imag_, val.imag_);
}
/* Note [gpuAtomicAdd vs atomicAdd]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Some extensions such as torchvision call atomicAdd()
* directly and require non-library provided data type support. Only for these, we
* continue to provide atomicAdd overloads.
*/
static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
return gpuAtomicAdd(address, val);
}
static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
return gpuAtomicAdd(address, val);
}
static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
gpuAtomicAdd(address, val);
}
static inline __device__ void atomicAdd(int8_t *address, int8_t val) {
gpuAtomicAdd(address, val);
}
static inline __device__ void atomicAdd(int16_t *address, int16_t val) {
gpuAtomicAdd(address, val);
}
static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
gpuAtomicAdd(address, val);
}
static inline __device__ void atomicAdd(bool *address, bool val) {
gpuAtomicAdd(address, val);
}
/* Note [explicitly non-returning atomics]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* AMD's MI100 (gfx908) provides an optimized fp32 atomicAdd, exposed via atomicAddNoRet().
* Due to compiler limitations, callers must opt-in to guarantee the optimized instruction.
* This non-returning atomicAddNoRet cannot be used to implement the returning atomicAdd,
* therefore we need a new API 'gpuAtomicAddNoReturn'.
*/
template<typename T>
static inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
/* Special case fp32 atomic. */
#if defined(USE_ROCM)
static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); }
#else
static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
#endif
// Atomic multiplication implementation.
inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
return AtomicFPOp<at::Half>()(address, val,
[](at::Half bsum, at::Half val) {
return bsum * val;
});
}
inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16 val) {
return AtomicFPOp<at::BFloat16>()(address, val,
[](at::BFloat16 bsum, at::BFloat16 val) {
return bsum * val;
});
}
inline __device__ double gpuAtomicMul(double * address, double val) {
return AtomicFPOp<double>()(address, val,
[](double val, unsigned long long int assumed) {
return __double_as_longlong(val * __longlong_as_double(assumed));
});
}
// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
inline __device__ float gpuAtomicMul (float * address, float val) {
unsigned int* address_as_ull = (unsigned int*)address;
unsigned int old = *address_as_ull;
unsigned int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__float_as_int(val *
__int_as_float(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __int_as_float(old);
}
// Atomic maximum implementation.
template <typename T>
__host__ __device__ T safe_max(T a, T b) {
#if defined(__HIPCC__)
// TODO: remove this special case for HIP when issue is fixed:
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
#else
T max = at::_isnan(b) ? b : std::max<T>(a, b);
#endif
return max;
}
inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) {
return AtomicFPOp<at::Half>()(address, val,
[](at::Half bsum, at::Half val) {
return safe_max(bsum, val);
});
}
inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) {
return AtomicFPOp<at::BFloat16>()(address, val,
[](at::BFloat16 bsum, at::BFloat16 val) {
return safe_max(bsum, val);
});
}
inline __device__ double gpuAtomicMax(double * address, double val) {
return AtomicFPOp<double>()(address, val,
[](double val, unsigned long long int assumed) {
return __double_as_longlong(safe_max(val, __longlong_as_double(assumed)));
});
}
// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
inline __device__ float gpuAtomicMax(float * address, float val) {
unsigned int* address_as_ull = (unsigned int*)address;
unsigned int old = *address_as_ull;
unsigned int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__float_as_int(safe_max(val, __int_as_float(assumed))));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __int_as_float(old);
}
// Atomic minimum implementation.
template <typename T>
__host__ __device__ T safe_min(T a, T b) {
#if defined(__HIPCC__)
// TODO: remove this special case for HIP when issue is fixed:
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
#else
T min = at::_isnan(b) ? b : std::min<T>(a, b);
#endif
return min;
}
inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) {
return AtomicFPOp<at::Half>()(address, val,
[](at::Half bsum, at::Half val) {
return safe_min(bsum, val);
});
}
inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) {
return AtomicFPOp<at::BFloat16>()(address, val,
[](at::BFloat16 bsum, at::BFloat16 val) {
return safe_min(bsum, val);
});
}
inline __device__ double gpuAtomicMin(double * address, double val) {
return AtomicFPOp<double>()(address, val,
[](double val, unsigned long long int assumed) {
return __double_as_longlong(safe_min(val, __longlong_as_double(assumed)));
});
}
// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
inline __device__ float gpuAtomicMin(float * address, float val) {
unsigned int* address_as_ull = (unsigned int*)address;
unsigned int old = *address_as_ull;
unsigned int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__float_as_int(safe_min(val, __int_as_float(assumed))));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __int_as_float(old);
}