CUDA编程优化Float16精度的atomicAdd

NLP中词嵌入(embedding)是非常重要的一环,embedding算子在反向计算时不同的线程会写入同一个结果,面对这类写冲突问题,高效的多线程计算必然要无锁化设计。而NVIDIA的加速卡本身支持了atomicAdd等原子计算。

当我们使用较低精度在一些训推任务中量化加速模型时(如:FP16精度),CUDA底层会调用诸如 atomicAdd(half*, half) 这类原子指令。但是当前的硬件对这条指令的支持不够好,所以我们需要借助一些内存对齐的技术以应用效率更高的 atomicAdd(half2*, half2) 指令。本文重点讲述如何利用软件来弥补硬件的缺陷这样的实例。

atomicAdd指令

下面的声明摘自 CUDA C Programming Guide。已经支持的数据类型如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
int atomicAdd(int* address, int val);
unsigned int atomicAdd(unsigned int* address,
unsigned int val);
unsigned long long int atomicAdd(unsigned long long int* address,
unsigned long long int val);
float atomicAdd(float* address, float val);
double atomicAdd(double* address, double val);
__half2 atomicAdd(__half2 *address, __half2 val);
__half atomicAdd(__half *address, __half val);
__nv_bfloat162 atomicAdd(__nv_bfloat162 *address, __nv_bfloat162 val);
__nv_bfloat16 atomicAdd(__nv_bfloat16 *address, __nv_bfloat16 val);
float2 atomicAdd(float2* address, float2 val);
float4 atomicAdd(float4* address, float4 val);

NVIDIA的论坛里有人讨论过half精度atomicAdd性能劣化的问题。但是回答中给到的测试是有一定的误导性的,实际上大量的累积计算会导致half精度早早就溢出了。所以可以本地测试一下1000次以内的,保证half和half2的累加值不溢出。过大时,half精度可能会停留在2048或者4096的这样的数字。

float half half2
2.6e-05s 0.00048s 1.9e-0.5s

可以看到,half2最快,half最慢。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// nvcc -o a a.cu -arch=sm_86 -DUSE_FLOAT
// nvcc -o a a.cu -arch=sm_86

#include <cuda_fp16.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

// const int nTPB = 128;
// const size_t nBLK = 1048576ULL;
const int nTPB = 10;
const size_t nBLK = 100ULL;
const size_t ds = nBLK*nTPB;
#ifndef USE_FLOAT
using ft = half;
#else
using ft = float;
#endif
__global__ void k(const ft * __restrict__ i, ft * __restrict__ o){
size_t idx = blockIdx.x * blockDim.x+threadIdx.x;
atomicAdd(o, i[idx]);
}

__global__ void k2(const ft * __restrict__ i, ft * __restrict__ o){
size_t idx = blockIdx.x*blockDim.x+threadIdx.x ;
if(((idx & 1) == 0) && (idx + 1 < ds)) {
#ifdef USE_FLOAT
#else
half2 i2 = make_half2(i[idx], i[idx + 1]);
atomicAdd(reinterpret_cast<half2*>(o), i2);
#endif
}
}

int main(){

ft *i, *o, *hi, *o2;
cudaMalloc(&i, ds*sizeof(ft));
cudaMalloc(&o, 2*sizeof(ft));
cudaMalloc(&o2, 2*sizeof(ft));
cudaMemset(o, 0, sizeof(ft));
hi = (ft *)malloc(ds*sizeof(ft));
for (size_t i = 0; i < ds; i++) {
if (i & 1) {
#ifndef USE_FLOAT
hi[i] = __float2half(0.1f);
#else
hi[i] = 0.1f;
#endif
} else {
#ifndef USE_FLOAT
hi[i] = __float2half(0.2f);
#else
hi[i] = 0.2f;
#endif
}
}

cudaMemcpy(i, hi, ds*sizeof(ft), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
k<<<nBLK, nTPB>>>(i, o); // warm-up
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
k<<<nBLK, nTPB>>>(i, o);
cudaDeviceSynchronize();

ft* out = (ft *)malloc(2*sizeof(ft));
cudaMemcpy(out, o, 2*sizeof(ft), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();

dt = dtime_usec(dt);
cudaError_t err = cudaGetLastError();
if (err == cudaSuccess) std::cout << "Duration: " << dt/(float)USECPSEC << "s" << std::endl;
else std::cout << "Error: " << cudaGetErrorString(err) << std::endl;


cudaMemcpy(i, hi, ds*sizeof(ft), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
k2<<<nBLK, nTPB>>>(i, o2); // warm-up
cudaDeviceSynchronize();
dt = dtime_usec(0);
k2<<<nBLK, nTPB>>>(i, o2);
cudaDeviceSynchronize();

cudaMemcpy(out, o2, 2*sizeof(ft), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();


dt = dtime_usec(dt);
err = cudaGetLastError();
if (err == cudaSuccess) std::cout << "Duration: " << dt/(float)USECPSEC << "s" << std::endl;
else std::cout << "Error: " << cudaGetErrorString(err) << std::endl;
}

half精度的fastAtomicAdd实现

首先我们要对齐内存,然后对一个连续的内存区间做操作,如果不是连续的那就会退化到 half 类型的原子加操作。对于两两相连的half值,我们对于目标值做累加,另一个紧邻的值补0对齐,这样就不会影响计算结果。low_byte 用来判断是否对齐,根据对齐与否来决定补0的位置是前面还是后面:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
template < 
typename scalar_t, typename index_t, typename std::enable_if<std::is_same<half, scalar_t>::value>::type* = nullptr>
__device__ __forceinline__ void fast_atomic_add(scalar_t *address, index_t index, const index_t numel, scalar_t value) {
#if ( \
(defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
(defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
atomic_add(
reinterpret_cast<half*>(address) + index,
static_cast<half>(value));
#else
__half* target_addr = reinterpret_cast<__half*>(address + index);
bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);

if (low_byte && (index + 1 < numel)) {
__half zero = __float2half(0.0f);
__half2 val2 = make_half2(value, zero);
atomicAdd(reinterpret_cast<__half2*>(target_addr), val2);
} else if (!low_byte && (index > 0)) {
__half zero = __float2half(0.0f);
__half2 val2 = make_half2(zero, value);
atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), val2);
} else {
atomic_add(
reinterpret_cast<half*>(address) + index,
static_cast<half>(value));
}
#endif
}

这个部分的原理与更完整的版本可以参考 PyTorch 的实现