CUDA编程优化Float16精度的atomicAdd

NLP中词嵌入（embedding）是非常重要的一环，embedding算子在反向计算时不同的线程会写入同一个结果，面对这类写冲突问题，高效的多线程计算必然要无锁化设计。而NVIDIA的加速卡本身支持了atomicAdd等原子计算。

当我们使用较低精度在一些训推任务中量化加速模型时（如：FP16精度），CUDA底层会调用诸如 atomicAdd(half*, half) 这类原子指令。但是当前的硬件对这条指令的支持不够好，所以我们需要借助一些内存对齐的技术以应用效率更高的 atomicAdd(half2*, half2) 指令。本文重点讲述如何利用软件来弥补硬件的缺陷这样的实例。

atomicAdd指令

下面的声明摘自 CUDA C Programming Guide。已经支持的数据类型如下：

int atomicAdd(int* address, int val);
unsigned int atomicAdd(unsigned int* address,
                       unsigned int val);
unsigned long long int atomicAdd(unsigned long long int* address,
                                 unsigned long long int val);
float atomicAdd(float* address, float val);
double atomicAdd(double* address, double val);
__half2 atomicAdd(__half2 *address, __half2 val);
__half atomicAdd(__half *address, __half val);
__nv_bfloat162 atomicAdd(__nv_bfloat162 *address, __nv_bfloat162 val);
__nv_bfloat16 atomicAdd(__nv_bfloat16 *address, __nv_bfloat16 val);
float2 atomicAdd(float2* address, float2 val);
float4 atomicAdd(float4* address, float4 val);

NVIDIA的论坛里有人讨论过half精度atomicAdd性能劣化的问题。但是回答中给到的测试是有一定的误导性的，实际上大量的累积计算会导致half精度早早就溢出了。所以可以本地测试一下1000次以内的，保证half和half2的累加值不溢出。过大时，half精度可能会停留在2048或者4096的这样的数字。

float	half	half2
2.6e-05s	0.00048s	1.9e-0.5s

可以看到，half2最快，half最慢。

// nvcc -o a a.cu -arch=sm_86 -DUSE_FLOAT
// nvcc -o a a.cu -arch=sm_86 

#include <cuda_fp16.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

// const int nTPB = 128;
// const size_t nBLK = 1048576ULL;
const int nTPB = 10;
const size_t nBLK = 100ULL;
const size_t ds = nBLK*nTPB;
#ifndef USE_FLOAT
using ft = half;
#else
using ft = float;
#endif
__global__ void k(const ft * __restrict__ i, ft * __restrict__ o){
    size_t idx = blockIdx.x * blockDim.x+threadIdx.x;
    atomicAdd(o, i[idx]);        
}

__global__ void k2(const ft * __restrict__ i, ft * __restrict__ o){
    size_t idx = blockIdx.x*blockDim.x+threadIdx.x ;
    if(((idx & 1) == 0) && (idx + 1 < ds)) {
        #ifdef USE_FLOAT
        #else
            half2 i2 = make_half2(i[idx], i[idx + 1]);
            atomicAdd(reinterpret_cast<half2*>(o), i2);
        #endif
    }
}

int main(){

  ft *i, *o, *hi, *o2;
  cudaMalloc(&i, ds*sizeof(ft));
  cudaMalloc(&o,  2*sizeof(ft));
  cudaMalloc(&o2,  2*sizeof(ft));
  cudaMemset(o, 0, sizeof(ft));
  hi = (ft *)malloc(ds*sizeof(ft));
  for (size_t i = 0; i < ds; i++) {
    if (i & 1) {
  #ifndef USE_FLOAT
      hi[i] = __float2half(0.1f);
  #else
      hi[i] = 0.1f;
  #endif
    } else {
  #ifndef USE_FLOAT
      hi[i] = __float2half(0.2f);
  #else
      hi[i] = 0.2f;
  #endif
    }
  }

  cudaMemcpy(i, hi, ds*sizeof(ft), cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();
  k<<<nBLK, nTPB>>>(i, o); // warm-up
  cudaDeviceSynchronize();
  unsigned long long dt = dtime_usec(0);
  k<<<nBLK, nTPB>>>(i, o);
  cudaDeviceSynchronize();

  ft* out = (ft *)malloc(2*sizeof(ft));
  cudaMemcpy(out, o, 2*sizeof(ft), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  dt = dtime_usec(dt);
  cudaError_t err = cudaGetLastError();
  if (err == cudaSuccess) std::cout << "Duration: " << dt/(float)USECPSEC << "s" << std::endl;
  else std::cout << "Error: " << cudaGetErrorString(err) << std::endl;


  cudaMemcpy(i, hi, ds*sizeof(ft), cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();
  k2<<<nBLK, nTPB>>>(i, o2); // warm-up
  cudaDeviceSynchronize();
  dt = dtime_usec(0);
  k2<<<nBLK, nTPB>>>(i, o2);
  cudaDeviceSynchronize();

  cudaMemcpy(out, o2, 2*sizeof(ft), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  
  dt = dtime_usec(dt);
  err = cudaGetLastError();
  if (err == cudaSuccess) std::cout << "Duration: " << dt/(float)USECPSEC << "s" << std::endl;
  else std::cout << "Error: " << cudaGetErrorString(err) << std::endl;
}

half精度的fastAtomicAdd实现

首先我们要对齐内存，然后对一个连续的内存区间做操作，如果不是连续的那就会退化到 half 类型的原子加操作。对于两两相连的half值，我们对于目标值做累加，另一个紧邻的值补0对齐，这样就不会影响计算结果。low_byte 用来判断是否对齐，根据对齐与否来决定补0的位置是前面还是后面：

template < 
  typename scalar_t, typename index_t, typename std::enable_if<std::is_same<half, scalar_t>::value>::type* = nullptr>
__device__ __forceinline__ void fast_atomic_add(scalar_t *address, index_t index, const index_t numel, scalar_t value) {
#if (                                                    \
    (defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
  atomic_add(
      reinterpret_cast<half*>(address) + index,
      static_cast<half>(value));
#else
  __half* target_addr = reinterpret_cast<__half*>(address + index);
  bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);

  if (low_byte && (index + 1 < numel)) {
    __half zero = __float2half(0.0f);
    __half2 val2 = make_half2(value, zero);
    atomicAdd(reinterpret_cast<__half2*>(target_addr), val2);
  } else if (!low_byte && (index > 0)) {
    __half zero = __float2half(0.0f);
    __half2 val2 = make_half2(zero, value);
    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), val2);
  } else {
    atomic_add(
      reinterpret_cast<half*>(address) + index,
      static_cast<half>(value));
  }
#endif
}

这个部分的原理与更完整的版本可以参考 PyTorch 的实现。

Z.Y. ☯ Cosmos

Powered by Disqus, scientific surfing for access

CUDA编程优化Float16精度的atomicAdd

atomicAdd指令

half精度的fastAtomicAdd实现