1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
|
#include <cuda_fp16.h> #include <iostream> #include <time.h> #include <sys/time.h> #define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start=0){ timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start; }
const int nTPB = 10; const size_t nBLK = 100ULL; const size_t ds = nBLK*nTPB; #ifndef USE_FLOAT using ft = half; #else using ft = float; #endif __global__ void k(const ft * __restrict__ i, ft * __restrict__ o){ size_t idx = blockIdx.x * blockDim.x+threadIdx.x; atomicAdd(o, i[idx]); }
__global__ void k2(const ft * __restrict__ i, ft * __restrict__ o){ size_t idx = blockIdx.x*blockDim.x+threadIdx.x ; if(((idx & 1) == 0) && (idx + 1 < ds)) { #ifdef USE_FLOAT #else half2 i2 = make_half2(i[idx], i[idx + 1]); atomicAdd(reinterpret_cast<half2*>(o), i2); #endif } }
int main(){
ft *i, *o, *hi, *o2; cudaMalloc(&i, ds*sizeof(ft)); cudaMalloc(&o, 2*sizeof(ft)); cudaMalloc(&o2, 2*sizeof(ft)); cudaMemset(o, 0, sizeof(ft)); hi = (ft *)malloc(ds*sizeof(ft)); for (size_t i = 0; i < ds; i++) { if (i & 1) { #ifndef USE_FLOAT hi[i] = __float2half(0.1f); #else hi[i] = 0.1f; #endif } else { #ifndef USE_FLOAT hi[i] = __float2half(0.2f); #else hi[i] = 0.2f; #endif } }
cudaMemcpy(i, hi, ds*sizeof(ft), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); k<<<nBLK, nTPB>>>(i, o); cudaDeviceSynchronize(); unsigned long long dt = dtime_usec(0); k<<<nBLK, nTPB>>>(i, o); cudaDeviceSynchronize();
ft* out = (ft *)malloc(2*sizeof(ft)); cudaMemcpy(out, o, 2*sizeof(ft), cudaMemcpyDeviceToHost); cudaDeviceSynchronize();
dt = dtime_usec(dt); cudaError_t err = cudaGetLastError(); if (err == cudaSuccess) std::cout << "Duration: " << dt/(float)USECPSEC << "s" << std::endl; else std::cout << "Error: " << cudaGetErrorString(err) << std::endl;
cudaMemcpy(i, hi, ds*sizeof(ft), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); k2<<<nBLK, nTPB>>>(i, o2); cudaDeviceSynchronize(); dt = dtime_usec(0); k2<<<nBLK, nTPB>>>(i, o2); cudaDeviceSynchronize();
cudaMemcpy(out, o2, 2*sizeof(ft), cudaMemcpyDeviceToHost); cudaDeviceSynchronize();
dt = dtime_usec(dt); err = cudaGetLastError(); if (err == cudaSuccess) std::cout << "Duration: " << dt/(float)USECPSEC << "s" << std::endl; else std::cout << "Error: " << cudaGetErrorString(err) << std::endl; }
|