素人によるCUDAのお勉強。6
cudaプログラミングの時間の計測。sys/time.hのgettimeofdayを使って、CPU上でGPUの実行時間を計測します。グリッドの次元に上限があるため
#include <cuda_runtime.h> #include <stdio.h> #include <sys/time.h> #define CHECK(call)\ {\ const cudaError_t error = call;\ if (error != cudaSuccess)\ {\ printf("Error: %s:%d",__FILE__,__LINE__);\ printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));\ exit(1);\ }\ } void checkResult(float *hostRef, float *gpuRef, const int N){ double epsilon = 1.0E-8; bool match = 1; int i; for(i=0;i<N;i++){ if(abs(hostRef[i] - gpuRef[i])>epsilon){ match = 0; printf("Arrays do not match!\n"); printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i); break; } } if(match)printf("Arrays match\n\n"); } void initialData(float *ip,int size){ int i; time_t t; srand((unsigned) time(&t)); for(i=0;i<size;i++){ ip[i] = (float)( rand() & 0xFF)/10.0f; } } void sumArrayOnHost(float *A, float *B, float *C, const int N){ int idx; for(idx=0;idx<N;idx++) C[idx] = A[idx] + B[idx]; } __global__ void sumArraysOnGPU(float *A, float *B,float *C,const int N){ int i = blockIdx.x * blockDim.x + threadIdx.x; if(i<N) C[i] = A[i] + B[i]; } double cpuSecond(){ struct timeval tp; gettimeofday(&tp,NULL); return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6); } int main(int argc, char **argv){ printf("%s Starting..\n",argv[0]); int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp,dev)); printf("Using Device %d: %s\n",dev,deviceProp.name); CHECK(cudaSetDevice(dev)); int nElem = 1<<24; printf("Vector size %d\n",nElem); size_t nBytes = nElem * sizeof(float); float *h_A, *h_B, *hostRef, *gpuRef; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (float *)malloc(nBytes); double iStart,iElaps; iStart = cpuSecond(); initialData(h_A, nElem); initialData(h_B, nElem); iElaps = cpuSecond() - iStart; memset(hostRef,0,nBytes); memset(gpuRef,0,nBytes); iStart = cpuSecond(); sumArrayOnHost(h_A,h_B,hostRef,nElem); iElaps = cpuSecond() - iStart; float *d_A, *d_B, *d_C; cudaMalloc((float **)&d_A,nBytes); cudaMalloc((float **)&d_B,nBytes); cudaMalloc((float **)&d_C,nBytes); cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice); int iLen = 1024; dim3 block (iLen); dim3 grid ((nElem+block.x-1)/block.x); iStart = cpuSecond(); sumArraysOnGPU <<<grid,block>>>(d_A,d_B,d_C,nElem); cudaDeviceSynchronize(); iElaps = cpuSecond() - iStart; printf("sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); cudaMemcpy(gpuRef,d_C,nBytes,cudaMemcpyDeviceToHost); checkResult(hostRef,gpuRef,nElem); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(hostRef); free(gpuRef); return(0); }
実行結果
Using Device 0: GeForce GTX 760 Vector size 16777216 sumArraysOnGPU <<<16384,1024>>> Time elapsed 0.001660 sec Arrays match
nvprofを使って、GPUの計算時間を計測する。
>> nvprof ./a.out ./a.out Starting.. ==1520== NVPROF is profiling process 1520, command: ./a.out Using Device 0: GeForce GTX 760 Vector size 16777216 sumArraysOnGPU <<<16384,1024>>> Time elapsed 0.001708 sec Arrays match ==1520== Profiling application: ./a.out ==1520== Profiling result: Time(%) Time Calls Avg Min Max Name 71.32% 17.215ms 2 8.6075ms 8.3067ms 8.9083ms [CUDA memcpy HtoD] 21.96% 5.3010ms 1 5.3010ms 5.3010ms 5.3010ms [CUDA memcpy DtoH] 6.72% 1.6217ms 1 1.6217ms 1.6217ms 1.6217ms sumArraysOnGPU(float*, float*, float*, int) ==1520== API calls: Time(%) Time Calls Avg Min Max Name 67.49% 53.244ms 3 17.748ms 97.150us 53.029ms cudaMalloc 28.89% 22.789ms 3 7.5963ms 5.3890ms 8.9853ms cudaMemcpy 2.08% 1.6446ms 1 1.6446ms 1.6446ms 1.6446ms cudaDeviceSynchronize 0.54% 424.36us 83 5.1120us 908ns 163.78us cuDeviceGetAttribute 0.43% 338.17us 1 338.17us 338.17us 338.17us cudaGetDeviceProperties 0.37% 291.45us 3 97.149us 73.543us 139.61us cudaFree 0.06% 48.121us 1 48.121us 48.121us 48.121us cuDeviceTotalMem 0.05% 40.159us 1 40.159us 40.159us 40.159us cudaLaunch 0.05% 39.251us 1 39.251us 39.251us 39.251us cuDeviceGetName 0.02% 18.927us 1 18.927us 18.927us 18.927us cudaSetDevice 0.01% 5.3080us 2 2.6540us 1.1180us 4.1900us cuDeviceGetCount 0.01% 3.9830us 4 995ns 699ns 1.6070us cudaSetupArgument 0.00% 2.4450us 2 1.2220us 1.0480us 1.3970us cuDeviceGet