おいも貴婦人ブログ

生物系博士課程満期退学をしたAIエンジニアのブログ。

素人によるCUDAのお勉強。6

cudaプログラミングの時間の計測。sys/time.hのgettimeofdayを使って、CPU上でGPUの実行時間を計測します。グリッドの次元に上限があるため

#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>

#define CHECK(call)\
{\
    const cudaError_t error = call;\
    if (error != cudaSuccess)\
        {\
            printf("Error: %s:%d",__FILE__,__LINE__);\
            printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));\
            exit(1);\
        }\
}

void checkResult(float *hostRef, float *gpuRef, const int N){
    double epsilon = 1.0E-8;
    bool match = 1;
    int i;
    for(i=0;i<N;i++){
        if(abs(hostRef[i] - gpuRef[i])>epsilon){
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
            break;
	}
    }
    if(match)printf("Arrays match\n\n");
}

void initialData(float *ip,int size){
    int i;
    time_t t;
    srand((unsigned) time(&t));

    for(i=0;i<size;i++){
        ip[i] = (float)( rand() & 0xFF)/10.0f;
    }
}

void sumArrayOnHost(float *A, float *B, float *C, const int N){
    int idx;
    for(idx=0;idx<N;idx++)
	C[idx] = A[idx] + B[idx];
}

__global__ void sumArraysOnGPU(float *A, float *B,float *C,const int N){
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i<N) C[i] = A[i] + B[i];
}

double cpuSecond(){
    struct timeval tp;
    gettimeofday(&tp,NULL);
    return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}

int main(int argc, char **argv){
    printf("%s Starting..\n",argv[0]);

    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp,dev));
    printf("Using Device %d: %s\n",dev,deviceProp.name);
    CHECK(cudaSetDevice(dev));

    int nElem = 1<<24;
    printf("Vector size %d\n",nElem);

    size_t nBytes = nElem * sizeof(float);

        float *h_A, *h_B, *hostRef, *gpuRef;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef = (float *)malloc(nBytes);

    double iStart,iElaps;

    iStart = cpuSecond();
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    iElaps = cpuSecond() - iStart;

    memset(hostRef,0,nBytes);
    memset(gpuRef,0,nBytes);

    iStart = cpuSecond();
    sumArrayOnHost(h_A,h_B,hostRef,nElem);
    iElaps = cpuSecond() - iStart;

    float *d_A, *d_B, *d_C;
    cudaMalloc((float **)&d_A,nBytes);
    cudaMalloc((float **)&d_B,nBytes);
    cudaMalloc((float **)&d_C,nBytes);

    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);

    int iLen = 1024;
    dim3 block (iLen);
    dim3 grid ((nElem+block.x-1)/block.x);

    iStart = cpuSecond();
    sumArraysOnGPU <<<grid,block>>>(d_A,d_B,d_C,nElem);
    cudaDeviceSynchronize();
    iElaps = cpuSecond() - iStart;
    printf("sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);

    cudaMemcpy(gpuRef,d_C,nBytes,cudaMemcpyDeviceToHost);

    checkResult(hostRef,gpuRef,nElem);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return(0);
}

実行結果

Using Device 0: GeForce GTX 760
Vector size 16777216
sumArraysOnGPU <<<16384,1024>>> Time elapsed 0.001660 sec
Arrays match

nvprofを使って、GPUの計算時間を計測する。

>> nvprof ./a.out
./a.out Starting..
==1520== NVPROF is profiling process 1520, command: ./a.out
Using Device 0: GeForce GTX 760
Vector size 16777216
sumArraysOnGPU <<<16384,1024>>> Time elapsed 0.001708 sec
Arrays match

==1520== Profiling application: ./a.out
==1520== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 71.32%  17.215ms         2  8.6075ms  8.3067ms  8.9083ms  [CUDA memcpy HtoD]
 21.96%  5.3010ms         1  5.3010ms  5.3010ms  5.3010ms  [CUDA memcpy DtoH]
  6.72%  1.6217ms         1  1.6217ms  1.6217ms  1.6217ms  sumArraysOnGPU(float*, float*, float*, int)

==1520== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
 67.49%  53.244ms         3  17.748ms  97.150us  53.029ms  cudaMalloc
 28.89%  22.789ms         3  7.5963ms  5.3890ms  8.9853ms  cudaMemcpy
  2.08%  1.6446ms         1  1.6446ms  1.6446ms  1.6446ms  cudaDeviceSynchronize
  0.54%  424.36us        83  5.1120us     908ns  163.78us  cuDeviceGetAttribute
  0.43%  338.17us         1  338.17us  338.17us  338.17us  cudaGetDeviceProperties
  0.37%  291.45us         3  97.149us  73.543us  139.61us  cudaFree
  0.06%  48.121us         1  48.121us  48.121us  48.121us  cuDeviceTotalMem
  0.05%  40.159us         1  40.159us  40.159us  40.159us  cudaLaunch
  0.05%  39.251us         1  39.251us  39.251us  39.251us  cuDeviceGetName
  0.02%  18.927us         1  18.927us  18.927us  18.927us  cudaSetDevice
  0.01%  5.3080us         2  2.6540us  1.1180us  4.1900us  cuDeviceGetCount
  0.01%  3.9830us         4     995ns     699ns  1.6070us  cudaSetupArgument
  0.00%  2.4450us         2  1.2220us  1.0480us  1.3970us  cuDeviceGet