続々:2次元イジングモデル
CUDAで下手くそなプログラミングを書いてみました。対象は前回から引き続き2次元イジングモデルです。2次元の行列サイズは1024x1024と2048x2048、ループ回数は1000です。コード中のMT.hは乱数を生成するためにメルセンヌツイスターを使用しているからです。早速結果から。
環境
結果
CUDAのプログラムは書き換える必要がまだまだあります。プログラムが完成したら、コードのレビューをしたいと思います。CUDAプログラム
- リダクションの関数を書く
- cudaMemcpyを減らす。
- 最適なblock,gridを探す。
- データの転送効率を計測する。(Host to Device)
#include <stdio.h> #include <cuda_runtime.h> #include "MT.h" #include <math.h> #define UP 0 #define DOWN 1 #define LEFT 2 #define RIGHT 3 #define CHECK(call) {\ cudaError_t error = call; \ if( cudaSuccess != error){ \ printf("Error:%d,%s\n",__LINE__,__FILE__);\ printf("code:%d,reason:%s",error,cudaGetErrorString(error));\ }\ } typedef struct tmp { int value; int neighbors[4]; // up,down,left,right } spin; /// functions int setneighbors(spin *spins,int row,int col,int point); /// --- __global__ void cudacalcE(spin *spins,int *E,int size){ int xdx = threadIdx.x + blockIdx.x * blockDim.x; int ydx = threadIdx.y + blockIdx.y * blockDim.y; int ndx = xdx + ydx * gridDim.x * blockDim.x; int sum = 0; if(ndx < size){ sum -= spins[ndx].neighbors[UP]*spins[ndx].value; sum -= spins[ndx].neighbors[DOWN]*spins[ndx].value; sum -= spins[ndx].neighbors[LEFT]*spins[ndx].value; sum -= spins[ndx].neighbors[RIGHT]*spins[ndx].value; E[ndx] = sum; } } __global__ void reduction(int *E,int *Eo,int size){ unsigned int tid = threadIdx.x; unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; int *idata = E + blockIdx.x * blockDim.x; int stride; if(idx >= size) return; for(stride = blockDim.x/2;stride > 0; stride>>=1){ if(tid <stride) idata[tid] += idata[tid+stride]; __syncthreads(); } if(tid == 0) Eo[blockIdx.x] = idata[0]; } __global__ void dcopy(spin *dst,spin *src,int size){ int xdx = threadIdx.x + blockIdx.x * blockDim.x; int ydx = threadIdx.y + blockIdx.y * blockDim.y; int i = xdx + ydx * gridDim.x * blockDim.x; if(i < size){ dst[i].value=src[i].value; dst[i].neighbors[UP]=src[i].neighbors[UP]; dst[i].neighbors[DOWN]=src[i].neighbors[DOWN]; dst[i].neighbors[LEFT]=src[i].neighbors[LEFT]; dst[i].neighbors[RIGHT]=src[i].neighbors[RIGHT]; } } int copy(spin *dst,spin *src,int size){ int i; for(i=0;i<size;i++){ dst[i].value=src[i].value; dst[i].neighbors[UP]=src[i].neighbors[UP]; dst[i].neighbors[DOWN]=src[i].neighbors[DOWN]; dst[i].neighbors[LEFT]=src[i].neighbors[LEFT]; dst[i].neighbors[RIGHT]=src[i].neighbors[RIGHT]; } return(0); } int easyshow(spin *spins, int row, int col){ int i,j; int value; for(i=0;i<row;i++){ for(j=0;j<col;j++){ value = spins[i*row+j].value; if(value<0) printf("-"); else printf("+"); } puts(""); } return(0); } int show(spin *spins, int row, int col){ int i,j; for(i=0;i<row;i++){ for(j=0;j<col;j++) printf(" %3d ",spins[i*row+j].neighbors[UP]); puts(""); for(j=0;j<row;j++){ printf("%3d%3d%3d " ,spins[i*row+j].neighbors[LEFT] ,spins[i*row+j].value ,spins[i*row+j].neighbors[RIGHT]); } puts(""); for(j=0;j<row;j++) printf(" %3d ",spins[i*row+j].neighbors[DOWN]); puts(""); } return(0); } int init(spin *spins, int row, int col){ int i; int sign[] = {-1,1}; for(i=0;i<row*col;i++) spins[i].value = sign[genrand_int32()%2]; for(i=0;i<row*col;i++) setneighbors(spins,row,col,i); return(0); } int setneighbors(spin *spins,int row,int col,int point){ int up=0, down=0, left=0, right=0; up = point - row; if(up < 0) up = row * (col - 1) + point ; down = point + row; if(down >= row * col) down = down % (row * col); if(point%col == 0) left = point + row - 1; else left = point - 1; if((point+1)%col == 0) right = point - row + 1; else right = point + 1; // neighbors[4] = up,down,left,right spins[up].neighbors[DOWN] = spins[point].value; spins[down].neighbors[UP] = spins[point].value; spins[left].neighbors[RIGHT] = spins[point].value; spins[right].neighbors[LEFT] = spins[point].value; return(0); } void MC(spin *spins,int row,int col,int nstep){ int i,j; int mp; int size = row*col; size_t dsize = sizeof(int)*size; size_t spinsize = sizeof(spin)*size; int Epre=0,Eafter=0; float prob; float T = 0.5; float invT = 1/T; int *h_E = (int *) malloc(dsize); int *d_E,*d_Eo; int blocksize = 1<<5; dim3 block(blocksize,blocksize); dim3 grid((block.x+row-1)/block.x,(block.x+col-1)/block.y); spin *dspins,*tspins,*dtspins; tspins = (spin *) malloc(spinsize); cudaMalloc((spin **)&dspins,spinsize); cudaMalloc((spin **)&dtspins,spinsize); cudaMalloc((int **)&d_E,dsize); cudaMalloc((int **)&d_Eo,dsize); //show(spins,row,col); cudaMemcpy(dspins,spins,spinsize,cudaMemcpyHostToDevice); //easyshow(spins,row,col); puts("---"); for(i=0;i<nstep;i++){ Epre = 0; Eafter = 0; mp = genrand_int32()%size; copy(tspins,spins,size); tspins[mp].value = -tspins[mp].value; setneighbors(tspins,row,col,mp); cudaMemcpy(dtspins,tspins,spinsize,cudaMemcpyHostToDevice); cudacalcE<<<grid,block>>>(dspins,d_E,size); //reduction<<<regrid,reblock>>>(d_E,d_Eo,size); cudaMemcpy(h_E,d_E,dsize,cudaMemcpyDeviceToHost); for(j=0;j<size;j++) Epre+=h_E[j]; cudacalcE<<<grid,block>>>(dtspins,d_E,size); //reduction<<<grid,1>>>(d_E,d_Eo,size); cudaMemcpy(h_E,d_E,dsize,cudaMemcpyDeviceToHost); for(j=0;j<size;j++) Eafter+=h_E[j]; if (Eafter <= Epre){ cudaMemcpy(dspins,dtspins,spinsize,cudaMemcpyDeviceToDevice); cudaMemcpy(spins,dspins,spinsize,cudaMemcpyDeviceToHost); } else{ prob = genrand_real1(); //printf("%lf,%lf\n",prob,exp(-(Eafter-Epre)*invT)); if(prob < exp(-(Eafter-Epre)*invT)){ cudaMemcpy(dspins,dtspins,spinsize,cudaMemcpyDeviceToDevice); cudaMemcpy(spins,dspins,spinsize,cudaMemcpyDeviceToHost); } } } cudaMemcpy(spins,dspins,spinsize,cudaMemcpyDeviceToHost); //easyshow(spins,row,col); free(tspins); cudaFree(dspins); cudaFree(dtspins); cudaFree(d_E); cudaFree(d_Eo); cudaDeviceReset(); } int main(void){ int bsize = 11; int row = 1<<bsize; int col = 1<<bsize; int size = row * col; int nstep = 1000; int seed = 1; init_genrand(seed); spin *spins; spins = (spin *) malloc(sizeof(spin)*size); init(spins,row,col); MC(spins,row,col,nstep); printf("@@ row = %d,col = %d,matrix = (%d,%d) size = %d\n",row,col,row,col,row*col); printf("@@ steps = %d\n",nstep); free(spins); return(0); }