横扫千军 发表于 2013-5-2 23:02
我也不明白了。坐等他人回复吧。
谢谢
类似还有这样的代码,我在nvs4200m上测试过多次,改过许多参数,基本运行速度都在10GFLOPS以下,而C2070上也只有50多GFLOPS,与理论峰值相差太远。
而这些kernel本身又是极其简单的,不存在mem-bound,math上也应该能较好发挥GPU性能,但是实测却差的比较多。(不过提供这些代码的网页上倒有许多不错的测试结果,基本都能达到理论峰值的90%)- #define LOOP (100000)
- #define BLOCKS (200)
- #define THPB (256)
- __global__ void new_ker(float *x)
- {
- int index = threadIdx.x+blockIdx.x*blockDim.x;
- float a = 1.0f, b = -1.0f;
- for(int i = 0; i < LOOP; i++){
- a = a*b + b;
- }
- x[index] = a;
- }
- int main(int argc,char **argv)
- {
- //Initializations
- float *x;
- float *dx;
- cudaEvent_t new_start,new_stop;
- float elapsed;
- double gflops;
- x = (float *)malloc(sizeof(float)*THPB*BLOCKS);
- cudaProfilerStart();
- cudaMalloc((void **)&dx,sizeof(float)*THPB);
- //ILP=1
- cudaEventCreate(&new_start);
- cudaEventCreate(&new_stop);
- printf("Kernel1:\n");
- cudaEventRecord(new_start, 0);
- new_ker<<<BLOCKS,THPB>>>(dx);
- cudaEventRecord(new_stop,0);
- cudaEventSynchronize(new_stop);
- cudaEventElapsedTime(&elapsed,new_start,new_stop);
-
- cudaMemcpy(x,dx,sizeof(float)*THPB*BLOCKS,cudaMemcpyDeviceToHost);
- cudaEventDestroy(new_start);
- cudaEventDestroy(new_stop);
- cudaDeviceReset();
- cudaProfilerStop();
- printf("\t%f\n",elapsed);
- gflops = 2.0e-6 * ((double)(LOOP)*(double)(THPB*BLOCKS)/(double)elapsed);
- printf("\t%f\n",gflops);
- return 0;
- }
复制代码 |