本帖最后由 lzhbch 于 2013-9-10 15:14 编辑
先描述一下自己的环境,win7 x64系统,VS2005, 650M
问题1:
同样一段代码,我用默认的CUDA 5.0 Runtime工程来跑与我把CUDA sample中的工程替换成同样的代码来跑,执行效率差距很大,不知道是什么原因造成的?sample使用的工程是simpleStreams工程。
首先是在CUDA 5.0 Runtime工程中的运行结果
然后是sample工程中的运行结果
所使用的程序来自CUDA的官方文章:How to Overlap Data Transfers in CUDA C/C++
代码为:
- inline
- cudaError_t checkCuda(cudaError_t result)
- {
- #if defined(DEBUG) || defined(_DEBUG)
- if (result != cudaSuccess) {
- fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
- assert(result == cudaSuccess);
- }
- #endif
- return result;
- }
- __global__ void kernel(float *a, int offset)
- {
- int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
- float x = (float)i;
- float s = sinf(x);
- float c = cosf(x);
- a[i] = a[i] + sqrtf(s*s+c*c);
- }
- float maxError(float *a, int n)
- {
- float maxE = 0;
- for (int i = 0; i < n; i++) {
- float error = fabs(a[i]-1.0f);
- if (error > maxE) maxE = error;
- }
- return maxE;
- }
- int main(int argc, char **argv)
- {
- const int blockSize = 256, nStreams = 4;
- const int n = 4 * 1024 * blockSize * nStreams;
- const int streamSize = n / nStreams;
- const int streamBytes = streamSize * sizeof(float);
- const int bytes = n * sizeof(float);
-
- int devId = 0;
- if (argc > 1) devId = atoi(argv[1]);
- cudaDeviceProp prop;
- checkCuda( cudaGetDeviceProperties(&prop, devId));
- printf("Device : %s\n", prop.name);
- checkCuda( cudaSetDevice(devId) );
-
- // allocate pinned host memory and device memory
- float *a, *d_a;
- checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned
- checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device
- float ms; // elapsed time in milliseconds
-
- // create events and streams
- cudaEvent_t startEvent, stopEvent, dummyEvent;
- cudaStream_t stream[nStreams];
- checkCuda( cudaEventCreate(&startEvent) );
- checkCuda( cudaEventCreate(&stopEvent) );
- checkCuda( cudaEventCreate(&dummyEvent) );
- for (int i = 0; i < nStreams; ++i)
- checkCuda( cudaStreamCreate(&stream[i]) );
-
- // baseline case - sequential transfer and execute
- memset(a, 0, bytes);
- checkCuda( cudaEventRecord(startEvent,0) );
- checkCuda( cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice) );
- kernel<<<n/blockSize, blockSize>>>(d_a, 0);
- checkCuda( cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost) );
- checkCuda( cudaEventRecord(stopEvent, 0) );
- checkCuda( cudaEventSynchronize(stopEvent) );
- checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
- printf("Time for sequential transfer and execute (ms): %f\n", ms);
- printf(" max error: %e\n", maxError(a, n));
- // asynchronous version 1: loop over {copy, kernel, copy}
- memset(a, 0, bytes);
- checkCuda( cudaEventRecord(startEvent,0) );
- for (int i = 0; i < nStreams; ++i) {
- int offset = i * streamSize;
- checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
- streamBytes, cudaMemcpyHostToDevice,
- stream[i]) );
- kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
- checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
- streamBytes, cudaMemcpyDeviceToHost,
- stream[i]) );
- }
- checkCuda( cudaEventRecord(stopEvent, 0) );
- checkCuda( cudaEventSynchronize(stopEvent) );
- checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
- printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
- printf(" max error: %e\n", maxError(a, n));
- // asynchronous version 2:
- // loop over copy, loop over kernel, loop over copy
- memset(a, 0, bytes);
- checkCuda( cudaEventRecord(startEvent,0) );
- for (int i = 0; i < nStreams; ++i)
- {
- int offset = i * streamSize;
- checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset],
- streamBytes, cudaMemcpyHostToDevice,
- stream[i]) );
- }
- for (int i = 0; i < nStreams; ++i)
- {
- int offset = i * streamSize;
- kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
- }
- for (int i = 0; i < nStreams; ++i)
- {
- int offset = i * streamSize;
- checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset],
- streamBytes, cudaMemcpyDeviceToHost,
- stream[i]) );
- }
- checkCuda( cudaEventRecord(stopEvent, 0) );
- checkCuda( cudaEventSynchronize(stopEvent) );
- checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );
- printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
- printf(" max error: %e\n", maxError(a, n));
- // cleanup
- checkCuda( cudaEventDestroy(startEvent) );
- checkCuda( cudaEventDestroy(stopEvent) );
- checkCuda( cudaEventDestroy(dummyEvent) );
- for (int i = 0; i < nStreams; ++i)
- checkCuda( cudaStreamDestroy(stream[i]) );
- cudaFree(d_a);
- cudaFreeHost(a);
- return 0;
- }
复制代码
问题2:
根据问题1中的代码内容,本来是想学习一下stream的并行执行的,结果发现我的机器kernel执行的时候有overlap,但是数据的copy却不能与kernel并发,不知道是什么原因造成的。
一下是我用sample工程执行的profiler图
问题比较多,原本我认为是因为我的工程属性造成数据传输无法并行,所以才使用sample中的工程来执行的,结果发现sample中的工程也无法并发传输,然后又发现程序运行的效率差距很大,其中两个程序都没有使用fast-math选项,结果一个问题编程两个了。卡的属性应当是支持并发传输的,deviceQuery.exe的执行结果中 Concurrent copy and kernel execution: Yes with 1 copy engine(s)。麻烦论坛里的高手帮忙解惑,谢谢了!
|