ice 发表于 2013-11-19 10:26
LZ您好:
请您提供您的具体实现的写法,以便分析。 - #include<stdio.h>
- #include<cuda_runtime.h>
- #include<omp.h>
- #include<time.h>
- #include<opencv2/core/core.hpp>
- #include<iostream>
- __global__ void
- vectorAdd(const float *A,const float *B, float *C,int numElements)
- {
- int i = blockDim.x * blockIdx.x + threadIdx.x;
- if (i < numElements)
- {
- C[i] = A[i] * B[i];
- }
- }
- int main(void)
- {
-
- cudaError_t err = cudaSuccess;
- int numElements = 5000000;
- size_t size = numElements * sizeof(float);
- printf("[Vector addition of %d elements]\n", numElements);
- float *h_a = (float *)malloc(size);
- float *h_b = (float *)malloc(size);
- float *h_c = (float *)malloc(size);
- for(int i = 0; i < numElements;++i)
- {
- h_a[i] = rand()/(float)RAND_MAX;
- h_b[i] = rand()/(float)RAND_MAX;
- }
- float *d_a = NULL;
- err = cudaMalloc((void**)&d_a,size);
- float *d_b = NULL;
- err = cudaMalloc((void **)&d_b,size);
-
- float *d_c = NULL;
- err = cudaMalloc((void**)&d_c,size);
- printf("Copy input data from the host memory to the CUDA device\n");
- err = cudaMemcpy(d_a,h_a,size,cudaMemcpyHostToDevice);
- err = cudaMemcpy(d_b,h_b,size,cudaMemcpyHostToDevice);
- /////////////////////////////////////////////////////////
- float *h_a1 = (float *)malloc(size);
- float *h_b1 = (float *)malloc(size);
- float *h_c1 = (float *)malloc(size);
- for(int i = 0; i < numElements;++i)
- {
- h_a1[i] = rand()/(float)RAND_MAX;
- h_b1[i] = rand()/(float)RAND_MAX;
- }
- float *d_a1 = NULL;
- err = cudaMalloc((void**)&d_a1,size);
- float *d_b1 = NULL;
- err = cudaMalloc((void **)&d_b1,size);
-
- float *d_c1 = NULL;
- err = cudaMalloc((void**)&d_c1,size);
- printf("Copy input data from the host memory to the CUDA device\n");
- err = cudaMemcpy(d_a1,h_a1,size,cudaMemcpyHostToDevice);
- err = cudaMemcpy(d_b1,h_b1,size,cudaMemcpyHostToDevice);
- /////////////////////////////////////////////////////////
- int threadsPerBlock = 256;
- int blocksPerGrid = (numElements + threadsPerBlock - 1)/threadsPerBlock;
- //long t1 = clock();
- int64 t = cv::getTickCount();
- #pragma omp parallel sections
- {
- #pragma omp section
- {
- printf("section1 threadid = %d\n",omp_get_thread_num());
- printf("cuda kernel launch with %d blocks of %d threads\n",blocksPerGrid,threadsPerBlock);
- //vectorAdd<<blocksPerGrid,threadsPerBlock>>(d_a,d_b,d_c,numElements);
- vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, numElements);
- }
- #pragma omp section
- {
- printf("section2 threadid = %d\n",omp_get_thread_num());
- printf("cuda kernel launch with %d blocks of %d threads\n",blocksPerGrid,threadsPerBlock);
- //vectorAdd<<blocksPerGrid,threadsPerBlock>>(d_a,d_b,d_c,numElements);
- vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a1, d_b1, d_c1, numElements);
- }
- }
- //long t2 = clock();
- //printf("cost time %ld\n",t2-t1);
- std::cout<<((cv::getTickCount() - t) / cv::getTickFrequency())*1000 << " msec"<<std::endl;
- err = cudaGetLastError();
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Copy the device result vector in device memory to the host result vector
- // in host memory.
- printf("Copy output data from the CUDA device to the host memory\n");
-
- err =cudaMemcpy(h_c,d_c,size,cudaMemcpyDeviceToHost);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Verify that the result vector is correct
- for (int i = 0; i < numElements; ++i)
- {
- if (fabs(h_a[i] *h_b[i] - h_c[i]) > 1e-5)
- {
- fprintf(stderr, "Result verification failed at element %d!\n", i);
- exit(EXIT_FAILURE);
- }
- }
- printf("Test PASSED\n");
- err = cudaFree(d_a);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- err = cudaFree(d_b);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- err = cudaFree(d_c);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Free host memory
- free(h_a);
- free(h_b);
- free(h_c);
- // Reset the device and exit
- err = cudaDeviceReset();
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- printf("Done\n");
- return 0;
- }
复制代码 你看我这种思路 可以实现 多个线程同时访问gpu么 直接把sdk中向量相加的例子 改的 |