点积运算(x1,x2,x3,x4......)•(y1,y2,y3,y4......)=x1y1+x2y2+x3y3+x4y4+...... CUDA代码如下:
#include<stdio.h>
#define imin(a,b) (a<b?a:b)
const int N = 33 * 1024;//要进行点积运算的个数
const int threadsPerBlock =256;//每个线程块中有256个线程
const int blocksPerGrid = imin(32,(N+threadsPerBlock-1)/threadsPerBlock);//每个线程格中线程块的数量,取两者最小值
__global__ void dot(float *a,float *b,float *c)
{
__shared__ float cache[threadsPerBlock];//共享内存,不同线程块的线程不可互相访问
int tid=threadIdx.x+blockIdx.x*blockDim.x;//线程索引
int cacheIndex = threadIdx.x;//共享内存索引
float temp=0;
while(tid<N)
{
temp+=a[tid]*b[tid];//点积运算
tid+=blockDim.x*gridDim.x;//线程偏移
}
//设置cache中相应位置上的值
cache[cacheIndex]=temp;
//对线程块中的线程进行同步
__syncthreads();
//对于归约运算来说,一下代码要求threadPerBlock必须是2的指数幂
int i = blockDim.x/2;
//int i=256/2;
while(i!=0)
{
if(cacheIndex<i)
cache[cacheIndex]+=cache[cacheIndex+i];
__syncthreads();
i/=2;
}
if (cacheIndex==0)//此if语句可省略
c[blockIdx.x]=cache[0];
}
int main(void)
{
float *a,*b,*partial_c,c;
float *dev_a,*dev_b,*dev_partial_c;
//在CPU上分配内存
a=(float*)malloc(N*sizeof(float));
b=(float*)malloc(N*sizeof(float));
partial_c=(float*)malloc(blocksPerGrid*sizeof(float));
cudaMalloc((void**)&dev_a,N*sizeof(float));
cudaMalloc((void**)&dev_b,N*sizeof(float));
cudaMalloc((void**)&dev_partial_c,blocksPerGrid*sizeof(float));
//填充主机内存
for(int i=0;i<N;i++)
{
a=i;
b=i*2;
}
//将数组a和b复制到GPU
cudaMemcpy(dev_a,a,N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,N*sizeof(float),cudaMemcpyHostToDevice);
dot<<<blocksPerGrid,threadsPerBlock>>>(dev_a,dev_b,dev_partial_c);
//将数组c从GPU复制到CPU
cudaMemcpy(partial_c,dev_partial_c,blocksPerGrid*sizeof(float),cudaMemcpyDeviceToHost);
c=0;
for(int i=0;i<blocksPerGrid;i++)
{c+=partial_c;}
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
printf("Does GPU value %.6g = %.6g?\n",c,2* sum_squares((float)(N-1)));
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_partial_c);
free(a);
free(b);
free(partial_c);
}
为什么要用temp+=a[tid]*b[tid],而不是temp=a[tid]*b[tid] |