/****************************************************************
** 计算t1 t1 = D1.*conj(H1) + D2.*conj(H2); **
** 计算t2 t2 = abs(H1).^2+abs(H2).^2 + gama; **
*****************************************************************/
__global__ void t1_t2(compx* H1_d,compx* H2_d,compx* D1_d,compx* D2_d,compx* t1_d,double *t2_d,int row,int col)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int j=blockIdx.y*blockDim.y+threadIdx.y;
int idx = i*col+j;
if(i<row&&j<col)
{
t1_d[idx].real = (D1_d[idx].real * H1_d[idx].real + D1_d[idx].imag * H1_d[idx].imag)+
D2_d[idx].real * H2_d[idx].real + D2_d[idx].imag * H2_d[idx].imag;
t1_d[idx].imag = (H1_d[idx].real * D1_d[idx].imag - D1_d[idx].real * H1_d[idx].imag)+
H2_d[idx].real * D2_d[idx].imag - D2_d[idx].real * H2_d[idx].imag;
t2_d[idx] = (H1_d[idx].real * H1_d[idx].real + H1_d[idx].imag * H1_d[idx].imag) +
(H2_d[idx].real * H2_d[idx].real + H2_d[idx].imag * H2_d[idx].imag);
}
}
上面是我的kernel,其中D1 H1 都重复使用,想把它们放的share memory 可是share memory只有48k,上面这个kernel有没有好的方法可以优化呢?
|