玫瑰幻想 发表于 2013-11-12 22:08
错误代码呢?
请配合点。
不好意思。kernel 代码如下:
__global__ void ddr_stride_16_bandwidth( complex *A, int numElements, int log_n) //tid = 256 threads for a block.
{
int burst = 4;
int burst_mask = (2<<burst)-1;
int bid = blockIdx.x;
int tid = threadIdx.x;
int p_bid = bid << burst;
int t_3_0 = tid & burst_mask;
int t_7_4 = (tid >> burst);
int p_mem = p_bid + (t_7_4 <<(log_n-7)) + t_3_0; // consecutive
int inc_256 = 1 << (log_n-3);
float *p_A = (float *)A;
__shared__ float s_2k_mem[4096];
s_2k_mem[tid ] = p_A[p_mem ];
s_2k_mem[tid+256] = p_A[p_mem+inc_256 ];
s_2k_mem[tid+512] = p_A[p_mem+inc_256*2];
s_2k_mem[tid+768] = p_A[p_mem+inc_256*3];
s_2k_mem[tid+1024] = p_A[p_mem+inc_256*4];
s_2k_mem[tid+1280] = p_A[p_mem+inc_256*5];
s_2k_mem[tid+1536] = p_A[p_mem+inc_256*6];
s_2k_mem[tid+1792] = p_A[p_mem+inc_256*7];
s_2k_mem[tid+2048] = p_A[p_mem+inc_256*8];
s_2k_mem[tid+2304] = p_A[p_mem+inc_256*9];
s_2k_mem[tid+2560] = p_A[p_mem+inc_256*10];
s_2k_mem[tid+2816] = p_A[p_mem+inc_256*11];
s_2k_mem[tid+3072] = p_A[p_mem+inc_256*12];
s_2k_mem[tid+3328] = p_A[p_mem+inc_256*13];
s_2k_mem[tid+3584] = p_A[p_mem+inc_256*14];
s_2k_mem[tid+3840] = p_A[p_mem+inc_256*15];
__syncthreads();
p_A[p_mem ] = s_2k_mem[tid ];
p_A[p_mem+inc_256 ] = s_2k_mem[tid+256];
p_A[p_mem+inc_256*2] = s_2k_mem[tid+512];
p_A[p_mem+inc_256*3] = s_2k_mem[tid+768];
p_A[p_mem+inc_256*4] = s_2k_mem[tid+1024];
p_A[p_mem+inc_256*5] = s_2k_mem[tid+1280];
p_A[p_mem+inc_256*6] = s_2k_mem[tid+1536];
p_A[p_mem+inc_256*7] = s_2k_mem[tid+1792];
p_A[p_mem+inc_256*8] = s_2k_mem[tid+2048];
p_A[p_mem+inc_256*9] = s_2k_mem[tid+2304];
p_A[p_mem+inc_256*10] = s_2k_mem[tid+2560];
p_A[p_mem+inc_256*11] = s_2k_mem[tid+2816];
p_A[p_mem+inc_256*12] = s_2k_mem[tid+3072];
p_A[p_mem+inc_256*13] = s_2k_mem[tid+3328];
p_A[p_mem+inc_256*14] = s_2k_mem[tid+3584];
p_A[p_mem+inc_256*15] = s_2k_mem[tid+3840];
__syncthreads();
}
|