__device__ float compute_d( float *GpuArray_d, int AttributeN_d, int z_d, int i_d_d)
{
extern __shared__ float CurrentRow[];
for(int zz = 0; zz < AttributeN_d; zz++)
{
CurrentRow[zz] = GpuArray_d[z_d * AttributeN_d+ zz];
}
float tempdist_d = 0;
for (int j = 0; j < AttributeN_d ; j++ ) {
tempdist_d = tempdist_d + (powf(GpuArray_d[i_d_d*AttributeN_d+j] - CurrentRow[j],2.0));
__syncthreads();
}
return tempdist_d;
}
__global__ void filterData_D(float *GpuArray, int Dataset_Dimension_d, int AttributeN_d, float *tempArrary_d, int i_d,clock_t *time_d)
{
const int bid = blockIdx.x;
const int tid = threadIdx.x;
//if(tid == 0) time_d[bid] = clock();
float tempdist = 0;
for ( int z = bid * THREAD_NUM + tid; z < Dataset_Dimension_d; z += BLOCK_NUM * THREAD_NUM)
{
tempdist = compute_d(GpuArray, AttributeN_d, z, i_d);
tempArrary_d[z] = (sqrtf(tempdist));
tempdist = 0;
}
}
这是调用KERNEL部分
filterData_D <<< BLOCK_NUM, THREAD_NUM,49152,0>>>( gpudata, E_Dataset_Dimension, E_AttributeN, tempArrary_H, i, E_time );
AttributeN_d=168;
在compute_d里定义以及赋值CurrentRow[]之前正常,定义CurrentRow[]之后就运行,给出的结果不一样。 |