没有问题啊,这是我的代码,看看!
int *gpu_input = 0;
int n;
int l_input[DATA_SIZE];
for(n = 0; n < DATA_SIZE; n++)
{
l_input[n] = n;
}
int *gpu_iResult = 0;
clock_t*gpu_timerResult = 0;
clock_t htimer;
CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_input, sizeof(int) * DATA_SIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_iResult, sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void**)&gpu_timerResult, sizeof(clock_t)));
CUDA_SAFE_CALL(cudaMemcpy(gpu_input, l_input, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice));
sumOfSquares < < <1, 1, 0>>>(gpu_input, gpu_iResult, gpu_timerResult);
CUDA_SAFE_CALL(cudaMemcpy(&htimer, gpu_timerResult, sizeof(clock_t), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(gpu_input));
CUDA_SAFE_CALL(cudaFree(gpu_iResult));
CUDA_SAFE_CALL(cudaFree(gpu_timerResult)); |