clock() is not ac开发者_运维知识库curate enough.
Use CUDA events for measure time of kernels or CUDA operations (memcpy etc):
// Prepare
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Start record
cudaEventRecord(start, 0);
// Do something on GPU
MyKernel<<<dimGrid, dimBlock>>>(input_data, output_data);
// Stop event
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop); // that's our time!
// Clean up:
cudaEventDestroy(start);
cudaEventDestroy(stop);
See CUDA Programming Guide, section 3.2.7.6
How about using clock() function in every CUDA thread to calculate start and end times. And store it in a array such a way that you can figure out which thread start/stop at which time based on array indices like following:
__global__ void kclock(unsigned int *ts) {
unsigned int start_time = 0, stop_time = 0;
start_time = clock();
// Code we need to measure should go here.
stop_time = clock();
ts[(blockIdx.x * blockDim.x + threadIdx.x) * 2] = start_time;
ts[(blockIdx.x * blockDim.x + threadIdx.x) * 2 + 1] = stop_time;
}
Then use this array to figure out minimal start time and maximum stop time for block you are considering. For example you can calculate range of indices of time array which corresponds to the (0, 0) block in CUDA and use min/max to calculate the execution time.
I think long long int clock64() is what you are looking for?
See Cuda Programming Guide, C Language Extensions, B. 11.
精彩评论