Here is an output of Compute Visual Profiler for my kernel on GT 440:
- Kernel details: Grid size: [100 1 1], Block size: [256 1 1]
- Register Ratio: 0.84375 ( 27开发者_运维百科648 / 32768 ) [35 registers per thread]
- Shared Memory Ratio: 0.336914 ( 16560 / 49152 ) [5520 bytes per Block]
- Active Blocks per SM: 3 (Maximum Active Blocks per SM: 8)
- Active threads per SM: 768 (Maximum Active threads per SM: 1536)
- Potential Occupancy: 0.5 ( 24 / 48 )
- Occupancy limiting factor: Registers
Please, pay your attention to the bullets marked bold. Kernel execution time is 121195 us
.
I reduced a number of registers per thread by moving some local variables to the shared memory. The Compute Visual Profiler output became:
- Kernel details: Grid size: [100 1 1], Block size: [256 1 1]
- Register Ratio: 1 ( 32768 / 32768 ) [30 registers per thread]
- Shared Memory Ratio: 0.451823 ( 22208 / 49152 ) [5552 bytes per Block]
- Active Blocks per SM: 4 (Maximum Active Blocks per SM: 8)
- Active threads per SM: 1024 (Maximum Active threads per SM: 1536)
- Potential Occupancy: 0.666667 ( 32 / 48 )
- Occupancy limiting factor: Registers
Hence, now 4
blocks are simultaneously executed on a single SM versus 3
blocks in the previous version. However, the execution time is 115756 us
, which is almost the same! Why? Aren't the blocks totally independent being executed on different CUDA cores?
You are implicitly assuming that higher occupancy automatically translates into higher performance. That is most often not the case.
The NVIDIA architecture needs a certain number of active warps per MP in order to hide the instruction pipeline latency of the GPU. On your Fermi based card, that requirement translates to a minimum occupancy of about 30%. Aiming for higher occupancies than that minimum will not necessarily result in higher throughput, as the latency bottleneck can have moved to another part of the GPU. Your entry level GPU doesn't have a lot of memory bandwidth, and it is quite possible that 3 blocks per MP is sufficient to make you code memory bandwidth limited, in which case increasing the number of blocks won't have any effect on performance (it might even go down because of increased memory controller contention and cache misses). Further, you said you spilled variables to shared memory in order to reduce the register foot print of the kernel. On Fermi, shared memory only has about 1000 Gb/s of bandwidth, compared to about 8000 Gb/s for registers (see the link below for the microbenchmarking results which demonstrate this). So you have moved variables to slower memory, which may also have a negative effect on performance, offsetting any benefit which high occupancy affords.
If you have not already seen it, I highly recommend Vasily Volkov's presentation from GTC 2010 "Better performance at lower occupancy" (pdf). Here is it shown how exploiting instruction level parallelism can increase GPU throughput to very high levels at very, very low levels of occupancy.
talonmies has already answered your question, so I just want to share a code inspired by the first part of V. Volkov's presentation mentioned in the answer above.
This is the code:
#include<stdio.h>
#define N_ITERATIONS 8192
//#define DEBUG
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if (tid < N) {
int a = d_a[tid];
int b = d_b[tid];
int c = d_c[tid];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/2];
int b2 = d_b[tid+N/2];
int c2 = d_c[tid+N/2];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid+N/2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/4) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/4];
int b2 = d_b[tid+N/4];
int c2 = d_c[tid+N/4];
int a3 = d_a[tid+N/2];
int b3 = d_b[tid+N/2];
int c3 = d_c[tid+N/2];
int a4 = d_a[tid+3*N/4];
int b4 = d_b[tid+3*N/4];
int c4 = d_c[tid+3*N/4];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid+N/4] = a2;
d_a[tid+N/2] = a3;
d_a[tid+3*N/4] = a4;
}
}
/********/
/* MAIN */
/********/
void main() {
const int N = 1024;
int *h_a = (int*)malloc(N*sizeof(int));
int *h_a_result_host = (int*)malloc(N*sizeof(int));
int *h_a_result_device = (int*)malloc(N*sizeof(int));
int *h_b = (int*)malloc(N*sizeof(int));
int *h_c = (int*)malloc(N*sizeof(int));
for (int i=0; i<N; i++) {
h_a[i] = 2;
h_b[i] = 1;
h_c[i] = 2;
h_a_result_host[i] = h_a[i];
for(unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* KERNEL0 */
/***********/
cudaEventRecord(start, 0);
kernel0<<<1, N>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(start, 0);
kernel1<<<1, N/2>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(start, 0);
kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
cudaDeviceReset();
}
On my GeForce GT540M, the result is
kernel0 GFlops = 21.069281 Occupancy = 66%
kernel1 GFlops = 21.183354 Occupancy = 33%
kernel2 GFlops = 21.224517 Occupancy = 16.7%
which means that kernels with lower occupancy can still exhibit high performance, if Instruction Level Parallelism (ILP) is exploited.
精彩评论