I am trying to simulate matrix multiplication in cuda C. Everything is correct except the output.
This is my program:
开发者_开发知识库#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define N 4
#define TILE_WIDTH 2
__global__ void MatMul(int*A, int* B, int* C) {
int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bx*TILE_WIDTH + idx;
uidy = by*TILE_WIDTH + idy;
sum = 0;
// Allocating memory in shared memory
__shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
__shared__ int temp2[TILE_WIDTH][TILE_WIDTH];
//copying the data to shared memory
for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N];
temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx];
__syncthreads();
// multiplying matrices in shared memory
for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}
// synchronizing the threads
__syncthreads();
C[uidy*N + uidx] = sum;
}
int main( void ) {
int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c
int *dev_a, *dev_b, *dev_c; //device copies of a,b,c
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );
// fill the matrices 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a[i][j] = j+3;
b[i][j] = i+6;
}
}
//copy above a,b values to device
cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//start record
cudaEventRecord(start, 0);
// Kernel invocation with N threads
dim3 dimGrid(2,2,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);
//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//this is operation time
cudaEventElapsedTime(&time, start, stop);
//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );
//output..
for (int i=0; i < N; i++){
for (int j=0; j < N; j++)
printf( "%d ", a[i][j]);
printf (" ");
for (int j=0; j < N; j++)
printf( "%d ", b[i][j]);
printf (" = ");
for (int j=0; j < N; j++)
printf( "%d ", c[i][j]);
printf ("\n");
}
//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf("\n multiplication done!!!\n");
printf("\n");
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}
And this is my output:
3 4 5 6 6 6 6 6 108 108 115 115
3 4 5 6 7 7 7 7 108 108 115 115
3 4 5 6 8 8 8 8 108 108 115 115
3 4 5 6 9 9 9 9 108 108 115 115
It is showing wrong values. Please tell me any error in my program. I'm very new to CUDA C.
While I don't know what is wrong with your program, I think you should be able to diagnose it better using simpler matrices. Have you tried multiplying two Identity matrices? Or filled with all 1s. Repeated tests with various simple matrices should demonstrate how the cells are being combined.
Ultimately, I think you'll find a problem with the way you use TILE_WIDTH, but I cannot be sure.
This should fix it (in the i loop):
temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];
精彩评论