Here's a simple program:
void multiply(const int* v_in, const int* w_in, int n_v, int n_w, int* w_out)
{
for(int i=0; i<n_w; i++)
开发者_JS百科 {
int sum=0;
for(int j=0; j<n_v; j++)
sum += (w_in[i]*v_in[j])>>1;
w_out[i]=sum;
}
}
Presume n_v, n_w ~10^6. Clearly, there's at least a dozen equivalent ways to do this in CUDA, with different ways to subdivide (n_v*n_w) operations into threads, with and without shared memory... Which way should, theoretically speaking, be the fastest?
simplest:
void multiply(const int* v_in, const int* w_in, int n_v, int n_w, int* w_out)
{
int *v = shared; // dynamic
for(int i = block.rank; i < n_w; i += block.size)
{
int w = w_in[i]; // coalesced
int sum=0;
for(int j=0; j<n_v; j += block.size) { // assumption
v[block.rank] = v_in[j+block.rank];
__synch();
for(int k = 0; k < block.size; ++k)
sum += (w*v[k])>>1; //
__synch(); // ouch
}
w_out[i] = sum; // ditto
}
}
精彩评论