CUDA "unspecified launch failure" accessing memory_问答_开发者

what i'm trying to do is very simple. each thread reads sub-array from global array stored in the global memory. then it do some calculations and store the result in static array. at last the output is stored back in another array in global memory when i comment the line which writes the static array to the global array the kernel run.as shown in the code. any ideas?

GPU kernel :

#ifndef _TEMPLATE_KERNEL_H_
#define _TEMPLATE_KERNEL_H_

#include <stdio.h>

__device__  void
DecompressBlockGPU(unsigned char *compressed_block,unsigned char *compressed_size,
                    int array_length,unsigned char *decompressed_block)
{       
    int j = 0;

    for(int i = 0 ; i < array_length ;i++)
    {
        for(int idx = 0 ; idx < compressed_size[i]; idx++)
        {
            decompressed_block[j] = compressed_block[i];
            j++;
        }
    }
}
__global__ void

gpu_test(unsigned char *compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,
        unsigned char *output, int BlockSize,int BlockWidth,int BlockHeight,
        int cols,int xTB,int yTB,int xTH,int yTH,unsigned char *aux_array)
{
    int x_max = xBlocks ;
    int y_max = yBlocks ;

    int x_block = blockIdx.x ; 
    int y_block = blockIdx.y ;

    x_max = gridDim.x*blockDim.x ;
    y_max = gridDim.y*blockDim.y ;

    x_block = (blockIdx.x*xTH); 
    y_block = (blockIdx.y*yTH);
    int x_block1 = x_block + threadIdx.x;
    int y_block1 = y_block + threadIdx.y;

    int block_idx = y_block1*xBlocks + x_block1;
    unsigned char *temp_ptr = compressed_data + OffsetsArray[block_idx];        
    int *array_length = (int *)temp_ptr;
    unsigned char *compressed_size = compressed_data + OffsetsArray[block_idx] + 
                               array_length[0] +sizeof(int)/sizeof(unsigned char);
    unsigned char *compressed_block = compressed_data + OffsetsArray[block_idx] + 
                               sizeof(int)/sizeof(unsigned char);

    aux_array = aux_array + (BlockWidth+2)*(BlockHeight+2)*block_idx;
    aux_array[block_idx]=array_length[0];

    unsigned char decompressed_block[72];
    unsigned char extracted_block[32];

    DecompressBlockGPU(compressed_block,compressed_size,array_length[0],
                             &decompressed_block[0]);

    if(block_idx == 0)
    {
        for(int i=0;i<16;i++) aux_array[i]= decompressed_block[i]; //fails  
        for(int i=16;i<16*36;i++) aux_array[i]=1;//works
    }
}
#endif

CPU functions :

unsigned char *runGPU(unsigned char *d_compressed_data,int *OffsetsArray,int xBlocks,int yBlocks,unsigned char *h_output)
{


    printf("xBlocks =%d yBlocks =%d  \n",xBlocks,yBlocks);



    int xTB = 4;
    int yTB = 4;
    int xTH = 1;
    int yTH = 1; 



    unsigned char *d_output;
    unsigned char *d_aux_array;
    unsigned char *h_aux_array;

    int mem_size = image_len*sizeof(unsigned char);
    int big_mem_size = sizeof(unsigned char)*xBlocks*yBlocks*(BlockWidth+2)*(BlockHeight+2);

    cutilSafeCall( cudaMalloc( (void**) &d_output, mem_size));
    cutilSafeCall( cudaMalloc( (void**) &d_aux_array,big_mem_size));
    h_aux_array = (unsigned char *)malloc(big_mem_size);


    float time = 0;
    float totalTime = 0;
    cudaEvent_t start_event4, stop_event4;
    cutilSafeCall( cudaEventCreate(&start_event4) );
    cutilSafeCall( cudaEventCreate(&stop_event4) );
    cutilSafeCall( cudaEventRecord(start_event4, 0) );

    dim3 grid(xTB,yTB, 1);
    dim3 threads( xTH, yTH, 1);

    gpu_test<<<grid,threads>&开发者_运维问答gt;>(d_compressed_data,OffsetsArray,xBlocks,yBlocks,d_output,BlockSize,BlockWidth,BlockHeight,cols,xTB,yTB,xTH,yTH,d_aux_array);
    cudaThreadSynchronize();

    cutilSafeCall( cudaEventRecord(stop_event4, 0) );
    cutilSafeCall( cudaEventSynchronize(stop_event4) );
    time = 0;
    cutilSafeCall( cudaEventElapsedTime(&time, start_event4, stop_event4));
    totalTime += time;
    totalTime /= (1.0e3 * 1);
    shrLogEx(LOGBOTH | MASTER, 0, "GPU decompression Time = %.5f \n",totalTime); 

    cutilSafeCall(cudaMemcpy(h_output,d_output, mem_size, cudaMemcpyDeviceToHost));
    cutilSafeCall(cudaMemcpy(h_aux_array,d_aux_array, big_mem_size, cudaMemcpyDeviceToHost));


    cudaFree(d_output);
    cudaFree(d_aux_array);

    return h_aux_array;

}

is it clear now ?(after editing)

Try running your program through cuda-memcheck (or enable memory checking if you are using Parallel Nsight).