I have problem with passing structure to kernel local memory. Here is the kernel kode:
typedef struct data {
unsigned long wId; // group_id
unsigned long iId[1]; // global_item_id
} DATA;
__kernel void tKernel(__global DATA *x, __local DATA tmp) {
int wd = get_work_dim();
// x dimension
int xGrId = get_group_id(0);
int xLId = get_local_id(0);
int xGlId = get_global_id(0);
x += xGrId;
x->wId = tmp.wId;
x->iId[xLId] = ++tmp.wId;
}
Here is the host code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define GLOBAL_ITEM_SIZE (1)
#define LOCAL_ITEM_SIZE (1)
#define MAX_SOURCE_SIZE (0x100000)
typedef struct data {
unsigned long wId;
unsigned long iId[LOCAL_ITEM_SIZE];
} DATA;
int main() {
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_mem cmPinnedBufOut = NULL;
DATA *cDataOut = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t group_size = GLOBAL_ITEM_SIZE / LOCAL_ITEM_SIZE;
FILE *fp;
const char fileName[] = "./kernel.cl";
size_t source_size;
char *source_str;
/* Load kernel source file */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(EXIT_FAILURE);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue with measurment of preformance */
commandQueue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create memory object */
cmPinnedBufOut = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, group_size * sizeof(DATA), NULL, &ret);
cDataOut = (DATA *)malloc(group_size * sizeof(DATA));
/* Create kernel program from source file */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
assert(ret == CL_SUCCESS);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("\nFail to build the program\n");
char buffer[10240];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
printf("%s\n", buffer);
exit(EXIT_FAILURE);
}
/* Create data parallel OpenCL kernel */
kernel = clCreateKernel(program, "tKernel", &ret);
assert(ret == CL_SUCCESS);
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&cmPinnedBufOut);
assert(ret == CL_SUCCESS);
DATA tmp;
tmp.wId = 66;
ret = clSetKernelArg(kernel, 1, sizeof(DATA), &tmp);
assert(ret == CL_SUCCESS);
size_t global_item_size = GLOBAL_ITEM_SIZE;
size_t local_item_size = LOCAL_ITEM_SIZE;
/* Execute OpenCL kernel as data parallel */
ret = clEnqueueNDRangeKernel(comm开发者_JAVA百科andQueue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret == CL_INVALID_WORK_GROUP_SIZE) {
printf("Invalid work group size: error when compute group size: %lu/%lu", global_item_size, local_item_size);
exit(EXIT_FAILURE);
}
assert(ret == CL_SUCCESS);
/* Transfer result to host */
ret = clEnqueueReadBuffer(commandQueue, cmPinnedBufOut, CL_TRUE, 0, group_size * sizeof(DATA), cDataOut, 0, NULL, NULL);
assert(ret == CL_SUCCESS);
/* Display Results */
for (int i = 0; i < group_size; i++) {
printf("%d: -> group_id %lu ~> work_item_ids: ", i, cDataOut[i].wId);
for (int j = 0; j < LOCAL_ITEM_SIZE; j++)
printf("%2lu, ", cDataOut[i].iId[j]);
printf("\n");
}
printf("\n");
/* Finalization */
ret = clFlush(commandQueue);
ret = clFinish(commandQueue); // blockink function, wait until all queue cmd are finished
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
free(source_str);
return 0;
}
So I expected as result 0: -> group_id 66 ~> work_item_ids: 67,
But I get 0: -> group_id 0 ~> work_item_ids: 1, From this I conclude that the hh structure with the number 66 was not reading correctly by the kernel. I try to put this same way on integer number and this works perfectly. So my question is, am I doing something wrong, or there isn't way to copy data structure from host to device local memory, or is there another way to doing this?The clSetKernelArg
for __local
buffers only specifies the size, and the pointer must be 0. See OpenCL spec 5.7.2. There is no way you can initialize local memory from the host.
精彩评论