Indeed, you're right. Just now I noticed the break command before the barrier, that's irreducible control-flow. Yongjia, can you change your kernel so all threads execute the barriers? Victor On Wed, Jan 22, 2014 at 11:12 AM, Tom Stellard <tom@xxxxxxxxxxxx> wrote: > On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote: >> From: Yongjia Zhang <Zhang_Yong_jia@xxxxxxx> >> >> This is a better way to accomplish the box-blur cl operation by using ocl's >> local memory from the opencv source code. It use the local shared memory to >> reduce global memory access, which significantly reduces the kernel's processing >> time by 70 percent compared to the original one. Because of the barriers and >> local worksize limitation, processing with a radius larger than 110 becomes >> slower than original algorithm, so I keep the original kernels in order to deal >> with box-blur with radius larger than 110. >> All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU. >> >> Signed-off-by: Yongjia Zhang <yongjia.zhang@xxxxxxxxx> >> --- >> opencl/box-blur.cl | 66 +++++++++++++++++++++++++ >> opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++ >> operations/common/box-blur.c | 115 ++++++++++++++++++++++++++----------------- >> 3 files changed, 201 insertions(+), 46 deletions(-) >> >> diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl >> index e99bea4..a1da9de 100644 >> --- a/opencl/box-blur.cl >> +++ b/opencl/box-blur.cl >> @@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux, >> out[out_index] = mean / (float)(2 * radius + 1); >> } >> } >> + >> +__kernel void kernel_box_blur_fast(const __global float4 *in, >> + __global float4 *out, >> + __local float4 *column_sum, >> + const int width, >> + const int height, >> + const int radius, >> + const int size) >> +{ >> + const int local_id0 = get_local_id(0); >> + const int twice_radius = 2 * radius; >> + const int in_width = twice_radius + width; >> + const int in_height = twice_radius + height; >> + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); >> + int column_index_start,column_index_end; >> + int y = get_global_id(1) * size; >> + const int out_x = get_group_id(0) >> + * ( get_local_size(0) - twice_radius ) + local_id0 - radius; >> + const int in_x = out_x + radius; >> + int tmp_size = size; >> + int tmp_index = 0; >> + float4 tmp_sum = (float4)0.0f; >> + float4 total_sum = (float4)0.0f; >> + if( in_x < in_width ) >> + { >> + column_index_start = y; >> + column_index_end = y + twice_radius; >> + for( int i=0; i<twice_radius+1; ++i ) >> + tmp_sum+=in[(y+i)*in_width+in_x]; >> + column_sum[local_id0] = tmp_sum; >> + } >> + >> + barrier( CLK_LOCAL_MEM_FENCE ); >> + >> + while(1) >> + { >> + if( out_x < width ) >> + { >> + if( local_id0 >= radius >> + && local_id0 < get_local_size(0) - radius ) >> + { >> + total_sum = (float4)0.0f; >> + for( int i=0; i<twice_radius+1; ++i ) >> + total_sum += column_sum[local_id0-radius+i]; >> + out[y*width+out_x] = total_sum/area; >> + } >> + } >> + if( --tmp_size ==0 || y == height - 1 ) >> + break; >> + >> + barrier( CLK_LOCAL_MEM_FENCE ); > > Is this barrier call guaranteed to be executed by all threads? If not, > then this will produce undefined behavior. > > -Tom > >> + >> + ++y; >> + if( in_x < in_width ) >> + { >> + tmp_sum = column_sum[local_id0]; >> + tmp_sum -= in[(column_index_start)*in_width+in_x]; >> + tmp_sum += in[(column_index_end+1)*in_width+in_x]; >> + ++column_index_start; >> + ++column_index_end; >> + column_sum[local_id0] = tmp_sum; >> + } >> + >> + barrier( CLK_LOCAL_MEM_FENCE ); >> + } >> +} >> diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h >> index bfed601..8f6aa81 100644 >> --- a/opencl/box-blur.cl.h >> +++ b/opencl/box-blur.cl.h >> @@ -44,4 +44,70 @@ static const char* box_blur_cl_source = >> " out[out_index] = mean / (float)(2 * radius + 1); \n" >> " } \n" >> "} \n" >> +" \n" >> +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n" >> +" __global float4 *out, \n" >> +" __local float4 *column_sum, \n" >> +" const int width, \n" >> +" const int height, \n" >> +" const int radius, \n" >> +" const int size) \n" >> +"{ \n" >> +" const int local_id0 = get_local_id(0); \n" >> +" const int twice_radius = 2 * radius; \n" >> +" const int in_width = twice_radius + width; \n" >> +" const int in_height = twice_radius + height; \n" >> +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n" >> +" int column_index_start,column_index_end; \n" >> +" int y = get_global_id(1) * size; \n" >> +" const int out_x = get_group_id(0) \n" >> +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n" >> +" const int in_x = out_x + radius; \n" >> +" int tmp_size = size; \n" >> +" int tmp_index = 0; \n" >> +" float4 tmp_sum = (float4)0.0f; \n" >> +" float4 total_sum = (float4)0.0f; \n" >> +" if( in_x < in_width ) \n" >> +" { \n" >> +" column_index_start = y; \n" >> +" column_index_end = y + twice_radius; \n" >> +" for( int i=0; i<twice_radius+1; ++i ) \n" >> +" tmp_sum+=in[(y+i)*in_width+in_x]; \n" >> +" column_sum[local_id0] = tmp_sum; \n" >> +" } \n" >> +" \n" >> +" barrier( CLK_LOCAL_MEM_FENCE ); \n" >> +" \n" >> +" while(1) \n" >> +" { \n" >> +" if( out_x < width ) \n" >> +" { \n" >> +" if( local_id0 >= radius \n" >> +" && local_id0 < get_local_size(0) - radius ) \n" >> +" { \n" >> +" total_sum = (float4)0.0f; \n" >> +" for( int i=0; i<twice_radius+1; ++i ) \n" >> +" total_sum += column_sum[local_id0-radius+i]; \n" >> +" out[y*width+out_x] = total_sum/area; \n" >> +" } \n" >> +" } \n" >> +" if( --tmp_size ==0 || y == height - 1 ) \n" >> +" break; \n" >> +" \n" >> +" barrier( CLK_LOCAL_MEM_FENCE ); \n" >> +" \n" >> +" ++y; \n" >> +" if( in_x < in_width ) \n" >> +" { \n" >> +" tmp_sum = column_sum[local_id0]; \n" >> +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n" >> +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n" >> +" ++column_index_start; \n" >> +" ++column_index_end; \n" >> +" column_sum[local_id0] = tmp_sum; \n" >> +" } \n" >> +" \n" >> +" barrier( CLK_LOCAL_MEM_FENCE ); \n" >> +" } \n" >> +"} \n" >> ; >> diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c >> index afc19ea..cb77ec0 100644 >> --- a/operations/common/box-blur.c >> +++ b/operations/common/box-blur.c >> @@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation) >> #include "buffer/gegl-buffer-cl-iterator.h" >> >> #include "opencl/box-blur.cl.h" >> - >> static GeglClRunData *cl_data = NULL; >> - >> static gboolean >> cl_box_blur (cl_mem in_tex, >> cl_mem aux_tex, >> @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex, >> gint radius) >> { >> cl_int cl_err = 0; >> - size_t global_ws_hor[2], global_ws_ver[2]; >> - size_t local_ws_hor[2], local_ws_ver[2]; >> - >> + size_t global_ws_hor[2], global_ws_ver[2], global_ws[2]; >> + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2]; >> + size_t step_size ; >> if (!cl_data) >> { >> - const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL}; >> + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL}; >> cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name); >> } >> >> if (!cl_data) >> return TRUE; >> - >> - local_ws_hor[0] = 1; >> - local_ws_hor[1] = 256; >> - global_ws_hor[0] = roi->height + 2 * radius; >> - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; >> - >> - local_ws_ver[0] = 1; >> - local_ws_ver[1] = 256; >> - global_ws_ver[0] = roi->height; >> - global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; >> - >> - >> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], >> - sizeof(cl_mem), (void*)&in_tex, >> - sizeof(cl_mem), (void*)&aux_tex, >> - sizeof(cl_int), (void*)&roi->width, >> - sizeof(cl_int), (void*)&radius, >> - NULL); >> - CL_CHECK; >> - >> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >> - cl_data->kernel[0], 2, >> - NULL, global_ws_hor, local_ws_hor, >> - 0, NULL, NULL); >> - CL_CHECK; >> - >> - >> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], >> - sizeof(cl_mem), (void*)&aux_tex, >> - sizeof(cl_mem), (void*)&out_tex, >> - sizeof(cl_int), (void*)&roi->width, >> - sizeof(cl_int), (void*)&radius, >> - NULL); >> - CL_CHECK; >> - >> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >> - cl_data->kernel[1], 2, >> - NULL, global_ws_ver, local_ws_ver, >> - 0, NULL, NULL); >> - CL_CHECK; >> + step_size = 64; >> + local_ws[0]=256; >> + local_ws[1]=1; >> + >> + >> + if( radius <=110 ) >> + { >> + global_ws[0] = (roi->width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0]; >> + global_ws[1] = (roi->height + step_size - 1) / step_size; >> + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2], >> + sizeof(cl_mem), (void *)&in_tex, >> + sizeof(cl_mem), (void *)&out_tex, >> + sizeof(cl_float4)*local_ws[0], (void *)NULL, >> + sizeof(cl_int), (void *)&roi->width, >> + sizeof(cl_int), (void *)&roi->height, >> + sizeof(cl_int), (void *)&radius, >> + sizeof(cl_int), (void *)&step_size, NULL); >> + CL_CHECK; >> + cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(), >> + cl_data->kernel[2], 2, >> + NULL, global_ws, local_ws, 0, NULL, NULL ); >> + CL_CHECK; >> + >> + } >> + else >> + { >> + local_ws_hor[0] = 1; >> + local_ws_hor[1] = 256; >> + global_ws_hor[0] = roi->height + 2 * radius; >> + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; >> + >> + local_ws_ver[0] = 1; >> + local_ws_ver[1] = 256; >> + global_ws_ver[0] = roi->height; >> + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; >> + >> + >> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], >> + sizeof(cl_mem), (void*)&in_tex, >> + sizeof(cl_mem), (void*)&aux_tex, >> + sizeof(cl_int), (void*)&roi->width, >> + sizeof(cl_int), (void*)&radius, >> + NULL); >> + CL_CHECK; >> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >> + cl_data->kernel[0], 2, >> + NULL, global_ws_hor, local_ws_hor, >> + 0, NULL, NULL); >> + CL_CHECK; >> + >> + >> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], >> + sizeof(cl_mem), (void*)&aux_tex, >> + sizeof(cl_mem), (void*)&out_tex, >> + sizeof(cl_int), (void*)&roi->width, >> + sizeof(cl_int), (void*)&radius, >> + NULL); >> + CL_CHECK; >> + >> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >> + cl_data->kernel[1], 2, >> + NULL, global_ws_ver, local_ws_ver, >> + 0, NULL, NULL); >> + CL_CHECK; >> + } >> >> return FALSE; >> >> -- >> 1.8.3.2 >> >> >> _______________________________________________ >> gegl-developer-list mailing list >> List address: gegl-developer-list@xxxxxxxxx >> List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list >> > _______________________________________________ > gegl-developer-list mailing list > List address: gegl-developer-list@xxxxxxxxx > List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list > _______________________________________________ gegl-developer-list mailing list List address: gegl-developer-list@xxxxxxxxx List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list