RSS/Atom feed Twitter
Site is read-only, email is disabled

[PATCH] Optimize operation box-blur opencl kernel

This discussion is connected to the gegl-developer-list.gnome.org mailing list which is provided by the GIMP developers and not related to gimpusers.com.

This is a read-only list on gimpusers.com so this discussion thread is read-only, too.

6 of 6 messages available
Toggle history

Please log in to manage your subscriptions.

[PATCH] Optimize operation box-blur opencl kernel Yongjia Zhang 22 Jan 08:21
  [PATCH] Optimize operation box-blur opencl kernel Victor Oliveira 22 Jan 19:05
  [PATCH] Optimize operation box-blur opencl kernel Tom Stellard 22 Jan 19:12
   [PATCH] Optimize operation box-blur opencl kernel Victor Oliveira 22 Jan 19:15
    [PATCH] Optimize operation box-blur opencl kernel 张勇加 23 Jan 01:48
     [PATCH] Optimize operation box-blur opencl kernel Gong, Zhigang 23 Jan 02:07
Yongjia Zhang
2014-01-22 08:21:42 UTC (over 10 years ago)

[PATCH] Optimize operation box-blur opencl kernel

From: Yongjia Zhang

This is a better way to accomplish the box-blur cl operation by using ocl's local memory from the opencv source code. It use the local shared memory to reduce global memory access, which significantly reduces the kernel's processing time by 70 percent compared to the original one. Because of the barriers and local worksize limitation, processing with a radius larger than 110 becomes slower than original algorithm, so I keep the original kernels in order to deal with box-blur with radius larger than 110. All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.

Signed-off-by: Yongjia Zhang ---
opencl/box-blur.cl | 66 +++++++++++++++++++++++++ opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++ operations/common/box-blur.c | 115 ++++++++++++++++++++++++++----------------- 3 files changed, 201 insertions(+), 46 deletions(-)

diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl index e99bea4..a1da9de 100644
--- a/opencl/box-blur.cl
+++ b/opencl/box-blur.cl
@@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux, out[out_index] = mean / (float)(2 * radius + 1); }
}
+
+__kernel void kernel_box_blur_fast(const __global float4 *in, + __global float4 *out, + __local float4 *column_sum, + const int width, + const int height, + const int radius, + const int size) +{ + const int local_id0 = get_local_id(0); + const int twice_radius = 2 * radius; + const int in_width = twice_radius + width; + const int in_height = twice_radius + height; + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); + int column_index_start,column_index_end; + int y = get_global_id(1) * size; + const int out_x = get_group_id(0) + * ( get_local_size(0) - twice_radius ) + local_id0 - radius; + const int in_x = out_x + radius; + int tmp_size = size; + int tmp_index = 0; + float4 tmp_sum = (float4)0.0f; + float4 total_sum = (float4)0.0f; + if( in_x < in_width ) + { + column_index_start = y; + column_index_end = y + twice_radius; + for( int i=0; i< width ) + { + if( local_id0 >= radius + && local_id0 < get_local_size(0) - radius ) + { + total_sum = (float4)0.0f; + for( int i=0; i< in_width ) + { + tmp_sum = column_sum[local_id0]; + tmp_sum -= in[(column_index_start)*in_width+in_x]; + tmp_sum += in[(column_index_end+1)*in_width+in_x]; + ++column_index_start; + ++column_index_end; + column_sum[local_id0] = tmp_sum; + }
+
+ barrier( CLK_LOCAL_MEM_FENCE ); + } +} diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h index bfed601..8f6aa81 100644
--- a/opencl/box-blur.cl.h
+++ b/opencl/box-blur.cl.h
@@ -44,4 +44,70 @@ static const char* box_blur_cl_source = " out[out_index] = mean / (float)(2 * radius + 1); \n" " } \n" "} \n" +" \n" +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n" +" __global float4 *out, \n" +" __local float4 *column_sum, \n" +" const int width, \n" +" const int height, \n" +" const int radius, \n" +" const int size) \n" +"{ \n" +" const int local_id0 = get_local_id(0); \n" +" const int twice_radius = 2 * radius; \n" +" const int in_width = twice_radius + width; \n" +" const int in_height = twice_radius + height; \n" +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n" +" int column_index_start,column_index_end; \n" +" int y = get_global_id(1) * size; \n" +" const int out_x = get_group_id(0) \n" +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n" +" const int in_x = out_x + radius; \n" +" int tmp_size = size; \n" +" int tmp_index = 0; \n" +" float4 tmp_sum = (float4)0.0f; \n" +" float4 total_sum = (float4)0.0f; \n" +" if( in_x < in_width ) \n" +" { \n" +" column_index_start = y; \n" +" column_index_end = y + twice_radius; \n" +" for( int i=0; i< width ) \n" +" { \n" +" if( local_id0 >= radius \n" +" && local_id0 < get_local_size(0) - radius ) \n" +" { \n" +" total_sum = (float4)0.0f; \n" +" for( int i=0; i< in_width ) \n" +" { \n" +" tmp_sum = column_sum[local_id0]; \n" +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n" +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n" +" ++column_index_start; \n" +" ++column_index_end; \n" +" column_sum[local_id0] = tmp_sum; \n" +" } \n" +" \n" +" barrier( CLK_LOCAL_MEM_FENCE ); \n" +" } \n" +"} \n" ;
diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c index afc19ea..cb77ec0 100644
--- a/operations/common/box-blur.c
+++ b/operations/common/box-blur.c
@@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation) #include "buffer/gegl-buffer-cl-iterator.h"
#include "opencl/box-blur.cl.h"
-
static GeglClRunData *cl_data = NULL; -
static gboolean
cl_box_blur (cl_mem in_tex, cl_mem aux_tex, @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex, gint radius) {
cl_int cl_err = 0;
- size_t global_ws_hor[2], global_ws_ver[2]; - size_t local_ws_hor[2], local_ws_ver[2]; -
+ size_t global_ws_hor[2], global_ws_ver[2], global_ws[2]; + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2]; + size_t step_size ;
if (!cl_data)
{
- const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL}; + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL}; cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name); }

if (!cl_data)
return TRUE;
-
- local_ws_hor[0] = 1;
- local_ws_hor[1] = 256;
- global_ws_hor[0] = roi->height + 2 * radius; - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; -
- local_ws_ver[0] = 1;
- local_ws_ver[1] = 256;
- global_ws_ver[0] = roi->height;
- global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; -
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], - sizeof(cl_mem), (void*)&in_tex, - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[0], 2, - NULL, global_ws_hor, local_ws_hor, - 0, NULL, NULL); - CL_CHECK;
-
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_mem), (void*)&out_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[1], 2, - NULL, global_ws_ver, local_ws_ver, - 0, NULL, NULL); - CL_CHECK;
+ step_size = 64;
+ local_ws[0]=256;
+ local_ws[1]=1;
+
+
+ if( radius width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0]; + global_ws[1] = (roi->height + step_size - 1) / step_size; + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2], + sizeof(cl_mem), (void *)&in_tex, + sizeof(cl_mem), (void *)&out_tex, + sizeof(cl_float4)*local_ws[0], (void *)NULL, + sizeof(cl_int), (void *)&roi->width, + sizeof(cl_int), (void *)&roi->height, + sizeof(cl_int), (void *)&radius, + sizeof(cl_int), (void *)&step_size, NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(), + cl_data->kernel[2], 2, + NULL, global_ws, local_ws, 0, NULL, NULL ); + CL_CHECK;
+
+ }
+ else
+ {
+ local_ws_hor[0] = 1;
+ local_ws_hor[1] = 256;
+ global_ws_hor[0] = roi->height + 2 * radius; + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; +
+ local_ws_ver[0] = 1;
+ local_ws_ver[1] = 256;
+ global_ws_ver[0] = roi->height; + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; +
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], + sizeof(cl_mem), (void*)&in_tex, + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[0], 2, + NULL, global_ws_hor, local_ws_hor, + 0, NULL, NULL); + CL_CHECK;
+
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_mem), (void*)&out_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[1], 2, + NULL, global_ws_ver, local_ws_ver, + 0, NULL, NULL); + CL_CHECK;
+ }

return FALSE;

1.8.3.2
Victor Oliveira
2014-01-22 19:05:35 UTC (over 10 years ago)

[PATCH] Optimize operation box-blur opencl kernel

Thanks Yongjia, I think this looks like a nice contribution. I'll push the patch soon.

Victor Oliveira

On Wed, Jan 22, 2014 at 12:21 AM, Yongjia Zhang wrote:

From: Yongjia Zhang

This is a better way to accomplish the box-blur cl operation by using ocl's local memory from the opencv source code. It use the local shared memory to reduce global memory access, which significantly reduces the kernel's processing time by 70 percent compared to the original one. Because of the barriers and local worksize limitation, processing with a radius larger than 110 becomes slower than original algorithm, so I keep the original kernels in order to deal with box-blur with radius larger than 110. All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.

Signed-off-by: Yongjia Zhang ---
opencl/box-blur.cl | 66 +++++++++++++++++++++++++ opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++ operations/common/box-blur.c | 115 ++++++++++++++++++++++++++----------------- 3 files changed, 201 insertions(+), 46 deletions(-)

diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl index e99bea4..a1da9de 100644
--- a/opencl/box-blur.cl
+++ b/opencl/box-blur.cl
@@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux, out[out_index] = mean / (float)(2 * radius + 1); }
}
+
+__kernel void kernel_box_blur_fast(const __global float4 *in, + __global float4 *out, + __local float4 *column_sum, + const int width, + const int height, + const int radius, + const int size) +{
+ const int local_id0 = get_local_id(0); + const int twice_radius = 2 * radius; + const int in_width = twice_radius + width; + const int in_height = twice_radius + height; + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); + int column_index_start,column_index_end; + int y = get_global_id(1) * size; + const int out_x = get_group_id(0) + * ( get_local_size(0) - twice_radius ) + local_id0 - radius; + const int in_x = out_x + radius; + int tmp_size = size;
+ int tmp_index = 0;
+ float4 tmp_sum = (float4)0.0f; + float4 total_sum = (float4)0.0f; + if( in_x < in_width )
+ {
+ column_index_start = y; + column_index_end = y + twice_radius; + for( int i=0; i< width ) + {
+ if( local_id0 >= radius + && local_id0 < get_local_size(0) - radius ) + {
+ total_sum = (float4)0.0f; + for( int i=0; i< in_width ) + {
+ tmp_sum = column_sum[local_id0]; + tmp_sum -= in[(column_index_start)*in_width+in_x]; + tmp_sum += in[(column_index_end+1)*in_width+in_x]; + ++column_index_start; + ++column_index_end; + column_sum[local_id0] = tmp_sum; + }
+
+ barrier( CLK_LOCAL_MEM_FENCE ); + }
+}
diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h index bfed601..8f6aa81 100644
--- a/opencl/box-blur.cl.h
+++ b/opencl/box-blur.cl.h
@@ -44,4 +44,70 @@ static const char* box_blur_cl_source = " out[out_index] = mean / (float)(2 * radius + 1); \n" " } \n" "} \n" +" \n" +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n" +" __global float4 *out, \n" +" __local float4 *column_sum, \n" +" const int width, \n" +" const int height, \n" +" const int radius, \n" +" const int size) \n" +"{ \n" +" const int local_id0 = get_local_id(0); \n" +" const int twice_radius = 2 * radius; \n" +" const int in_width = twice_radius + width; \n" +" const int in_height = twice_radius + height; \n" +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n" +" int column_index_start,column_index_end; \n" +" int y = get_global_id(1) * size; \n" +" const int out_x = get_group_id(0) \n" +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n" +" const int in_x = out_x + radius; \n" +" int tmp_size = size; \n" +" int tmp_index = 0; \n" +" float4 tmp_sum = (float4)0.0f; \n" +" float4 total_sum = (float4)0.0f; \n" +" if( in_x < in_width ) \n" +" { \n" +" column_index_start = y; \n" +" column_index_end = y + twice_radius; \n" +" for( int i=0; i< width ) \n" +" { \n" +" if( local_id0 >= radius \n" +" && local_id0 < get_local_size(0) - radius ) \n" +" { \n" +" total_sum = (float4)0.0f; \n" +" for( int i=0; i< in_width ) \n" +" { \n" +" tmp_sum = column_sum[local_id0]; \n" +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n" +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n" +" ++column_index_start; \n" +" ++column_index_end; \n" +" column_sum[local_id0] = tmp_sum; \n" +" } \n" +" \n" +" barrier( CLK_LOCAL_MEM_FENCE ); \n" +" } \n" +"} \n" ;
diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c index afc19ea..cb77ec0 100644
--- a/operations/common/box-blur.c
+++ b/operations/common/box-blur.c
@@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation) #include "buffer/gegl-buffer-cl-iterator.h"

#include "opencl/box-blur.cl.h" -
static GeglClRunData *cl_data = NULL; -
static gboolean
cl_box_blur (cl_mem in_tex, cl_mem aux_tex, @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex, gint radius) {
cl_int cl_err = 0;
- size_t global_ws_hor[2], global_ws_ver[2]; - size_t local_ws_hor[2], local_ws_ver[2]; -
+ size_t global_ws_hor[2], global_ws_ver[2], global_ws[2]; + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2]; + size_t step_size ;
if (!cl_data)
{
- const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL}; + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL}; cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name); }

if (!cl_data)
return TRUE;
-
- local_ws_hor[0] = 1;
- local_ws_hor[1] = 256;
- global_ws_hor[0] = roi->height + 2 * radius; - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; -
- local_ws_ver[0] = 1;
- local_ws_ver[1] = 256;
- global_ws_ver[0] = roi->height;
- global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; -
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], - sizeof(cl_mem), (void*)&in_tex, - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[0], 2, - NULL, global_ws_hor, local_ws_hor, - 0, NULL, NULL); - CL_CHECK;
-
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_mem), (void*)&out_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[1], 2, - NULL, global_ws_ver, local_ws_ver, - 0, NULL, NULL); - CL_CHECK;
+ step_size = 64;
+ local_ws[0]=256;
+ local_ws[1]=1;
+
+
+ if( radius width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0]; + global_ws[1] = (roi->height + step_size - 1) / step_size; + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2], + sizeof(cl_mem), (void *)&in_tex, + sizeof(cl_mem), (void *)&out_tex, + sizeof(cl_float4)*local_ws[0], (void *)NULL, + sizeof(cl_int), (void *)&roi->width, + sizeof(cl_int), (void *)&roi->height, + sizeof(cl_int), (void *)&radius, + sizeof(cl_int), (void *)&step_size, NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(), + cl_data->kernel[2], 2, + NULL, global_ws, local_ws, 0, NULL, NULL ); + CL_CHECK;
+
+ }
+ else
+ {
+ local_ws_hor[0] = 1;
+ local_ws_hor[1] = 256;
+ global_ws_hor[0] = roi->height + 2 * radius; + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; +
+ local_ws_ver[0] = 1;
+ local_ws_ver[1] = 256;
+ global_ws_ver[0] = roi->height; + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; +
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], + sizeof(cl_mem), (void*)&in_tex, + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[0], 2, + NULL, global_ws_hor, local_ws_hor, + 0, NULL, NULL); + CL_CHECK;
+
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_mem), (void*)&out_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[1], 2, + NULL, global_ws_ver, local_ws_ver, + 0, NULL, NULL); + CL_CHECK;
+ }

return FALSE;

-- 1.8.3.2

_______________________________________________ gegl-developer-list mailing list
List address: gegl-developer-list@gnome.org List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list

Tom Stellard
2014-01-22 19:12:21 UTC (over 10 years ago)

[PATCH] Optimize operation box-blur opencl kernel

On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote:

From: Yongjia Zhang

This is a better way to accomplish the box-blur cl operation by using ocl's local memory from the opencv source code. It use the local shared memory to reduce global memory access, which significantly reduces the kernel's processing time by 70 percent compared to the original one. Because of the barriers and local worksize limitation, processing with a radius larger than 110 becomes slower than original algorithm, so I keep the original kernels in order to deal with box-blur with radius larger than 110. All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.

Signed-off-by: Yongjia Zhang ---
opencl/box-blur.cl | 66 +++++++++++++++++++++++++ opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++ operations/common/box-blur.c | 115 ++++++++++++++++++++++++++----------------- 3 files changed, 201 insertions(+), 46 deletions(-)

diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl index e99bea4..a1da9de 100644
--- a/opencl/box-blur.cl
+++ b/opencl/box-blur.cl
@@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux, out[out_index] = mean / (float)(2 * radius + 1); }
}
+
+__kernel void kernel_box_blur_fast(const __global float4 *in, + __global float4 *out, + __local float4 *column_sum, + const int width, + const int height, + const int radius, + const int size) +{ + const int local_id0 = get_local_id(0); + const int twice_radius = 2 * radius; + const int in_width = twice_radius + width; + const int in_height = twice_radius + height; + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); + int column_index_start,column_index_end; + int y = get_global_id(1) * size; + const int out_x = get_group_id(0) + * ( get_local_size(0) - twice_radius ) + local_id0 - radius; + const int in_x = out_x + radius; + int tmp_size = size; + int tmp_index = 0; + float4 tmp_sum = (float4)0.0f; + float4 total_sum = (float4)0.0f; + if( in_x < in_width ) + { + column_index_start = y; + column_index_end = y + twice_radius; + for( int i=0; i< width ) + { + if( local_id0 >= radius + && local_id0 < get_local_size(0) - radius ) + { + total_sum = (float4)0.0f; + for( int i=0; i
+
+ barrier( CLK_LOCAL_MEM_FENCE );

Is this barrier call guaranteed to be executed by all threads? If not, then this will produce undefined behavior.

-Tom

+
+ ++y; + if( in_x < in_width ) + { + tmp_sum = column_sum[local_id0]; + tmp_sum -= in[(column_index_start)*in_width+in_x]; + tmp_sum += in[(column_index_end+1)*in_width+in_x]; + ++column_index_start; + ++column_index_end; + column_sum[local_id0] = tmp_sum; + }
+
+ barrier( CLK_LOCAL_MEM_FENCE ); + } +} diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h index bfed601..8f6aa81 100644
--- a/opencl/box-blur.cl.h
+++ b/opencl/box-blur.cl.h
@@ -44,4 +44,70 @@ static const char* box_blur_cl_source = " out[out_index] = mean / (float)(2 * radius + 1); \n" " } \n" "} \n" +" \n" +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n" +" __global float4 *out, \n" +" __local float4 *column_sum, \n" +" const int width, \n" +" const int height, \n" +" const int radius, \n" +" const int size) \n" +"{ \n" +" const int local_id0 = get_local_id(0); \n" +" const int twice_radius = 2 * radius; \n" +" const int in_width = twice_radius + width; \n" +" const int in_height = twice_radius + height; \n" +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n" +" int column_index_start,column_index_end; \n" +" int y = get_global_id(1) * size; \n" +" const int out_x = get_group_id(0) \n" +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n" +" const int in_x = out_x + radius; \n" +" int tmp_size = size; \n" +" int tmp_index = 0; \n" +" float4 tmp_sum = (float4)0.0f; \n" +" float4 total_sum = (float4)0.0f; \n" +" if( in_x < in_width ) \n" +" { \n" +" column_index_start = y; \n" +" column_index_end = y + twice_radius; \n" +" for( int i=0; i< width ) \n" +" { \n" +" if( local_id0 >= radius \n" +" && local_id0 < get_local_size(0) - radius ) \n" +" { \n" +" total_sum = (float4)0.0f; \n" +" for( int i=0; i< in_width ) \n" +" { \n" +" tmp_sum = column_sum[local_id0]; \n" +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n" +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n" +" ++column_index_start; \n" +" ++column_index_end; \n" +" column_sum[local_id0] = tmp_sum; \n" +" } \n" +" \n" +" barrier( CLK_LOCAL_MEM_FENCE ); \n" +" } \n" +"} \n" ;
diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c index afc19ea..cb77ec0 100644
--- a/operations/common/box-blur.c
+++ b/operations/common/box-blur.c
@@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation) #include "buffer/gegl-buffer-cl-iterator.h"
#include "opencl/box-blur.cl.h"
-
static GeglClRunData *cl_data = NULL; -
static gboolean
cl_box_blur (cl_mem in_tex, cl_mem aux_tex, @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex, gint radius) {
cl_int cl_err = 0;
- size_t global_ws_hor[2], global_ws_ver[2]; - size_t local_ws_hor[2], local_ws_ver[2]; -
+ size_t global_ws_hor[2], global_ws_ver[2], global_ws[2]; + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2]; + size_t step_size ;
if (!cl_data)
{
- const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL}; + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL}; cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name); }

if (!cl_data)
return TRUE;
-
- local_ws_hor[0] = 1;
- local_ws_hor[1] = 256;
- global_ws_hor[0] = roi->height + 2 * radius; - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; -
- local_ws_ver[0] = 1;
- local_ws_ver[1] = 256;
- global_ws_ver[0] = roi->height;
- global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; -
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], - sizeof(cl_mem), (void*)&in_tex, - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[0], 2, - NULL, global_ws_hor, local_ws_hor, - 0, NULL, NULL); - CL_CHECK;
-
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_mem), (void*)&out_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[1], 2, - NULL, global_ws_ver, local_ws_ver, - 0, NULL, NULL); - CL_CHECK;
+ step_size = 64;
+ local_ws[0]=256;
+ local_ws[1]=1;
+
+
+ if( radius width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0]; + global_ws[1] = (roi->height + step_size - 1) / step_size; + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2], + sizeof(cl_mem), (void *)&in_tex, + sizeof(cl_mem), (void *)&out_tex, + sizeof(cl_float4)*local_ws[0], (void *)NULL, + sizeof(cl_int), (void *)&roi->width, + sizeof(cl_int), (void *)&roi->height, + sizeof(cl_int), (void *)&radius, + sizeof(cl_int), (void *)&step_size, NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(), + cl_data->kernel[2], 2, + NULL, global_ws, local_ws, 0, NULL, NULL ); + CL_CHECK;
+
+ }
+ else
+ {
+ local_ws_hor[0] = 1;
+ local_ws_hor[1] = 256;
+ global_ws_hor[0] = roi->height + 2 * radius; + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; +
+ local_ws_ver[0] = 1;
+ local_ws_ver[1] = 256;
+ global_ws_ver[0] = roi->height; + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; +
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], + sizeof(cl_mem), (void*)&in_tex, + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[0], 2, + NULL, global_ws_hor, local_ws_hor, + 0, NULL, NULL); + CL_CHECK;
+
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_mem), (void*)&out_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[1], 2, + NULL, global_ws_ver, local_ws_ver, + 0, NULL, NULL); + CL_CHECK;
+ }

return FALSE;

--
1.8.3.2

_______________________________________________ gegl-developer-list mailing list
List address: gegl-developer-list@gnome.org List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list

Victor Oliveira
2014-01-22 19:15:54 UTC (over 10 years ago)

[PATCH] Optimize operation box-blur opencl kernel

Indeed, you're right. Just now I noticed the break command before the barrier, that's irreducible control-flow.

Yongjia, can you change your kernel so all threads execute the barriers?

Victor

On Wed, Jan 22, 2014 at 11:12 AM, Tom Stellard wrote:

On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote:

From: Yongjia Zhang

This is a better way to accomplish the box-blur cl operation by using ocl's local memory from the opencv source code. It use the local shared memory to reduce global memory access, which significantly reduces the kernel's processing time by 70 percent compared to the original one. Because of the barriers and local worksize limitation, processing with a radius larger than 110 becomes slower than original algorithm, so I keep the original kernels in order to deal with box-blur with radius larger than 110. All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.

Signed-off-by: Yongjia Zhang ---
opencl/box-blur.cl | 66 +++++++++++++++++++++++++ opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++ operations/common/box-blur.c | 115 ++++++++++++++++++++++++++----------------- 3 files changed, 201 insertions(+), 46 deletions(-)

diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl index e99bea4..a1da9de 100644
--- a/opencl/box-blur.cl
+++ b/opencl/box-blur.cl
@@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux, out[out_index] = mean / (float)(2 * radius + 1); }
}
+
+__kernel void kernel_box_blur_fast(const __global float4 *in, + __global float4 *out, + __local float4 *column_sum, + const int width, + const int height, + const int radius, + const int size) +{
+ const int local_id0 = get_local_id(0); + const int twice_radius = 2 * radius; + const int in_width = twice_radius + width; + const int in_height = twice_radius + height; + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); + int column_index_start,column_index_end; + int y = get_global_id(1) * size; + const int out_x = get_group_id(0) + * ( get_local_size(0) - twice_radius ) + local_id0 - radius; + const int in_x = out_x + radius; + int tmp_size = size;
+ int tmp_index = 0;
+ float4 tmp_sum = (float4)0.0f; + float4 total_sum = (float4)0.0f; + if( in_x < in_width )
+ {
+ column_index_start = y; + column_index_end = y + twice_radius; + for( int i=0; i< width ) + {
+ if( local_id0 >= radius + && local_id0 < get_local_size(0) - radius ) + {
+ total_sum = (float4)0.0f; + for( int i=0; i
+ }
+ if( --tmp_size ==0 || y == height - 1 ) + break;
+
+ barrier( CLK_LOCAL_MEM_FENCE );

Is this barrier call guaranteed to be executed by all threads? If not, then this will produce undefined behavior.

-Tom

+
+ ++y;
+ if( in_x < in_width )
+ {
+ tmp_sum = column_sum[local_id0]; + tmp_sum -= in[(column_index_start)*in_width+in_x]; + tmp_sum += in[(column_index_end+1)*in_width+in_x]; + ++column_index_start; + ++column_index_end; + column_sum[local_id0] = tmp_sum; + }
+
+ barrier( CLK_LOCAL_MEM_FENCE ); + }
+}
diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h index bfed601..8f6aa81 100644
--- a/opencl/box-blur.cl.h
+++ b/opencl/box-blur.cl.h
@@ -44,4 +44,70 @@ static const char* box_blur_cl_source = " out[out_index] = mean / (float)(2 * radius + 1); \n" " } \n" "} \n" +" \n" +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n" +" __global float4 *out, \n" +" __local float4 *column_sum, \n" +" const int width, \n" +" const int height, \n" +" const int radius, \n" +" const int size) \n" +"{ \n" +" const int local_id0 = get_local_id(0); \n" +" const int twice_radius = 2 * radius; \n" +" const int in_width = twice_radius + width; \n" +" const int in_height = twice_radius + height; \n" +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n" +" int column_index_start,column_index_end; \n" +" int y = get_global_id(1) * size; \n" +" const int out_x = get_group_id(0) \n" +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n" +" const int in_x = out_x + radius; \n" +" int tmp_size = size; \n" +" int tmp_index = 0; \n" +" float4 tmp_sum = (float4)0.0f; \n" +" float4 total_sum = (float4)0.0f; \n" +" if( in_x < in_width ) \n" +" { \n" +" column_index_start = y; \n" +" column_index_end = y + twice_radius; \n" +" for( int i=0; i< width ) \n" +" { \n" +" if( local_id0 >= radius \n" +" && local_id0 < get_local_size(0) - radius ) \n" +" { \n" +" total_sum = (float4)0.0f; \n" +" for( int i=0; i< in_width ) \n" +" { \n" +" tmp_sum = column_sum[local_id0]; \n" +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n" +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n" +" ++column_index_start; \n" +" ++column_index_end; \n" +" column_sum[local_id0] = tmp_sum; \n" +" } \n" +" \n" +" barrier( CLK_LOCAL_MEM_FENCE ); \n" +" } \n" +"} \n" ;
diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c index afc19ea..cb77ec0 100644
--- a/operations/common/box-blur.c
+++ b/operations/common/box-blur.c
@@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation) #include "buffer/gegl-buffer-cl-iterator.h"

#include "opencl/box-blur.cl.h" -
static GeglClRunData *cl_data = NULL; -
static gboolean
cl_box_blur (cl_mem in_tex, cl_mem aux_tex, @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex, gint radius) {
cl_int cl_err = 0;
- size_t global_ws_hor[2], global_ws_ver[2]; - size_t local_ws_hor[2], local_ws_ver[2]; -
+ size_t global_ws_hor[2], global_ws_ver[2], global_ws[2]; + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2]; + size_t step_size ;
if (!cl_data)
{
- const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL}; + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL}; cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name); }

if (!cl_data)
return TRUE;
-
- local_ws_hor[0] = 1;
- local_ws_hor[1] = 256;
- global_ws_hor[0] = roi->height + 2 * radius; - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; -
- local_ws_ver[0] = 1;
- local_ws_ver[1] = 256;
- global_ws_ver[0] = roi->height;
- global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; -
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], - sizeof(cl_mem), (void*)&in_tex, - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[0], 2, - NULL, global_ws_hor, local_ws_hor, - 0, NULL, NULL); - CL_CHECK;
-
-
- cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], - sizeof(cl_mem), (void*)&aux_tex, - sizeof(cl_mem), (void*)&out_tex, - sizeof(cl_int), (void*)&roi->width, - sizeof(cl_int), (void*)&radius, - NULL); - CL_CHECK;
-
- cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), - cl_data->kernel[1], 2, - NULL, global_ws_ver, local_ws_ver, - 0, NULL, NULL); - CL_CHECK;
+ step_size = 64;
+ local_ws[0]=256;
+ local_ws[1]=1;
+
+
+ if( radius width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0]; + global_ws[1] = (roi->height + step_size - 1) / step_size; + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2], + sizeof(cl_mem), (void *)&in_tex, + sizeof(cl_mem), (void *)&out_tex, + sizeof(cl_float4)*local_ws[0], (void *)NULL, + sizeof(cl_int), (void *)&roi->width, + sizeof(cl_int), (void *)&roi->height, + sizeof(cl_int), (void *)&radius, + sizeof(cl_int), (void *)&step_size, NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(), + cl_data->kernel[2], 2, + NULL, global_ws, local_ws, 0, NULL, NULL ); + CL_CHECK;
+
+ }
+ else
+ {
+ local_ws_hor[0] = 1;
+ local_ws_hor[1] = 256;
+ global_ws_hor[0] = roi->height + 2 * radius; + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; +
+ local_ws_ver[0] = 1;
+ local_ws_ver[1] = 256;
+ global_ws_ver[0] = roi->height; + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; +
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], + sizeof(cl_mem), (void*)&in_tex, + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[0], 2, + NULL, global_ws_hor, local_ws_hor, + 0, NULL, NULL); + CL_CHECK;
+
+
+ cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], + sizeof(cl_mem), (void*)&aux_tex, + sizeof(cl_mem), (void*)&out_tex, + sizeof(cl_int), (void*)&roi->width, + sizeof(cl_int), (void*)&radius, + NULL); + CL_CHECK;
+
+ cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), + cl_data->kernel[1], 2, + NULL, global_ws_ver, local_ws_ver, + 0, NULL, NULL); + CL_CHECK;
+ }

return FALSE;

-- 1.8.3.2

_______________________________________________ gegl-developer-list mailing list
List address: gegl-developer-list@gnome.org List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list

_______________________________________________ gegl-developer-list mailing list
List address: gegl-developer-list@gnome.org List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list

张勇加
2014-01-23 01:48:15 UTC (over 10 years ago)

[PATCH] Optimize operation box-blur opencl kernel

Hi Victor and Tom,

Though the barrier you were talking about is not guaranteed to be executed by all threads, but the work items in a workgroup's behaviors about this barrier are all the same, so, if a work item executes the barrier, then the others of the same work group execute it as well, and vice versa. It's for two reasons. 1st is the 'size' viarable are all the same value in all work items and it decreases without any condition. 2nd is that if the 'break' is called because of the 'y==height-1', it is because that we were dealing with the last row of the buffer, the work items of a work group are all in the same situation, every work item will pass this barrier and return.
Thanks

Yongjia Zhang

At 2014-01-23 03:15:54,"Victor Oliveira" wrote: >Indeed, you're right. Just now I noticed the break command before the >barrier, that's irreducible control-flow. > >Yongjia, can you change your kernel so all threads execute the barriers? > >Victor > >On Wed, Jan 22, 2014 at 11:12 AM, Tom Stellard wrote: >> On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote: >>> From: Yongjia Zhang >>> >>> This is a better way to accomplish the box-blur cl operation by using ocl's >>> local memory from the opencv source code. It use the local shared memory to >>> reduce global memory access, which significantly reduces the kernel's processing >>> time by 70 percent compared to the original one. Because of the barriers and >>> local worksize limitation, processing with a radius larger than 110 becomes >>> slower than original algorithm, so I keep the original kernels in order to deal >>> with box-blur with radius larger than 110. >>> All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU. >>> >>> Signed-off-by: Yongjia Zhang >>> --- >>> opencl/box-blur.cl | 66 +++++++++++++++++++++++++ >>> opencl/box-blur.cl.h | 66 +++++++++++++++++++++++++ >>> operations/common/box-blur.c | 115 ++++++++++++++++++++++++++----------------- >>> 3 files changed, 201 insertions(+), 46 deletions(-) >>> >>> diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl >>> index e99bea4..a1da9de 100644 >>> --- a/opencl/box-blur.cl >>> +++ b/opencl/box-blur.cl >>> @@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4 *aux, >>> out[out_index] = mean / (float)(2 * radius + 1); >>> } >>> } >>> + >>> +__kernel void kernel_box_blur_fast(const __global float4 *in, >>> + __global float4 *out, >>> + __local float4 *column_sum, >>> + const int width, >>> + const int height, >>> + const int radius, >>> + const int size) >>> +{ >>> + const int local_id0 = get_local_id(0); >>> + const int twice_radius = 2 * radius; >>> + const int in_width = twice_radius + width; >>> + const int in_height = twice_radius + height; >>> + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); >>> + int column_index_start,column_index_end; >>> + int y = get_global_id(1) * size; >>> + const int out_x = get_group_id(0) >>> + * ( get_local_size(0) - twice_radius ) + local_id0 - radius; >>> + const int in_x = out_x + radius; >>> + int tmp_size = size; >>> + int tmp_index = 0; >>> + float4 tmp_sum = (float4)0.0f; >>> + float4 total_sum = (float4)0.0f; >>> + if( in_x < in_width ) >>> + { >>> + column_index_start = y; >>> + column_index_end = y + twice_radius; >>> + for( int i=0; i>> + tmp_sum+=in[(y+i)*in_width+in_x]; >>> + column_sum[local_id0] = tmp_sum; >>> + } >>> + >>> + barrier( CLK_LOCAL_MEM_FENCE ); >>> + >>> + while(1) >>> + { >>> + if( out_x < width ) >>> + { >>> + if( local_id0 >= radius >>> + && local_id0 < get_local_size(0) - radius ) >>> + { >>> + total_sum = (float4)0.0f; >>> + for( int i=0; i>> + total_sum += column_sum[local_id0-radius+i]; >>> + out[y*width+out_x] = total_sum/area; >>> + } >>> + } >>> + if( --tmp_size ==0 || y == height - 1 ) >>> + break; >>> + >>> + barrier( CLK_LOCAL_MEM_FENCE ); >> >> Is this barrier call guaranteed to be executed by all threads? If not, >> then this will produce undefined behavior. >> >> -Tom >> >>> + >>> + ++y; >>> + if( in_x < in_width ) >>> + { >>> + tmp_sum = column_sum[local_id0]; >>> + tmp_sum -= in[(column_index_start)*in_width+in_x]; >>> + tmp_sum += in[(column_index_end+1)*in_width+in_x]; >>> + ++column_index_start; >>> + ++column_index_end; >>> + column_sum[local_id0] = tmp_sum; >>> + } >>> + >>> + barrier( CLK_LOCAL_MEM_FENCE ); >>> + } >>> +} >>> diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h >>> index bfed601..8f6aa81 100644 >>> --- a/opencl/box-blur.cl.h >>> +++ b/opencl/box-blur.cl.h >>> @@ -44,4 +44,70 @@ static const char* box_blur_cl_source = >>> " out[out_index] = mean / (float)(2 * radius + 1); \n" >>> " } \n" >>> "} \n" >>> +" \n" >>> +"__kernel void kernel_box_blur_fast(const __global float4 *in, \n" >>> +" __global float4 *out, \n" >>> +" __local float4 *column_sum, \n" >>> +" const int width, \n" >>> +" const int height, \n" >>> +" const int radius, \n" >>> +" const int size) \n" >>> +"{ \n" >>> +" const int local_id0 = get_local_id(0); \n" >>> +" const int twice_radius = 2 * radius; \n" >>> +" const int in_width = twice_radius + width; \n" >>> +" const int in_height = twice_radius + height; \n" >>> +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) ); \n" >>> +" int column_index_start,column_index_end; \n" >>> +" int y = get_global_id(1) * size; \n" >>> +" const int out_x = get_group_id(0) \n" >>> +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius; \n" >>> +" const int in_x = out_x + radius; \n" >>> +" int tmp_size = size; \n" >>> +" int tmp_index = 0; \n" >>> +" float4 tmp_sum = (float4)0.0f; \n" >>> +" float4 total_sum = (float4)0.0f; \n" >>> +" if( in_x < in_width ) \n" >>> +" { \n" >>> +" column_index_start = y; \n" >>> +" column_index_end = y + twice_radius; \n" >>> +" for( int i=0; i< width ) \n" >>> +" { \n" >>> +" if( local_id0 >= radius \n" >>> +" && local_id0 < get_local_size(0) - radius ) \n" >>> +" { \n" >>> +" total_sum = (float4)0.0f; \n" >>> +" for( int i=0; i< in_width ) \n" >>> +" { \n" >>> +" tmp_sum = column_sum[local_id0]; \n" >>> +" tmp_sum -= in[(column_index_start)*in_width+in_x]; \n" >>> +" tmp_sum += in[(column_index_end+1)*in_width+in_x]; \n" >>> +" ++column_index_start; \n" >>> +" ++column_index_end; \n" >>> +" column_sum[local_id0] = tmp_sum; \n" >>> +" } \n" >>> +" \n" >>> +" barrier( CLK_LOCAL_MEM_FENCE ); \n" >>> +" } \n" >>> +"} \n" >>> ; >>> diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c >>> index afc19ea..cb77ec0 100644 >>> --- a/operations/common/box-blur.c >>> +++ b/operations/common/box-blur.c >>> @@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation) >>> #include "buffer/gegl-buffer-cl-iterator.h" >>> >>> #include "opencl/box-blur.cl.h" >>> - >>> static GeglClRunData *cl_data = NULL; >>> - >>> static gboolean >>> cl_box_blur (cl_mem in_tex, >>> cl_mem aux_tex, >>> @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex, >>> gint radius) >>> { >>> cl_int cl_err = 0; >>> - size_t global_ws_hor[2], global_ws_ver[2]; >>> - size_t local_ws_hor[2], local_ws_ver[2]; >>> - >>> + size_t global_ws_hor[2], global_ws_ver[2], global_ws[2]; >>> + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2]; >>> + size_t step_size ; >>> if (!cl_data) >>> { >>> - const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", NULL}; >>> + const char *kernel_name[] = { "kernel_blur_hor", "kernel_blur_ver","kernel_box_blur_fast", NULL}; >>> cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name); >>> } >>> >>> if (!cl_data) >>> return TRUE; >>> - >>> - local_ws_hor[0] = 1; >>> - local_ws_hor[1] = 256; >>> - global_ws_hor[0] = roi->height + 2 * radius; >>> - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; >>> - >>> - local_ws_ver[0] = 1; >>> - local_ws_ver[1] = 256; >>> - global_ws_ver[0] = roi->height; >>> - global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; >>> - >>> - >>> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], >>> - sizeof(cl_mem), (void*)&in_tex, >>> - sizeof(cl_mem), (void*)&aux_tex, >>> - sizeof(cl_int), (void*)&roi->width, >>> - sizeof(cl_int), (void*)&radius, >>> - NULL); >>> - CL_CHECK; >>> - >>> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >>> - cl_data->kernel[0], 2, >>> - NULL, global_ws_hor, local_ws_hor, >>> - 0, NULL, NULL); >>> - CL_CHECK; >>> - >>> - >>> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], >>> - sizeof(cl_mem), (void*)&aux_tex, >>> - sizeof(cl_mem), (void*)&out_tex, >>> - sizeof(cl_int), (void*)&roi->width, >>> - sizeof(cl_int), (void*)&radius, >>> - NULL); >>> - CL_CHECK; >>> - >>> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >>> - cl_data->kernel[1], 2, >>> - NULL, global_ws_ver, local_ws_ver, >>> - 0, NULL, NULL); >>> - CL_CHECK; >>> + step_size = 64; >>> + local_ws[0]=256; >>> + local_ws[1]=1; >>> + >>> + >>> + if( radius >> + { >>> + global_ws[0] = (roi->width + local_ws[0] - 2 * radius - 1) / ( local_ws[0] - 2 * radius ) * local_ws[0]; >>> + global_ws[1] = (roi->height + step_size - 1) / step_size; >>> + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2], >>> + sizeof(cl_mem), (void *)&in_tex, >>> + sizeof(cl_mem), (void *)&out_tex, >>> + sizeof(cl_float4)*local_ws[0], (void *)NULL, >>> + sizeof(cl_int), (void *)&roi->width, >>> + sizeof(cl_int), (void *)&roi->height, >>> + sizeof(cl_int), (void *)&radius, >>> + sizeof(cl_int), (void *)&step_size, NULL); >>> + CL_CHECK; >>> + cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(), >>> + cl_data->kernel[2], 2, >>> + NULL, global_ws, local_ws, 0, NULL, NULL ); >>> + CL_CHECK; >>> + >>> + } >>> + else >>> + { >>> + local_ws_hor[0] = 1; >>> + local_ws_hor[1] = 256; >>> + global_ws_hor[0] = roi->height + 2 * radius; >>> + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * local_ws_hor[1]; >>> + >>> + local_ws_ver[0] = 1; >>> + local_ws_ver[1] = 256; >>> + global_ws_ver[0] = roi->height; >>> + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * local_ws_ver[1]; >>> + >>> + >>> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0], >>> + sizeof(cl_mem), (void*)&in_tex, >>> + sizeof(cl_mem), (void*)&aux_tex, >>> + sizeof(cl_int), (void*)&roi->width, >>> + sizeof(cl_int), (void*)&radius, >>> + NULL); >>> + CL_CHECK; >>> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >>> + cl_data->kernel[0], 2, >>> + NULL, global_ws_hor, local_ws_hor, >>> + 0, NULL, NULL); >>> + CL_CHECK; >>> + >>> + >>> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1], >>> + sizeof(cl_mem), (void*)&aux_tex, >>> + sizeof(cl_mem), (void*)&out_tex, >>> + sizeof(cl_int), (void*)&roi->width, >>> + sizeof(cl_int), (void*)&radius, >>> + NULL); >>> + CL_CHECK; >>> + >>> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (), >>> + cl_data->kernel[1], 2, >>> + NULL, global_ws_ver, local_ws_ver, >>> + 0, NULL, NULL); >>> + CL_CHECK; >>> + } >>> >>> return FALSE; >>> >>> -- >>> 1.8.3.2 >>> >>> >>> _______________________________________________ >>> gegl-developer-list mailing list >>> List address: gegl-developer-list@gnome.org >>> List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list >>> >> _______________________________________________ >> gegl-developer-list mailing list >> List address: gegl-developer-list@gnome.org >> List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list >>

Gong, Zhigang
2014-01-23 02:07:25 UTC (over 10 years ago)

[PATCH] Optimize operation box-blur opencl kernel

Hi yongjia,

Right, the size is indeed uniform for all work items in the same group. I just guess people may be a little bit concern of the y which is derived from get_local_id(1). You may consider to add the following attribute to your fast kernel to eliminate those concerns.

__attribute__((reqd_work_group_size(256, 1, 1)))

From: 张勇加 [mailto:zhang_yong_jia@126.com] Sent: Thursday, January 23, 2014 9:48 AM To: Victor Oliveira; tom@stellard.net Cc: gegl-developer-list@gnome.org; Zou, Nanhai; Gong, Zhigang; Zhang, Yongjia Subject: Re:Re: [Gegl-developer] [PATCH] Optimize operation box-blur opencl kernel

Hi Victor and Tom,

Though the barrier you were talking about is not guaranteed to be executed by all threads, but the work items in a workgroup's behaviors about this barrier are all the same, so, if a work item executes the barrier, then the others of the same work group execute it as well, and vice versa. It's for two reasons. 1st is the 'size' viarable are all the same value in all work items and it decreases without any condition. 2nd is that if the 'break' is called because of the 'y==height-1', it is because that we were dealing with the last row of the buffer, the work items of a work group are all in the same situation, every work item will pass this barrier and return.

Thanks

Yongjia Zhang