From ff6ea62ce06e32ef4a84c03d061178244b23f8ce Mon Sep 17 00:00:00 2001 From: Mateusz Tabaka Date: Mon, 23 May 2022 15:59:37 +0200 Subject: [PATCH] Fix local work size for conv kernel yxfb_yxio_b16 with fp16 (#11679) convolution_gpu_yxfb_yxio_b16 for fp16 has hardcoded reqd_work_group_size to (16, 1, 1). On devices where CL_DEVICE_MAX_WORK_GROUP_SIZE is 512 GetOptimalLocalWorkGroupSizes picks (16, 2, 1) for LWS. That causes issues during clEnqueueNDRangeKernel since LWS doesn't match with reqd_work_group_size in the kernel. --- .../convolution/convolution_kernel_yxfb_yxio_b16.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp index 72f8fa4287f156..d6512772da3d08 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b16.cpp @@ -78,6 +78,11 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b16::SetDefault( dispatchData.lws[0] = min_lws; dispatchData.gws[0] = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem); + if (arg.inputs[0].GetDType() == Datatype::F16) { + dispatchData.lws[1] = 1; + dispatchData.lws[2] = 1; + } + return dispatchData; }