Using kernel specific max work group size instead of device max work … (

#542) …group size. Max work group size of kernel is not a static and device related only property. It now in SYCL depends on driver/compiler implementation. Device max work group means the probable max work group size allowd by the device. But actual max work group size depends on driver/compiler implementation, like compilaton optimization. Using kernel specific max work group size could get actual max work group allowed correctly. For example, on Xe, if compiler chooses SIMD16 and large GRF (32 HW threads per SS), the actual max work group size will be 512 (16 * 32), not 1024 queried by device::info::max_work_group_size. --------- Signed-off-by: Feng Yuan <[email protected]>
intel · Jul 11, 2024 · 0253fb9 · 0253fb9
1 parent d294ebd
commit 0253fb9
Show file tree

Hide file tree

Showing 29 changed files with 824 additions and 460 deletions.
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -97,13 +97,13 @@ void launch_glu_backward_kernel(
     OffsetCalc offset_calculator,
     int64_t gI_byte_offset,
     int64_t I_byte_offset) {
-  const int64_t local_size = syclMaxWorkGroupSize();
-  const int64_t num_wg = (numel + local_size - 1) / local_size;
-  const int64_t global_size = num_wg * local_size;
-
   GluBackwardKernelFunctor<scalar_t, OffsetCalc> kfn(
       numel, gI, I, gO, offset_calculator, gI_byte_offset, I_byte_offset);
 
+  const int64_t local_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg = (numel + local_size - 1) / local_size;
+  const int64_t global_size = num_wg * local_size;
+
   sycl_kernel_submit(global_size, local_size, getCurrentSYCLQueue(), kfn);
 }
 

diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -194,7 +194,7 @@ struct AdaptiveAvgPool2dBwdSLMKernelFunctor
     numel_ = ib_ * ic_ * ih_ * iw_;
     int total_item = std::min(numel_, syclMaxWorkItemsPerTile());
 
-    local_range_ = syclMaxWorkGroupSize();
+    local_range_ = syclMaxWorkGroupSize(*this);
     global_range_ = total_item < local_range_
         ? local_range_
         : (total_item / local_range_) * local_range_;

diff --git a/src/ATen/native/xpu/sycl/BatchKernel.h b/src/ATen/native/xpu/sycl/BatchKernel.h
@@ -39,6 +39,7 @@ class BatchKernelConfig {
         problem_batch_(problem_batch),
         problem_along_x_(problem_along_x),
         policy_(policy_combine(policies)),
+        prefer_wg_size_(prefer_wg_size),
         problem_wg_range_(0),
         problem_glb_range_(0),
         problem_range_(0),
@@ -47,12 +48,15 @@ class BatchKernelConfig {
         glb_range_x_(0),
         glb_range_y_(0),
         wg_range_x_(0),
-        wg_range_y_(0) {
-    size_t wg_size = syclMaxWorkGroupSize();
+        wg_range_y_(0) {}
+
+  template <class KernelClass>
+  void build() {
+    size_t wg_size = syclMaxWorkGroupSize<KernelClass>();
     size_t sg_size = syclMaxSubGroupSize();
-    if (prefer_wg_size != 0 && prefer_wg_size % sg_size == 0 &&
-        prefer_wg_size < wg_size) {
-      wg_size = prefer_wg_size;
+    if (prefer_wg_size_ != 0 && prefer_wg_size_ % sg_size == 0 &&
+        prefer_wg_size_ < wg_size) {
+      wg_size = prefer_wg_size_;
     }
     wg_range_x_ = sg_size;
     wg_range_y_ = wg_size / wg_range_x_;
@@ -263,6 +267,7 @@ class BatchKernelConfig {
   /* logical active batch */ int64_t problem_batch_;
   bool problem_along_x_;
   Policy policy_;
+  size_t prefer_wg_size_;
   int64_t problem_wg_range_;
   int64_t problem_glb_range_;
   size_t problem_range_;