intel · fengyuan14 · Jul 11, 2024 · Jul 6, 2024 · Jul 7, 2024 · Jul 10, 2024
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -97,13 +97,13 @@ void launch_glu_backward_kernel(
     OffsetCalc offset_calculator,
     int64_t gI_byte_offset,
     int64_t I_byte_offset) {
-  const int64_t local_size = syclMaxWorkGroupSize();
-  const int64_t num_wg = (numel + local_size - 1) / local_size;
-  const int64_t global_size = num_wg * local_size;
-
   GluBackwardKernelFunctor<scalar_t, OffsetCalc> kfn(
       numel, gI, I, gO, offset_calculator, gI_byte_offset, I_byte_offset);
 
+  const int64_t local_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg = (numel + local_size - 1) / local_size;
+  const int64_t global_size = num_wg * local_size;
+
   sycl_kernel_submit(global_size, local_size, getCurrentSYCLQueue(), kfn);
 }
 

diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -194,7 +194,7 @@ struct AdaptiveAvgPool2dBwdSLMKernelFunctor
     numel_ = ib_ * ic_ * ih_ * iw_;
     int total_item = std::min(numel_, syclMaxWorkItemsPerTile());
 
-    local_range_ = syclMaxWorkGroupSize();
+    local_range_ = syclMaxWorkGroupSize(*this);
     global_range_ = total_item < local_range_
         ? local_range_
         : (total_item / local_range_) * local_range_;

diff --git a/src/ATen/native/xpu/sycl/BatchKernel.h b/src/ATen/native/xpu/sycl/BatchKernel.h
@@ -39,6 +39,7 @@ class BatchKernelConfig {
         problem_batch_(problem_batch),
         problem_along_x_(problem_along_x),
         policy_(policy_combine(policies)),
+        prefer_wg_size_(prefer_wg_size),
         problem_wg_range_(0),
         problem_glb_range_(0),
         problem_range_(0),
@@ -47,12 +48,15 @@ class BatchKernelConfig {
         glb_range_x_(0),
         glb_range_y_(0),
         wg_range_x_(0),
-        wg_range_y_(0) {
-    size_t wg_size = syclMaxWorkGroupSize();
+        wg_range_y_(0) {}
+
+  template <class KernelClass>
+  void build() {
+    size_t wg_size = syclMaxWorkGroupSize<KernelClass>();
     size_t sg_size = syclMaxSubGroupSize();
-    if (prefer_wg_size != 0 && prefer_wg_size % sg_size == 0 &&
-        prefer_wg_size < wg_size) {
-      wg_size = prefer_wg_size;
+    if (prefer_wg_size_ != 0 && prefer_wg_size_ % sg_size == 0 &&
+        prefer_wg_size_ < wg_size) {
+      wg_size = prefer_wg_size_;
     }
     wg_range_x_ = sg_size;
     wg_range_y_ = wg_size / wg_range_x_;
@@ -263,6 +267,7 @@ class BatchKernelConfig {
   /* logical active batch */ int64_t problem_batch_;
   bool problem_along_x_;
   Policy policy_;
+  size_t prefer_wg_size_;
   int64_t problem_wg_range_;
   int64_t problem_glb_range_;
   size_t problem_range_;