[XPU][TritonGPUToLLVM] Avoid bank conflicts in sub-group transposes

- Store the whole matrix using SIMD block stores for each row leaving a single garbage item at the end of the row so each row has `sub_group_size + 1` elements - Load each row with vector loads By introducing this garbage item at the end of each row, we ensure matrix loading avoid bank conflicts as the offset between the position loaded by work-item `i` and `i+j` is `N * (sub_group_size + 1)` (assuming `sub_group_size` banks). Signed-off-by: victor-eds <[email protected]>
intel · Nov 20, 2024 · b6cd04c · b6cd04c
1 parent 7551a90
commit b6cd04c
Show file tree

Hide file tree

Showing 4 changed files with 440 additions and 242 deletions.
diff --git a/test/Conversion/intel/intel-allocate-shared-memory.mlir b/test/Conversion/intel/intel-allocate-shared-memory.mlir
@@ -24,7 +24,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 // Check scracth memory configuration for different sub-group transpose-like layout conversions.
 
 // CHECK-LABEL: module attributes
-// CHECK-SAME: triton_gpu.shared = 512 : i32
+// CHECK-SAME: triton_gpu.shared = 544 : i32
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   tt.func @test_f16(%arg0: tensor<16x16xf16, #blocked>) -> tensor<16x16xf16, #blocked1> {
     %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #blocked1>
@@ -40,7 +40,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 // Check scracth memory configuration for different sub-group transpose-like layout conversions.
 
 // CHECK-LABEL: module attributes
-// CHECK-SAME: triton_gpu.shared = 1024 : i32
+// CHECK-SAME: triton_gpu.shared = 1088 : i32
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   tt.func @test_f32(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16x16xf32, #blocked1> {
     %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #blocked1>
@@ -56,7 +56,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 // Check scracth memory configuration for different sub-group transpose-like layout conversions.
 
 // CHECK-LABEL: module attributes
-// CHECK-SAME: triton_gpu.shared = 32768 : i32
+// CHECK-SAME: triton_gpu.shared = 34816 : i32
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   tt.func @test_f32(%arg0: tensor<128x64xf32, #blocked>) -> tensor<128x64xf32, #blocked1> {
     %0 = triton_gpu.convert_layout %arg0 : tensor<128x64xf32, #blocked> -> tensor<128x64xf32, #blocked1>