From ecc35557f2bea973821834c22995fbd3f1095a49 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 19 Nov 2024 17:30:06 -0500 Subject: [PATCH] Fix coalescing pass (#2760) (#2764) Fix Intel coalescing pass for cases where the result of a SCF loop (containing a coalescable block ptr load) is used by an operation with operands that do not have block ptr type (e.g. `tt.reduce`) --------- Signed-off-by: Tiotto, Ettore (cherry picked from commit a8ca9e558026ff49c3bb74c6471c112b04f63d2d) Co-authored-by: Ettore Tiotto --- test/TritonIntelGPU/coalesce.mlir | 47 +++++++++++++++++++ .../lib/TritonIntelGPUTransforms/Coalesce.cpp | 11 +++-- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/test/TritonIntelGPU/coalesce.mlir b/test/TritonIntelGPU/coalesce.mlir index d9b2de454c..b078158d8b 100644 --- a/test/TritonIntelGPU/coalesce.mlir +++ b/test/TritonIntelGPU/coalesce.mlir @@ -336,3 +336,50 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return } } + +// ----- + +// COM: Test coalescing on blocked pointers: loop result used by tt.reduce + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 16 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-DAG: [[BLOCKED_LAYOUT:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 4], order = [1, 0]}> + // CHECK-DAG: [[BLOCKED_LAYOUT1:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 32, 1], warpsPerCTA = [1, 1, 16], order = [0, 1, 2]}> + // CHECK-DAG: [[BLOCKED_LAYOUT2:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [1, 1, 32], warpsPerCTA = [1, 4, 4], order = [2, 1, 0]}> + // CHECK: @triton_red_fused_mul_sum_0 + tt.func public @triton_red_fused_mul_sum_0(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) { + %c128_i32 = arith.constant 128 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c262144_i64 = arith.constant 262144 : i64 + %c1_i64 = arith.constant 1 : i64 + %c512_i64 = arith.constant 512 : i64 + %c32_i32 = arith.constant 32 : i32 + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c32_i32 : i32 + %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %3 = tt.expand_dims %2 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> + %4 = arith.divsi %1, %c512_i32 : i32 + %5 = arith.remsi %1, %c512_i32 : i32 + // CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr %arg0, {{.*}} : > + %6 = tt.make_tensor_ptr %arg0, [%c512_i64, %c512_i64, %c512_i64], [%c1_i64, %c512_i64, %c262144_i64], [%4, %5, %c0_i32] {order = array} : > + // CHECK: [[RES:%.*]]:2 = scf.for {{.*}} iter_args([[ARG1:%.*]] = [[PTR1]], [[ARG2:%.*]] = {{.*}}) -> (!tt.ptr>, tensor<32x128xf32, [[BLOCKED_LAYOUT]]>) + %8:2 = scf.for %arg5 = %c0_i32 to %c512_i32 step %c128_i32 iter_args(%arg6 = %6, %arg8 = %cst_0) -> (!tt.ptr>, tensor<32x128xf32, #blocked>) : i32 { + // CHECK: [[LOAD:%.*]] = tt.load [[ARG1]] evictionPolicy = evict_last {boundaryCheck = array, padding = 1 : i32} : !tt.ptr> + // CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<1x32x128xf32, [[BLOCKED_LAYOUT1]]> -> tensor<1x32x128xf32, [[BLOCKED_LAYOUT2]]> + %17 = tt.load %arg6 evictionPolicy = evict_last {boundaryCheck = array, padding = 1 : i32} : !tt.ptr> + // CHECK: scf.yield [[ARG1]], [[ARG2]] : !tt.ptr>, tensor<32x128xf32, [[BLOCKED_LAYOUT]]> + scf.yield %arg6, %arg8 : !tt.ptr>, tensor<32x128xf32, #blocked> + } + // CHECK: = "tt.reduce"([[RES]]#1) <{axis = 1 : i32}> ({ + // CHECK }) : (tensor<32x128xf32, [[BLOCKED_LAYOUT]]) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = [[BLOCKED_LAYOUT]]}>> + %9 = "tt.reduce"(%8#1) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %14 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %14 : f32 + }) : (tensor<32x128xf32, #blocked>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + tt.return + } +} diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp index 7f52090f4e..978622ecc0 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp @@ -148,11 +148,13 @@ struct CoalescePass if (op->getNumResults() == 0 && op->getNumRegions() == 0) return true; + // Operations that do not consume a block pointer aren't interesting. + if (llvm::none_of(op->getOperandTypes(), tt::isTensorPointerType)) + return true; + // Operations that do not yield a block pointer aren't interesting. if (op->getNumRegions() == 0 && - llvm::none_of(op->getResultTypes(), [](Type resType) { - return tt::isTensorPointerType(resType); - })) + llvm::none_of(op->getResultTypes(), tt::isTensorPointerType)) return true; return false; @@ -367,8 +369,7 @@ struct CoalescePass }); LLVM_DEBUG({ - DBGS() << "\nlayoutMap:" - << "\n"; + DBGS() << "\nlayoutMap:\n"; for (auto [op, encoding] : layoutMap) { DBGS() << "op: " << *op << "\n"; DBGS() << "encoding: " << encoding << "\n\n";