diff --git a/test/TritonIntelGPU/optimize-elementwise.mlir b/test/TritonIntelGPU/optimize-elementwise.mlir deleted file mode 100644 index a6c08eaab9..0000000000 --- a/test/TritonIntelGPU/optimize-elementwise.mlir +++ /dev/null @@ -1,259 +0,0 @@ -// RUN: triton-opt %s --split-input-file -tritonintelgpu-optimize-elementwise-parallelism | FileCheck %s - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_dpas( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) - tt.func @test_dpas(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_0]]> - %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> - -#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_blocked( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) - tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_1]]> - %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_1]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> - tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_blocked_repeat( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) - tt.func @test_blocked_repeat(%arg0: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<64xf32, #[[$ATTR_1]]> - %0 = arith.addf %arg0, %arg1 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<64xf32, #[[$ATTR_1]]> -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> - tt.return %0 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 1], order = [0, 1]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [2], order = [0]}> - -#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 1], order = [0, 1]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_blocked_multi_warp( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> { - tt.func @test_blocked_multi_warp(%arg0: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<32xf32, #[[$ATTR_1]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<32xf32, #[[$ATTR_1]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<32xf32, #[[$ATTR_1]]> - %0 = arith.addf %arg0, %arg1 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<32xf32, #[[$ATTR_1]]> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> - tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [0, 1]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}> - -#blocked = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [0, 1]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_blocked_multi_warp_double_stride( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> { - tt.func @test_blocked_multi_warp_double_stride(%arg0: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<128xf16, #[[$ATTR_1]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<128xf16, #[[$ATTR_1]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_1]]> - %0 = arith.addf %arg0, %arg1 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_1]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> - tt.return %0 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [8], order = [0]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_mma_multi_warp_double_stride( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> { - tt.func @test_mma_multi_warp_double_stride(%arg0: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_0]]> - %0 = arith.addf %arg0, %arg1 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_0]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.return %0 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [2], order = [0]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_mma_multi_warp_double_stride_repeat( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, -// CHECK-SAME: %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> { - tt.func @test_mma_multi_warp_double_stride_repeat(%arg0: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> { -// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]> -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]> -// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_0]]> - %0 = arith.addf %arg0, %arg1 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_0]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.return %0 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { -// CHECK-LABEL: tt.func @test_multi_user( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_2:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) - tt.func @test_multi_user(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg2: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> { -// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_4:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_5:.*]] = arith.addf %[[VAL_3]], %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]> - %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_6:.*]] = triton_gpu.convert_layout %[[VAL_5]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_7:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_8:.*]] = triton_gpu.convert_layout %[[VAL_2]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_9:.*]] = arith.addf %[[VAL_7]], %[[VAL_8]] : tensor<16xf32, #[[$ATTR_0]]> - %1 = arith.addf %arg0, %arg2 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_10:.*]] = triton_gpu.convert_layout %[[VAL_9]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_11:.*]] = triton_gpu.convert_layout %[[VAL_6]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_12:.*]] = triton_gpu.convert_layout %[[VAL_10]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_13:.*]] = arith.addf %[[VAL_11]], %[[VAL_12]] : tensor<16xf32, #[[$ATTR_0]]> - %2 = arith.addf %0, %1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_14:.*]] = triton_gpu.convert_layout %[[VAL_13]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: tt.return %[[VAL_14]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.return %2 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { - -// CHECK-LABEL: tt.func @test_basic_loop( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, -// CHECK-SAME: %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index - tt.func @test_basic_loop(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg2: index, %arg3: index, %arg4: index) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> { -// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_6:.*]] = scf.for %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_4]] iter_args(%[[VAL_8:.*]] = %[[VAL_5]]) -> (tensor<16xf32, #[[$ATTR_0]]>) { - %0 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg0) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) { -// CHECK: %[[VAL_9:.*]] = triton_gpu.convert_layout %[[VAL_8]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_10:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_11:.*]] = triton_gpu.convert_layout %[[VAL_9]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] : tensor<16xf32, #[[$ATTR_0]]> - %1 = arith.addf %arg1, %arg6 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_13:.*]] = triton_gpu.convert_layout %[[VAL_12]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_14:.*]] = triton_gpu.convert_layout %[[VAL_13]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: scf.yield %[[VAL_14]] : tensor<16xf32, #[[$ATTR_0]]> - scf.yield %1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -// CHECK: %[[VAL_15:.*]] = triton_gpu.convert_layout %[[VAL_6]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: tt.return %[[VAL_15]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> -// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { - -// CHECK-LABEL: tt.func @test_advanced_loop( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, -// CHECK-SAME: %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index, -// CHECK-SAME: %[[VAL_5:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.func @test_advanced_loop(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg2: index, %arg3: index, %arg4: index, %arg5: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) { -// CHECK: %[[VAL_6:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_7:.*]] = triton_gpu.convert_layout %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_8:.*]]:2 = scf.for %[[VAL_9:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_4]] iter_args(%[[VAL_10:.*]] = %[[VAL_6]], %[[VAL_11:.*]] = %[[VAL_7]]) -> (tensor<16xf32, #[[$ATTR_0]]>, tensor<16xf32, #[[$ATTR_0]]>) { - %0:2 = scf.for %arg6 = %arg2 to %arg3 step %arg4 iter_args(%arg7 = %arg0, %arg8 = %arg5) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) { -// CHECK: %[[VAL_12:.*]] = triton_gpu.convert_layout %[[VAL_10]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_13:.*]] = triton_gpu.convert_layout %[[VAL_11]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_14:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_15:.*]] = triton_gpu.convert_layout %[[VAL_12]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_16:.*]] = arith.addf %[[VAL_14]], %[[VAL_15]] : tensor<16xf32, #[[$ATTR_0]]> - %1 = arith.addf %arg1, %arg7 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> -// CHECK: %[[VAL_17:.*]] = triton_gpu.convert_layout %[[VAL_16]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_18:.*]] = triton_gpu.convert_layout %[[VAL_17]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_19:.*]] = triton_gpu.convert_layout %[[VAL_13]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_20:.*]] = arith.addf %[[VAL_18]], %[[VAL_19]] : tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_21:.*]] = triton_gpu.convert_layout %[[VAL_20]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_22:.*]] = triton_gpu.convert_layout %[[VAL_17]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: %[[VAL_23:.*]] = triton_gpu.convert_layout %[[VAL_21]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> -// CHECK: scf.yield %[[VAL_22]], %[[VAL_23]] : tensor<16xf32, #[[$ATTR_0]]>, tensor<16xf32, #[[$ATTR_0]]> - %2 = arith.addf %1, %arg8 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - scf.yield %1, %2 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -// CHECK: } -// CHECK: %[[VAL_24:.*]] = triton_gpu.convert_layout %[[VAL_25:.*]]#0 : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: %[[VAL_26:.*]] = triton_gpu.convert_layout %[[VAL_25]]#1 : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -// CHECK: tt.return %[[VAL_24]], %[[VAL_26]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> - tt.return %0#0, %0#1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } -} diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index d1176ba078..99307e50f3 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -252,7 +252,6 @@ def make_ttgir(mod, metadata, opt, properties): passes.ttgpuir.add_prefetch(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) intel.passes.ttgpuir.add_optimize_reduction_locality(pm) - intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_reduce_data_duplication(pm) passes.ttgpuir.add_reorder_instructions(pm) diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td index 1d81bc4741..c551a96856 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td @@ -365,52 +365,4 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic "mlir::triton::gpu::TritonGPUDialect"]; } -def TritonIntelGPUOptimizeElementwiseParallelism - : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> { - let summary = - "Improve parallelism of elementwise operations better utilizing hardware resources."; - - let description = [{ - Detect elementwise operations with an encoding causing sub-par parallelism, - i.e., with data duplication across threads, and convert the operands to a - more optimal encoding if the cost of doing so is heuristically estimated to - be sufficiently low. As of now, the cost should be 0, we only support - "unbroadcasting" tensors, i.e., dropping duplicated values held in other - threads by re-distributing them. - - As an example, this pass would modify the following code: -```mlir -#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { - tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { - %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } -} -``` - Obtaining: -```mlir -#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> - -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { - tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { - %0 = triton_gpu.convert_layout %arg0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1> - %1 = triton_gpu.convert_layout %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1> - %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1> - %3 = triton_gpu.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - tt.return %3 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } -} -``` - - Note how the converted tensors are not sliced and thus each element in the - tensor is held by a single thread. - }]; - - let dependentDialects = []; -} - - #endif // TRITON_INTEL_GPU_PASSES diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt index 46d121a070..dbc641e2a3 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt +++ b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt @@ -4,7 +4,6 @@ add_triton_library(TritonIntelGPUTransforms DistributeToWarps.cpp MatchTargetSize.cpp MaterializeBlockPointer.cpp - OptimizeElementwiseParallelism.cpp OptimizeReductionLocality.cpp Pipeliner/MatmulLoopPipeline.cpp Pipeliner/SoftwarePipeliner.cpp diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp deleted file mode 100644 index af997d2193..0000000000 --- a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp +++ /dev/null @@ -1,397 +0,0 @@ -//===- OptimizeElementwiseParallelism.cpp -------------------------------*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// This file implements the `tritonintelgpu-optimize-elementwise-parallelism` -/// pass. -//===----------------------------------------------------------------------===// - -#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h" - -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" - -#include "triton/Dialect/Triton/IR/Dialect.h" -#include "triton/Dialect/Triton/IR/Utility.h" -#include "triton/Dialect/TritonGPU/IR/Dialect.h" - -#define DEBUG_TYPE "tritonintelgpu-optimize-elementwise-parallelism" - -namespace mlir::triton::gpu::intel { -#define GEN_PASS_DEF_TRITONINTELGPUOPTIMIZEELEMENTWISEPARALLELISM -#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h.inc" - -namespace { -bool isMultiWarpValidLayoutForUnbroadcast(const LinearLayout &linearLayout, - int32_t numWorkGroupPos, - Builder &builder) { - StringAttr kLane = builder.getStringAttr("lane"); - StringAttr kWarp = builder.getStringAttr("warp"); - int32_t subGroupSize = linearLayout.getInDimSize(kLane); - ArrayRef numContiguousPerWarp = linearLayout.getBasis(kWarp, 0); - // Check the warp dimension hasn't been sliced away and we have n * - // sub_group_size contiguous elements per warp. - if (numContiguousPerWarp == ArrayRef{0} || - numContiguousPerWarp[0] % subGroupSize != 0) - return false; - int32_t expectedValue = numContiguousPerWarp[0] * 2; - for (int32_t pos = 1; pos < numWorkGroupPos; ++pos) { - if (linearLayout.getBasis(kWarp, pos) != ArrayRef{expectedValue}) - return false; - expectedValue *= 2; - } - return true; -} - -/// Return whether the input linear layout can be unbroadcasted. -/// -/// A layout is valid for being "unbroadcasted" along its lanes if: -/// - The 'lane' input dimension is zero: this means the lane dimension has been -/// sliced. -/// - The size of the input 'block' dimension is 1. This is true for XPU -/// backend. -/// - The size of the input 'warp' dimension is 1 or there are n*sub_group_size -/// contiguous elements per warp. -/// -/// Broadcasted layouts are layouts with sliced lane, warp or block (not -/// possible for XPU backend) dimensions, i.e., the same data is owned by -/// different threads. -bool isValidLayoutForUnbroadcast(const LinearLayout &linearLayout, - Builder &builder) { - StringAttr kLane = builder.getStringAttr("lane"); - StringAttr kWarp = builder.getStringAttr("warp"); - StringAttr kBlock = builder.getStringAttr("block"); - StringAttr kDim0 = builder.getStringAttr("dim0"); - // 'lane' dimension must have been sliced away completely. - if (!linearLayout.sublayoutIsZero(kLane, kDim0)) - return false; - // Only single block for now. - if (linearLayout.getInDimSize(kBlock) != 1) - return false; - // 'warp' dimension hasn't been sliced away and there are n*sub_group_size - // contiguous elements in each warp (or there is a single warp). - int32_t numWorkGroupPos = linearLayout.getInDimSizeLog2(kWarp); - return numWorkGroupPos == 0 || isMultiWarpValidLayoutForUnbroadcast( - linearLayout, numWorkGroupPos, builder); -} - -/// Generic checks for the operation not looking at the tensor type. -bool isCandidateOp(Operation *op) { - // Rely on this for a simpler pass. - if (!op->hasTrait() || - op->getNumResults() != 1) - return false; - - // Skip complex operations. - if (op->hasSuccessors() || op->getNumRegions() != 0) - return false; - - return true; -} - -bool optimizationDoesNotWorsenRegisterPressure( - Value value, RankedTensorType newType, SmallPtrSetImpl &visited) { - if (!visited.insert(value).second) - return true; - // All users must be operations we will optimize too or layout conversions we - // will introduce later. - return llvm::all_of(value.getUses(), [&visited, newType](OpOperand &operand) { - Operation *owner = operand.getOwner(); - - // We will be introducing just this operation later. - if (auto convertLayout = dyn_cast(owner)) - return convertLayout.getResult().getType() == newType; - - // Broadcasted in source. - if (isa(owner)) - return true; - - // Allow for loop optimizations. - if (isa(owner)) - return true; - - // Only allow candidates. Check only operation constraints. We do not have - // to check the type as we did already. - if (!owner->hasTrait() || !isCandidateOp(owner)) - return false; - - // Check other operands fit the constraints. - return llvm::all_of(owner->getOperands(), - [&visited, newType](Value operand) { - return optimizationDoesNotWorsenRegisterPressure( - operand, newType, visited); - }); - }); -} - -/// Get optimized unbroadcasted tensor type. -/// -/// Get optimized ranked tensor type after unbroadcasting. As we only support 1D -/// tensors, this is as simple as getting an "unboradcasted" blocked-encoded 1D -/// tensor type. -RankedTensorType getOptimizedType(RankedTensorType type, - const LinearLayout &linearLayout, - Builder &builder) { - StringAttr kWarp = builder.getStringAttr("warp"); - - auto encoding = cast(type.getEncoding()); - unsigned threadsPerWarp = product(encoding.getThreadsPerWarp()); - unsigned warpsPerCTA = product(encoding.getWarpsPerCTA()); - [[maybe_unused]] unsigned ctaSplitNum = product(encoding.getCTASplitNum()); - assert(ctaSplitNum == 1 && "Expecting single CTA"); - - RankedTensorType::Builder typeBuilder(type); - int32_t numWorkGroupPos = linearLayout.getInDimSizeLog2(kWarp); - unsigned sizePerThread = - numWorkGroupPos == 0 - ? 1 - : linearLayout.getBasis(kWarp, 0)[0] / threadsPerWarp; - CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(rewriter.getContext(), 1); - auto newEncoding = rewriter.getAttr( - sizePerThread, threadsPerWarp, warpsPerCTA, /*order=*/0, ctaLayout); - typeBuilder.setEncoding(newEncoding); - return typeBuilder; -} - -bool isCandidateTypeForOptimization(RankedTensorType type) { - if (!type) - return false; - - // Check if the layout is actually bad and can be optimized using our - // approach. We only support 1D tensors for now as these are easier to - // handle. - Attribute layout = type.getEncoding(); - if (!layout || type.getRank() != 1) - return false; - std::optional linearLayout = - toLinearLayout(type.getShape(), layout); - - LLVM_DEBUG(llvm::dbgs() << "Checking linear layout:\n" - << linearLayout << "\n"); - - Builder builder(type.getContext()); - if (!linearLayout || !isValidLayoutForUnbroadcast(*linearLayout, builder)) - return false; - - // As we are dealing with 1D tensors, we can do a simple transform to obtain - // a more optimized operation. - RankedTensorType newType = getOptimizedType(type, *linearLayout, builder); - - LLVM_DEBUG(llvm::dbgs() << "Would convert to type:\n" << newType << "\n"); - - return true; -} - -bool canLoopInductionVarBeOptimized(Value initArg, Value regionIterArg, - Value yieldedVal) { - LLVM_DEBUG(llvm::dbgs() << "Checking loop vars:\n" - << initArg << "\n" - << regionIterArg << "\n" - << yieldedVal << "\n"); - - // Check the induction variable is a candidate for this optimization based on - // its type. - auto type = dyn_cast(initArg.getType()); - if (!isCandidateTypeForOptimization(type)) - return false; - assert(type && "Expecting ranked tensor type"); - - // We want to check all the variables involve in the optimization can be - // replaced by ones with a more efficient layout without affecting register - // pressure. - - LinearLayout linearLayout = - *toLinearLayout(type.getShape(), type.getEncoding()); - Builder builder(type.getContext()); - RankedTensorType newType = getOptimizedType(type, linearLayout, builder); - - SmallPtrSet visited; - // Only allow initArgs with a single use for now. - return initArg.hasOneUse() && - optimizationDoesNotWorsenRegisterPressure(yieldedVal, newType, - visited) && - optimizationDoesNotWorsenRegisterPressure(regionIterArg, newType, - visited); -} - -struct ElementwiseOptPattern final - : OpTraitRewritePattern { - using OpTraitRewritePattern::OpTraitRewritePattern; - - LogicalResult matchAndRewrite(Operation *op, - PatternRewriter &rewriter) const final { - LLVM_DEBUG(llvm::dbgs() << "Checking operation:\n" << *op << "\n"); - - // Rely on this for a simpler pass. - if (!isCandidateOp(op)) - return failure(); - - // Check the operation is a candidate for this optimization based on its - // type. - auto type = dyn_cast(op->getResult(0).getType()); - if (!isCandidateTypeForOptimization(type)) - return failure(); - assert(type && "Expecting ranked tensor type"); - - LinearLayout linearLayout = - *toLinearLayout(type.getShape(), type.getEncoding()); - - // As we are dealing with 1D tensors, we can do a simple transform to obtain - // a more optimized operation. - Location loc = op->getLoc(); - RankedTensorType newType = getOptimizedType(type, linearLayout, rewriter); - - // Check the operands are not used by other operations. This will prevent - // register pressure increase: - if (SmallPtrSet visited; - !llvm::all_of(op->getOperands(), [&visited, newType](Value operand) { - return optimizationDoesNotWorsenRegisterPressure(operand, newType, - visited); - })) - return failure(); - - // Obtain converted operands. - SmallVector newOperands(op->getNumOperands()); - llvm::transform(op->getOperands(), std::begin(newOperands), - [&rewriter, loc, newType](Value operand) { - return rewriter.create(loc, newType, - operand); - }); - - // Now we create the optimized operation: - StringAttr opName = op->getName().getIdentifier(); - ArrayRef attributes = op->getAttrs(); - Operation *newElementwiseOp = - rewriter.create(loc, opName, newOperands, newType, attributes); - assert(newElementwiseOp->getNumResults() == 1 && - "Expecting single result operation"); - - // Convert to unoptimized encoding for further use. - Value newValue = newElementwiseOp->getResult(0); - rewriter.replaceOpWithNewOp(op, type, newValue); - - LLVM_DEBUG(llvm::dbgs() << "Conversion took place.\n"); - - return success(); - } -}; - -struct ForOptPattern final : OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(scf::ForOp loop, - PatternRewriter &rewriter) const final { - LLVM_DEBUG(llvm::dbgs() << "Checking operation:\n" << loop << "\n"); - - // Tuples of - SmallVector> toOptimize; - llvm::copy_if(llvm::zip_equal(loop.getInitArgs(), loop.getRegionIterArgs(), - loop.getYieldedValues()), - std::back_inserter(toOptimize), [](auto entry) { - auto [initArg, regionIterArg, yieldedVal] = entry; - return canLoopInductionVarBeOptimized( - initArg, regionIterArg, yieldedVal); - }); - if (toOptimize.empty()) - return failure(); - - constexpr auto getRealArgNumber = [](BlockArgument blockArg) { - return blockArg.getArgNumber() - 1; - }; - - LLVM_DEBUG({ - llvm::dbgs() << "Selected for optimization:\n"; - for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) - llvm::dbgs() << "Value:\n- init_arg: " << initArg - << "\n- index: " << getRealArgNumber(regionIterArg) - << "\n- yielded_value: " << yieldedVal << "\n"; - }); - - Location loc = loop.getLoc(); - Value lowerBound = loop.getLowerBound(); - Value upperBound = loop.getUpperBound(); - Value step = loop.getStep(); - - // Convert candidate init args: - SmallVector newInitArgs(loop.getInitArgs()); - for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) { - unsigned index = getRealArgNumber(regionIterArg); - auto type = cast(initArg.getType()); - LinearLayout linearLayout = - *toLinearLayout(type.getShape(), type.getEncoding()); - RankedTensorType newType = getOptimizedType(type, linearLayout, rewriter); - newInitArgs[index] = - rewriter.create(loc, newType, initArg); - } - - // Create new for loop. - // We provide a custom loop body builder that will clone the original body, - // but adding layout conversions for the optimized block arguments. - auto loopBodyBuilder = [&](OpBuilder &builder, Location loc, - Value inductionVar, ValueRange regionIterArgs) { - // Add mapping for the cloning. - // We need to convert the operations back to the unoptimized layout in the - // loop body. - SmallVector argValues{inductionVar}; - llvm::append_range(argValues, regionIterArgs); - for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) { - unsigned index = getRealArgNumber(regionIterArg); - auto type = cast(initArg.getType()); - Value backToOriginalLayout = builder.create( - loc, regionIterArg.getType(), regionIterArgs[index]); - argValues[regionIterArg.getArgNumber()] = backToOriginalLayout; - } - - rewriter.mergeBlocks(&loop.getRegion().front(), builder.getBlock(), - argValues); - - // Modify yield operation with updated values. - auto yieldOp = cast(builder.getBlock()->getTerminator()); - builder.setInsertionPoint(yieldOp); - for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) { - unsigned index = getRealArgNumber(regionIterArg); - Type type = regionIterArgs[index].getType(); - yieldOp.getResultsMutable()[index].assign( - builder.create( - loc, type, yieldOp.getResultsMutable()[index].get())); - } - }; - auto newForOp = rewriter.create( - loc, lowerBound, upperBound, step, newInitArgs, loopBodyBuilder); - - LLVM_DEBUG(llvm::dbgs() << "New loop:\n" << newForOp << "\n"); - - // Convert for loop results back to their original types: - SmallVector newVals(newForOp.getResults().size()); - llvm::transform(llvm::zip_equal(newForOp.getResults(), loop.getResults()), - std::begin(newVals), [&](auto entry) -> Value { - auto [newRes, origRes] = entry; - if (newRes.getType() == origRes.getType()) - return newRes; - return rewriter.create( - loc, origRes.getType(), newRes); - }); - rewriter.replaceOp(loop, newVals); - return success(); - } -}; - -struct TritonIntelGPUOptimizeElementwiseParallelism final - : impl::TritonIntelGPUOptimizeElementwiseParallelismBase< - TritonIntelGPUOptimizeElementwiseParallelism> { - using Base::Base; - - void runOnOperation() final { - Operation *op = getOperation(); - MLIRContext *ctx = op->getContext(); - RewritePatternSet patterns(ctx); - patterns.add(ctx); - if (failed( - applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) - signalPassFailure(); - } -}; -} // namespace -} // namespace mlir::triton::gpu::intel diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 3a3037f6c0..55db149919 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -100,9 +100,6 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) { gpu::intel::createTritonIntelGPUMaterializeBlockPointer); ADD_PASS_WRAPPER_0("add_optimize_reduction_locality", gpu::intel::createTritonIntelGPUOptimizeReductionLocality); - ADD_PASS_WRAPPER_0( - "add_optimize_elementwise_parallelism", - gpu::intel::createTritonIntelGPUOptimizeElementwiseParallelism); } void init_triton_intel(py::module &&m) {