diff --git a/test/TritonIntelGPU/optimize-elementwise.mlir b/test/TritonIntelGPU/optimize-elementwise.mlir
deleted file mode 100644
index a6c08eaab9..0000000000
--- a/test/TritonIntelGPU/optimize-elementwise.mlir
+++ /dev/null
@@ -1,259 +0,0 @@
-// RUN: triton-opt %s --split-input-file -tritonintelgpu-optimize-elementwise-parallelism | FileCheck %s
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_dpas(
-// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
-// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>)
-  tt.func @test_dpas(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_0]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-
-#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_blocked(
-// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
-// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>)
-  tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_1]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_1]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-
-#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_blocked_repeat(
-// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
-// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>)
-  tt.func @test_blocked_repeat(%arg0: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<64xf32, #[[$ATTR_1]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<64xf32, #[[$ATTR_1]]> -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-    tt.return %0 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 1], order = [0, 1]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [2], order = [0]}>
-
-#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 1], order = [0, 1]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_blocked_multi_warp(
-// CHECK-SAME:                                     %[[VAL_0:.*]]: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
-// CHECK-SAME:                                     %[[VAL_1:.*]]: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> {
-  tt.func @test_blocked_multi_warp(%arg0: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<32xf32, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<32xf32, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<32xf32, #[[$ATTR_1]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<32xf32, #[[$ATTR_1]]> -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-    tt.return %0 : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [0, 1]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [16], warpsPerCTA = [4], order = [0]}>
-
-#blocked = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 1], order = [0, 1]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_blocked_multi_warp_double_stride(
-// CHECK-SAME:                                                   %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
-// CHECK-SAME:                                                   %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> {
-  tt.func @test_blocked_multi_warp_double_stride(%arg0: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<128xf16, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<128xf16, #[[$ATTR_1]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_1]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_1]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
-    tt.return %0 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [8], order = [0]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_mma_multi_warp_double_stride(
-// CHECK-SAME:                                               %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
-// CHECK-SAME:                                               %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> {
-  tt.func @test_mma_multi_warp_double_stride(%arg0: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_0]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_0]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-    tt.return %0 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [2], order = [0]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_mma_multi_warp_double_stride_repeat(
-// CHECK-SAME:                                                      %[[VAL_0:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
-// CHECK-SAME:                                                      %[[VAL_1:.*]]: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> {
-  tt.func @test_mma_multi_warp_double_stride_repeat(%arg0: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
-// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<128xf16, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<128xf16, #[[$ATTR_0]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<128xf16, #[[$ATTR_0]]> -> tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           tt.return %[[VAL_5]] : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-    tt.return %0 : tensor<128xf16, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-// CHECK-LABEL:   tt.func @test_multi_user(
-// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_2:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>)
-  tt.func @test_multi_user(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg2: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
-// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_4:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_5:.*]] = arith.addf %[[VAL_3]], %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]>
-    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:           %[[VAL_6:.*]] = triton_gpu.convert_layout %[[VAL_5]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           %[[VAL_7:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_8:.*]] = triton_gpu.convert_layout %[[VAL_2]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_9:.*]] = arith.addf %[[VAL_7]], %[[VAL_8]] : tensor<16xf32, #[[$ATTR_0]]>
-    %1 = arith.addf %arg0, %arg2 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:           %[[VAL_10:.*]] = triton_gpu.convert_layout %[[VAL_9]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           %[[VAL_11:.*]] = triton_gpu.convert_layout %[[VAL_6]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_12:.*]] = triton_gpu.convert_layout %[[VAL_10]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_13:.*]] = arith.addf %[[VAL_11]], %[[VAL_12]] : tensor<16xf32, #[[$ATTR_0]]>
-    %2 = arith.addf %0, %1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:           %[[VAL_14:.*]] = triton_gpu.convert_layout %[[VAL_13]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           tt.return %[[VAL_14]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-    tt.return %2 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-
-// CHECK-LABEL:   tt.func @test_basic_loop(
-// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
-// CHECK-SAME:                             %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index
-  tt.func @test_basic_loop(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg2: index, %arg3: index, %arg4: index) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
-// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_6:.*]] = scf.for %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_4]] iter_args(%[[VAL_8:.*]] = %[[VAL_5]]) -> (tensor<16xf32, #[[$ATTR_0]]>) {
-    %0 = scf.for %arg5 = %arg2 to %arg3 step %arg4 iter_args(%arg6 = %arg0) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) {
-// CHECK:             %[[VAL_9:.*]] = triton_gpu.convert_layout %[[VAL_8]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:             %[[VAL_10:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_11:.*]] = triton_gpu.convert_layout %[[VAL_9]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] : tensor<16xf32, #[[$ATTR_0]]>
-      %1 = arith.addf %arg1, %arg6 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:             %[[VAL_13:.*]] = triton_gpu.convert_layout %[[VAL_12]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:             %[[VAL_14:.*]] = triton_gpu.convert_layout %[[VAL_13]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             scf.yield %[[VAL_14]] : tensor<16xf32, #[[$ATTR_0]]>
-      scf.yield %1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-    }
-// CHECK:           %[[VAL_15:.*]] = triton_gpu.convert_layout %[[VAL_6]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           tt.return %[[VAL_15]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-  }
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-
-// CHECK-LABEL:   tt.func @test_advanced_loop(
-// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
-// CHECK-SAME:                                %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index, %[[VAL_4:.*]]: index,
-// CHECK-SAME:                                %[[VAL_5:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-  tt.func @test_advanced_loop(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg2: index, %arg3: index, %arg4: index, %arg5: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) {
-// CHECK:           %[[VAL_6:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_7:.*]] = triton_gpu.convert_layout %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:           %[[VAL_8:.*]]:2 = scf.for %[[VAL_9:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_4]] iter_args(%[[VAL_10:.*]] = %[[VAL_6]], %[[VAL_11:.*]] = %[[VAL_7]]) -> (tensor<16xf32, #[[$ATTR_0]]>, tensor<16xf32, #[[$ATTR_0]]>) {
-    %0:2 = scf.for %arg6 = %arg2 to %arg3 step %arg4 iter_args(%arg7 = %arg0, %arg8 = %arg5) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) {
-// CHECK:             %[[VAL_12:.*]] = triton_gpu.convert_layout %[[VAL_10]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:             %[[VAL_13:.*]] = triton_gpu.convert_layout %[[VAL_11]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:             %[[VAL_14:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_15:.*]] = triton_gpu.convert_layout %[[VAL_12]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_16:.*]] = arith.addf %[[VAL_14]], %[[VAL_15]] : tensor<16xf32, #[[$ATTR_0]]>
-      %1 = arith.addf %arg1, %arg7 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-// CHECK:             %[[VAL_17:.*]] = triton_gpu.convert_layout %[[VAL_16]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:             %[[VAL_18:.*]] = triton_gpu.convert_layout %[[VAL_17]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_19:.*]] = triton_gpu.convert_layout %[[VAL_13]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_20:.*]] = arith.addf %[[VAL_18]], %[[VAL_19]] : tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_21:.*]] = triton_gpu.convert_layout %[[VAL_20]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:             %[[VAL_22:.*]] = triton_gpu.convert_layout %[[VAL_17]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             %[[VAL_23:.*]] = triton_gpu.convert_layout %[[VAL_21]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
-// CHECK:             scf.yield %[[VAL_22]], %[[VAL_23]] : tensor<16xf32, #[[$ATTR_0]]>, tensor<16xf32, #[[$ATTR_0]]>
-      %2 = arith.addf %1, %arg8 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-      scf.yield %1, %2 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-    }
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = triton_gpu.convert_layout %[[VAL_25:.*]]#0 : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           %[[VAL_26:.*]] = triton_gpu.convert_layout %[[VAL_25]]#1 : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-// CHECK:           tt.return %[[VAL_24]], %[[VAL_26]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
-    tt.return %0#0, %0#1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
-  }
-}
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index d1176ba078..99307e50f3 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -252,7 +252,6 @@ def make_ttgir(mod, metadata, opt, properties):
         passes.ttgpuir.add_prefetch(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
         intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
-        intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_reduce_data_duplication(pm)
         passes.ttgpuir.add_reorder_instructions(pm)
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
index 1d81bc4741..c551a96856 100644
--- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
+++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
@@ -365,52 +365,4 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic
                            "mlir::triton::gpu::TritonGPUDialect"];
 }
 
-def TritonIntelGPUOptimizeElementwiseParallelism
-    : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> {
-  let summary =
-      "Improve parallelism of elementwise operations better utilizing hardware resources.";
-
-  let description = [{
-    Detect elementwise operations with an encoding causing sub-par parallelism,
-    i.e., with data duplication across threads, and convert the operands to a
-    more optimal encoding if the cost of doing so is heuristically estimated to
-    be sufficiently low. As of now, the cost should be 0, we only support
-    "unbroadcasting" tensors, i.e., dropping duplicated values held in other
-    threads by re-distributing them.
-
-    As an example, this pass would modify the following code:
-```mlir
-#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-  tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
-    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-```
-    Obtaining:
-```mlir
-#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
-#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
-
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-  tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
-    %0 = triton_gpu.convert_layout %arg0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
-    %1 = triton_gpu.convert_layout %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
-    %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1>
-    %3 = triton_gpu.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-    tt.return %3 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
-  }
-}
-```
-
-  Note how the converted tensors are not sliced and thus each element in the
-  tensor is held by a single thread.
-  }];
-
-  let dependentDialects = [];
-}
-
-
 #endif // TRITON_INTEL_GPU_PASSES
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
index 46d121a070..dbc641e2a3 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
@@ -4,7 +4,6 @@ add_triton_library(TritonIntelGPUTransforms
   DistributeToWarps.cpp
   MatchTargetSize.cpp
   MaterializeBlockPointer.cpp
-  OptimizeElementwiseParallelism.cpp
   OptimizeReductionLocality.cpp
   Pipeliner/MatmulLoopPipeline.cpp
   Pipeliner/SoftwarePipeliner.cpp
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp
deleted file mode 100644
index af997d2193..0000000000
--- a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp
+++ /dev/null
@@ -1,397 +0,0 @@
-//===- OptimizeElementwiseParallelism.cpp -------------------------------*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// This file implements the `tritonintelgpu-optimize-elementwise-parallelism`
-/// pass.
-//===----------------------------------------------------------------------===//
-
-#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
-
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
-
-#define DEBUG_TYPE "tritonintelgpu-optimize-elementwise-parallelism"
-
-namespace mlir::triton::gpu::intel {
-#define GEN_PASS_DEF_TRITONINTELGPUOPTIMIZEELEMENTWISEPARALLELISM
-#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h.inc"
-
-namespace {
-bool isMultiWarpValidLayoutForUnbroadcast(const LinearLayout &linearLayout,
-                                          int32_t numWorkGroupPos,
-                                          Builder &builder) {
-  StringAttr kLane = builder.getStringAttr("lane");
-  StringAttr kWarp = builder.getStringAttr("warp");
-  int32_t subGroupSize = linearLayout.getInDimSize(kLane);
-  ArrayRef<int32_t> numContiguousPerWarp = linearLayout.getBasis(kWarp, 0);
-  // Check the warp dimension hasn't been sliced away and we have n *
-  // sub_group_size contiguous elements per warp.
-  if (numContiguousPerWarp == ArrayRef<int32_t>{0} ||
-      numContiguousPerWarp[0] % subGroupSize != 0)
-    return false;
-  int32_t expectedValue = numContiguousPerWarp[0] * 2;
-  for (int32_t pos = 1; pos < numWorkGroupPos; ++pos) {
-    if (linearLayout.getBasis(kWarp, pos) != ArrayRef<int32_t>{expectedValue})
-      return false;
-    expectedValue *= 2;
-  }
-  return true;
-}
-
-/// Return whether the input linear layout can be unbroadcasted.
-///
-/// A layout is valid for being "unbroadcasted" along its lanes if:
-/// - The 'lane' input dimension is zero: this means the lane dimension has been
-/// sliced.
-/// - The size of the input 'block' dimension is 1. This is true for XPU
-/// backend.
-/// - The size of the input 'warp' dimension is 1 or there are n*sub_group_size
-/// contiguous elements per warp.
-///
-/// Broadcasted layouts are layouts with sliced lane, warp or block (not
-/// possible for XPU backend) dimensions, i.e., the same data is owned by
-/// different threads.
-bool isValidLayoutForUnbroadcast(const LinearLayout &linearLayout,
-                                 Builder &builder) {
-  StringAttr kLane = builder.getStringAttr("lane");
-  StringAttr kWarp = builder.getStringAttr("warp");
-  StringAttr kBlock = builder.getStringAttr("block");
-  StringAttr kDim0 = builder.getStringAttr("dim0");
-  // 'lane' dimension must have been sliced away completely.
-  if (!linearLayout.sublayoutIsZero(kLane, kDim0))
-    return false;
-  // Only single block for now.
-  if (linearLayout.getInDimSize(kBlock) != 1)
-    return false;
-  // 'warp' dimension hasn't been sliced away and there are n*sub_group_size
-  // contiguous elements in each warp (or there is a single warp).
-  int32_t numWorkGroupPos = linearLayout.getInDimSizeLog2(kWarp);
-  return numWorkGroupPos == 0 || isMultiWarpValidLayoutForUnbroadcast(
-                                     linearLayout, numWorkGroupPos, builder);
-}
-
-/// Generic checks for the operation not looking at the tensor type.
-bool isCandidateOp(Operation *op) {
-  // Rely on this for a simpler pass.
-  if (!op->hasTrait<OpTrait::SameOperandsAndResultType>() ||
-      op->getNumResults() != 1)
-    return false;
-
-  // Skip complex operations.
-  if (op->hasSuccessors() || op->getNumRegions() != 0)
-    return false;
-
-  return true;
-}
-
-bool optimizationDoesNotWorsenRegisterPressure(
-    Value value, RankedTensorType newType, SmallPtrSetImpl<Value> &visited) {
-  if (!visited.insert(value).second)
-    return true;
-  // All users must be operations we will optimize too or layout conversions we
-  // will introduce later.
-  return llvm::all_of(value.getUses(), [&visited, newType](OpOperand &operand) {
-    Operation *owner = operand.getOwner();
-
-    // We will be introducing just this operation later.
-    if (auto convertLayout = dyn_cast<ConvertLayoutOp>(owner))
-      return convertLayout.getResult().getType() == newType;
-
-    // Broadcasted in source.
-    if (isa<ExpandDimsOp>(owner))
-      return true;
-
-    // Allow for loop optimizations.
-    if (isa<scf::YieldOp>(owner))
-      return true;
-
-    // Only allow candidates. Check only operation constraints. We do not have
-    // to check the type as we did already.
-    if (!owner->hasTrait<OpTrait::Elementwise>() || !isCandidateOp(owner))
-      return false;
-
-    // Check other operands fit the constraints.
-    return llvm::all_of(owner->getOperands(),
-                        [&visited, newType](Value operand) {
-                          return optimizationDoesNotWorsenRegisterPressure(
-                              operand, newType, visited);
-                        });
-  });
-}
-
-/// Get optimized unbroadcasted tensor type.
-///
-/// Get optimized ranked tensor type after unbroadcasting. As we only support 1D
-/// tensors, this is as simple as getting an "unboradcasted" blocked-encoded 1D
-/// tensor type.
-RankedTensorType getOptimizedType(RankedTensorType type,
-                                  const LinearLayout &linearLayout,
-                                  Builder &builder) {
-  StringAttr kWarp = builder.getStringAttr("warp");
-
-  auto encoding = cast<DistributedEncodingTrait>(type.getEncoding());
-  unsigned threadsPerWarp = product(encoding.getThreadsPerWarp());
-  unsigned warpsPerCTA = product(encoding.getWarpsPerCTA());
-  [[maybe_unused]] unsigned ctaSplitNum = product(encoding.getCTASplitNum());
-  assert(ctaSplitNum == 1 && "Expecting single CTA");
-
-  RankedTensorType::Builder typeBuilder(type);
-  int32_t numWorkGroupPos = linearLayout.getInDimSizeLog2(kWarp);
-  unsigned sizePerThread =
-      numWorkGroupPos == 0
-          ? 1
-          : linearLayout.getBasis(kWarp, 0)[0] / threadsPerWarp;
-  CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(rewriter.getContext(), 1);
-  auto newEncoding = rewriter.getAttr<BlockedEncodingAttr>(
-      sizePerThread, threadsPerWarp, warpsPerCTA, /*order=*/0, ctaLayout);
-  typeBuilder.setEncoding(newEncoding);
-  return typeBuilder;
-}
-
-bool isCandidateTypeForOptimization(RankedTensorType type) {
-  if (!type)
-    return false;
-
-  // Check if the layout is actually bad and can be optimized using our
-  // approach. We only support 1D tensors for now as these are easier to
-  // handle.
-  Attribute layout = type.getEncoding();
-  if (!layout || type.getRank() != 1)
-    return false;
-  std::optional<LinearLayout> linearLayout =
-      toLinearLayout(type.getShape(), layout);
-
-  LLVM_DEBUG(llvm::dbgs() << "Checking linear layout:\n"
-                          << linearLayout << "\n");
-
-  Builder builder(type.getContext());
-  if (!linearLayout || !isValidLayoutForUnbroadcast(*linearLayout, builder))
-    return false;
-
-  // As we are dealing with 1D tensors, we can do a simple transform to obtain
-  // a more optimized operation.
-  RankedTensorType newType = getOptimizedType(type, *linearLayout, builder);
-
-  LLVM_DEBUG(llvm::dbgs() << "Would convert to type:\n" << newType << "\n");
-
-  return true;
-}
-
-bool canLoopInductionVarBeOptimized(Value initArg, Value regionIterArg,
-                                    Value yieldedVal) {
-  LLVM_DEBUG(llvm::dbgs() << "Checking loop vars:\n"
-                          << initArg << "\n"
-                          << regionIterArg << "\n"
-                          << yieldedVal << "\n");
-
-  // Check the induction variable is a candidate for this optimization based on
-  // its type.
-  auto type = dyn_cast<RankedTensorType>(initArg.getType());
-  if (!isCandidateTypeForOptimization(type))
-    return false;
-  assert(type && "Expecting ranked tensor type");
-
-  // We want to check all the variables involve in the optimization can be
-  // replaced by ones with a more efficient layout without affecting register
-  // pressure.
-
-  LinearLayout linearLayout =
-      *toLinearLayout(type.getShape(), type.getEncoding());
-  Builder builder(type.getContext());
-  RankedTensorType newType = getOptimizedType(type, linearLayout, builder);
-
-  SmallPtrSet<Value, 2> visited;
-  // Only allow initArgs with a single use for now.
-  return initArg.hasOneUse() &&
-         optimizationDoesNotWorsenRegisterPressure(yieldedVal, newType,
-                                                   visited) &&
-         optimizationDoesNotWorsenRegisterPressure(regionIterArg, newType,
-                                                   visited);
-}
-
-struct ElementwiseOptPattern final
-    : OpTraitRewritePattern<OpTrait::Elementwise> {
-  using OpTraitRewritePattern<OpTrait::Elementwise>::OpTraitRewritePattern;
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const final {
-    LLVM_DEBUG(llvm::dbgs() << "Checking operation:\n" << *op << "\n");
-
-    // Rely on this for a simpler pass.
-    if (!isCandidateOp(op))
-      return failure();
-
-    // Check the operation is a candidate for this optimization based on its
-    // type.
-    auto type = dyn_cast<RankedTensorType>(op->getResult(0).getType());
-    if (!isCandidateTypeForOptimization(type))
-      return failure();
-    assert(type && "Expecting ranked tensor type");
-
-    LinearLayout linearLayout =
-        *toLinearLayout(type.getShape(), type.getEncoding());
-
-    // As we are dealing with 1D tensors, we can do a simple transform to obtain
-    // a more optimized operation.
-    Location loc = op->getLoc();
-    RankedTensorType newType = getOptimizedType(type, linearLayout, rewriter);
-
-    // Check the operands are not used by other operations. This will prevent
-    // register pressure increase:
-    if (SmallPtrSet<Value, 2> visited;
-        !llvm::all_of(op->getOperands(), [&visited, newType](Value operand) {
-          return optimizationDoesNotWorsenRegisterPressure(operand, newType,
-                                                           visited);
-        }))
-      return failure();
-
-    // Obtain converted operands.
-    SmallVector<Value> newOperands(op->getNumOperands());
-    llvm::transform(op->getOperands(), std::begin(newOperands),
-                    [&rewriter, loc, newType](Value operand) {
-                      return rewriter.create<ConvertLayoutOp>(loc, newType,
-                                                              operand);
-                    });
-
-    // Now we create the optimized operation:
-    StringAttr opName = op->getName().getIdentifier();
-    ArrayRef<NamedAttribute> attributes = op->getAttrs();
-    Operation *newElementwiseOp =
-        rewriter.create(loc, opName, newOperands, newType, attributes);
-    assert(newElementwiseOp->getNumResults() == 1 &&
-           "Expecting single result operation");
-
-    // Convert to unoptimized encoding for further use.
-    Value newValue = newElementwiseOp->getResult(0);
-    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, type, newValue);
-
-    LLVM_DEBUG(llvm::dbgs() << "Conversion took place.\n");
-
-    return success();
-  }
-};
-
-struct ForOptPattern final : OpRewritePattern<scf::ForOp> {
-  using OpRewritePattern<scf::ForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(scf::ForOp loop,
-                                PatternRewriter &rewriter) const final {
-    LLVM_DEBUG(llvm::dbgs() << "Checking operation:\n" << loop << "\n");
-
-    // Tuples of <initArg, blockArg, yieldedVal>
-    SmallVector<std::tuple<Value, BlockArgument, Value>> toOptimize;
-    llvm::copy_if(llvm::zip_equal(loop.getInitArgs(), loop.getRegionIterArgs(),
-                                  loop.getYieldedValues()),
-                  std::back_inserter(toOptimize), [](auto entry) {
-                    auto [initArg, regionIterArg, yieldedVal] = entry;
-                    return canLoopInductionVarBeOptimized(
-                        initArg, regionIterArg, yieldedVal);
-                  });
-    if (toOptimize.empty())
-      return failure();
-
-    constexpr auto getRealArgNumber = [](BlockArgument blockArg) {
-      return blockArg.getArgNumber() - 1;
-    };
-
-    LLVM_DEBUG({
-      llvm::dbgs() << "Selected for optimization:\n";
-      for (auto [initArg, regionIterArg, yieldedVal] : toOptimize)
-        llvm::dbgs() << "Value:\n- init_arg: " << initArg
-                     << "\n- index: " << getRealArgNumber(regionIterArg)
-                     << "\n- yielded_value: " << yieldedVal << "\n";
-    });
-
-    Location loc = loop.getLoc();
-    Value lowerBound = loop.getLowerBound();
-    Value upperBound = loop.getUpperBound();
-    Value step = loop.getStep();
-
-    // Convert candidate init args:
-    SmallVector<Value> newInitArgs(loop.getInitArgs());
-    for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) {
-      unsigned index = getRealArgNumber(regionIterArg);
-      auto type = cast<RankedTensorType>(initArg.getType());
-      LinearLayout linearLayout =
-          *toLinearLayout(type.getShape(), type.getEncoding());
-      RankedTensorType newType = getOptimizedType(type, linearLayout, rewriter);
-      newInitArgs[index] =
-          rewriter.create<ConvertLayoutOp>(loc, newType, initArg);
-    }
-
-    // Create new for loop.
-    // We provide a custom loop body builder that will clone the original body,
-    // but adding layout conversions for the optimized block arguments.
-    auto loopBodyBuilder = [&](OpBuilder &builder, Location loc,
-                               Value inductionVar, ValueRange regionIterArgs) {
-      // Add mapping for the cloning.
-      // We need to convert the operations back to the unoptimized layout in the
-      // loop body.
-      SmallVector<Value> argValues{inductionVar};
-      llvm::append_range(argValues, regionIterArgs);
-      for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) {
-        unsigned index = getRealArgNumber(regionIterArg);
-        auto type = cast<RankedTensorType>(initArg.getType());
-        Value backToOriginalLayout = builder.create<ConvertLayoutOp>(
-            loc, regionIterArg.getType(), regionIterArgs[index]);
-        argValues[regionIterArg.getArgNumber()] = backToOriginalLayout;
-      }
-
-      rewriter.mergeBlocks(&loop.getRegion().front(), builder.getBlock(),
-                           argValues);
-
-      // Modify yield operation with updated values.
-      auto yieldOp = cast<scf::YieldOp>(builder.getBlock()->getTerminator());
-      builder.setInsertionPoint(yieldOp);
-      for (auto [initArg, regionIterArg, yieldedVal] : toOptimize) {
-        unsigned index = getRealArgNumber(regionIterArg);
-        Type type = regionIterArgs[index].getType();
-        yieldOp.getResultsMutable()[index].assign(
-            builder.create<ConvertLayoutOp>(
-                loc, type, yieldOp.getResultsMutable()[index].get()));
-      }
-    };
-    auto newForOp = rewriter.create<scf::ForOp>(
-        loc, lowerBound, upperBound, step, newInitArgs, loopBodyBuilder);
-
-    LLVM_DEBUG(llvm::dbgs() << "New loop:\n" << newForOp << "\n");
-
-    // Convert for loop results back to their original types:
-    SmallVector<Value> newVals(newForOp.getResults().size());
-    llvm::transform(llvm::zip_equal(newForOp.getResults(), loop.getResults()),
-                    std::begin(newVals), [&](auto entry) -> Value {
-                      auto [newRes, origRes] = entry;
-                      if (newRes.getType() == origRes.getType())
-                        return newRes;
-                      return rewriter.create<ConvertLayoutOp>(
-                          loc, origRes.getType(), newRes);
-                    });
-    rewriter.replaceOp(loop, newVals);
-    return success();
-  }
-};
-
-struct TritonIntelGPUOptimizeElementwiseParallelism final
-    : impl::TritonIntelGPUOptimizeElementwiseParallelismBase<
-          TritonIntelGPUOptimizeElementwiseParallelism> {
-  using Base::Base;
-
-  void runOnOperation() final {
-    Operation *op = getOperation();
-    MLIRContext *ctx = op->getContext();
-    RewritePatternSet patterns(ctx);
-    patterns.add<ElementwiseOptPattern, ForOptPattern>(ctx);
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
-      signalPassFailure();
-  }
-};
-} // namespace
-} // namespace mlir::triton::gpu::intel
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index 3a3037f6c0..55db149919 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -100,9 +100,6 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUMaterializeBlockPointer);
   ADD_PASS_WRAPPER_0("add_optimize_reduction_locality",
                      gpu::intel::createTritonIntelGPUOptimizeReductionLocality);
-  ADD_PASS_WRAPPER_0(
-      "add_optimize_elementwise_parallelism",
-      gpu::intel::createTritonIntelGPUOptimizeElementwiseParallelism);
 }
 
 void init_triton_intel(py::module &&m) {