Skip to content

Commit

Permalink
Drop elementwise
Browse files Browse the repository at this point in the history
  • Loading branch information
victor-eds committed Nov 19, 2024
1 parent 89ea181 commit 23f3b65
Show file tree
Hide file tree
Showing 6 changed files with 0 additions and 709 deletions.
259 changes: 0 additions & 259 deletions test/TritonIntelGPU/optimize-elementwise.mlir

This file was deleted.

1 change: 0 additions & 1 deletion third_party/intel/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ def make_ttgir(mod, metadata, opt, properties):
passes.ttgpuir.add_prefetch(pm)
passes.ttgpuir.add_optimize_dot_operands(pm, True)
intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm)
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
intel.passes.ttgpuir.add_reduce_data_duplication(pm)
passes.ttgpuir.add_reorder_instructions(pm)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,52 +365,4 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic
"mlir::triton::gpu::TritonGPUDialect"];
}

def TritonIntelGPUOptimizeElementwiseParallelism
: Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> {
let summary =
"Improve parallelism of elementwise operations better utilizing hardware resources.";

let description = [{
Detect elementwise operations with an encoding causing sub-par parallelism,
i.e., with data duplication across threads, and convert the operands to a
more optimal encoding if the cost of doing so is heuristically estimated to
be sufficiently low. As of now, the cost should be 0, we only support
"unbroadcasting" tensors, i.e., dropping duplicated values held in other
threads by re-distributing them.

As an example, this pass would modify the following code:
```mlir
#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>

module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
%0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
}
}
```
Obtaining:
```mlir
#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>

module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
%0 = triton_gpu.convert_layout %arg0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
%1 = triton_gpu.convert_layout %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
%2 = arith.addf %0, %1 : tensor<16xf32, #blocked1>
%3 = triton_gpu.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
tt.return %3 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
}
}
```

Note how the converted tensors are not sliced and thus each element in the
tensor is held by a single thread.
}];

let dependentDialects = [];
}


#endif // TRITON_INTEL_GPU_PASSES
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ add_triton_library(TritonIntelGPUTransforms
DistributeToWarps.cpp
MatchTargetSize.cpp
MaterializeBlockPointer.cpp
OptimizeElementwiseParallelism.cpp
OptimizeReductionLocality.cpp
Pipeliner/MatmulLoopPipeline.cpp
Pipeliner/SoftwarePipeliner.cpp
Expand Down
Loading

0 comments on commit 23f3b65

Please sign in to comment.