From cc574497bcbfd8a35e45937c42242d9890eeee6e Mon Sep 17 00:00:00 2001 From: HyoungWook Nam Date: Tue, 30 Jul 2024 19:30:50 -0500 Subject: [PATCH] Adding conversion between LinearLayout and DPASLayout (#1684) This PR fixes #1295. Upstream Triton tries to abstract all tensor layouts with the LinearLayout abstraction. Solving #1295 requires using LinearLayout, so I've added codes to convert DPASLayout to LinearLayout. Currently, the conversion is only correct for DPAS OperandC layout. --- scripts/skiplist/a770/language.txt | 177 ------- scripts/skiplist/conda/language.txt | 49 -- scripts/skiplist/default/language.txt | 49 -- scripts/skiplist/lts/language.txt | 49 -- .../IR/LinearLayoutConversions.h | 19 + .../Dialect/TritonIntelGPU/IR/CMakeLists.txt | 2 +- .../IR/LinearLayoutConversions.cpp | 433 ++++++++++++++++++ .../ConvertLayoutOpToLLVM.cpp | 24 +- .../lib/TritonIntelGPUToLLVM/Utility.cpp | 127 +++++ .../intel/lib/TritonIntelGPUToLLVM/Utility.h | 194 +++----- unittest/Dialect/TritonGPU/CMakeLists.txt | 6 + .../TritonGPU/DPAStoLinearLayoutTest.cpp | 126 +++++ 12 files changed, 799 insertions(+), 456 deletions(-) create mode 100644 third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h create mode 100644 third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp create mode 100644 unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt index 59a2daec56..4baa756191 100644 --- a/scripts/skiplist/a770/language.txt +++ b/scripts/skiplist/a770/language.txt @@ -1,180 +1,3 @@ -# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295 -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout7-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout0-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout1-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout2-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout3-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout4-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout5-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout6-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout7-float16-64-1] # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt index 4860e6204d..cdac848de1 100644 --- a/scripts/skiplist/conda/language.txt +++ b/scripts/skiplist/conda/language.txt @@ -1,52 +1,3 @@ -# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295 -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1] # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt index 4860e6204d..cdac848de1 100644 --- a/scripts/skiplist/default/language.txt +++ b/scripts/skiplist/default/language.txt @@ -1,52 +1,3 @@ -# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295 -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1] # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt index 501a125278..9a4e14ab13 100644 --- a/scripts/skiplist/lts/language.txt +++ b/scripts/skiplist/lts/language.txt @@ -1,52 +1,3 @@ -# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295 -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64] -test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1] # LTS failures test/unit/language/test_core.py::test_dot[1-64-128-128-4-True-True-none-tf32-int8-int8-1_0] test/unit/language/test_core.py::test_dot[1-64-128-128-4-True-True-none-tf32-int8-int8-1_1] diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h b/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h new file mode 100644 index 0000000000..ba497942ae --- /dev/null +++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h @@ -0,0 +1,19 @@ +// Conversions from TritonIntelGPU DpasEncodingAttr to LinearLayout. + +#ifndef TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H +#define TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H + +#include + +#include "intel/include/Dialect/TritonIntelGPU/IR/Attributes.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" +#include "triton/Tools/LinearLayout.h" + +namespace mlir::triton::gpu { + +std::optional DPAStoLinearLayout(ArrayRef shape, + Attribute layout); + +} // namespace mlir::triton::gpu + +#endif // TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt index 64070b015d..24334b6773 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt @@ -1,7 +1,7 @@ add_triton_library(TritonIntelGPUIR Dialect.cpp Ops.cpp - + LinearLayoutConversions.cpp DEPENDS TritonIntelGPUTableGen TritonIntelGPUAttrDefsIncGen diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp new file mode 100644 index 0000000000..2758ed1cfe --- /dev/null +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp @@ -0,0 +1,433 @@ +#include + +#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h" +#include "triton/Dialect/Triton/IR/Utility.h" +#include "triton/Dialect/TritonGPU/IR/Attributes.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Tools/LinearLayout.h" +#include "triton/Tools/StrUtil.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" + +using namespace mlir::triton::gpu::intel; + +namespace mlir::triton::gpu { +namespace { + +// We use the following nomenclature in this file. +// +// - ctaLayout: A layout for one block, i.e. input dims [register, lane, warp] +// for register layouts, and input dims [offset] for shared layouts. +// - cgaLayout: Arrangement of multiple blocks, i.e. input dims [block]. +// +// Note that this is inconsistent with the type name CTALayoutAttr. That type +// is equivalent to our cgaLayout. +// +// IMO the name CTALayoutAttr is wrong. If we tried to be consistent anyway, +// then we'd have to rename ctaLayout to "warpLayout". I think that's more +// confusing than being inconsistent about "cgaLayout", especially when we have +// to consider the size of the warpLayout (surely that's not the "warpSize"). + +#define S(v) StringAttr::get(ctx, (v)) + +// Returns ["out0", "out1", ..., "out"]. +SmallVector standardOutDimNames(MLIRContext *ctx, int rank) { + SmallVector ret; + for (int i = 0; i < rank; i++) { + ret.push_back(S("dim" + llvm::Twine(i))); + } + return ret; +} + +// Returns a 1D -> ND layout that's equivalent to creating a 1D -> 1D mapping of +// size product(shape) and then reshaping to permute(shape, order). +LinearLayout identityND(StringAttr inDimName, ArrayRef shape, + ArrayRef order, + ArrayRef outDimNames) { + assert(shape.size() == order.size()); + + MLIRContext *ctx = inDimName.getContext(); + LinearLayout ret = LinearLayout::empty(); + for (int i = 0; i < shape.size(); i++) { + // Start with the most-minor dimension, which is order[0]. + int dim = order[i]; + ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]); + } + return ret; +} + +// Make a LinearLayout that maps a block-id to an N-dimensional index. +// +// The tensor is split up into CTAsPerCGA pieces, which are distributed among +// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups). +// +// See the nomenclature note at the top of the file for an explanation of why +// this is called makeCgaLayout when it accepts a CTALayoutAttr. +LinearLayout makeCgaLayout(CTALayoutAttr layout) { + MLIRContext *ctx = layout.getContext(); + StringAttr kBlock = S("block"); + + int rank = layout.getCTAOrder().size(); + SmallVector outDimNames = standardOutDimNames(ctx, rank); + + LinearLayout ret = LinearLayout::empty(); + for (int i = 0; i < rank; i++) { + // Start with the most minor dimension, which is order[0]. + int dim = layout.getCTAOrder()[i]; + int split = layout.getCTASplitNum()[dim]; + int ctas = layout.getCTAsPerCGA()[dim]; + assert(ctas % split == 0); + ret *= LinearLayout::identity1D(split, kBlock, outDimNames[dim]) * + LinearLayout::zeros1D(ctas / split, kBlock, outDimNames[dim]); + } + + // Transpose to standard order (dim0, dim1, ...). + return ret.transposeOuts(outDimNames); +} + +// Shrinks the output set of a layout function while leaving the input set +// unchanged, by making high-order inputs in inDimName map to the same output. +// Attempts to shrink down to desiredSize, but this is not always possible just +// by modifying one the specified input dimension. +// +// We do this by making the most-major inputs to the layout map to 0. This +// effectively duplicates data along that input dimension. For example, this +// layout has out-dim size 32: +// +// L(register=1) = 8 +// L(register=2) = 4 +// L(register=4) = 1 +// L(lane=1) = 2 +// L(lane=2) = 16. +// +// If we shrink it to size 16 along the `lane` dimension, we set L(lane=2) to 0: +// +// L(register=1) = 8 +// L(register=2) = 4 +// L(register=4) = 1 +// L(lane=1) = 2 +// L(lane=2) = 0. +// +// This means that lane=2 has the same data as lane=0. +// +// If we shrink to size 8 along the lane dimension, we set L(lane=1) = 0 as +// well. But when we do this, we have to remove bit 1 (the value of L(lane=1)) +// from all other bases: +// +// L(register=1) = 4 +// L(register=2) = 2 +// L(register=1) = 1 +// L(lane=1) = 0 +// L(lane=2) = 0. +// +// Note this only works because the bases are powers of two. I don't quite know +// what to do when they're not. +LinearLayout shrinkCodomain(const LinearLayout &layout, StringAttr inDimName, + StringAttr outDimName, int desiredSize) { + assert(llvm::isPowerOf2_32(desiredSize)); + int outDimIdx = layout.getOutDimIndex(outDimName); + int desiredZeros = + llvm::Log2_32(layout.getOutDimSize(outDimName) / desiredSize); + if (desiredZeros == 0) { + return layout; + } + + // Find the desiredZeros most-major basis vectors that are not already zero. + // These are the ones we will set to zero. + SmallVector basesToZero; + for (int i = layout.getInDimSizeLog2(inDimName) - 1; + i >= 0 && basesToZero.size() < desiredZeros; i--) { + int basis = layout.getBasis(inDimName, i, outDimName); + if (basis != 0) { + basesToZero.push_back(basis); + } + } + + // Bail if all the bases are already zero; nothing more we can do. + if (basesToZero.empty()) { + return layout; + } + + // The algorithm below only works because the bases are powers of two. I'm + // not sure what to do otherwise. + assert(llvm::all_of(basesToZero, + [&](int basis) { return llvm::isPowerOf2_32(basis); })); + + // We want to zero out the bases in `basesToZero`, and also "shift out" the + // corresponding bits from all other bases. For example if we remove the + // basis with value 8 = 0b100, then if another basis has value 26 = 0b11010, + // the 1 in its 3rd position gets removed and it becomes 10 = 0b1010. + // + // We could manually alter the bases in `layout` to achieve this, but it's + // perhaps simpler to use the linearity of LLs to our advantage. + // + // Consider the function O which is the identity map from out-dims to + // out-dims. We can easily calculate what happens when we remove the relevant + // bases from O. Call this new function O'. + // + // Because of linearity, removing the bases from L is equivalent to composing + // L with O'. So that's what we do below. + + // Construct the out-dims -> out-dims identity layout O. + LinearLayout outputIdentity = LinearLayout::empty(); + for (StringAttr dim : layout.getOutDimNames()) { + outputIdentity *= + LinearLayout::identity1D(layout.getOutDimSize(dim), dim, dim); + } + + // Modify O to remove the relevant bases. + // + // TODO(jlebar): I don't like manually modifying bases here. Perhaps this + // should be a function on LinearLayout. + LinearLayout::BasesT newBases = outputIdentity.getBases(); + llvm::sort(basesToZero); + for (int basis : basesToZero) { + int idx = llvm::Log2_32(basis); + for (int i = newBases[outDimName].size() - 1; i > idx; i--) { + newBases[outDimName][i][outDimIdx] = + newBases[outDimName][i - 1][outDimIdx]; + } + newBases[outDimName][idx][outDimIdx] = 0; + } + + // Construct O'. + LinearLayout transform(std::move(newBases), + llvm::to_vector(layout.getOutDimNames())); + + // Compose O' with L. + return layout.compose(transform); +} + +// For each out-dim d, ensure the layout's out-size (i.e. its codomain) is no +// larger than shape[d]. Do this without changing the size of the layout's +// inputs (i.e. leave its domain unchanged). +// +// This function is invariant to the order of the layout's input and output +// dimensions. +LinearLayout ensureLayoutNotLargerThan( + const LinearLayout &layout, + const llvm::SmallDenseMap &shape) { + assert(shape.size() == layout.getNumOutDims()); + if (shape.empty()) { + return layout; + } + MLIRContext *ctx = shape.begin()->first.getContext(); + + // For the purposes of this function, "block" is the "most-minor" dimension. + // This is just a consequence of how legacy layouts work: We only put the same + // tensor element into two different blocks as a last resort, only after all + // the registers in all the lanes in all the warps in a block already have the + // same tensor element. (Or, for shared layouts, only after all values in + // smem within a block have the same value.) + // + // inDimNames combines the in dims for register and shared layouts; that's OK + // because we skip in-dims that aren't present. So we'll iterate over + // {blocked, register, lane, warp} or {blocked, offset}. + SmallVector inDimNames = { + // for both register and shared layouts + S("block"), + + // for register layouts + S("register"), + S("lane"), + S("warp"), + + // for shared layouts + S("offset"), + }; + + LinearLayout ret = layout; + for (auto outDimName : layout.getOutDimNames()) { + int32_t actualSize = layout.getOutDimSize(outDimName); + int32_t desiredSize = shape.lookup(outDimName); + if (actualSize <= desiredSize) { + continue; + } + assert(actualSize % desiredSize == 0); + for (StringAttr inDimName : llvm::reverse(inDimNames)) { + if (ret.hasInDim(inDimName)) { + ret = shrinkCodomain(ret, inDimName, outDimName, desiredSize); + } + } + assert(ret.getOutDimSize(outDimName) == desiredSize); + } + return ret; +} + +// For each out-dim d, ensure the layout's out-size (i.e. its codomain) is no +// smaller than shape[d]. Do this by increasing the size of the layout's inputs +// along its most-minor dimension ("register" for register layouts, "offset" for +// shared layouts). +// +// This function is invariant to the order of the layout's input dimensions, but +// it cares about the order of the output dims, which should be minor-to-major. +LinearLayout ensureLayoutNotSmallerThan( + const LinearLayout &layout, + const llvm::SmallDenseMap &shape) { + assert(shape.size() == layout.getNumOutDims()); + if (shape.empty()) { + return layout; + } + + MLIRContext *ctx = shape.begin()->first.getContext(); + StringAttr kDim = *layout.getInDimNames().begin(); + assert(kDim == "register" || kDim == "offset"); + + LinearLayout ret = layout; + for (StringAttr outDimName : layout.getOutDimNames()) { + int32_t actualSize = layout.getOutDimSize(outDimName); + int32_t desiredSize = shape.lookup(outDimName); + assert(actualSize > desiredSize || desiredSize % actualSize == 0); + ret *= LinearLayout::identity1D(desiredSize / actualSize, kDim, outDimName); + assert(ret.getOutDimSize(outDimName) >= desiredSize); + } + return ret; +} + +// Combines the layout of a CTA (input dims [register, lane, warp]) with the +// layout of a CGA (i.e. a block), and ensures that the resulting layout has the +// given shape. +// +// See the nomenclature note at the top of the file for why the variable with +// type CTALayoutAttr is called cgaLayoutAttr. +LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout, + CTALayoutAttr cgaLayoutAttr, + ArrayRef shape) { + int rank = shape.size(); + assert(ctaLayout.getNumOutDims() == rank); + assert(cgaLayoutAttr.getCTAOrder().size() == rank); + MLIRContext *ctx = cgaLayoutAttr.getContext(); + + SmallVector outDimNames = standardOutDimNames(ctx, rank); + + llvm::SmallDenseMap labeledShape; + for (auto [dim, size] : llvm::zip(outDimNames, shape)) { + labeledShape[dim] = size; + } + + LinearLayout cgaLayout = + ensureLayoutNotLargerThan(makeCgaLayout(cgaLayoutAttr), labeledShape) + .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames())); + + // Calculate the shape of the ctaLayout, which is `shape` divided by the + // cgaLayout's size. + llvm::SmallDenseMap ctaShape; + assert(llvm::to_vector(ctaLayout.getOutDimNames()) == + llvm::to_vector(cgaLayout.getOutDimNames())); + for (auto dim : ctaLayout.getOutDimNames()) { + ctaShape[dim] = + std::max(int64_t{1}, labeledShape[dim] / cgaLayout.getOutDimSize(dim)); + } + + ctaLayout = ensureLayoutNotSmallerThan(ctaLayout, ctaShape); + ctaLayout = ensureLayoutNotLargerThan(ctaLayout, ctaShape); + + LinearLayout ret = (ctaLayout * cgaLayout).transposeOuts(outDimNames); + for (auto dim : ret.getOutDimNames()) { + assert(ret.getOutDimSize(dim) == labeledShape[dim]); + } + return ret; +} + +} // anonymous namespace + +// The layout example repeat_count=8, systolic_depth=8, +// execution_size=16 and operands_per_chan=2 for warp size 32. +// DPASInst layout of C operand: +// execution size = 16 +//<----------------------------------> +// t0 t1 t2 t3 ~ t12 t13 t14 t15 ^ +// t16 t17 t18 t19 ~ t28 t29 t30 t31 | +// . . . . . . . . . | +// . . . . . . . . . | repeatCount = 8 +// t0 t1 t2 t3 ~ t12 t13 t14 t15 | +// t16 t17 t18 t19 ~ t28 t29 t30 t31 v +// In this case, the LinearLayout bases are: +// Register: {{2,0}, {4,0}} +// Lane: {{0,1}, {0,2}, {0,4}, {0,8}, {1,0}} +// Currently, LinearLayout is not supported for DotOperandEncoding +// so only Operand C conversion is implemented. +std::vector> +DPASRegBasesC(int repeatCount, int executionSize, int threadsPerWarp) { + int rowsPerWarp = threadsPerWarp / executionSize; + + std::vector> regBases; + + for (int rid = rowsPerWarp; rid < repeatCount; rid = rid * 2) { + regBases.push_back({rid, 0}); + } + + return regBases; +} + +std::vector> +DPASLaneBasesC(int repeatCount, int executionSize, int threadsPerWarp) { + + std::vector> laneBases; + + for (int tid = 1; tid < executionSize; tid = tid * 2) { + laneBases.push_back({0, tid}); + } + int rowsPerWarp = threadsPerWarp / executionSize; + for (int row = 1; row < rowsPerWarp; row = row * 2) { + laneBases.push_back({row, 0}); + } + + return laneBases; +} + +std::optional DPAStoLinearLayout(ArrayRef shape, + Attribute layout) { + + auto dpas = dyn_cast(layout); + assert(dpas && "Must be DPAS Operand C layout"); + + int rank = shape.size(); + assert(rank == dpas.getWarpsPerCTA().size()); + assert(rank == 2); + + MLIRContext *ctx = dpas.getContext(); + SmallVector outDimNames = standardOutDimNames(ctx, rank); + + StringAttr kRegister = S("register"); + StringAttr kLane = S("lane"); + + const SmallVector warpsPerCTA = dpas.getWarpsPerCTA(); + int threadsPerWarp = triton::gpu::getWarpSize(dpas); + auto repCluster = dpas.getRepCluster(); + SmallVector numReps = dpas.getDPASRepetitions(shape, 2); + + auto tileLayout = LinearLayout::empty(); + int repeatCount = dpas.getRepeatCount(); + int executionSize = dpas.getExecutionSize(); + + auto regBases = DPASRegBasesC(repeatCount, executionSize, threadsPerWarp); + auto laneBases = DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp); + tileLayout = + LinearLayout({{kRegister, regBases}, {kLane, laneBases}}, outDimNames); + + // The per-inst layout is repeated at each repCluster. + // Hence, multiply with the identity layouts starting from the + // least significant dimension. + tileLayout *= + LinearLayout::identity1D(repCluster[1], kRegister, outDimNames[1]); + tileLayout *= + LinearLayout::identity1D(repCluster[0], kRegister, outDimNames[0]); + + // Then, it is repeated by DPASRepetitions to form per-Warp layout. + tileLayout *= LinearLayout::identity1D(numReps[1], kRegister, outDimNames[1]); + tileLayout *= LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]); + + // Finally, per-warp layout is repeated among the warps in the CTA. + LinearLayout warpLayout = + identityND(S("warp"), dpas.getWarpsPerCTA(), {0, 1}, outDimNames); + LinearLayout ctaLayout = tileLayout * warpLayout; + + return combineCtaCgaWithShape(ctaLayout, CTALayoutAttr::getDefault(ctx, rank), + shape); +} + +} // namespace mlir::triton::gpu diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp index 638d922856..a7465f50da 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -3,6 +3,8 @@ #include "Utility.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h" +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" @@ -471,12 +473,24 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion matchAndRewrite(ConvertLayoutOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { MLIRContext *ctx = op.getContext(); - const auto &shape = op.getType().getShape(); - std::optional srcLayout = - gpu::toLinearLayout(shape, op.getSrc().getType().getEncoding()); - std::optional dstLayout = - gpu::toLinearLayout(shape, op.getType().getEncoding()); + std::optional srcLayout; + auto srcTy = op.getSrc().getType(); + + if (auto dpasLayout = dyn_cast(srcTy.getEncoding())) { + srcLayout = gpu::DPAStoLinearLayout(shape, dpasLayout); + } else { + srcLayout = gpu::toLinearLayout(shape, srcTy.getEncoding()); + } + + std::optional dstLayout; + auto dstTy = op.getType(); + if (auto dpasLayout = dyn_cast(dstTy.getEncoding())) { + dstLayout = gpu::DPAStoLinearLayout(shape, dpasLayout); + } else { + dstLayout = gpu::toLinearLayout(shape, dstTy.getEncoding()); + } + if (!srcLayout.has_value() || !dstLayout.has_value()) { return failure(); } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp index b4af7fdc5e..a408e7342b 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "Utility.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h" +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" using namespace mlir; using namespace mlir::triton; @@ -140,3 +142,128 @@ Value llPrintf(ConversionPatternRewriter &rewriter, StringRef msg, } } // namespace mlir::LLVM::intel + +namespace mlir::triton::intel { +bool emitTransferBetweenDPASAndShared( + RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy, + std::optional maxVecElems, Value shmemBase, + ArrayRef shmemStrides, Location loc, RewriterBase &rewriter, + const TargetInfoBase &target, + std::function perVectorCallback) { + MLIRContext *ctx = rewriter.getContext(); + + auto shape = registerTy.getShape(); + int rank = shape.size(); + + StringAttr kBlock = str_attr("block"); + StringAttr kRegister = str_attr("register"); + StringAttr kLane = str_attr("lane"); + StringAttr kWarp = str_attr("warp"); + + std::optional regLayout; + if (auto dpas = dyn_cast(registerTy.getEncoding())) { + // Default is operandC (opidx == 2) + regLayout = triton::gpu::DPAStoLinearLayout(shape, dpas); + } else { + regLayout = triton::gpu::toLinearLayout(shape, registerTy.getEncoding()); + } + + std::optional sharedLayout; + if (auto dpas = dyn_cast(sharedTy.getEncoding())) { + sharedLayout = triton::gpu::DPAStoLinearLayout(shape, dpas); + } else { + sharedLayout = triton::gpu::toLinearLayout( + shape, sharedTy.getEncoding(), elemLlvmTy.getIntOrFloatBitWidth()); + } + + if (!regLayout.has_value() || !sharedLayout.has_value()) { + return false; + } + auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding()); + + // sharedLayout's in-dims are currently (offset, block). Reshape to + // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional + // shmem strides. (The offsetX's appear in minor-to-major order.) + auto sharedLegacy = + cast(sharedTy.getEncoding()); + SmallVector> multiDimSharedSize; + for (int i = 0; i < rank; i++) { + int dim = sharedOrder[i]; + int64_t size = std::max( + int64_t{1}, + shape[dim] / sharedLegacy.getCTALayout().getCTASplitNum()[dim]); + multiDimSharedSize.push_back( + {str_attr("offset" + std::to_string(dim)), size}); + } + multiDimSharedSize.push_back({kBlock, sharedLayout->getInDimSize(kBlock)}); + sharedLayout = sharedLayout->reshapeIns(multiDimSharedSize); + + // regToSharedLayout maps from (register, lane, warp, block) to (offsetX1, + // ..., offsetXN, block), where the offsetX's are in minor-to-major order. + LinearLayout regToSharedLayout = regLayout->invertAndCompose(*sharedLayout); + + // TODO(jlebar): We don't currently support loading from shared memory in a + // different CTA. We'd need to emit `mapa.shared::cluster` instructions. + for (int inBlock = 1; inBlock < regToSharedLayout.getInDimSize(kBlock); + inBlock *= 2) { + auto idx = llvm::to_vector(llvm::make_second_range(regToSharedLayout.apply( + {{kRegister, 0}, {kLane, 0}, {kWarp, 0}, {kBlock, inBlock}}))); + // offsetX1, ..., offsetXN must all be 0. + if (!llvm::all_of(ArrayRef(idx).drop_back(1), + [&](auto offset) { return offset == 0; })) { + return false; + } + int32_t outBlock = idx.back(); + if (outBlock != inBlock) { + return false; + } + } + + // Determine how many consecutive registers map to consecutive shmem elements + // in out-dimension offsetN. This is our load instruction's vector width. + // + // It's OK if the vector width we choose here is wider than the hardware + // supports; LLVM will legalize it. + // + // TODO(jlebar): shmemStrides are Values, but most of them are usually integer + // constants. We could add those constant strides to the LL, and then before + // calling getNumConsecutiveInOut(), we could flatten consecutive out-dims + // which have known strides. This would allow us to vectorize across multiple + // shmem out dimensions where possible. + const int vecElems = + std::min(regToSharedLayout.getNumConsecutiveInOut(), + maxVecElems.value_or(std::numeric_limits::max())); + + Value threadId = getThreadId(rewriter, loc); + Value threadsPerWarp = i32_val(regToSharedLayout.getInDimSize(kLane)); + Value laneId = urem(threadId, threadsPerWarp); + Value warpId = udiv(threadId, threadsPerWarp); + + int numElems = regToSharedLayout.getInDimSize(kRegister); + auto vecTy = vec_ty(elemLlvmTy, vecElems); + auto ptrTy = ptr_ty(ctx, /*addressSpace=*/3); + Value zero = i32_val(0); + SmallVector ret; + for (int i = 0; i < numElems / vecElems; i++) { + // Get the address to load/store. The multi-dim address is (offsetX1, ..., + // offsetXN, block), where the offsets appear in minor-to-major order, and + // we drop_end to drop block, which we know from above will be 0. + auto multiDimShmemOffset = + llvm::to_vector(llvm::drop_end(llvm::make_second_range( + applyLinearLayout(loc, rewriter, regToSharedLayout, + {{kRegister, i32_val(i * vecElems)}, + {kLane, laneId}, + {kWarp, warpId}, + {kBlock, zero}})))); + + // Reorder strides according to `order`. This way they match the + // multi-dimensional offsets in regToSharedLayout. + Value shmemOffset = dot(rewriter, loc, multiDimShmemOffset, + applyPermutation(shmemStrides, sharedOrder)); + auto vecAddr = gep(ptrTy, elemLlvmTy, shmemBase, shmemOffset); + vecAddr.setInbounds(true); + perVectorCallback(vecTy, vecAddr); + } + return true; +} +} // namespace mlir::triton::intel diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h index 7b9e1fc0c0..883460a2dc 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h @@ -11,6 +11,7 @@ #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include "triton/Dialect/Triton/IR/Utility.h" @@ -745,6 +746,13 @@ inline DenseMap getSwizzledSharedPtrs( return ret; } +[[nodiscard]] bool emitTransferBetweenDPASAndShared( + RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy, + std::optional maxVecElems, Value shmemBase, + ArrayRef shmemStrides, Location loc, RewriterBase &rewriter, + const TargetInfoBase &target, + std::function perVectorCallback); + inline SmallVector loadSharedToDistributed(Value dst, Value src, SharedMemoryObject &shrMemObj, Type elemTy, Location loc, RewriterBase &rewriter, @@ -753,68 +761,35 @@ loadSharedToDistributed(Value dst, Value src, SharedMemoryObject &shrMemObj, auto srcTy = cast(src.getType()); SmallVector ret; - if (emitTransferBetweenRegistersAndShared( - dstTy, srcTy, elemTy, /*maxVecElems=*/std::nullopt, - shrMemObj.getBase(), shrMemObj.getStrides(), loc, rewriter, target, - [&](VectorType vecTy, Value vecAddr) { - auto vecVal = load(vecTy, vecAddr); - vecVal.setAlignment(vecTy.getNumElements() * - elemTy.getIntOrFloatBitWidth() / 8); - - for (int v = 0; v < vecTy.getNumElements(); v++) { - ret.push_back(extract_element(elemTy, vecVal, i32_val(v))); - } - })) - return ret; - - auto dstShape = dstTy.getShape(); - assert(dstShape.size() <= 2 && "Unexpected rank of loadSharedToDistributed"); - auto dstDistributedLayout = dstTy.getEncoding(); - if (auto mmaLayout = dyn_cast(dstDistributedLayout)) { - assert((!mmaLayout.isVolta()) && - "ConvertLayout Shared->MMAv1 is not supported yet"); - } - auto srcSharedLayout = - cast(srcTy.getEncoding()); - auto srcElemTy = srcTy.getElementType(); - auto dstElemTy = dstTy.getElementType(); - LDBG("loadSharedToDistributed elemTy " << elemTy << " srcElemTy " << srcElemTy - << " dstElemTy " << dstElemTy); - auto inOrd = triton::gpu::getOrder(srcSharedLayout); - auto outOrd = triton::gpu::getOrder(dstDistributedLayout); - unsigned outVec = inOrd == outOrd - ? triton::gpu::getUniqueContigPerThread( - dstDistributedLayout, dstShape)[outOrd[0]] - : 1; - - // If the shmem layout is not swizzled, we can trivially vectorize loads - // across the whole width of the most-minor dimension of the shape, because - // Triton requires all the dims are powers of 2. - unsigned inVec = srcSharedLayout.getMaxPhase() == 1 - ? srcTy.getShape()[inOrd[0]] - : srcSharedLayout.getVec(); - unsigned minVec = std::min(outVec, inVec); - unsigned outElems = triton::gpu::getTotalElemsPerThread(dstTy); - SmallVector offsetVals = {shrMemObj.strides.size(), i32_val(0)}; - - DenseMap sharedPtrs = ::intel::getSwizzledSharedPtrs( - loc, target, outVec, dstTy, srcSharedLayout, elemTy, shrMemObj, rewriter, - offsetVals, shrMemObj.strides); - assert(outElems % minVec == 0 && "Unexpected number of elements"); - unsigned numVecs = outElems / minVec; - auto wordTy = vec_ty(elemTy, minVec); - SmallVector outVals(outElems); - for (unsigned i = 0; i < numVecs; ++i) { - Value shrMemAddr = sharedPtrs[i * minVec]; - shrMemAddr = bitcast(shrMemAddr, ptr_ty(rewriter.getContext(), 3)); - auto valVec = load(wordTy, shrMemAddr); - valVec.setAlignment(minVec * elemTy.getIntOrFloatBitWidth() / 8); - for (unsigned v = 0; v < minVec; ++v) { - Value currVal = extract_element(elemTy, valVec, i32_val(v)); - outVals[i * minVec + v] = currVal; - } + if (isa(dstTy.getEncoding())) { + if (emitTransferBetweenDPASAndShared( + dstTy, srcTy, elemTy, /*maxVecElems=*/std::nullopt, + shrMemObj.getBase(), shrMemObj.getStrides(), loc, rewriter, target, + [&](VectorType vecTy, Value vecAddr) { + auto vecVal = load(vecTy, vecAddr); + vecVal.setAlignment(vecTy.getNumElements() * + elemTy.getIntOrFloatBitWidth() / 8); + for (int v = 0; v < vecTy.getNumElements(); v++) { + ret.push_back(extract_element(elemTy, vecVal, i32_val(v))); + } + })) + return ret; } - return outVals; + bool success = emitTransferBetweenRegistersAndShared( + dstTy, srcTy, elemTy, /*maxVecElems=*/std::nullopt, shrMemObj.getBase(), + shrMemObj.getStrides(), loc, rewriter, target, + [&](VectorType vecTy, Value vecAddr) { + auto vecVal = load(vecTy, vecAddr); + vecVal.setAlignment(vecTy.getNumElements() * + elemTy.getIntOrFloatBitWidth() / 8); + for (int v = 0; v < vecTy.getNumElements(); v++) { + ret.push_back(extract_element(elemTy, vecVal, i32_val(v))); + } + }); + if (!success) + llvm::report_fatal_error("Failed to emit transfer from shared to register"); + + return ret; } inline void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy, @@ -822,72 +797,39 @@ inline void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy, Value smemBase, ArrayRef dstStrides, Location loc, RewriterBase &rewriter, const TargetInfoBase &target) { - - if (emitTransferBetweenRegistersAndShared( - srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase, - dstStrides, loc, rewriter, target, - [&](VectorType vecTy, Value vecAddr) { - ArrayRef vals = srcVals.take_front(vecTy.getNumElements()); - srcVals = srcVals.drop_front(vecTy.getNumElements()); - - Value vec = undef(vecTy); - for (int i = 0; i < vals.size(); i++) { - vec = insert_element(vec, vals[i], i32_val(i)); - } - store(vec, vecAddr) - .setAlignment(vecTy.getNumElements() * - elemLlvmTy.getIntOrFloatBitWidth() / 8); - })) - return; - - auto srcShape = srcTy.getShape(); - auto rank = srcShape.size(); - assert(rank <= 3 && "Unexpected rank of storeDistributedToShared"); - auto srcDistributedLayout = srcTy.getEncoding(); - if (auto mmaLayout = dyn_cast(srcDistributedLayout)) { - assert((!mmaLayout.isVolta()) && - "ConvertLayout MMAv1->Shared is not supported yet"); - } - auto dstSharedLayout = - cast(dstTy.getEncoding()); - auto dstElemTy = dstTy.getElementType(); - auto inOrd = triton::gpu::getOrder(srcDistributedLayout); - auto outOrd = dstSharedLayout.getOrder(); - unsigned inVec = inOrd == outOrd - ? triton::gpu::getUniqueContigPerThread( - srcDistributedLayout, srcShape)[inOrd[0]] - : 1; - // If the shmem layout is not swizzled, we can trivially vectorize stores - // across the whole width of the most-minor dimension of the shape, because - // Triton requires all the dims are powers of 2. - unsigned outVec = dstSharedLayout.getMaxPhase() == 1 - ? dstTy.getShape()[inOrd[0]] - : dstSharedLayout.getVec(); - unsigned minVec = std::min(outVec, inVec); - unsigned numElems = triton::gpu::getTotalElemsPerThread(srcTy); - auto wordTy = vec_ty(elemLlvmTy, minVec); - Value word; - - SmallVector srcStrides(dstStrides); - SmallVector offsetVals(rank, i32_val(0)); - SharedMemoryObject shrMemObj(smemBase, elemLlvmTy, srcStrides, offsetVals); - - DenseMap sharedPtrs = ::intel::getSwizzledSharedPtrs( - loc, target, inVec, srcTy, dstSharedLayout, elemLlvmTy, - std::move(shrMemObj), rewriter, offsetVals, srcStrides); - LDBG("storeDistributedToShared: numElems = " << numElems << " minVec = " - << minVec << " " << wordTy); - for (unsigned i = 0; i < numElems; ++i) { - if (i % minVec == 0) - word = undef(wordTy); - word = insert_element(wordTy, word, srcVals[i], i32_val(i % minVec)); - if (i % minVec == minVec - 1) { - Value shrMemAddr = sharedPtrs[i / minVec * minVec]; - shrMemAddr = bitcast(shrMemAddr, ptr_ty(rewriter.getContext(), 3)); - store(word, shrMemAddr) - .setAlignment(minVec * elemLlvmTy.getIntOrFloatBitWidth() / 8); - } + if (isa(srcTy.getEncoding())) { + if (emitTransferBetweenDPASAndShared( + srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase, + dstStrides, loc, rewriter, target, + [&](VectorType vecTy, Value vecAddr) { + ArrayRef vals = srcVals.take_front(vecTy.getNumElements()); + srcVals = srcVals.drop_front(vecTy.getNumElements()); + Value vec = undef(vecTy); + for (int i = 0; i < vals.size(); i++) { + vec = insert_element(vec, vals[i], i32_val(i)); + } + store(vec, vecAddr) + .setAlignment(vecTy.getNumElements() * + elemLlvmTy.getIntOrFloatBitWidth() / 8); + })) + return; } + bool success = emitTransferBetweenRegistersAndShared( + srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase, + dstStrides, loc, rewriter, target, [&](VectorType vecTy, Value vecAddr) { + ArrayRef vals = srcVals.take_front(vecTy.getNumElements()); + srcVals = srcVals.drop_front(vecTy.getNumElements()); + + Value vec = undef(vecTy); + for (int i = 0; i < vals.size(); i++) { + vec = insert_element(vec, vals[i], i32_val(i)); + } + store(vec, vecAddr) + .setAlignment(vecTy.getNumElements() * + elemLlvmTy.getIntOrFloatBitWidth() / 8); + }); + if (!success) + llvm::report_fatal_error("Failed to emit transfer from register to shared"); } Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter, diff --git a/unittest/Dialect/TritonGPU/CMakeLists.txt b/unittest/Dialect/TritonGPU/CMakeLists.txt index 3e57b1c005..88e7ac463a 100644 --- a/unittest/Dialect/TritonGPU/CMakeLists.txt +++ b/unittest/Dialect/TritonGPU/CMakeLists.txt @@ -13,3 +13,9 @@ add_triton_ut( SRCS LinearLayoutConversionsTest.cpp LIBS TritonGPUIR ) + +add_triton_ut( + NAME DPAStoLinearLayout + SRCS DPAStoLinearLayoutTest.cpp + LIBS TritonIntelGPUIR +) diff --git a/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp b/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp new file mode 100644 index 0000000000..19c2c31ccd --- /dev/null +++ b/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp @@ -0,0 +1,126 @@ +#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h" +#include "mlir/IR/MLIRContext.h" +#include "triton/Tools/StrUtil.h" +#include "llvm/Support/Signals.h" +#include +#include + +namespace mlir { +std::ostream &operator<<(std::ostream &os, StringAttr str) { + os << str.str(); + return os; +} +} // namespace mlir + +using namespace mlir; +using namespace mlir::triton::gpu::intel; + +namespace mlir::triton::gpu { +namespace { + +class DPAStoLinearLayoutTest : public ::testing::Test { +public: + void SetUp() { ctx.getOrLoadDialect(); } + + DpasEncodingAttr dpas(ArrayRef warps, unsigned repeatCount, + unsigned systolicDepth, unsigned executionSize, + unsigned opsPerChannel, ArrayRef repCluster, + unsigned threadsPerWarp) { + return DpasEncodingAttr::get(&ctx, repeatCount, systolicDepth, + executionSize, opsPerChannel, warps, + repCluster, threadsPerWarp); + } + + StringAttr S(StringRef str) { return StringAttr::get(&ctx, str); } + +protected: + MLIRContext ctx; +}; + +TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) { + EXPECT_EQ(DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32)), + LinearLayout( + { + {S("register"), {{2, 0}, {4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ(DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 16)), + LinearLayout( + { + {S("register"), {{1, 0}, {2, 0}, {4, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); +} + +TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) { + EXPECT_EQ(DPAStoLinearLayout({32, 32}, dpas({1, 1}, 8, 8, 16, 2, {4, 2}, 16)), + LinearLayout( + { + {S("register"), + {{1, 0}, {2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ(DPAStoLinearLayout({32, 32}, dpas({1, 1}, 8, 8, 16, 1, {4, 2}, 16)), + LinearLayout( + { + {S("register"), + {{1, 0}, {2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}}, + {S("warp"), {}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); +} + +TEST_F(DPAStoLinearLayoutTest, DPAS_withWarp) { + EXPECT_EQ(DPAStoLinearLayout({32, 32}, dpas({4, 1}, 8, 8, 16, 2, {1, 2}, 16)), + LinearLayout( + { + {S("register"), {{1, 0}, {2, 0}, {4, 0}, {0, 16}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}}, + {S("warp"), {{8, 0}, {16, 0}}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); + EXPECT_EQ(DPAStoLinearLayout({64, 64}, dpas({2, 2}, 8, 8, 16, 1, {4, 2}, 32)), + LinearLayout( + { + {S("register"), {{2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {{32, 0}, {0, 32}}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); +} + +TEST_F(DPAStoLinearLayoutTest, DPAS_withDPASRepetitions) { + EXPECT_EQ(DPAStoLinearLayout({64, 64}, dpas({2, 1}, 8, 8, 16, 2, {4, 2}, 32)), + LinearLayout( + { + {S("register"), + {{2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}, {0, 32}}}, + {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}}, + {S("warp"), {{32, 0}}}, + {S("block"), {}}, + }, + {S("dim0"), S("dim1")})); +} + +} // anonymous namespace +} // namespace mlir::triton::gpu + +int main(int argc, char *argv[]) { + llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}