From cc574497bcbfd8a35e45937c42242d9890eeee6e Mon Sep 17 00:00:00 2001
From: HyoungWook Nam <hyoungwook.nam@intel.com>
Date: Tue, 30 Jul 2024 19:30:50 -0500
Subject: [PATCH] Adding conversion between LinearLayout and DPASLayout (#1684)

This PR fixes #1295.

Upstream Triton tries to abstract all tensor layouts with the
LinearLayout abstraction.
Solving #1295 requires using LinearLayout, so I've added codes to
convert DPASLayout to LinearLayout.
Currently, the conversion is only correct for DPAS OperandC layout.
---
 scripts/skiplist/a770/language.txt            | 177 -------
 scripts/skiplist/conda/language.txt           |  49 --
 scripts/skiplist/default/language.txt         |  49 --
 scripts/skiplist/lts/language.txt             |  49 --
 .../IR/LinearLayoutConversions.h              |  19 +
 .../Dialect/TritonIntelGPU/IR/CMakeLists.txt  |   2 +-
 .../IR/LinearLayoutConversions.cpp            | 433 ++++++++++++++++++
 .../ConvertLayoutOpToLLVM.cpp                 |  24 +-
 .../lib/TritonIntelGPUToLLVM/Utility.cpp      | 127 +++++
 .../intel/lib/TritonIntelGPUToLLVM/Utility.h  | 194 +++-----
 unittest/Dialect/TritonGPU/CMakeLists.txt     |   6 +
 .../TritonGPU/DPAStoLinearLayoutTest.cpp      | 126 +++++
 12 files changed, 799 insertions(+), 456 deletions(-)
 create mode 100644 third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h
 create mode 100644 third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
 create mode 100644 unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp

diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt
index 59a2daec56..4baa756191 100644
--- a/scripts/skiplist/a770/language.txt
+++ b/scripts/skiplist/a770/language.txt
@@ -1,180 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout2-src_layout7-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout0-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout1-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout2-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout3-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout4-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout5-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout6-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout8-interm_layout3-src_layout7-float16-64-1]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt
index 4860e6204d..cdac848de1 100644
--- a/scripts/skiplist/conda/language.txt
+++ b/scripts/skiplist/conda/language.txt
@@ -1,52 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt
index 4860e6204d..cdac848de1 100644
--- a/scripts/skiplist/default/language.txt
+++ b/scripts/skiplist/default/language.txt
@@ -1,52 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt
index 501a125278..9a4e14ab13 100644
--- a/scripts/skiplist/lts/language.txt
+++ b/scripts/skiplist/lts/language.txt
@@ -1,52 +1,3 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/1295
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout0-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout1-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout2-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout3-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout4-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout5-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout6-interm_layout3-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout1-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout2-src_layout8-float16-64-1]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-1-64]
-test/unit/language/test_core.py::test_convert2d[dst_layout7-interm_layout3-src_layout8-float16-64-1]
 # LTS failures
 test/unit/language/test_core.py::test_dot[1-64-128-128-4-True-True-none-tf32-int8-int8-1_0]
 test/unit/language/test_core.py::test_dot[1-64-128-128-4-True-True-none-tf32-int8-int8-1_1]
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h b/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h
new file mode 100644
index 0000000000..ba497942ae
--- /dev/null
+++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h
@@ -0,0 +1,19 @@
+// Conversions from TritonIntelGPU DpasEncodingAttr to LinearLayout.
+
+#ifndef TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H
+#define TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H
+
+#include <optional>
+
+#include "intel/include/Dialect/TritonIntelGPU/IR/Attributes.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
+#include "triton/Tools/LinearLayout.h"
+
+namespace mlir::triton::gpu {
+
+std::optional<LinearLayout> DPAStoLinearLayout(ArrayRef<int64_t> shape,
+                                               Attribute layout);
+
+} // namespace mlir::triton::gpu
+
+#endif // TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt
index 64070b015d..24334b6773 100644
--- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt
+++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_triton_library(TritonIntelGPUIR
   Dialect.cpp
   Ops.cpp
-
+  LinearLayoutConversions.cpp
   DEPENDS
   TritonIntelGPUTableGen
   TritonIntelGPUAttrDefsIncGen
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
new file mode 100644
index 0000000000..2758ed1cfe
--- /dev/null
+++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -0,0 +1,433 @@
+#include <vector>
+
+#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Tools/LinearLayout.h"
+#include "triton/Tools/StrUtil.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace mlir::triton::gpu::intel;
+
+namespace mlir::triton::gpu {
+namespace {
+
+// We use the following nomenclature in this file.
+//
+//  - ctaLayout: A layout for one block, i.e. input dims [register, lane, warp]
+//    for register layouts, and input dims [offset] for shared layouts.
+//  - cgaLayout: Arrangement of multiple blocks, i.e. input dims [block].
+//
+// Note that this is inconsistent with the type name CTALayoutAttr.  That type
+// is equivalent to our cgaLayout.
+//
+// IMO the name CTALayoutAttr is wrong.  If we tried to be consistent anyway,
+// then we'd have to rename ctaLayout to "warpLayout".  I think that's more
+// confusing than being inconsistent about "cgaLayout", especially when we have
+// to consider the size of the warpLayout (surely that's not the "warpSize").
+
+#define S(v) StringAttr::get(ctx, (v))
+
+// Returns ["out0", "out1", ..., "out<rank-1>"].
+SmallVector<StringAttr> standardOutDimNames(MLIRContext *ctx, int rank) {
+  SmallVector<StringAttr> ret;
+  for (int i = 0; i < rank; i++) {
+    ret.push_back(S("dim" + llvm::Twine(i)));
+  }
+  return ret;
+}
+
+// Returns a 1D -> ND layout that's equivalent to creating a 1D -> 1D mapping of
+// size product(shape) and then reshaping to permute(shape, order).
+LinearLayout identityND(StringAttr inDimName, ArrayRef<unsigned> shape,
+                        ArrayRef<unsigned> order,
+                        ArrayRef<StringAttr> outDimNames) {
+  assert(shape.size() == order.size());
+
+  MLIRContext *ctx = inDimName.getContext();
+  LinearLayout ret = LinearLayout::empty();
+  for (int i = 0; i < shape.size(); i++) {
+    // Start with the most-minor dimension, which is order[0].
+    int dim = order[i];
+    ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
+  }
+  return ret;
+}
+
+// Make a LinearLayout that maps a block-id to an N-dimensional index.
+//
+// The tensor is split up into CTAsPerCGA pieces, which are distributed among
+// the CTAsPerCGA CTAs (i.e. blocks) in the CGA (i.e. groups).
+//
+// See the nomenclature note at the top of the file for an explanation of why
+// this is called makeCgaLayout when it accepts a CTALayoutAttr.
+LinearLayout makeCgaLayout(CTALayoutAttr layout) {
+  MLIRContext *ctx = layout.getContext();
+  StringAttr kBlock = S("block");
+
+  int rank = layout.getCTAOrder().size();
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+
+  LinearLayout ret = LinearLayout::empty();
+  for (int i = 0; i < rank; i++) {
+    // Start with the most minor dimension, which is order[0].
+    int dim = layout.getCTAOrder()[i];
+    int split = layout.getCTASplitNum()[dim];
+    int ctas = layout.getCTAsPerCGA()[dim];
+    assert(ctas % split == 0);
+    ret *= LinearLayout::identity1D(split, kBlock, outDimNames[dim]) *
+           LinearLayout::zeros1D(ctas / split, kBlock, outDimNames[dim]);
+  }
+
+  // Transpose to standard order (dim0, dim1, ...).
+  return ret.transposeOuts(outDimNames);
+}
+
+// Shrinks the output set of a layout function while leaving the input set
+// unchanged, by making high-order inputs in inDimName map to the same output.
+// Attempts to shrink down to desiredSize, but this is not always possible just
+// by modifying one the specified input dimension.
+//
+// We do this by making the most-major inputs to the layout map to 0.  This
+// effectively duplicates data along that input dimension.  For example, this
+// layout has out-dim size 32:
+//
+//   L(register=1) = 8
+//   L(register=2) = 4
+//   L(register=4) = 1
+//   L(lane=1) = 2
+//   L(lane=2) = 16.
+//
+// If we shrink it to size 16 along the `lane` dimension, we set L(lane=2) to 0:
+//
+//   L(register=1) = 8
+//   L(register=2) = 4
+//   L(register=4) = 1
+//   L(lane=1) = 2
+//   L(lane=2) = 0.
+//
+// This means that lane=2 has the same data as lane=0.
+//
+// If we shrink to size 8 along the lane dimension, we set L(lane=1) = 0 as
+// well.  But when we do this, we have to remove bit 1 (the value of L(lane=1))
+// from all other bases:
+//
+//   L(register=1) = 4
+//   L(register=2) = 2
+//   L(register=1) = 1
+//   L(lane=1) = 0
+//   L(lane=2) = 0.
+//
+// Note this only works because the bases are powers of two.  I don't quite know
+// what to do when they're not.
+LinearLayout shrinkCodomain(const LinearLayout &layout, StringAttr inDimName,
+                            StringAttr outDimName, int desiredSize) {
+  assert(llvm::isPowerOf2_32(desiredSize));
+  int outDimIdx = layout.getOutDimIndex(outDimName);
+  int desiredZeros =
+      llvm::Log2_32(layout.getOutDimSize(outDimName) / desiredSize);
+  if (desiredZeros == 0) {
+    return layout;
+  }
+
+  // Find the desiredZeros most-major basis vectors that are not already zero.
+  // These are the ones we will set to zero.
+  SmallVector<int> basesToZero;
+  for (int i = layout.getInDimSizeLog2(inDimName) - 1;
+       i >= 0 && basesToZero.size() < desiredZeros; i--) {
+    int basis = layout.getBasis(inDimName, i, outDimName);
+    if (basis != 0) {
+      basesToZero.push_back(basis);
+    }
+  }
+
+  // Bail if all the bases are already zero; nothing more we can do.
+  if (basesToZero.empty()) {
+    return layout;
+  }
+
+  // The algorithm below only works because the bases are powers of two.  I'm
+  // not sure what to do otherwise.
+  assert(llvm::all_of(basesToZero,
+                      [&](int basis) { return llvm::isPowerOf2_32(basis); }));
+
+  // We want to zero out the bases in `basesToZero`, and also "shift out" the
+  // corresponding bits from all other bases.  For example if we remove the
+  // basis with value 8 = 0b100, then if another basis has value 26 = 0b11010,
+  // the 1 in its 3rd position gets removed and it becomes 10 = 0b1010.
+  //
+  // We could manually alter the bases in `layout` to achieve this, but it's
+  // perhaps simpler to use the linearity of LLs to our advantage.
+  //
+  // Consider the function O which is the identity map from out-dims to
+  // out-dims.  We can easily calculate what happens when we remove the relevant
+  // bases from O.  Call this new function O'.
+  //
+  // Because of linearity, removing the bases from L is equivalent to composing
+  // L with O'.  So that's what we do below.
+
+  // Construct the out-dims -> out-dims identity layout O.
+  LinearLayout outputIdentity = LinearLayout::empty();
+  for (StringAttr dim : layout.getOutDimNames()) {
+    outputIdentity *=
+        LinearLayout::identity1D(layout.getOutDimSize(dim), dim, dim);
+  }
+
+  // Modify O to remove the relevant bases.
+  //
+  // TODO(jlebar): I don't like manually modifying bases here.  Perhaps this
+  // should be a function on LinearLayout.
+  LinearLayout::BasesT newBases = outputIdentity.getBases();
+  llvm::sort(basesToZero);
+  for (int basis : basesToZero) {
+    int idx = llvm::Log2_32(basis);
+    for (int i = newBases[outDimName].size() - 1; i > idx; i--) {
+      newBases[outDimName][i][outDimIdx] =
+          newBases[outDimName][i - 1][outDimIdx];
+    }
+    newBases[outDimName][idx][outDimIdx] = 0;
+  }
+
+  // Construct O'.
+  LinearLayout transform(std::move(newBases),
+                         llvm::to_vector(layout.getOutDimNames()));
+
+  // Compose O' with L.
+  return layout.compose(transform);
+}
+
+// For each out-dim d, ensure the layout's out-size (i.e. its codomain) is no
+// larger than shape[d].  Do this without changing the size of the layout's
+// inputs (i.e. leave its domain unchanged).
+//
+// This function is invariant to the order of the layout's input and output
+// dimensions.
+LinearLayout ensureLayoutNotLargerThan(
+    const LinearLayout &layout,
+    const llvm::SmallDenseMap<StringAttr, int64_t> &shape) {
+  assert(shape.size() == layout.getNumOutDims());
+  if (shape.empty()) {
+    return layout;
+  }
+  MLIRContext *ctx = shape.begin()->first.getContext();
+
+  // For the purposes of this function, "block" is the "most-minor" dimension.
+  // This is just a consequence of how legacy layouts work: We only put the same
+  // tensor element into two different blocks as a last resort, only after all
+  // the registers in all the lanes in all the warps in a block already have the
+  // same tensor element.  (Or, for shared layouts, only after all values in
+  // smem within a block have the same value.)
+  //
+  // inDimNames combines the in dims for register and shared layouts; that's OK
+  // because we skip in-dims that aren't present.  So we'll iterate over
+  // {blocked, register, lane, warp} or {blocked, offset}.
+  SmallVector<StringAttr> inDimNames = {
+      // for both register and shared layouts
+      S("block"),
+
+      // for register layouts
+      S("register"),
+      S("lane"),
+      S("warp"),
+
+      // for shared layouts
+      S("offset"),
+  };
+
+  LinearLayout ret = layout;
+  for (auto outDimName : layout.getOutDimNames()) {
+    int32_t actualSize = layout.getOutDimSize(outDimName);
+    int32_t desiredSize = shape.lookup(outDimName);
+    if (actualSize <= desiredSize) {
+      continue;
+    }
+    assert(actualSize % desiredSize == 0);
+    for (StringAttr inDimName : llvm::reverse(inDimNames)) {
+      if (ret.hasInDim(inDimName)) {
+        ret = shrinkCodomain(ret, inDimName, outDimName, desiredSize);
+      }
+    }
+    assert(ret.getOutDimSize(outDimName) == desiredSize);
+  }
+  return ret;
+}
+
+// For each out-dim d, ensure the layout's out-size (i.e. its codomain) is no
+// smaller than shape[d].  Do this by increasing the size of the layout's inputs
+// along its most-minor dimension ("register" for register layouts, "offset" for
+// shared layouts).
+//
+// This function is invariant to the order of the layout's input dimensions, but
+// it cares about the order of the output dims, which should be minor-to-major.
+LinearLayout ensureLayoutNotSmallerThan(
+    const LinearLayout &layout,
+    const llvm::SmallDenseMap<StringAttr, int64_t> &shape) {
+  assert(shape.size() == layout.getNumOutDims());
+  if (shape.empty()) {
+    return layout;
+  }
+
+  MLIRContext *ctx = shape.begin()->first.getContext();
+  StringAttr kDim = *layout.getInDimNames().begin();
+  assert(kDim == "register" || kDim == "offset");
+
+  LinearLayout ret = layout;
+  for (StringAttr outDimName : layout.getOutDimNames()) {
+    int32_t actualSize = layout.getOutDimSize(outDimName);
+    int32_t desiredSize = shape.lookup(outDimName);
+    assert(actualSize > desiredSize || desiredSize % actualSize == 0);
+    ret *= LinearLayout::identity1D(desiredSize / actualSize, kDim, outDimName);
+    assert(ret.getOutDimSize(outDimName) >= desiredSize);
+  }
+  return ret;
+}
+
+// Combines the layout of a CTA (input dims [register, lane, warp]) with the
+// layout of a CGA (i.e. a block), and ensures that the resulting layout has the
+// given shape.
+//
+// See the nomenclature note at the top of the file for why the variable with
+// type CTALayoutAttr is called cgaLayoutAttr.
+LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
+                                    CTALayoutAttr cgaLayoutAttr,
+                                    ArrayRef<int64_t> shape) {
+  int rank = shape.size();
+  assert(ctaLayout.getNumOutDims() == rank);
+  assert(cgaLayoutAttr.getCTAOrder().size() == rank);
+  MLIRContext *ctx = cgaLayoutAttr.getContext();
+
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+
+  llvm::SmallDenseMap<StringAttr, int64_t> labeledShape;
+  for (auto [dim, size] : llvm::zip(outDimNames, shape)) {
+    labeledShape[dim] = size;
+  }
+
+  LinearLayout cgaLayout =
+      ensureLayoutNotLargerThan(makeCgaLayout(cgaLayoutAttr), labeledShape)
+          .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
+
+  // Calculate the shape of the ctaLayout, which is `shape` divided by the
+  // cgaLayout's size.
+  llvm::SmallDenseMap<StringAttr, int64_t> ctaShape;
+  assert(llvm::to_vector(ctaLayout.getOutDimNames()) ==
+         llvm::to_vector(cgaLayout.getOutDimNames()));
+  for (auto dim : ctaLayout.getOutDimNames()) {
+    ctaShape[dim] =
+        std::max(int64_t{1}, labeledShape[dim] / cgaLayout.getOutDimSize(dim));
+  }
+
+  ctaLayout = ensureLayoutNotSmallerThan(ctaLayout, ctaShape);
+  ctaLayout = ensureLayoutNotLargerThan(ctaLayout, ctaShape);
+
+  LinearLayout ret = (ctaLayout * cgaLayout).transposeOuts(outDimNames);
+  for (auto dim : ret.getOutDimNames()) {
+    assert(ret.getOutDimSize(dim) == labeledShape[dim]);
+  }
+  return ret;
+}
+
+} // anonymous namespace
+
+// The layout example repeat_count=8, systolic_depth=8,
+// execution_size=16 and operands_per_chan=2 for warp size 32.
+// DPASInst layout of C operand:
+//        execution size = 16
+//<---------------------------------->
+// t0  t1  t2  t3  ~ t12 t13 t14 t15          ^
+// t16 t17 t18 t19 ~ t28 t29 t30 t31          |
+// .   .   .   .   .   .   .   .   .          |
+// .   .   .   .   .   .   .   .   .          | repeatCount = 8
+// t0  t1  t2  t3  ~ t12 t13 t14 t15          |
+// t16 t17 t18 t19 ~ t28 t29 t30 t31          v
+// In this case, the LinearLayout bases are:
+// Register:  {{2,0}, {4,0}}
+// Lane:      {{0,1}, {0,2}, {0,4}, {0,8}, {1,0}}
+// Currently, LinearLayout is not supported for DotOperandEncoding
+// so only Operand C conversion is implemented.
+std::vector<std::vector<int32_t>>
+DPASRegBasesC(int repeatCount, int executionSize, int threadsPerWarp) {
+  int rowsPerWarp = threadsPerWarp / executionSize;
+
+  std::vector<std::vector<int32_t>> regBases;
+
+  for (int rid = rowsPerWarp; rid < repeatCount; rid = rid * 2) {
+    regBases.push_back({rid, 0});
+  }
+
+  return regBases;
+}
+
+std::vector<std::vector<int32_t>>
+DPASLaneBasesC(int repeatCount, int executionSize, int threadsPerWarp) {
+
+  std::vector<std::vector<int32_t>> laneBases;
+
+  for (int tid = 1; tid < executionSize; tid = tid * 2) {
+    laneBases.push_back({0, tid});
+  }
+  int rowsPerWarp = threadsPerWarp / executionSize;
+  for (int row = 1; row < rowsPerWarp; row = row * 2) {
+    laneBases.push_back({row, 0});
+  }
+
+  return laneBases;
+}
+
+std::optional<LinearLayout> DPAStoLinearLayout(ArrayRef<int64_t> shape,
+                                               Attribute layout) {
+
+  auto dpas = dyn_cast<DpasEncodingAttr>(layout);
+  assert(dpas && "Must be DPAS Operand C layout");
+
+  int rank = shape.size();
+  assert(rank == dpas.getWarpsPerCTA().size());
+  assert(rank == 2);
+
+  MLIRContext *ctx = dpas.getContext();
+  SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
+
+  StringAttr kRegister = S("register");
+  StringAttr kLane = S("lane");
+
+  const SmallVector<unsigned> warpsPerCTA = dpas.getWarpsPerCTA();
+  int threadsPerWarp = triton::gpu::getWarpSize(dpas);
+  auto repCluster = dpas.getRepCluster();
+  SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, 2);
+
+  auto tileLayout = LinearLayout::empty();
+  int repeatCount = dpas.getRepeatCount();
+  int executionSize = dpas.getExecutionSize();
+
+  auto regBases = DPASRegBasesC(repeatCount, executionSize, threadsPerWarp);
+  auto laneBases = DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp);
+  tileLayout =
+      LinearLayout({{kRegister, regBases}, {kLane, laneBases}}, outDimNames);
+
+  // The per-inst layout is repeated at each repCluster.
+  // Hence, multiply with the identity layouts starting from the
+  // least significant dimension.
+  tileLayout *=
+      LinearLayout::identity1D(repCluster[1], kRegister, outDimNames[1]);
+  tileLayout *=
+      LinearLayout::identity1D(repCluster[0], kRegister, outDimNames[0]);
+
+  // Then, it is repeated by DPASRepetitions to form per-Warp layout.
+  tileLayout *= LinearLayout::identity1D(numReps[1], kRegister, outDimNames[1]);
+  tileLayout *= LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
+
+  // Finally, per-warp layout is repeated among the warps in the CTA.
+  LinearLayout warpLayout =
+      identityND(S("warp"), dpas.getWarpsPerCTA(), {0, 1}, outDimNames);
+  LinearLayout ctaLayout = tileLayout * warpLayout;
+
+  return combineCtaCgaWithShape(ctaLayout, CTALayoutAttr::getDefault(ctx, rank),
+                                shape);
+}
+
+} // namespace mlir::triton::gpu
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 638d922856..a7465f50da 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -3,6 +3,8 @@
 #include "Utility.h"
 
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
@@ -471,12 +473,24 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
   matchAndRewrite(ConvertLayoutOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     MLIRContext *ctx = op.getContext();
-
     const auto &shape = op.getType().getShape();
-    std::optional<LinearLayout> srcLayout =
-        gpu::toLinearLayout(shape, op.getSrc().getType().getEncoding());
-    std::optional<LinearLayout> dstLayout =
-        gpu::toLinearLayout(shape, op.getType().getEncoding());
+    std::optional<LinearLayout> srcLayout;
+    auto srcTy = op.getSrc().getType();
+
+    if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(srcTy.getEncoding())) {
+      srcLayout = gpu::DPAStoLinearLayout(shape, dpasLayout);
+    } else {
+      srcLayout = gpu::toLinearLayout(shape, srcTy.getEncoding());
+    }
+
+    std::optional<LinearLayout> dstLayout;
+    auto dstTy = op.getType();
+    if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(dstTy.getEncoding())) {
+      dstLayout = gpu::DPAStoLinearLayout(shape, dpasLayout);
+    } else {
+      dstLayout = gpu::toLinearLayout(shape, dstTy.getEncoding());
+    }
+
     if (!srcLayout.has_value() || !dstLayout.has_value()) {
       return failure();
     }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp
index b4af7fdc5e..a408e7342b 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Utility.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -140,3 +142,128 @@ Value llPrintf(ConversionPatternRewriter &rewriter, StringRef msg,
 }
 
 } // namespace mlir::LLVM::intel
+
+namespace mlir::triton::intel {
+bool emitTransferBetweenDPASAndShared(
+    RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy,
+    std::optional<int32_t> maxVecElems, Value shmemBase,
+    ArrayRef<Value> shmemStrides, Location loc, RewriterBase &rewriter,
+    const TargetInfoBase &target,
+    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
+  MLIRContext *ctx = rewriter.getContext();
+
+  auto shape = registerTy.getShape();
+  int rank = shape.size();
+
+  StringAttr kBlock = str_attr("block");
+  StringAttr kRegister = str_attr("register");
+  StringAttr kLane = str_attr("lane");
+  StringAttr kWarp = str_attr("warp");
+
+  std::optional<LinearLayout> regLayout;
+  if (auto dpas = dyn_cast<DpasEncodingAttr>(registerTy.getEncoding())) {
+    // Default is operandC (opidx == 2)
+    regLayout = triton::gpu::DPAStoLinearLayout(shape, dpas);
+  } else {
+    regLayout = triton::gpu::toLinearLayout(shape, registerTy.getEncoding());
+  }
+
+  std::optional<LinearLayout> sharedLayout;
+  if (auto dpas = dyn_cast<DpasEncodingAttr>(sharedTy.getEncoding())) {
+    sharedLayout = triton::gpu::DPAStoLinearLayout(shape, dpas);
+  } else {
+    sharedLayout = triton::gpu::toLinearLayout(
+        shape, sharedTy.getEncoding(), elemLlvmTy.getIntOrFloatBitWidth());
+  }
+
+  if (!regLayout.has_value() || !sharedLayout.has_value()) {
+    return false;
+  }
+  auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding());
+
+  // sharedLayout's in-dims are currently (offset, block).  Reshape to
+  // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional
+  // shmem strides.  (The offsetX's appear in minor-to-major order.)
+  auto sharedLegacy =
+      cast<triton::gpu::SharedEncodingAttr>(sharedTy.getEncoding());
+  SmallVector<std::pair<StringAttr, int32_t>> multiDimSharedSize;
+  for (int i = 0; i < rank; i++) {
+    int dim = sharedOrder[i];
+    int64_t size = std::max(
+        int64_t{1},
+        shape[dim] / sharedLegacy.getCTALayout().getCTASplitNum()[dim]);
+    multiDimSharedSize.push_back(
+        {str_attr("offset" + std::to_string(dim)), size});
+  }
+  multiDimSharedSize.push_back({kBlock, sharedLayout->getInDimSize(kBlock)});
+  sharedLayout = sharedLayout->reshapeIns(multiDimSharedSize);
+
+  // regToSharedLayout maps from (register, lane, warp, block) to (offsetX1,
+  // ..., offsetXN, block), where the offsetX's are in minor-to-major order.
+  LinearLayout regToSharedLayout = regLayout->invertAndCompose(*sharedLayout);
+
+  // TODO(jlebar): We don't currently support loading from shared memory in a
+  // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
+  for (int inBlock = 1; inBlock < regToSharedLayout.getInDimSize(kBlock);
+       inBlock *= 2) {
+    auto idx = llvm::to_vector(llvm::make_second_range(regToSharedLayout.apply(
+        {{kRegister, 0}, {kLane, 0}, {kWarp, 0}, {kBlock, inBlock}})));
+    // offsetX1, ..., offsetXN must all be 0.
+    if (!llvm::all_of(ArrayRef(idx).drop_back(1),
+                      [&](auto offset) { return offset == 0; })) {
+      return false;
+    }
+    int32_t outBlock = idx.back();
+    if (outBlock != inBlock) {
+      return false;
+    }
+  }
+
+  // Determine how many consecutive registers map to consecutive shmem elements
+  // in out-dimension offsetN.  This is our load instruction's vector width.
+  //
+  // It's OK if the vector width we choose here is wider than the hardware
+  // supports; LLVM will legalize it.
+  //
+  // TODO(jlebar): shmemStrides are Values, but most of them are usually integer
+  // constants.  We could add those constant strides to the LL, and then before
+  // calling getNumConsecutiveInOut(), we could flatten consecutive out-dims
+  // which have known strides.  This would allow us to vectorize across multiple
+  // shmem out dimensions where possible.
+  const int vecElems =
+      std::min(regToSharedLayout.getNumConsecutiveInOut(),
+               maxVecElems.value_or(std::numeric_limits<int>::max()));
+
+  Value threadId = getThreadId(rewriter, loc);
+  Value threadsPerWarp = i32_val(regToSharedLayout.getInDimSize(kLane));
+  Value laneId = urem(threadId, threadsPerWarp);
+  Value warpId = udiv(threadId, threadsPerWarp);
+
+  int numElems = regToSharedLayout.getInDimSize(kRegister);
+  auto vecTy = vec_ty(elemLlvmTy, vecElems);
+  auto ptrTy = ptr_ty(ctx, /*addressSpace=*/3);
+  Value zero = i32_val(0);
+  SmallVector<Value> ret;
+  for (int i = 0; i < numElems / vecElems; i++) {
+    // Get the address to load/store.  The multi-dim address is (offsetX1, ...,
+    // offsetXN, block), where the offsets appear in minor-to-major order, and
+    // we drop_end to drop block, which we know from above will be 0.
+    auto multiDimShmemOffset =
+        llvm::to_vector(llvm::drop_end(llvm::make_second_range(
+            applyLinearLayout(loc, rewriter, regToSharedLayout,
+                              {{kRegister, i32_val(i * vecElems)},
+                               {kLane, laneId},
+                               {kWarp, warpId},
+                               {kBlock, zero}}))));
+
+    // Reorder strides according to `order`.  This way they match the
+    // multi-dimensional offsets in regToSharedLayout.
+    Value shmemOffset = dot(rewriter, loc, multiDimShmemOffset,
+                            applyPermutation(shmemStrides, sharedOrder));
+    auto vecAddr = gep(ptrTy, elemLlvmTy, shmemBase, shmemOffset);
+    vecAddr.setInbounds(true);
+    perVectorCallback(vecTy, vecAddr);
+  }
+  return true;
+}
+} // namespace mlir::triton::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
index 7b9e1fc0c0..883460a2dc 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -11,6 +11,7 @@
 
 #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
@@ -745,6 +746,13 @@ inline DenseMap<unsigned, Value> getSwizzledSharedPtrs(
   return ret;
 }
 
+[[nodiscard]] bool emitTransferBetweenDPASAndShared(
+    RankedTensorType registerTy, MemDescType sharedTy, Type elemLlvmTy,
+    std::optional<int32_t> maxVecElems, Value shmemBase,
+    ArrayRef<Value> shmemStrides, Location loc, RewriterBase &rewriter,
+    const TargetInfoBase &target,
+    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
+
 inline SmallVector<Value>
 loadSharedToDistributed(Value dst, Value src, SharedMemoryObject &shrMemObj,
                         Type elemTy, Location loc, RewriterBase &rewriter,
@@ -753,68 +761,35 @@ loadSharedToDistributed(Value dst, Value src, SharedMemoryObject &shrMemObj,
   auto srcTy = cast<MemDescType>(src.getType());
 
   SmallVector<Value> ret;
-  if (emitTransferBetweenRegistersAndShared(
-          dstTy, srcTy, elemTy, /*maxVecElems=*/std::nullopt,
-          shrMemObj.getBase(), shrMemObj.getStrides(), loc, rewriter, target,
-          [&](VectorType vecTy, Value vecAddr) {
-            auto vecVal = load(vecTy, vecAddr);
-            vecVal.setAlignment(vecTy.getNumElements() *
-                                elemTy.getIntOrFloatBitWidth() / 8);
-
-            for (int v = 0; v < vecTy.getNumElements(); v++) {
-              ret.push_back(extract_element(elemTy, vecVal, i32_val(v)));
-            }
-          }))
-    return ret;
-
-  auto dstShape = dstTy.getShape();
-  assert(dstShape.size() <= 2 && "Unexpected rank of loadSharedToDistributed");
-  auto dstDistributedLayout = dstTy.getEncoding();
-  if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(dstDistributedLayout)) {
-    assert((!mmaLayout.isVolta()) &&
-           "ConvertLayout Shared->MMAv1 is not supported yet");
-  }
-  auto srcSharedLayout =
-      cast<triton::gpu::SharedEncodingAttr>(srcTy.getEncoding());
-  auto srcElemTy = srcTy.getElementType();
-  auto dstElemTy = dstTy.getElementType();
-  LDBG("loadSharedToDistributed elemTy " << elemTy << " srcElemTy " << srcElemTy
-                                         << " dstElemTy " << dstElemTy);
-  auto inOrd = triton::gpu::getOrder(srcSharedLayout);
-  auto outOrd = triton::gpu::getOrder(dstDistributedLayout);
-  unsigned outVec = inOrd == outOrd
-                        ? triton::gpu::getUniqueContigPerThread(
-                              dstDistributedLayout, dstShape)[outOrd[0]]
-                        : 1;
-
-  // If the shmem layout is not swizzled, we can trivially vectorize loads
-  // across the whole width of the most-minor dimension of the shape, because
-  // Triton requires all the dims are powers of 2.
-  unsigned inVec = srcSharedLayout.getMaxPhase() == 1
-                       ? srcTy.getShape()[inOrd[0]]
-                       : srcSharedLayout.getVec();
-  unsigned minVec = std::min(outVec, inVec);
-  unsigned outElems = triton::gpu::getTotalElemsPerThread(dstTy);
-  SmallVector<Value> offsetVals = {shrMemObj.strides.size(), i32_val(0)};
-
-  DenseMap<unsigned, Value> sharedPtrs = ::intel::getSwizzledSharedPtrs(
-      loc, target, outVec, dstTy, srcSharedLayout, elemTy, shrMemObj, rewriter,
-      offsetVals, shrMemObj.strides);
-  assert(outElems % minVec == 0 && "Unexpected number of elements");
-  unsigned numVecs = outElems / minVec;
-  auto wordTy = vec_ty(elemTy, minVec);
-  SmallVector<Value> outVals(outElems);
-  for (unsigned i = 0; i < numVecs; ++i) {
-    Value shrMemAddr = sharedPtrs[i * minVec];
-    shrMemAddr = bitcast(shrMemAddr, ptr_ty(rewriter.getContext(), 3));
-    auto valVec = load(wordTy, shrMemAddr);
-    valVec.setAlignment(minVec * elemTy.getIntOrFloatBitWidth() / 8);
-    for (unsigned v = 0; v < minVec; ++v) {
-      Value currVal = extract_element(elemTy, valVec, i32_val(v));
-      outVals[i * minVec + v] = currVal;
-    }
+  if (isa<DpasEncodingAttr>(dstTy.getEncoding())) {
+    if (emitTransferBetweenDPASAndShared(
+            dstTy, srcTy, elemTy, /*maxVecElems=*/std::nullopt,
+            shrMemObj.getBase(), shrMemObj.getStrides(), loc, rewriter, target,
+            [&](VectorType vecTy, Value vecAddr) {
+              auto vecVal = load(vecTy, vecAddr);
+              vecVal.setAlignment(vecTy.getNumElements() *
+                                  elemTy.getIntOrFloatBitWidth() / 8);
+              for (int v = 0; v < vecTy.getNumElements(); v++) {
+                ret.push_back(extract_element(elemTy, vecVal, i32_val(v)));
+              }
+            }))
+      return ret;
   }
-  return outVals;
+  bool success = emitTransferBetweenRegistersAndShared(
+      dstTy, srcTy, elemTy, /*maxVecElems=*/std::nullopt, shrMemObj.getBase(),
+      shrMemObj.getStrides(), loc, rewriter, target,
+      [&](VectorType vecTy, Value vecAddr) {
+        auto vecVal = load(vecTy, vecAddr);
+        vecVal.setAlignment(vecTy.getNumElements() *
+                            elemTy.getIntOrFloatBitWidth() / 8);
+        for (int v = 0; v < vecTy.getNumElements(); v++) {
+          ret.push_back(extract_element(elemTy, vecVal, i32_val(v)));
+        }
+      });
+  if (!success)
+    llvm::report_fatal_error("Failed to emit transfer from shared to register");
+
+  return ret;
 }
 
 inline void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
@@ -822,72 +797,39 @@ inline void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
                                      Value smemBase, ArrayRef<Value> dstStrides,
                                      Location loc, RewriterBase &rewriter,
                                      const TargetInfoBase &target) {
-
-  if (emitTransferBetweenRegistersAndShared(
-          srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase,
-          dstStrides, loc, rewriter, target,
-          [&](VectorType vecTy, Value vecAddr) {
-            ArrayRef<Value> vals = srcVals.take_front(vecTy.getNumElements());
-            srcVals = srcVals.drop_front(vecTy.getNumElements());
-
-            Value vec = undef(vecTy);
-            for (int i = 0; i < vals.size(); i++) {
-              vec = insert_element(vec, vals[i], i32_val(i));
-            }
-            store(vec, vecAddr)
-                .setAlignment(vecTy.getNumElements() *
-                              elemLlvmTy.getIntOrFloatBitWidth() / 8);
-          }))
-    return;
-
-  auto srcShape = srcTy.getShape();
-  auto rank = srcShape.size();
-  assert(rank <= 3 && "Unexpected rank of storeDistributedToShared");
-  auto srcDistributedLayout = srcTy.getEncoding();
-  if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(srcDistributedLayout)) {
-    assert((!mmaLayout.isVolta()) &&
-           "ConvertLayout MMAv1->Shared is not supported yet");
-  }
-  auto dstSharedLayout =
-      cast<triton::gpu::SharedEncodingAttr>(dstTy.getEncoding());
-  auto dstElemTy = dstTy.getElementType();
-  auto inOrd = triton::gpu::getOrder(srcDistributedLayout);
-  auto outOrd = dstSharedLayout.getOrder();
-  unsigned inVec = inOrd == outOrd
-                       ? triton::gpu::getUniqueContigPerThread(
-                             srcDistributedLayout, srcShape)[inOrd[0]]
-                       : 1;
-  // If the shmem layout is not swizzled, we can trivially vectorize stores
-  // across the whole width of the most-minor dimension of the shape, because
-  // Triton requires all the dims are powers of 2.
-  unsigned outVec = dstSharedLayout.getMaxPhase() == 1
-                        ? dstTy.getShape()[inOrd[0]]
-                        : dstSharedLayout.getVec();
-  unsigned minVec = std::min(outVec, inVec);
-  unsigned numElems = triton::gpu::getTotalElemsPerThread(srcTy);
-  auto wordTy = vec_ty(elemLlvmTy, minVec);
-  Value word;
-
-  SmallVector<Value, 3> srcStrides(dstStrides);
-  SmallVector<Value, 3> offsetVals(rank, i32_val(0));
-  SharedMemoryObject shrMemObj(smemBase, elemLlvmTy, srcStrides, offsetVals);
-
-  DenseMap<unsigned, Value> sharedPtrs = ::intel::getSwizzledSharedPtrs(
-      loc, target, inVec, srcTy, dstSharedLayout, elemLlvmTy,
-      std::move(shrMemObj), rewriter, offsetVals, srcStrides);
-  LDBG("storeDistributedToShared: numElems = " << numElems << " minVec = "
-                                               << minVec << " " << wordTy);
-  for (unsigned i = 0; i < numElems; ++i) {
-    if (i % minVec == 0)
-      word = undef(wordTy);
-    word = insert_element(wordTy, word, srcVals[i], i32_val(i % minVec));
-    if (i % minVec == minVec - 1) {
-      Value shrMemAddr = sharedPtrs[i / minVec * minVec];
-      shrMemAddr = bitcast(shrMemAddr, ptr_ty(rewriter.getContext(), 3));
-      store(word, shrMemAddr)
-          .setAlignment(minVec * elemLlvmTy.getIntOrFloatBitWidth() / 8);
-    }
+  if (isa<DpasEncodingAttr>(srcTy.getEncoding())) {
+    if (emitTransferBetweenDPASAndShared(
+            srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase,
+            dstStrides, loc, rewriter, target,
+            [&](VectorType vecTy, Value vecAddr) {
+              ArrayRef<Value> vals = srcVals.take_front(vecTy.getNumElements());
+              srcVals = srcVals.drop_front(vecTy.getNumElements());
+              Value vec = undef(vecTy);
+              for (int i = 0; i < vals.size(); i++) {
+                vec = insert_element(vec, vals[i], i32_val(i));
+              }
+              store(vec, vecAddr)
+                  .setAlignment(vecTy.getNumElements() *
+                                elemLlvmTy.getIntOrFloatBitWidth() / 8);
+            }))
+      return;
   }
+  bool success = emitTransferBetweenRegistersAndShared(
+      srcTy, dstTy, elemLlvmTy, /*maxVecElems=*/std::nullopt, smemBase,
+      dstStrides, loc, rewriter, target, [&](VectorType vecTy, Value vecAddr) {
+        ArrayRef<Value> vals = srcVals.take_front(vecTy.getNumElements());
+        srcVals = srcVals.drop_front(vecTy.getNumElements());
+
+        Value vec = undef(vecTy);
+        for (int i = 0; i < vals.size(); i++) {
+          vec = insert_element(vec, vals[i], i32_val(i));
+        }
+        store(vec, vecAddr)
+            .setAlignment(vecTy.getNumElements() *
+                          elemLlvmTy.getIntOrFloatBitWidth() / 8);
+      });
+  if (!success)
+    llvm::report_fatal_error("Failed to emit transfer from register to shared");
 }
 
 Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter,
diff --git a/unittest/Dialect/TritonGPU/CMakeLists.txt b/unittest/Dialect/TritonGPU/CMakeLists.txt
index 3e57b1c005..88e7ac463a 100644
--- a/unittest/Dialect/TritonGPU/CMakeLists.txt
+++ b/unittest/Dialect/TritonGPU/CMakeLists.txt
@@ -13,3 +13,9 @@ add_triton_ut(
 	SRCS LinearLayoutConversionsTest.cpp
 	LIBS TritonGPUIR
 )
+
+add_triton_ut(
+	NAME DPAStoLinearLayout
+	SRCS DPAStoLinearLayoutTest.cpp
+	LIBS TritonIntelGPUIR
+)
diff --git a/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp b/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp
new file mode 100644
index 0000000000..19c2c31ccd
--- /dev/null
+++ b/unittest/Dialect/TritonGPU/DPAStoLinearLayoutTest.cpp
@@ -0,0 +1,126 @@
+#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
+#include "mlir/IR/MLIRContext.h"
+#include "triton/Tools/StrUtil.h"
+#include "llvm/Support/Signals.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlir {
+std::ostream &operator<<(std::ostream &os, StringAttr str) {
+  os << str.str();
+  return os;
+}
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::triton::gpu::intel;
+
+namespace mlir::triton::gpu {
+namespace {
+
+class DPAStoLinearLayoutTest : public ::testing::Test {
+public:
+  void SetUp() { ctx.getOrLoadDialect<TritonIntelGPUDialect>(); }
+
+  DpasEncodingAttr dpas(ArrayRef<unsigned> warps, unsigned repeatCount,
+                        unsigned systolicDepth, unsigned executionSize,
+                        unsigned opsPerChannel, ArrayRef<unsigned> repCluster,
+                        unsigned threadsPerWarp) {
+    return DpasEncodingAttr::get(&ctx, repeatCount, systolicDepth,
+                                 executionSize, opsPerChannel, warps,
+                                 repCluster, threadsPerWarp);
+  }
+
+  StringAttr S(StringRef str) { return StringAttr::get(&ctx, str); }
+
+protected:
+  MLIRContext ctx;
+};
+
+TEST_F(DPAStoLinearLayoutTest, DPAS_perInst) {
+  EXPECT_EQ(DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 2, {1, 1}, 32)),
+            LinearLayout(
+                {
+                    {S("register"), {{2, 0}, {4, 0}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
+                    {S("warp"), {}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+  EXPECT_EQ(DPAStoLinearLayout({8, 16}, dpas({1, 1}, 8, 8, 16, 1, {1, 1}, 16)),
+            LinearLayout(
+                {
+                    {S("register"), {{1, 0}, {2, 0}, {4, 0}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}},
+                    {S("warp"), {}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+}
+
+TEST_F(DPAStoLinearLayoutTest, DPAS_withRepCluster) {
+  EXPECT_EQ(DPAStoLinearLayout({32, 32}, dpas({1, 1}, 8, 8, 16, 2, {4, 2}, 16)),
+            LinearLayout(
+                {
+                    {S("register"),
+                     {{1, 0}, {2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}},
+                    {S("warp"), {}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+  EXPECT_EQ(DPAStoLinearLayout({32, 32}, dpas({1, 1}, 8, 8, 16, 1, {4, 2}, 16)),
+            LinearLayout(
+                {
+                    {S("register"),
+                     {{1, 0}, {2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}},
+                    {S("warp"), {}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+}
+
+TEST_F(DPAStoLinearLayoutTest, DPAS_withWarp) {
+  EXPECT_EQ(DPAStoLinearLayout({32, 32}, dpas({4, 1}, 8, 8, 16, 2, {1, 2}, 16)),
+            LinearLayout(
+                {
+                    {S("register"), {{1, 0}, {2, 0}, {4, 0}, {0, 16}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}}},
+                    {S("warp"), {{8, 0}, {16, 0}}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+  EXPECT_EQ(DPAStoLinearLayout({64, 64}, dpas({2, 2}, 8, 8, 16, 1, {4, 2}, 32)),
+            LinearLayout(
+                {
+                    {S("register"), {{2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
+                    {S("warp"), {{32, 0}, {0, 32}}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+}
+
+TEST_F(DPAStoLinearLayoutTest, DPAS_withDPASRepetitions) {
+  EXPECT_EQ(DPAStoLinearLayout({64, 64}, dpas({2, 1}, 8, 8, 16, 2, {4, 2}, 32)),
+            LinearLayout(
+                {
+                    {S("register"),
+                     {{2, 0}, {4, 0}, {0, 16}, {8, 0}, {16, 0}, {0, 32}}},
+                    {S("lane"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {1, 0}}},
+                    {S("warp"), {{32, 0}}},
+                    {S("block"), {}},
+                },
+                {S("dim0"), S("dim1")}));
+}
+
+} // anonymous namespace
+} // namespace mlir::triton::gpu
+
+int main(int argc, char *argv[]) {
+  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}