From 3c296abe517d47036bba1c660089eee1cd929a7f Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@meta.com>
Date: Mon, 4 Nov 2024 22:59:56 -0800
Subject: [PATCH 1/3] =?UTF-8?q?[BACKEND]=20Update=20llvm=20to=20llvm/llvm-?=
 =?UTF-8?q?project@fa57c7a6a5f594a9e3ae2dbe35=E2=80=A6=20(#5064)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumping llvm to include a loop unroller fix:
https://github.com/llvm/llvm-project/pull/114573. This is needed for
subsequent loop unroller upstreaming work.
---
 cmake/llvm-hash.txt                   | 2 +-
 test/lib/Instrumentation/GPUHello.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt
index 6f79225948..840d6d9f3e 100644
--- a/cmake/llvm-hash.txt
+++ b/cmake/llvm-hash.txt
@@ -1 +1 @@
-b74e588e1f460eb48ceb1a30cf8ac870b7537dcc
+fa57c7a6a5f594a9e3ae2dbe3542cf89a20cdd73
diff --git a/test/lib/Instrumentation/GPUHello.cpp b/test/lib/Instrumentation/GPUHello.cpp
index 3bee8ce90c..5c71857c8f 100644
--- a/test/lib/Instrumentation/GPUHello.cpp
+++ b/test/lib/Instrumentation/GPUHello.cpp
@@ -61,7 +61,7 @@ bool GpuHello::runOnModule(Module &module) {
 
 PassPluginLibraryInfo getPassPluginInfo() {
   const auto callback = [](PassBuilder &pb) {
-    pb.registerOptimizerLastEPCallback([&](ModulePassManager &mpm, auto) {
+    pb.registerOptimizerLastEPCallback([&](ModulePassManager &mpm, auto, auto) {
       mpm.addPass(GpuHello());
       return true;
     });

From d2b8659913250fc3861c25eea0a29d3823109bb2 Mon Sep 17 00:00:00 2001
From: Hongtao Yu <hoy@meta.com>
Date: Mon, 4 Nov 2024 23:41:33 -0800
Subject: [PATCH 2/3] [LoopUnroll] Do not pipeline epilog loops generated by
 loop unrolling (#5027)

The epilog loop created by the loop unroller may not be run if the main
unrolled loop covers all original loop iterations, thus pipelining it
non-speculatively may not be beneficial. It can also cause some
correctness issue when combined with the downstream PTXAS optimizer.
---
 lib/Dialect/Triton/Transforms/LoopUnroll.cpp | 20 ++++++++++++++------
 test/Triton/loop-unroll.mlir                 |  1 +
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp
index 257e734b7f..cb25d41a25 100644
--- a/lib/Dialect/Triton/Transforms/LoopUnroll.cpp
+++ b/lib/Dialect/Triton/Transforms/LoopUnroll.cpp
@@ -22,8 +22,6 @@
 
 namespace mlir::triton {
 
-static const char *loopUnrollFactorAttrName = "tt.loop_unroll_factor";
-
 namespace {
 
 class LoopUnrollPass : public TritonLoopUnrollBase<LoopUnrollPass> {
@@ -31,12 +29,15 @@ class LoopUnrollPass : public TritonLoopUnrollBase<LoopUnrollPass> {
   int getUnrollFactorOrDefault(scf::ForOp forOp) {
     // Use the attribute attached to the loop if it exists otherwise set the
     // factor to 1 to suppress the unrolling.
-    if (auto factor = forOp->getAttrOfType<IntegerAttr>(
-            mlir::triton::loopUnrollFactorAttrName))
+    if (auto factor =
+            forOp->getAttrOfType<IntegerAttr>(loopUnrollFactorAttrName))
       return factor.getInt();
     return 1;
   }
 
+  const char *loopUnrollFactorAttrName = "tt.loop_unroll_factor";
+  const char *pipelineStagesAttrName = "tt.num_stages";
+
 public:
   LoopUnrollPass() = default;
   LoopUnrollPass(const LoopUnrollPass &) {}
@@ -49,11 +50,18 @@ class LoopUnrollPass : public TritonLoopUnrollBase<LoopUnrollPass> {
         loops.push_back(forOp);
     });
 
+    auto ctx = getOperation()->getContext();
     for (auto loop : loops) {
       auto unrollFactor = getUnrollFactorOrDefault(loop);
-      loop->removeAttr(mlir::triton::loopUnrollFactorAttrName);
+      loop->removeAttr(loopUnrollFactorAttrName);
       LDBG("Unrolling loop by " << unrollFactor << " times\n" << loop);
-      (void)loopUnrollByFactor(loop, unrollFactor);
+      auto resultLoops = loopUnrollByFactor(loop, unrollFactor);
+      // Do not pipeline the epilog loop.
+      if (succeeded(resultLoops) && resultLoops->epilogueLoopOp) {
+        (*resultLoops->epilogueLoopOp)
+            ->setAttr(pipelineStagesAttrName,
+                      mlir::IntegerAttr::get(IntegerType::get(ctx, 32), 1));
+      }
     }
   }
 };
diff --git a/test/Triton/loop-unroll.mlir b/test/Triton/loop-unroll.mlir
index 9166630281..531a14fffa 100644
--- a/test/Triton/loop-unroll.mlir
+++ b/test/Triton/loop-unroll.mlir
@@ -13,6 +13,7 @@ tt.func @add_kernel_unroll(%arg0: tensor<256x!tt.ptr<f32>>, %arg1: i32) {
   // CHECK: scf.for
   // CHECK: tt.load
   // CHECK-NOT: tt.load
+  // CHECK: tt.num_stages = 1 : i32
   %2:2 = scf.for %arg3 = %c1_i32 to %arg1 step %c1_i32 iter_args(%arg4 = %1, %arg5 = %arg0) -> (tensor<256xf32>, tensor<256x!tt.ptr<f32>>)  : i32 {
       %3 = tt.load %arg5 : tensor<256x!tt.ptr<f32>>
     %4 = arith.addf %arg4, %3 : tensor<256xf32>

From 38a11b859fff79ea214256d3f1cfe43d54e36c2c Mon Sep 17 00:00:00 2001
From: Mario Lezcano Casado <3291265+lezcano@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:29:48 +0000
Subject: [PATCH 3/3] [BACKEND] Fix uses of getOrder(DotOperand(Nvidia) and
 MMA(Nvidia)) (#5055)

We use `getOrder` very liberally throughout the codebase, when we really
meant to use `getThreadOrder`. This is an issue with the input layout is
an
`DotOperand(mma(opIdx=1))`, where the thread order and the matrix order
are opposite.

Found this to be an issue when a PR changed the `getOrder` of
`DotOperand(Hopper)` to an incorrect one and CI still passed! The issue
here is that the LLVM lowering for wgmma and the LinearLayout does not
use `getOrder`, but there are many other subsystems do, and many
heuristics would be getting an incorrect order, and potentially be
disabled.

This is particularly problematic for `DotOperand(opIdx=1)` in nvidia
hardware, as `getThreadOrder` and `getOrder` are different!

While doing so we:
- Audit most (all?) the calls to `getOrder(dotOperand)`. It turns out
that most of them really meant `getThreadOrder`
- Fix the ordering methods of `SliceEncodingAttr` to be consistent
- Move the implementation of `getWarpOrder` to the Attr classes, because
of OOP

The test strategy was to add `llvm::report_fatal_error("Testing");`
within `getOrder(nvidiaMma)` and `getOrder(DotOperand(nvidiaMma))` and
triaging all errors that were raised in CI.
---
 include/triton/Analysis/Utility.h             |  2 +-
 lib/Analysis/Allocation.cpp                   |  3 +
 lib/Analysis/AxisInfo.cpp                     |  6 +-
 lib/Analysis/Utility.cpp                      | 10 +--
 .../TritonGPUToLLVM/ReduceOpToLLVM.cpp        |  4 +-
 .../TritonGPUToLLVM/ViewOpToLLVM.cpp          |  7 +-
 lib/Dialect/TritonGPU/IR/Dialect.cpp          | 71 +++++++++----------
 .../TritonGPU/IR/LinearLayoutConversions.cpp  |  1 +
 .../Transforms/ReduceDataDuplication.cpp      | 10 +--
 python/test/unit/language/test_core.py        |  1 -
 .../TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp  |  7 +-
 .../DecomposeUnsupportedConversions.cpp       |  2 +-
 .../LoadStoreOpToLLVM.cpp                     |  4 +-
 13 files changed, 63 insertions(+), 65 deletions(-)

diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
index df6029db0d..20da978449 100644
--- a/include/triton/Analysis/Utility.h
+++ b/include/triton/Analysis/Utility.h
@@ -66,7 +66,7 @@ class ReduceOpHelper {
   // The shape of the shared memory space needed for the reduction.
   SmallVector<unsigned> getScratchRepShape();
 
-  SmallVector<unsigned> getOrderWithAxisAtBeginning();
+  SmallVector<unsigned> getThreadOrderWithAxisAtBeginning();
 
   unsigned getScratchSizeInBytes();
 
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
index e2713916ea..020a9ea4d3 100644
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -46,6 +46,9 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
   auto dstShapePerCTATile =
       gpu::getShapePerCTATile(dstLayout, dstTy.getShape());
 
+  assert(srcTy.getRank() == dstTy.getRank() &&
+         "src and dst must have the same rank");
+
   unsigned rank = dstTy.getRank();
   SmallVector<unsigned> repShape(rank);
   for (unsigned d = 0; d < rank; ++d) {
diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
index f0c5ae3167..d2c2c9fd8d 100644
--- a/lib/Analysis/AxisInfo.cpp
+++ b/lib/Analysis/AxisInfo.cpp
@@ -1213,7 +1213,7 @@ unsigned ModuleAxisInfoAnalysis::getPtrContiguity(Value ptr) {
 
   // Here order should be ordered by contiguous first, so the first element
   // should have the largest contiguous.
-  auto order = triton::gpu::getOrder(layout);
+  auto order = triton::gpu::getThreadOrder(layout);
   unsigned align = getPtrAlignment(ptr);
 
   auto uniqueContigPerThread =
@@ -1235,7 +1235,7 @@ unsigned ModuleAxisInfoAnalysis::getPtrAlignment(Value ptr) {
   if (!axisInfo)
     return 1;
   auto layout = tensorTy.getEncoding();
-  auto order = triton::gpu::getOrder(layout);
+  auto order = triton::gpu::getThreadOrder(layout);
   auto maxMultipleBytes = axisInfo->getDivisibility(order[0]);
   auto maxContig = axisInfo->getContiguity(order[0]);
   auto elemNumBits = triton::getPointeeBitWidth(tensorTy);
@@ -1262,7 +1262,7 @@ unsigned ModuleAxisInfoAnalysis::getMaskAlignment(Value mask) {
   auto *axisInfo = getAxisInfo(mask);
   if (!axisInfo)
     return 1;
-  auto maskOrder = triton::gpu::getOrder(tensorTy.getEncoding());
+  auto maskOrder = triton::gpu::getThreadOrder(tensorTy.getEncoding());
   auto alignment = std::max<unsigned>(axisInfo->getConstancy(maskOrder[0]), 1);
   LDBG("getMaskAlignment maskOrder[0] " << maskOrder[0] << " alignment "
                                         << alignment);
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index aa9f8b01ea..8c62e738f7 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -32,9 +32,9 @@ int getParentAxis(Attribute layout, int axis) {
   return axis;
 }
 
-SmallVector<unsigned> getParentOrder(Attribute layout) {
+SmallVector<unsigned> getParentThreadOrder(Attribute layout) {
   if (auto sliceEncoding = mlir::dyn_cast<SliceEncodingAttr>(layout)) {
-    return getParentOrder(sliceEncoding.getParent());
+    return getParentThreadOrder(sliceEncoding.getParent());
   }
   return getThreadOrder(layout);
 }
@@ -44,12 +44,12 @@ SmallVector<unsigned> getParentOrder(Attribute layout) {
 // TODO(jlebar): Move this class into namespace triton.
 bool ReduceOpHelper::isReductionOnLayoutFastAxis() {
   return getParentAxis(getSrcLayout(), axis) ==
-         getParentOrder(getSrcLayout())[0];
+         getParentThreadOrder(getSrcLayout())[0];
 }
 
-SmallVector<unsigned> ReduceOpHelper::getOrderWithAxisAtBeginning() {
+SmallVector<unsigned> ReduceOpHelper::getThreadOrderWithAxisAtBeginning() {
   auto srcLayout = getSrcLayout();
-  auto order = getOrder(srcLayout);
+  auto order = getThreadOrder(srcLayout);
   auto it = std::find(order.begin(), order.end(), axis);
   // delete the axis from order
   order.erase(it);
diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
index 8682706db8..5367ab89f7 100644
--- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
@@ -283,7 +283,7 @@ struct ReduceOpConversion
         getMultiDimWarpId(helper, warpId, loc, rewriter);
     Value warpIdAxis = multiDimWarpId[axis];
 
-    auto smemOrder = helper.getOrderWithAxisAtBeginning();
+    auto smemOrder = helper.getThreadOrderWithAxisAtBeginning();
     for (auto it : accs) {
       const SmallVector<unsigned> &key = it.first;
       SmallVector<Value> &acc = it.second;
@@ -370,7 +370,7 @@ struct ReduceOpConversion
     Location loc = op.getLoc();
     auto srcLayout = helper.getSrcLayout();
     auto axis = op.getAxis();
-    auto smemOrder = helper.getOrderWithAxisAtBeginning();
+    auto smemOrder = helper.getThreadOrderWithAxisAtBeginning();
     SmallVector<Value> results(op.getNumOperands());
     for (unsigned i = 0; i < op.getNumOperands(); ++i) {
       auto elemTy = getElementType(op, i);
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
index 297a94e851..be5feedd67 100644
--- a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -181,9 +181,9 @@ struct SplitOpConversion : public ConvertOpToLLVMPattern<SplitOp> {
     int numContiguousValues = 1;
     auto encoding = cast<BlockedEncodingAttr>(
         cast<RankedTensorType>(op.getSrc().getType()).getEncoding());
-    int splitDim = encoding.getOrder().size() - 1;
-    for (int i = 0; i < encoding.getOrder().size(); i++) {
-      if (encoding.getOrder()[i] == splitDim)
+    int splitDim = encoding.getThreadOrder().size() - 1;
+    for (int i = 0; i < encoding.getThreadOrder().size(); i++) {
+      if (encoding.getThreadOrder()[i] == splitDim)
         break;
       numContiguousValues *= encoding.getSizePerThread()[i];
     }
@@ -336,7 +336,6 @@ struct BroadcastOpConversion
     unsigned rank = srcTy.getRank();
     auto typeConverter = getTypeConverter();
     assert(rank == resultTy.getRank());
-    auto order = triton::gpu::getOrder(srcLayout);
     auto srcOffsets = emitOffsetForLayout(srcLayout, srcTy);
     auto resultOffsets = emitOffsetForLayout(resultLayout, resultTy);
     SmallVector<Value> srcVals = unpackLLElements(loc, src, rewriter);
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 6a820fc2b3..d0365b4cee 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -217,7 +217,7 @@ bool isExpensiveView(Type srcType, Type dstType) {
   return getTotalElemsPerThread(srcType) != getTotalElemsPerThread(dstType);
 }
 
-/* Utility function used by getOrder and getCTAOrder of SliceEncodingAttr.
+/* Utility function used by get.*Order methods of SliceEncodingAttr.
  * Erase dim and decrease all values larger than dim by 1.
  * Example:    order = [0, 2, 4, 3, 1], dim = 2
  *          resOrder = [0,    3, 2, 1]
@@ -262,29 +262,11 @@ SmallVector<unsigned> getOrderForDotOperand(unsigned opIdx, unsigned rank,
 }
 
 SmallVector<unsigned> getWarpOrder(Attribute layout) {
-  if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    if (isa<AMDMfmaEncodingAttr>(dotLayout.getParent())) {
-      return getWarpOrder(dotLayout.getParent());
-    }
-  }
-  auto order = getOrder(layout);
-  // FIXME: At the moment, warpOrder in Ampere is N-major but in Hopper it's
-  // M-major This is awkward. Since we can choose any warpOrder in Ampere, we
-  // should probably choose M-major and change `LinearLayoutConversion.cpp` and
-  // `MMAv2.cpp` to match.
-  if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
-    if (mmaLayout.isHopper()) {
-      // Hopper MMA instructions force warps to be column-major
-      // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-wgmma-mma-async-m64nnk8
-      return getMatrixOrder(order.size(), /*rowMajor*/ false);
-    }
-  } else if (auto dotOpLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    // It's quite weird to talk about warp order when that the warps
-    // are broadcasted along the K dimension
-    llvm::report_fatal_error(
-        "DotOperandEncoding::getWarpOrder not implemented");
-  }
-  return order;
+  if (auto distributedLayout = mlir::dyn_cast<DistributedEncodingTrait>(layout))
+    return distributedLayout.getWarpOrder();
+  else
+    llvm::report_fatal_error("Unimplemented usage of getThreadOrder");
+  return {};
 }
 
 SmallVector<unsigned> getOrder(Attribute layout) {
@@ -293,7 +275,7 @@ SmallVector<unsigned> getOrder(Attribute layout) {
   }
   if (auto mmaLayout = dyn_cast<MmaEncodingTrait>(layout)) {
     // Order doesn't really matter. We just have to be consistent when unpacking
-    // the elements in the MMAv2/V3 lowerings. We choose row-major
+    // the output elements in the LLVM lowerings. We choose row-major
     auto distributedLayout = cast<DistributedEncodingTrait>(layout);
     auto rank = distributedLayout.getWarpsPerCTA().size();
     return getMatrixOrder(rank, /*rowMajor*/ true);
@@ -318,7 +300,7 @@ SmallVector<unsigned> getOrder(Attribute layout) {
 
   llvm::report_fatal_error("Unimplemented usage of getOrder");
   return {};
-};
+}
 
 SmallVector<unsigned> getThreadOrder(Attribute layout) {
   if (auto distributedLayout = mlir::dyn_cast<DistributedEncodingTrait>(layout))
@@ -326,7 +308,7 @@ SmallVector<unsigned> getThreadOrder(Attribute layout) {
   else
     llvm::report_fatal_error("Unimplemented usage of getThreadOrder");
   return {};
-};
+}
 
 CTALayoutAttr getCTALayout(Attribute layout) {
   if (auto distributedLayout =
@@ -769,7 +751,8 @@ SmallVector<unsigned> SliceEncodingAttr::getWarpsPerCTA() const {
   return warpsPerCTA;
 }
 SmallVector<unsigned> SliceEncodingAttr::getWarpOrder() const {
-  return ::getWarpOrder(*this);
+  auto parentWarpOrder = ::getWarpOrder(getParent());
+  return eraseOrder(parentWarpOrder, getDim());
 }
 SmallVector<unsigned> SliceEncodingAttr::getThreadsPerWarp() const {
   auto parent = getParent();
@@ -781,7 +764,8 @@ SmallVector<unsigned> SliceEncodingAttr::getThreadsPerWarp() const {
   return threadsPerWarp;
 }
 SmallVector<unsigned> SliceEncodingAttr::getThreadOrder() const {
-  return ::getOrder(*this);
+  auto parentThreadOrder = ::getThreadOrder(getParent());
+  return eraseOrder(parentThreadOrder, getDim());
 }
 SmallVector<unsigned> SliceEncodingAttr::getSizePerThread() const {
   auto sizePerThread = ::getSizePerThread(getParent());
@@ -1049,7 +1033,14 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {
   return warps;
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
-  return ::getWarpOrder(*this);
+  // FIXME(Lezcano): Preexisting. Do we want to have this path at all?
+  if (mlir::isa<AMDMfmaEncodingAttr>(getParent())) {
+    return ::getWarpOrder(getParent());
+  }
+  // It's quite weird to talk about warp order when that the warps
+  // are broadcasted along the K dimension
+  llvm::report_fatal_error("DotOperandEncoding::getWarpOrder not implemented");
+  return {};
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
   return getOrderForDotOperand(getOpIdx(), getWarpsPerCTA().size(),
@@ -1597,7 +1588,7 @@ SmallVector<unsigned> AMDMfmaEncodingAttr::getWarpsPerCTA() const {
   return SmallVector<unsigned>(getWarpsPerCTA__());
 }
 SmallVector<unsigned> AMDMfmaEncodingAttr::getWarpOrder() const {
-  return ::getWarpOrder(*this);
+  return ::getOrder(*this);
 }
 SmallVector<unsigned> AMDMfmaEncodingAttr::getThreadOrder() const {
   auto order = ::getOrder(*this);
@@ -1766,7 +1757,7 @@ SmallVector<unsigned> AMDWmmaEncodingAttr::getWarpsPerCTA() const {
   return SmallVector<unsigned>(getWarpsPerCTA__());
 }
 SmallVector<unsigned> AMDWmmaEncodingAttr::getWarpOrder() const {
-  return ::getWarpOrder(*this);
+  return ::getOrder(*this);
 }
 SmallVector<unsigned> AMDWmmaEncodingAttr::getThreadOrder() const {
   return ::getOrder(*this);
@@ -1890,7 +1881,11 @@ SmallVector<unsigned> NvidiaMmaEncodingAttr::getWarpsPerCTA() const {
   return SmallVector<unsigned>(getWarpsPerCTA__());
 }
 SmallVector<unsigned> NvidiaMmaEncodingAttr::getWarpOrder() const {
-  return ::getWarpOrder(*this);
+  auto rank = getWarpsPerCTA().size();
+  // Hopper (wgmma) uses column-major as this is embeded in the instruction
+  // For Ampere we can choose either row-major or column-major.
+  // We choose row-major as the legacy path did so
+  return getMatrixOrder(rank, /*rowMajor*/ !isHopper());
 }
 SmallVector<unsigned> NvidiaMmaEncodingAttr::getThreadsPerWarp() const {
   auto rank = getWarpsPerCTA().size();
@@ -1914,10 +1909,11 @@ SmallVector<unsigned> NvidiaMmaEncodingAttr::getThreadsPerWarp() const {
       "getThreadsPerWarp not implemented for unknown Mma version ");
 }
 SmallVector<unsigned> NvidiaMmaEncodingAttr::getThreadOrder() const {
-  return ::getOrder(*this);
+  auto rank = getWarpsPerCTA().size();
+  return getMatrixOrder(rank, /*rowMajor*/ true);
 }
 SmallVector<unsigned> NvidiaMmaEncodingAttr::getSizePerThread() const {
-  auto rank = ::getOrder(*this).size();
+  auto rank = getWarpsPerCTA().size();
   SmallVector<unsigned> res(rank, 1);
   if (isAmpere()) {
     res[rank - 2] = 2;
@@ -2158,11 +2154,10 @@ NvidiaMmaEncodingAttr::getSizePerThreadForOperand(int kWidth, int opIdx) const {
   if (opIdx == 0) {
     sizePerThread[rank - 2] = 2;
     sizePerThread[rank - 1] = 2 * kWidth;
-  } else if (opIdx == 1) {
+  } else {
+    assert(opIdx == 1);
     sizePerThread[rank - 2] = 2 * kWidth;
     sizePerThread[rank - 1] = 1;
-  } else {
-    llvm::report_fatal_error("DotOperandEncodingAttr opIdx must be 0 or 1");
   }
   return sizePerThread;
 }
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 6796307b7e..2668f38497 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -327,6 +327,7 @@ LinearLayout hopperMmaToLinearLayout(ArrayRef<int64_t> shape,
   assert(n == 8 || n == 16 || n == 32 || n == 64 || n == 128 || n == 256);
   assert(k == 8 || k == 16 || k == 32);
 
+  // TODO Make the getOrder of Hopper explicit here via an assert
   MLIRContext *ctx = mma.getContext();
   LinearLayout ctaLayout(
       {{S("register"), {{1, 0}, {0, 8}}},
diff --git a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
index b1e296c1bb..dce6c7f2af 100644
--- a/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/ReduceDataDuplication.cpp
@@ -44,17 +44,17 @@ class TritonGPUReduceDataDuplicationPass
         return;
       if (!cvtNeedsSharedMemory(srcType, dstType))
         return;
-      auto srcOrder = triton::gpu::getOrder(srcEncoding);
-      auto rank = srcOrder.size();
+      auto srcThreadOrder = triton::gpu::getThreadOrder(srcEncoding);
+      auto rank = srcThreadOrder.size();
       SmallVector<unsigned> sharedOrder;
       if (rank == 3) {
         // add all elements except the element that is zero
         for (unsigned i = 0; i < rank; ++i)
-          if (srcOrder[i] != 0)
-            sharedOrder.emplace_back(srcOrder[i]);
+          if (srcThreadOrder[i] != 0)
+            sharedOrder.emplace_back(srcThreadOrder[i]);
         sharedOrder.emplace_back(0);
       } else {
-        sharedOrder = srcOrder;
+        sharedOrder = srcThreadOrder;
       }
       auto sharedMemorySpace =
           triton::gpu::SharedMemorySpaceAttr::get(srcType.getContext());
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index 18e22139ba..eb790153a9 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -1765,7 +1765,6 @@ def kernel(output_ptr, n_elements, BLOCK_SIZE: tl.constexpr, CONSTANT_FIELD: tl.
     kernel[(1, )](output, block_size, BLOCK_SIZE=block_size, num_ctas=num_ctas, CONSTANT_FIELD=constant_field)
 
     if constant_field == "value":
-        print(output, ref)
         assert torch.all(output == ref)
     else:
         assert torch.all(output == 0)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
index ef0ef5e591..f8edb7f93c 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -40,15 +40,16 @@ Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,
     auto sizePerThread = triton::gpu::getSizePerThread(layout);
     auto threadsPerWarp = triton::gpu::getThreadsPerWarp(layout);
     auto warpsPerCTA = triton::gpu::getWarpsPerCTA(layout);
-    auto order = triton::gpu::getOrder(layout);
+    auto threadOrder = triton::gpu::getThreadOrder(layout);
+    auto warpOrder = triton::gpu::getWarpOrder(layout);
     auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);
     Value warpSize = i32_val(triton::gpu::getWarpSize(layout));
     Value laneId = urem(tid, warpSize);
     Value warpId = udiv(tid, warpSize);
     SmallVector<Value> multiDimWarpId =
-        delinearize(rewriter, loc, warpId, warpsPerCTA, order);
+        delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
     SmallVector<Value> multiDimThreadId =
-        delinearize(rewriter, loc, laneId, threadsPerWarp, order);
+        delinearize(rewriter, loc, laneId, threadsPerWarp, threadOrder);
     for (unsigned dim = 0; dim < rank; ++dim) {
       // if there is no data replication across threads on this dimension
       if (shape[dim] >= shapePerCTATile[dim])
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
index 36b14e270b..6129d77f17 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DecomposeUnsupportedConversions.cpp
@@ -54,7 +54,7 @@ class DecomposeLocalLoadToDotOperand
         type.getShape(), type.getElementType(),
         triton::gpu::SharedEncodingAttr::get(
             op.getContext(), dstDotOp, type.getShape(),
-            triton::gpu::getOrder(parentEnc),
+            triton::gpu::getThreadOrder(parentEnc),
             triton::gpu::getCTALayout(parentEnc), type.getElementType()),
         srcType.getMemorySpace());
     auto tmp = rewriter.create<triton::gpu::LocalAllocOp>(
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
index 945fb092eb..afe7f98be8 100644
--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -38,7 +38,7 @@ Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,
     auto sizePerThread = triton::gpu::getSizePerThread(layout);
     auto threadsPerWarp = triton::gpu::getThreadsPerWarp(layout);
     auto warpsPerCTA = triton::gpu::getWarpsPerCTA(layout);
-    auto order = triton::gpu::getOrder(layout);
+    auto threadOrder = triton::gpu::getThreadOrder(layout);
     auto warpOrder = triton::gpu::getWarpOrder(layout);
     auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);
     Value warpSize = i32_val(32);
@@ -47,7 +47,7 @@ Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,
     SmallVector<Value> multiDimWarpId =
         delinearize(rewriter, loc, warpId, warpsPerCTA, warpOrder);
     SmallVector<Value> multiDimThreadId =
-        delinearize(rewriter, loc, laneId, threadsPerWarp, order);
+        delinearize(rewriter, loc, laneId, threadsPerWarp, threadOrder);
     for (unsigned dim = 0; dim < rank; ++dim) {
       // if there is no data replication across threads on this dimension
       if (shape[dim] >= shapePerCTATile[dim])