intel · LiyangLingIntel · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Jan 3, 2025
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -39,8 +39,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_INTEL_ENABLE_FIRST_LOAD_TO_SLM",
     "TRITON_INTEL_ENABLE_INSTR_SCHED",
     "TRITON_INTEL_ENABLE_POST_PROCESS_LLIR",
-    "TRITON_INTEL_REDUCE_TRANSPOSE",
-    "TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING"
+    "TRITON_INTEL_REDUCE_TRANSPOSE"
     // clang-format on
 };
 

diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -5,7 +5,6 @@
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
-#include "triton/Tools/Sys/GetEnv.hpp"
 
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
@@ -339,21 +338,15 @@ LogicalResult UpcastMXFPOp::verify() {
     return success();
   }
 
-  /// TODO: Temporarily disabled this check to allow for the blocked encoding.
-  /// Enable once we have the dot op encoding UpcastMXFPOp lowering.
   auto dotEncoding = dyn_cast<DotOperandEncodingAttr>(layoutX);
-  if (mlir::triton::tools::getBoolEnv(
-          "TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING") &&
-      !dotEncoding) {
+  if (!dotEncoding) {
     return emitOpError("Expected a DotOperandEncodingAttr for values");
   }
   if (!isa<BlockedEncodingAttr, LinearEncodingAttr>(layoutScale)) {
     return emitOpError(
         "Expected a BlockOperandEncoding or LinearOperandEncoding "
         "for scales");
   }
-  if (!dotEncoding)
-    return success();
 
   if (isa<NvidiaMmaEncodingAttr>(dotEncoding.getParent())) {
     // Necessary to keep all of the scales of a given block of values in the
@@ -411,43 +404,36 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
     } else {
       Type elemType = FloatType::getBF16(ctx);
       Attribute newVEncoding = nullptr;
-      if (auto oldEncoding = dyn_cast<DotOperandEncodingAttr>(encoding)) {
-        const int opIdx = oldEncoding.getOpIdx();
-        const bool hasBatch = xShape.size() == 3;
-        const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
-        newShape[kIdx] *= 2;
-
-        // Note: For Intel the dot operands layout's kWidth parameter must match
-        // the parent's DPAS layout opsPerChannel so we need to materialize a
-        // new DPAS layout.
-        if (auto dpasEncoding =
-                dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
-          auto newDpasEncoding = intel::DpasEncodingAttr::get(
-              ctx, dpasEncoding.getRepeatCount(),
-              dpasEncoding.getSystolicDepth(), dpasEncoding.getExecutionSize(),
-              intel::DpasEncodingAttr::getOpsPerChannel(elemType),
-              dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
-              dpasEncoding.getSubGroupSize());
-          newVEncoding = DotOperandEncodingAttr::get(
-              ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
-        } else {
-          // Figure out the K dimension for the input A/B, given that the return
-          // type is upcasted A/B type so we need to update the proper dim size.
-          newVEncoding = DotOperandEncodingAttr::get(
-              ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
-              oldEncoding.getKWidth() * 2);
-        }
-      } else if (auto oldEncoding = dyn_cast<BlockedEncodingAttr>(encoding)) {
-        // TODO: Temporary code, remove once upcast_mxfp support dot encoding.
-        assert(!tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING"));
-        SmallVector<unsigned> sizePerThread = oldEncoding.getSizePerThread();
-        int opIdx = sizePerThread.back() == 1 ? 1 : 0;
-        sizePerThread[!opIdx] *= 2;
-        newShape[!opIdx] *= 2;
-        newVEncoding = BlockedEncodingAttr::get(
-            ctx, sizePerThread, oldEncoding.getThreadsPerWarp(),
-            oldEncoding.getWarpsPerCTA(), oldEncoding.getCTAOrder(),
-            oldEncoding.getCTALayout());
+      auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+      const int opIdx = oldEncoding.getOpIdx();
+      const bool hasBatch = xShape.size() == 3;
+      const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+      newShape[kIdx] *= 2;
+
+      // Note: For Intel the dot operands layout's kWidth parameter must match
+      // the parent's DPAS layout opsPerChannel so we need to materialize a
+      // new DPAS layout.
+      if (auto dpasEncoding =
+              dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
+        unsigned opsPerChannel =
+            intel::DpasEncodingAttr::getOpsPerChannel(elemType);
+        // e2m1 is packed 2 elements per int8, we must handle continuous 2
+        // elements when upcasting to bf16
+        if (xTy.getElementType() == IntegerType::get(ctx, 8))
+          opsPerChannel *= 2;
+        auto newDpasEncoding = intel::DpasEncodingAttr::get(
+            ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(),
+            dpasEncoding.getExecutionSize(), opsPerChannel,
+            dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
+            dpasEncoding.getSubGroupSize());
+        newVEncoding = DotOperandEncodingAttr::get(
+            ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
+      } else {
+        // Figure out the K dimension for the input A/B, given that the return
+        // type is upcasted A/B type so we need to update the proper dim size.
+        newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
+                                                   oldEncoding.getParent(),
+                                                   oldEncoding.getKWidth() * 2);
       }
       retTy = RankedTensorType::get(newShape, elemType, newVEncoding);
     }

diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3534,7 +3534,12 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
     if is_xpu():
-        if M == 128 and N == 128 and K == 64 and not col_a and not col_b and rhs_scale and normal_type == "e4m3" and mxfp_type == "bf16":
+        # skip cases: test_scaled_dot[32-64-128-False-False-True-e5m2-bf16-4-16-1]
+        #             test_scaled_dot[64-32-128-False-False-True-e4m3-bf16-4-16-1]
+        # for L0 runtime error
+        if ((M == 32 and N == 64 and K == 128 and not col_a and not col_b and rhs_scale and normal_type == "e5m2"
+             and mxfp_type == "bf16") or (M == 64 and N == 32 and K == 128 and not col_a and not col_b and rhs_scale
+                                          and normal_type == "e4m3" and mxfp_type == "bf16")):
             pytest.skip(
                 f"FIXME: {M}x{N}x{K} col_a={col_a} col_b={col_b} rhs_scale={rhs_scale} normal_type={normal_type} mxfp_type={mxfp_type}"
             )