diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp index 0b5534f012..4bfeb64e25 100644 --- a/include/triton/Tools/Sys/GetEnv.hpp +++ b/include/triton/Tools/Sys/GetEnv.hpp @@ -38,7 +38,8 @@ inline const std::set CACHE_INVALIDATING_ENV_VARS = { "TRITON_INTEL_ENABLE_FIRST_LOAD_TO_SLM", "TRITON_INTEL_ENABLE_INSTR_SCHED", "TRITON_INTEL_ENABLE_POST_PROCESS_LLIR", - "TRITON_INTEL_REDUCE_TRANSPOSE" + "TRITON_INTEL_REDUCE_TRANSPOSE", + "TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING" // clang-format on }; diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp index 991fe5ba06..f53d1442f6 100644 --- a/lib/Dialect/TritonGPU/IR/Ops.cpp +++ b/lib/Dialect/TritonGPU/IR/Ops.cpp @@ -4,6 +4,7 @@ #include "triton/Dialect/Triton/IR/Utility.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Tools/Sys/GetEnv.hpp" #define GET_OP_CLASSES #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc" @@ -50,8 +51,12 @@ LogicalResult UpcastMXFPOp::verify() { return success(); } + /// TODO: Temporarily disabled this check to allow for the blocked encoding. + /// Enable once we have the dot op encoding UpcastMXFPOp lowering. auto dotEncoding = dyn_cast(layoutX); - if (!dotEncoding) { + if (mlir::triton::tools::getBoolEnv( + "TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING") && + !dotEncoding) { return emitOpError("Expected a DotOperandEncodingAttr for values"); } if (!isa(layoutScale)) { @@ -59,6 +64,8 @@ LogicalResult UpcastMXFPOp::verify() { "Expected a BlockOperandEncoding or LinearOperandEncoding " "for scales"); } + if (!dotEncoding) + return success(); if (isa(dotEncoding.getParent())) { // Necessary to keep all of the scales of a given block of values in the @@ -114,34 +121,45 @@ LogicalResult UpcastMXFPOp::inferReturnTypes( newShape.back() *= 2; retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx)); } else { - auto oldEncoding = cast(encoding); - - const int opIdx = oldEncoding.getOpIdx(); - const bool hasBatch = xShape.size() == 3; - const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch; - newShape[kIdx] *= 2; Type elemType = FloatType::getBF16(ctx); - - // Note: For Intel the dot operands layout's kWidth parameter must match - // the parent's DPAS layout opsPerChannel so we need to materialize a new - // DPAS layout. - Attribute newVEncoding; - if (auto dpasEncoding = - dyn_cast(oldEncoding.getParent())) { - auto newDpasEncoding = intel::DpasEncodingAttr::get( - ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(), - dpasEncoding.getExecutionSize(), - intel::DpasEncodingAttr::getOpsPerChannel(elemType), - dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(), - dpasEncoding.getSubGroupSize()); - newVEncoding = DotOperandEncodingAttr::get( - ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel()); - } else { - // Figure out the K dimension for the input A/B, given that the return - // type is upcasted A/B type so we need to update the proper dim size. - newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(), - oldEncoding.getParent(), - oldEncoding.getKWidth() * 2); + Attribute newVEncoding = nullptr; + if (auto oldEncoding = dyn_cast(encoding)) { + const int opIdx = oldEncoding.getOpIdx(); + const bool hasBatch = xShape.size() == 3; + const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch; + newShape[kIdx] *= 2; + + // Note: For Intel the dot operands layout's kWidth parameter must match + // the parent's DPAS layout opsPerChannel so we need to materialize a + // new DPAS layout. + if (auto dpasEncoding = + dyn_cast(oldEncoding.getParent())) { + auto newDpasEncoding = intel::DpasEncodingAttr::get( + ctx, dpasEncoding.getRepeatCount(), + dpasEncoding.getSystolicDepth(), dpasEncoding.getExecutionSize(), + intel::DpasEncodingAttr::getOpsPerChannel(elemType), + dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(), + dpasEncoding.getSubGroupSize()); + newVEncoding = DotOperandEncodingAttr::get( + ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel()); + } else { + // Figure out the K dimension for the input A/B, given that the return + // type is upcasted A/B type so we need to update the proper dim size. + newVEncoding = DotOperandEncodingAttr::get( + ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(), + oldEncoding.getKWidth() * 2); + } + } else if (auto oldEncoding = dyn_cast(encoding)) { + // TODO: Temporary code, remove once upcast_mxfp support dot encoding. + assert(!tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING")); + SmallVector sizePerThread = oldEncoding.getSizePerThread(); + int opIdx = sizePerThread.back() == 1 ? 1 : 0; + sizePerThread[!opIdx] *= 2; + newShape[!opIdx] *= 2; + newVEncoding = BlockedEncodingAttr::get( + ctx, sizePerThread, oldEncoding.getThreadsPerWarp(), + oldEncoding.getWarpsPerCTA(), oldEncoding.getCTAOrder(), + oldEncoding.getCTALayout()); } retTy = RankedTensorType::get(newShape, elemType, newVEncoding); } diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 5e96d2d0ab..6d41b6eece 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -3441,7 +3441,10 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu if mma == 16 and K == 64: pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot") if is_xpu(): - pytest.skip("scaled_dot isn't supported on XPU") + if M == 128 and N == 128 and K == 64 and not col_a and not col_b and rhs_scale and normal_type == "e4m3" and mxfp_type == "bf16": + pytest.skip( + f"FIXME: {M}x{N}x{K} col_a={col_a} col_b={col_b} rhs_scale={rhs_scale} normal_type={normal_type} mxfp_type={mxfp_type}" + ) @triton.jit def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, b_scale, out, diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt index 686b00623d..86084b9fa5 100644 --- a/scripts/skiplist/lts/language.txt +++ b/scripts/skiplist/lts/language.txt @@ -274,3 +274,1299 @@ test/unit/language/test_core.py::test_dot3d[8-8-64-64-64-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[8-8-64-64-64-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[8-8-64-64-64-32-32-float32-float32] test/unit/language/test_core.py::test_dot3d[8-8-64-64-64-32-32-int8-int8] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-32-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-64-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[32-128-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-32-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-64-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[64-128-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-32-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-64-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-64-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-True-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-True-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e4m3-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e4m3-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e2m1-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e2m1-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e2m1-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-False-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e4m3-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e5m2-bf16-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e5m2-e5m2-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e5m2-e4m3-4-16-1] +test/unit/language/test_core.py::test_scaled_dot[128-128-128-False-False-True-e4m3-bf16-4-16-1] diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir index f75eb9947a..cd28be3fc2 100644 --- a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir @@ -1,4 +1,4 @@ -// RUN: triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s +// RUN: TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING=1 triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s // CHECK: #[[$DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 16], B = [16, 16], C = [32, 16]}> // CHECK: #[[$DPAS_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}> diff --git a/third_party/intel/include/Analysis/DPAS.h b/third_party/intel/include/Analysis/DPAS.h index 3f649625d0..1f12258c5c 100644 --- a/third_party/intel/include/Analysis/DPAS.h +++ b/third_party/intel/include/Analysis/DPAS.h @@ -26,6 +26,14 @@ class DPASAnalysis { BF16_BF16_BF16_BF16, U32_U32_U8_U8, S32_S32_S8_S8, + // data types for dot scaled. + FP32_FP32_BF16_FP8, + FP32_FP32_BF16_FP4, + FP32_FP32_FP8_BF16, + FP32_FP32_FP8_FP8, + FP32_FP32_FP8_FP4, + FP32_FP32_FP4_BF16, + FP32_FP32_FP4_FP8, NOT_APPLICABLE }; @@ -39,17 +47,24 @@ class DPASAnalysis { /// (aka threads per warp) size. Result canUseDPAS(FunctionOpInterface funcOp) const; - /// Given a DotOp operation, return its DPAS engine type. - static DPASEngineType getDPASType(DotOp op); + /// Given a 'DotOp' or 'ScaledDot' operation, return its DPAS engine type. + static DPASEngineType getDPASType(Operation *op); + + // clang-format off + template + typename std::enable_if::value, + DPASAnalysis::DPASEngineType>::type + static getDPASType(OpTy); + // clang-format on private: mlir::ModuleOp mod; - /// Tracks Dot operations and their DPAS engine type. - std::map dotToDPASEngineMap; + /// Tracks Dot/DotScaled operations and their DPAS engine type. + std::map dotToDPASEngineMap; - /// Tracks the Dot operations contained in a function. - std::map> funcToDotMap; + /// Tracks the Dot/DotScaled operations contained in a function. + std::map> funcToDotMap; }; } // namespace mlir::triton::gpu::intel diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp index 90faf63c24..fbf434046b 100644 --- a/third_party/intel/lib/Analysis/DPAS.cpp +++ b/third_party/intel/lib/Analysis/DPAS.cpp @@ -1,5 +1,10 @@ #include "intel/include/Analysis/DPAS.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" +#include "mlir/IR/BuiltinTypes.h" +#include "triton/Dialect/Triton/IR/Dialect.h" +#include "llvm/Support/Casting.h" +#include +#include namespace mlir::triton::gpu::intel { @@ -16,19 +21,22 @@ DPASAnalysis::DPASAnalysis(Operation *root) { mod.walk([&](FunctionOpInterface funcOp) { auto it = funcToDotMap.find(funcOp); - funcOp.walk([&](DotOp dotOp) { + funcOp.walk([&](Operation *op) { + if (!isa(op)) + return; + if (it != funcToDotMap.end()) - it->second.push_back(dotOp); + it->second.push_back(op); else - funcToDotMap[funcOp] = {dotOp}; + funcToDotMap[funcOp] = {op}; DPASEngineType dpasEngineType = supportDPAS - ? DPASAnalysis::getDPASType(dotOp) + ? DPASAnalysis::getDPASType(op) : DPASEngineType::NOT_APPLICABLE; if (dpasEngineType == DPASEngineType::FP32_FP32_TF32_TF32 && - dotOp.getInputPrecision() != InputPrecision::TF32) + cast(op).getInputPrecision() != InputPrecision::TF32) dpasEngineType = DPASEngineType::NOT_APPLICABLE; - dotToDPASEngineMap[dotOp] = dpasEngineType; + dotToDPASEngineMap[op] = dpasEngineType; }); }); } @@ -44,7 +52,7 @@ DPASAnalysis::canUseDPAS(FunctionOpInterface funcOp) const { // Ensure all dot operations in the function can be lowered to DPAS // instructions. - for (const DotOp &dotOp : it->second) { + for (Operation *dotOp : it->second) { DPASEngineType dpasEngineType = dotToDPASEngineMap.at(dotOp); if (dpasEngineType == DPASEngineType::NOT_APPLICABLE) return Result::False; @@ -65,53 +73,110 @@ DPASAnalysis::canUseDPAS(FunctionOpInterface funcOp) const { return (threadsPerWarp == minSGSize) ? Result::True : Result::False; } -DPASAnalysis::DPASEngineType DPASAnalysis::getDPASType(DotOp op) { - // d = a * b + c - auto aTy = cast(op.getA().getType()); - auto bTy = cast(op.getB().getType()); +DPASAnalysis::DPASEngineType DPASAnalysis::getDPASType(Operation *op) { + if (auto dotOp = dyn_cast(op)) + return DPASAnalysis::getDPASType(dotOp); + if (auto dotScaledOp = dyn_cast(op)) + return DPASAnalysis::getDPASType(dotScaledOp); + return DPASEngineType::NOT_APPLICABLE; +} + +// This function determines the DPAS engine type for the given operation. +// It checks the element types of the tensors involved in the operation +// and returns the appropriate DPAS engine type based on the type combinations. +template +typename std::enable_if::value, + DPASAnalysis::DPASEngineType>::type +DPASAnalysis::getDPASType(OpTy op) { auto cTy = cast(op.getC().getType()); auto dTy = cast(op.getD().getType()); - Type aElemTy = aTy.getElementType(); - Type bElemTy = bTy.getElementType(); Type cElemTy = cTy.getElementType(); Type dElemTy = dTy.getElementType(); assert(cElemTy == dElemTy && "Unexpected element type mismatch"); - if (aElemTy != bElemTy) - return DPASEngineType::NOT_APPLICABLE; + RankedTensorType aTy, bTy; + Type aElemTy, bElemTy; + + if constexpr (std::is_same_v) { + // d = a * b + c + aTy = cast(op.getA().getType()); + bTy = cast(op.getB().getType()); + aElemTy = aTy.getElementType(); + bElemTy = bTy.getElementType(); + + if (aElemTy != bElemTy) + return DPASEngineType::NOT_APPLICABLE; + + if (dElemTy.isIntOrIndex()) { + if (dElemTy.getIntOrFloatBitWidth() == 32 && + aElemTy.getIntOrFloatBitWidth() == 8) + return dElemTy.isSignedInteger() ? DPASEngineType::S32_S32_S8_S8 + : DPASEngineType::U32_U32_U8_U8; + return DPASEngineType::NOT_APPLICABLE; + } - if (dElemTy.isIntOrIndex()) { - if (dElemTy.getIntOrFloatBitWidth() == 32 && - aElemTy.getIntOrFloatBitWidth() == 8) - return dElemTy.isSignedInteger() ? DPASEngineType::S32_S32_S8_S8 - : DPASEngineType::U32_U32_U8_U8; - return DPASEngineType::NOT_APPLICABLE; + if (isa(dElemTy)) { + if (dElemTy.isF32()) { + if (aElemTy.isF16()) + return DPASEngineType::FP32_FP32_FP16_FP16; + if (aElemTy.isBF16()) + return DPASEngineType::FP32_FP32_BF16_BF16; + if (aElemTy.isF32() && op.getInputPrecision() == InputPrecision::TF32) + return DPASEngineType::FP32_FP32_TF32_TF32; + // For FP8XFP8->FP32, upcast to FP16 + if (aElemTy.isFloat8E5M2()) + return DPASEngineType::FP32_FP32_FP16_FP16; + if (aElemTy.isFloat8E4M3FN()) + return DPASEngineType::FP32_FP32_FP16_FP16; + } else if (dElemTy.isF16()) { + if (aElemTy.isF16()) + return DPASEngineType::FP16_FP16_FP16_FP16; + } else if (dElemTy.isBF16()) { + if (aElemTy.isBF16()) + return DPASEngineType::BF16_BF16_BF16_BF16; + } + } } - if (isa(dElemTy)) { - if (dElemTy.isF32()) { - if (aElemTy.isF16()) - return DPASEngineType::FP32_FP32_FP16_FP16; - if (aElemTy.isBF16()) - return DPASEngineType::FP32_FP32_BF16_BF16; - if (aElemTy.isF32() && op.getInputPrecision() == InputPrecision::TF32) - return DPASEngineType::FP32_FP32_TF32_TF32; - // For FP8XFP8->FP32, upcast to FP16 - if (aElemTy.isFloat8E5M2()) - return DPASEngineType::FP32_FP32_FP16_FP16; - if (aElemTy.isFloat8E4M3FN()) - return DPASEngineType::FP32_FP32_FP16_FP16; - } else if (dElemTy.isF16()) { - if (aElemTy.isF16()) - return DPASEngineType::FP16_FP16_FP16_FP16; - } else if (dElemTy.isBF16()) { - if (aElemTy.isBF16()) - return DPASEngineType::BF16_BF16_BF16_BF16; + if constexpr (std::is_same_v) { + aTy = cast(op.getLhs().getType()); + bTy = cast(op.getRhs().getType()); + aElemTy = aTy.getElementType(); + bElemTy = bTy.getElementType(); + + if (isa(dElemTy)) { + if (dElemTy.isF32()) { + if (aElemTy.isBF16() && + (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2())) + return DPASEngineType::FP32_FP32_BF16_FP8; + // 2 E2M1 are packed into 1 int8 + if (aElemTy.isBF16() && bElemTy.isInteger(8)) + return DPASEngineType::FP32_FP32_BF16_FP4; + if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) && + bElemTy.isBF16()) + return DPASEngineType::FP32_FP32_FP8_BF16; + if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) && + (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2())) + return DPASEngineType::FP32_FP32_FP8_FP8; + if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) && + bElemTy.isInteger(8)) + return DPASEngineType::FP32_FP32_FP8_FP4; + if (aElemTy.isInteger(8) && bElemTy.isBF16()) + return DPASEngineType::FP32_FP32_FP4_BF16; + if (aElemTy.isInteger(8) && + (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2())) + return DPASEngineType::FP32_FP32_FP4_FP8; + } } } - return DPASEngineType::NOT_APPLICABLE; } +// Explicit instantiations. +template DPASAnalysis::DPASEngineType +DPASAnalysis::getDPASType(DotOp op); +template DPASAnalysis::DPASEngineType +DPASAnalysis::getDPASType(DotScaledOp op); + } // namespace mlir::triton::gpu::intel diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt index 4e86cbd2f2..211c2b185a 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt @@ -24,6 +24,7 @@ add_triton_library(TritonIntelGPUToLLVM TritonGPUToLLVM.cpp TritonOpsToLLVM.cpp TypeConverter.cpp + UpcastMXFPToLLVM.cpp Utility.cpp ViewOpToLLVM.cpp diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PatternTritonGPUOpToLLVM.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PatternTritonGPUOpToLLVM.h index aca8430be1..dd361daf71 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PatternTritonGPUOpToLLVM.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PatternTritonGPUOpToLLVM.h @@ -41,6 +41,11 @@ void populateElementwiseOpToLLVMPatterns( ModuleAxisInfoAnalysis &axisInfoAnalysis, const TargetInfoBase &targetInfo, PatternBenefit benefit); +void populateUpcastMXFPToLLVMPatterns(LLVMTypeConverter &typeConverter, + RewritePatternSet &patterns, + const TargetInfo &targetInfo, + PatternBenefit benefit); + void populateBF16CastsLLVMPatterns(LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, PatternBenefit benefit); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 7bc577a2b7..102b2c9169 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -251,6 +251,8 @@ class TritonGPUToLLVMPipelineManager { targetInfo, benefit); intel::populateMakeRangeOpToLLVMPattern(typeConverter, targetInfo, patterns, benefit); + intel::populateUpcastMXFPToLLVMPatterns(typeConverter, patterns, + targetInfo, benefit); } intel::populateSPMDOpToLLVMPattern(typeConverter, patterns, targetInfo, diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp new file mode 100644 index 0000000000..feacf9ae1e --- /dev/null +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp @@ -0,0 +1,85 @@ +#include "PatternTritonGPUOpToLLVM.h" + +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Transforms/DialectConversion.h" +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" +#include "triton/Dialect/Triton/IR/Dialect.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" + +using namespace mlir; +using namespace mlir::triton; +using namespace mlir::triton::gpu; + +namespace { + +static Value mxfpScaleBf16(ConversionPatternRewriter &rewriter, Location loc, + Value v, Value scale) { + Value vBf16 = bitcast(v, bf16_ty); + Value nanBf16 = bitcast(i16_val(0x7fff), bf16_ty); + Value scaleIsNan = icmp_eq(scale, i8_val(0xff)); + Value scaleBf16 = bitcast(shl(zext(i16_ty, scale), i16_val(7)), bf16_ty); + + Value v0 = mlir::triton::intel::convertBf16ToFp32(loc, rewriter, vBf16); + Value v1 = mlir::triton::intel::convertBf16ToFp32(loc, rewriter, scaleBf16); + auto result = rewriter.create(loc, f32_ty, v0, v1); + auto undefRounding = static_cast(-1); + Value scaledBf16 = mlir::triton::intel::convertFp32ToBf16( + loc, rewriter, result, undefRounding); + // Value scaledBf16 = fmul(vBf16, scaleBf16); + // Account for NaN in the scale as per the mxfp specification. + return select(scaleIsNan, nanBf16, scaledBf16); +}; + +class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern { +private: + const TargetInfoBase &targetInfo; + +public: + UpcastMXFPOpPattern(LLVMTypeConverter &typeConverter, + const TargetInfoBase &targetInfo, PatternBenefit benefit) + : ConvertOpToLLVMPattern(typeConverter, benefit), + targetInfo(targetInfo) {} + + LogicalResult + matchAndRewrite(UpcastMXFPOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + auto operands = adaptor.getOperands(); + SmallVector xVals = unpackLLElements(loc, operands[0], rewriter); + SmallVector scaleVals = unpackLLElements(loc, operands[1], rewriter); + ScaleDotElemType fpType = op.getFpType(); + + Value tid = tid_val(); + auto mod = op->getParentOfType(); + Value warpSize = + i32_val(triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod)); + Value warpId = udiv(tid, warpSize); + Value laneId = urem(tid, warpSize); + + if (fpType == ScaleDotElemType::E2M1) + xVals = LLVM::convertMxfp4x2ToBf16x2(rewriter, loc, xVals); + + for (auto [i, scaleVal] : llvm::enumerate(scaleVals)) { + for (int j = 0; j < 32; ++j) { + xVals[32 * i + j] = + mxfpScaleBf16(rewriter, loc, xVals[32 * i + j], scaleVal); + } + } + + Value result = + packLLElements(loc, getTypeConverter(), xVals, rewriter, op.getType()); + rewriter.replaceOp(op, result); + return success(); + } +}; +} // anonymous namespace + +void mlir::triton::intel::populateUpcastMXFPToLLVMPatterns( + LLVMTypeConverter &typeConverter, RewritePatternSet &patterns, + const TargetInfo &targetInfo, PatternBenefit benefit) { + patterns.add(typeConverter, targetInfo, benefit); +} diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp index 94fe2aa693..31cea35ef8 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp @@ -12,6 +12,7 @@ #include "triton/Dialect/Triton/IR/Dialect.h" #include "triton/Dialect/Triton/IR/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Tools/Sys/GetEnv.hpp" #include "llvm/ADT/TypeSwitch.h" #define PVC_2D_LOAD_MAXIMUM_NUMBER_OF_ROWS 32 @@ -250,6 +251,9 @@ class DecomposeScaledBlocked : public OpRewritePattern { } private: + const bool upcastMXFPUseDotOpEnc = + mlir::triton::tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING"); + struct OpDescriptor { TensorValue op; triton::ScaleDotElemType elemType; @@ -261,8 +265,10 @@ class DecomposeScaledBlocked : public OpRewritePattern { triton::gpu::intel::DpasEncodingAttr dpasEnc, RankedTensorType newRetType, ModuleOp mod, PatternRewriter &rewriter) const { + assert((aDesc.scale || bDesc.scale) && "No scale provided"); + assert(!(aDesc.scale && bDesc.scale) && "NYI: Both LHS and RHS scale"); + if (aDesc.scale) { - assert(bDesc.scale == nullptr && "NYI: both LHS and RHS scale"); TensorValue newA = convertScaledOperand<0>(aDesc, dpasEnc, newRetType, mod, rewriter); TensorValue newB = @@ -270,7 +276,6 @@ class DecomposeScaledBlocked : public OpRewritePattern { return {newA, newB}; } - assert((bDesc.scale && !aDesc.scale) && "NYI: both LHS and RHS scale"); TensorValue newB = convertScaledOperand<1>(bDesc, dpasEnc, newRetType, mod, rewriter); TensorValue newA = @@ -286,37 +291,87 @@ class DecomposeScaledBlocked : public OpRewritePattern { static_assert(opIdx == 0 || opIdx == 1, "Illegal operand index"); assert(opDesc.scale && "Expecting valid operand & scale"); + MLIRContext *ctx = opDesc.op.getContext(); + unsigned numWarps = ttg::TritonGPUDialect::getNumWarps(mod); + unsigned warpSize = ttg::TritonGPUDialect::getThreadsPerWarp(mod); unsigned opsPerChannel = dpasEnc.getOpsPerChannel(); + unsigned rank = retType.getRank(); + + if (upcastMXFPUseDotOpEnc) { + if (opDesc.elemType == tt::ScaleDotElemType::E2M1) + opsPerChannel *= 2; + + auto opEncoding = ttg::intel::DpasEncodingAttr::get( + ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(), + dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(), + dpasEnc.getRepCluster(), dpasEnc.getSubGroupSize()); + + auto newOpEncoding = ttg::DotOperandEncodingAttr::get( + ctx, opIdx, opEncoding, opEncoding.getOpsPerChannel()); + TensorValue op = + createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter); + + unsigned warpSize = ttg::TritonGPUDialect::getThreadsPerWarp(mod); + unsigned instrShapeM = dpasEnc.getDPASInstShapeA()[1]; + SmallVector threadsPerWarp{instrShapeM, + warpSize / instrShapeM}; + SmallVector warpsPerCTA(rank, 1); + warpsPerCTA[0] = numWarps; + auto CTALayout = ttg::getCTALayout(retType.getEncoding()); + + auto newScaleEncoding = ttg::BlockedEncodingAttr::get( + ctx, {1, 1}, threadsPerWarp, warpsPerCTA, newOpEncoding.getCTAOrder(), + CTALayout); + TensorValue scale = createScale(opDesc.scale, newScaleEncoding, rewriter); + + return createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter); + } + + auto scaleEncoding = dyn_cast( + opDesc.scale.getType().getEncoding()); + assert(scaleEncoding && "Expecting blocked encoding for scale"); + + // Referring to + // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf + // the scalingBlockSize should be 32 for E5M2, E4M3 and E2M1 + unsigned scalingBlockSize = 32; + // 2 FP4E2M1 are packed in one i8 if (opDesc.elemType == tt::ScaleDotElemType::E2M1) - opsPerChannel *= 2; + scalingBlockSize = 16; - MLIRContext *ctx = opDesc.op.getContext(); - auto opEncoding = ttg::intel::DpasEncodingAttr::get( - ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(), - dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(), - dpasEnc.getRepCluster(), dpasEnc.getSubGroupSize()); + SmallVector sizePerThread = {1, 1}; + SmallVector threadsPerWarp = {1, 1}; + sizePerThread[!opIdx] = scalingBlockSize; + threadsPerWarp[opIdx] = warpSize; + SmallVector warpsPerCTA = {numWarps, 1}; - auto newOpEncoding = ttg::DotOperandEncodingAttr::get( - ctx, opIdx, opEncoding, opEncoding.getOpsPerChannel()); + auto newOpEncoding = ttg::BlockedEncodingAttr::get( + ctx, sizePerThread, threadsPerWarp, warpsPerCTA, + scaleEncoding.getCTAOrder(), scaleEncoding.getCTALayout()); TensorValue op = createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter); - unsigned warpSize = ttg::TritonGPUDialect::getThreadsPerWarp(mod); - unsigned instrShapeM = dpasEnc.getDPASInstShapeA()[1]; - SmallVector threadsPerWarp{instrShapeM, - warpSize / instrShapeM}; - unsigned rank = retType.getRank(); - int numWarps = ttg::TritonGPUDialect::getNumWarps(mod); - SmallVector warpsPerCTA(rank, 1); - warpsPerCTA[0] = numWarps; - auto CTALayout = ttg::getCTALayout(retType.getEncoding()); - - auto newScaleEncoding = - ttg::BlockedEncodingAttr::get(ctx, {1, 1}, threadsPerWarp, warpsPerCTA, - newOpEncoding.getCTAOrder(), CTALayout); + warpsPerCTA = opIdx ? SmallVector{1, numWarps} + : SmallVector{numWarps, 1}; + auto newScaleEncoding = ttg::BlockedEncodingAttr::get( + ctx, {1, 1}, {warpSize, 1}, warpsPerCTA, scaleEncoding.getCTAOrder(), + scaleEncoding.getCTALayout()); TensorValue scale = createScale(opDesc.scale, newScaleEncoding, rewriter); - return createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter); + auto retDpasEncoding = ttg::intel::DpasEncodingAttr::get( + ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(), + dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(), + dpasEnc.getRepCluster(), dpasEnc.getSubGroupSize()); + auto retDotOpEncoding = ttg::DotOperandEncodingAttr::get( + ctx, opIdx, retDpasEncoding, retDpasEncoding.getOpsPerChannel()); + + auto upcastOp = createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter); + + auto resultType = cast(upcastOp.getType()); + resultType = RankedTensorType::get( + resultType.getShape(), resultType.getElementType(), retDotOpEncoding); + return rewriter.create(opDesc.op.getLoc(), resultType, + upcastOp); } template diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index e91cfa34c0..e1407f971a 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -307,7 +307,9 @@ bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) { bool isLayoutAnchor(Operation *op) { if (isa(op)) return ttgi::isExpensiveLoadOrStore(op); - if (isa(op)) + // TODO: we should estimate the cost of the not propagating layout for + // AtomicCAS and UpcastMXFP ops for further performance consideration. + if (isa(op)) return true; if (isa(op)) if (auto tensorType =