diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt index 36344442bd..454a94e1f4 100644 --- a/cmake/llvm-hash.txt +++ b/cmake/llvm-hash.txt @@ -1 +1 @@ -df0864e761107b07e38f5503e0cbee0cebb4c5e8 +61f8a7f618901797ee8663389a29722f29216a96 diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 1862c03b9c..d9ebe7ccc1 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -101,7 +101,7 @@ using namespace mlir::triton; #define barrier() rewriter.create(loc) #define undef(...) rewriter.create(loc, __VA_ARGS__) #define null(...) rewriter.create(loc, __VA_ARGS__) -#define call(...) rewriter.create(loc, __VA_ARGS__) +#define call(...) LLVM::createLLVMCallOp(rewriter, loc, __VA_ARGS__) // Types #define int_ty(width) rewriter.getIntegerType(width) @@ -228,6 +228,12 @@ Value createIndexConstant(OpBuilder &builder, Location loc, Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width, int64_t value); +LLVM::CallOp createLLVMCallOp(OpBuilder &builder, Location loc, + LLVMFuncOp funcOp, ValueRange args); +LLVM::CallIntrinsicOp +createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic, + TypeRange types, ValueRange args); + // Is v an integer or floating-point scalar constant equal to 0? bool isConstantZero(Value v); diff --git a/lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp index 4d42db4268..8d5a63eb14 100644 --- a/lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ControlFlowOpToLLVM.cpp @@ -109,6 +109,10 @@ struct CallOpConversion : public ConvertOpToLLVMPattern { auto newCallOp = rewriter.create( callOp.getLoc(), packedResult ? TypeRange(packedResult) : TypeRange(), promotedOperands, callOp->getAttrs()); + newCallOp.getProperties().setOpBundleSizes( + rewriter.getDenseI32ArrayAttr({})); + newCallOp.getProperties().setOperandSegmentSizes( + {static_cast(promotedOperands.size()), 0}); return newCallOp; } diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp index 787dee35fb..8762942c31 100644 --- a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -299,7 +299,7 @@ struct MulhiUIOpConversion LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, op, funcName, funcType); return { - rewriter.create(loc, funcOp, operands[0]).getResult()}; + LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]).getResult()}; } protected: @@ -327,7 +327,7 @@ struct ExternElementwiseOpConversion LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp( rewriter, op, funcName, funcType, op.getLibname(), op.getLibpath()); return { - rewriter.create(loc, funcOp, operands[0]).getResult()}; + LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]).getResult()}; } }; diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 9de4434528..e857dd36f6 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -1,8 +1,7 @@ #include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" +#include "mlir/IR/Attributes.h" #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h" -#include "triton/Conversion/TritonGPUToLLVM/TypeConverter.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" #include "llvm/ADT/STLExtras.h" @@ -518,6 +517,24 @@ Value createLLVMIntegerConstant(OpBuilder &builder, Location loc, short width, builder.getIntegerAttr(ty, value)); } +LLVM::CallOp createLLVMCallOp(OpBuilder &builder, Location loc, + LLVMFuncOp funcOp, ValueRange args) { + auto op = builder.create(loc, funcOp, args); + op.getProperties().setOpBundleSizes(builder.getDenseI32ArrayAttr({})); + op.getProperties().setOperandSegmentSizes({static_cast(args.size()), 0}); + return op; +} + +LLVM::CallIntrinsicOp +createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic, + TypeRange types, ValueRange args) { + auto op = builder.create(loc, types, args); + op.getProperties().setIntrin(builder.getStringAttr(intrinsic)); + op.getProperties().setOpBundleSizes(builder.getDenseI32ArrayAttr({})); + op.getProperties().setOperandSegmentSizes({static_cast(args.size()), 0}); + return op; +} + bool isConstantZero(Value v) { if (auto constantOp = v.getDefiningOp()) { if (auto attr = dyn_cast(constantOp.getValue())) { diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir index e1b04e7d41..d76f708267 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir @@ -336,10 +336,10 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war // CHECK-LABEL: llvm.func spir_kernelcc @test( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<3>, // CHECK-SAME: %[[VAL_1:.*]]: vector<16xf32>) -> vector<16xf32> -// CHECK: %[[VAL_2:.*]] = llvm.call spir_funccc @_Z16get_sub_group_idv() {{{.*}}} : () -> i32 -// CHECK: %[[VAL_3:.*]] = llvm.sext %[[VAL_2]] : i32 to i64 -// CHECK: %[[VAL_4:.*]] = llvm.call spir_funccc @_Z22get_sub_group_local_idv() {{{.*}}} : () -> i32 -// CHECK: %[[VAL_5:.*]] = llvm.sext %[[VAL_4]] : i32 to i64 +// CHECK: %[[VAL_2:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {{{.*}}} : () -> i32 +// CHECK: %[[VAL_3:.*]] = llvm.zext %[[VAL_2]] : i32 to i64 +// CHECK: %[[VAL_4:.*]] = llvm.call spir_funccc @_Z22get_sub_group_local_id() {{{.*}}} : () -> i32 +// CHECK: %[[VAL_5:.*]] = llvm.zext %[[VAL_4]] : i32 to i64 // CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(16 : i64) : i64 // CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(256 : i64) : i64 // CHECK: %[[VAL_8:.*]] = llvm.mul %[[VAL_7]], %[[VAL_3]] : i64 diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir index 2a93b6bac4..f61ff23b42 100644 --- a/test/TritonIntelGPU/blockptr_load.mlir +++ b/test/TritonIntelGPU/blockptr_load.mlir @@ -70,8 +70,8 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war // CHECK: %[[VAL_13:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_12]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> // CHECK: %[[VAL_14:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_13]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> // CHECK: %[[BLOCK_POINTER:.*]] = llvm.insertvalue %[[VAL_0]], %[[VAL_14]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_idv() - // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.sext %[[SUB_GROUP_ID_RAW]] : i32 to i64 + // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64 // CHECK: %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32 // CHECK: %[[VAL_17:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[SUB_GROUP_ID_N:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_17]] : i32 @@ -142,8 +142,8 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war // CHECK: %[[VAL_12:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_11]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> // CHECK: %[[VAL_13:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_12]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> // CHECK: %[[BLOCK_POINTER:.*]] = llvm.insertvalue %[[VAL_0]], %[[VAL_13]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_idv() - // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.sext %[[SUB_GROUP_ID_RAW]] : i32 to i64 + // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64 // CHECK: %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32 // CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_17:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_16]] : i32 diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir index 04b90323f2..443743fa76 100644 --- a/test/TritonIntelGPU/blockptr_store.mlir +++ b/test/TritonIntelGPU/blockptr_store.mlir @@ -23,7 +23,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war %12 = arith.truncf %11#0 : tensor<64x64xf32, #dpas> to tensor<64x64xf16, #dpas> %13 = tt.make_tensor_ptr %arg2, [%arg3, %arg5], [%arg6, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > // The next two lines is used to start checking constant related to the BlockStore. - // CHECK-COUNT-3: llvm.call spir_funccc @_Z16get_sub_group_idv + // CHECK-COUNT-3: llvm.call spir_funccc @_Z16get_sub_group_id // CHECK-COUNT-39: llvm.extractvalue // Next constant must be equal to warpsPerCTA[0] // CHECK: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 @@ -83,8 +83,8 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war // CHECK: %[[VAL_80:.*]] = llvm.insertvalue %[[CST_1]], %[[VAL_79]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> // CHECK: %[[BLOCK_PTR:.*]] = llvm.insertvalue %[[base]], %[[VAL_80]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> // CHECK: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_idv() - // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.sext %[[SUB_GROUP_ID_RAW]] : i32 to i64 + // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64 // CHECK: %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32 // CHECK: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[SUB_GROUP_ID_N:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[CST_1]] : i32 diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index a5709ec017..c5480dc8aa 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -8,8 +8,8 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war %c1_i64 = arith.constant 1 : i64 // CHECK: %[[ROW_MAJOR_BLOCK_PTR:.*]] = llvm.insertvalue %arg0, {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[VAL_17:.*]] = llvm.call spir_funccc @_Z16get_sub_group_idv() - // CHECK: %[[VAL_18:.*]] = llvm.sext %[[VAL_17]] : i32 to i64 + // CHECK: %[[VAL_17:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[VAL_18:.*]] = llvm.zext %[[VAL_17]] : i32 to i64 // CHECK: %[[VAL_19:.*]] = llvm.trunc %[[VAL_18]] : i64 to i32 // CHECK: %[[VAL_20:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_21:.*]] = llvm.urem %[[VAL_19]], %[[VAL_20]] : i32 diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp index edf74cea42..18364b67e1 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/BuiltinFuncToLLVM.cpp @@ -4,6 +4,7 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" namespace mlir { namespace triton { @@ -187,11 +188,11 @@ class CallOpConversion : public mlir::RewritePattern { rewriter.create(loc, returnType, op->getResult(0)); } else if (calleeName == "__triton_hip_fast_fdividef") { assert(operands.size() == 2); - auto name = StringAttr::get(callOp.getContext(), "llvm.amdgcn.rcp.f32"); - LLVM::FastmathFlagsAttr defaultFlags{}; - auto rcpOp = rewriter.create( - loc, returnType, name, operands[1], defaultFlags); + const char *intrinsic = "llvm.amdgcn.rcp.f32"; + auto rcpOp = LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsic, + returnType, operands[1]); + LLVM::FastmathFlagsAttr defaultFlags{}; replacementOp = rewriter.create( loc, returnType, operands[0], rcpOp->getResult(0), defaultFlags); } diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp index 9368443255..9f575be082 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/WMMA.cpp @@ -24,6 +24,7 @@ #include "../PatternTritonGPUOpToLLVM.h" #include "Utility.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" namespace mlir::triton::AMD { namespace { @@ -219,10 +220,8 @@ Value generateWMMAIntrinsic(ConversionPatternRewriter &rewriter, Location loc, if (32 / dElType.getIntOrFloatBitWidth() > 1 || dElType.isInteger(32)) { operands.push_back(int_val(1, false)); } - auto wmmaIntrinsic = rewriter.create( - loc, TypeRange{valC.getType()}, StringAttr::get(loc.getContext(), name), - operands, defaultFlags); - + auto wmmaIntrinsic = LLVM::createLLVMIntrinsicCallOp( + rewriter, loc, name, valC.getType(), operands); return wmmaIntrinsic.getResult(0); } diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp index 3682f3d7da..47d5fbb355 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -1243,7 +1243,7 @@ struct ExpOpConversionApprox LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, op, funcName, funcType); - return {rewriter.create(loc, funcOp, prod).getResult()}; + return {LLVM::createLLVMCallOp(rewriter, loc, funcOp, prod).getResult()}; } }; @@ -1276,7 +1276,7 @@ struct Exp2OpConversion appendOrGetExternFuncOp(rewriter, op, funcName, funcType); return { - rewriter.create(loc, funcOp, operands[0]).getResult()}; + LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]).getResult()}; } private: diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp index c9413a52f5..9bed879619 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp @@ -38,7 +38,7 @@ void createSchedGroupBarrier(PatternRewriter &rewriter, Location loc, InstructionKindMask maskValue, int sizeValue, int groupIdValue) { MLIRContext *ctx = rewriter.getContext(); - auto intrinsicName = str_attr("llvm.amdgcn.sched.group.barrier"); + const char *intrinsicName = "llvm.amdgcn.sched.group.barrier"; Value mask = LLVM::createConstantI32(loc, rewriter, static_cast(maskValue)); @@ -47,10 +47,8 @@ void createSchedGroupBarrier(PatternRewriter &rewriter, Location loc, Value groupId = LLVM::createConstantI32(loc, rewriter, static_cast(groupIdValue)); - LLVM::FastmathFlagsAttr defaultFlags{}; - rewriter.create(loc, TypeRange{}, intrinsicName, - ValueRange{mask, size, groupId}, - defaultFlags); + LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsicName, TypeRange{}, + ValueRange{mask, size, groupId}); } // Insert intrinsic that controls the types of instructions that may be @@ -58,25 +56,25 @@ void createSchedGroupBarrier(PatternRewriter &rewriter, Location loc, Operation *createSchedBarrier(PatternRewriter &rewriter, Location loc, int64_t maskValue) { MLIRContext *ctx = rewriter.getContext(); - auto intrinsicName = str_attr("llvm.amdgcn.sched.barrier"); + const char *intrinsicName = "llvm.amdgcn.sched.barrier"; LLVM::FastmathFlagsAttr defaultFlags{}; Value mask = LLVM::createConstantI32(loc, rewriter, static_cast(maskValue)); - return rewriter.create(loc, TypeRange{}, intrinsicName, - ValueRange{mask}, defaultFlags); + return LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsicName, + TypeRange{}, ValueRange{mask}); } // Insert an experimental intrinsic for instruction group level parallelism. // The intrinsic takes a value that specifies the strategy. Operation *createIglpOpt(PatternRewriter &rewriter, Location loc, int value) { MLIRContext *ctx = rewriter.getContext(); - auto intrinsicName = str_attr("llvm.amdgcn.iglp.opt"); + const char *intrinsicName = "llvm.amdgcn.iglp.opt"; LLVM::FastmathFlagsAttr defaultFlags{}; Value iglpValue = LLVM::createConstantI32(loc, rewriter, static_cast(value)); - return rewriter.create( - loc, TypeRange{}, intrinsicName, ValueRange{iglpValue}, defaultFlags); + return LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsicName, + TypeRange{}, ValueRange{iglpValue}); } struct InstructionSchedHintsRewriter diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp index 8462eb5fc9..c96ddbbe89 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp @@ -69,12 +69,9 @@ Value TargetInfo::getClusterCTAId(RewriterBase &rewriter, Location loc) const { Value TargetInfo::ballot(RewriterBase &rewriter, Location loc, Type type, Value cmp) const { - auto stringAttr = rewriter.getStringAttr("llvm.amdgcn.ballot"); - SmallVector operands = {cmp}; - Value asmResult = - rewriter.create(loc, type, stringAttr, operands) - ->getResult(0); - return asmResult; + return LLVM::createLLVMIntrinsicCallOp(rewriter, loc, "llvm.amdgcn.ballot", + type, cmp) + ->getResult(0); } void TargetInfo::storeDShared(RewriterBase &rewriter, Location loc, Value ptr, diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp index 2e114c898f..542b1ecbb7 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp @@ -231,11 +231,9 @@ Value llLoad(RewriterBase &rewriter, Location loc, Value ptr, Type elemTy, auto funcName = mangleFunc(getLoadNameRaw(cm), funcType); LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, parent, funcName, funcType); - auto loadVal = - rewriter - .create(loc, funcOp, ValueRange({ptr, pred, falseVal})) - .getResult(); - return loadVal; + return LLVM::createLLVMCallOp(rewriter, loc, funcOp, + ValueRange({ptr, pred, falseVal})) + .getResult(); } void llStore(RewriterBase &rewriter, Location loc, Value ptr, Value val, @@ -276,7 +274,7 @@ void llStore(RewriterBase &rewriter, Location loc, Value ptr, Value val, auto funcName = mangleFunc(getStoreNameRaw(cm), funcType); LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, parent, funcName, funcType); - rewriter.create(loc, funcOp, ValueRange({ptr, val, pred})); + LLVM::createLLVMCallOp(rewriter, loc, funcOp, ValueRange({ptr, val, pred})); } } // namespace mlir::LLVM::AMD diff --git a/third_party/intel/lib/Target/LLVMIR/SLPVectorizer.cpp b/third_party/intel/lib/Target/LLVMIR/SLPVectorizer.cpp index b53bdfe2d5..b79d313b71 100644 --- a/third_party/intel/lib/Target/LLVMIR/SLPVectorizer.cpp +++ b/third_party/intel/lib/Target/LLVMIR/SLPVectorizer.cpp @@ -168,6 +168,21 @@ static bool isValidElementType(Type *Ty) { !Ty->isPPC_FP128Ty(); } +/// \returns the number of elements for Ty. +static unsigned getNumElements(Type *Ty) { + assert(!isa(Ty) && + "ScalableVectorType is not supported."); + if (auto *VecTy = dyn_cast(Ty)) + return VecTy->getNumElements(); + return 1; +} + +/// \returns the vector type of ScalarTy based on vectorization factor. +static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { + return FixedVectorType::get(ScalarTy->getScalarType(), + VF * getNumElements(ScalarTy)); +} + /// \returns True if the value is a constant (but not globals/constant /// expressions). static bool isConstant(Value *V) { @@ -9088,6 +9103,32 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return VecCost - ScalarCost; }; + auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) { + auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL); + if (MinMaxID == Intrinsic::not_intrinsic) + return InstructionCost::getInvalid(); + Type *CanonicalType = Ty; + if (CanonicalType->isPtrOrPtrVectorTy()) + CanonicalType = CanonicalType->getWithNewType(IntegerType::get( + CanonicalType->getContext(), + DL->getTypeSizeInBits(CanonicalType->getScalarType()))); + + IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType, + {CanonicalType, CanonicalType}); + InstructionCost IntrinsicCost = + TTI->getIntrinsicInstrCost(CostAttrs, CostKind); + // If the selects are the only uses of the compares, they will be + // dead and we can adjust the cost by removing their cost. + if (VI && SelectOnly) { + assert(!Ty->isVectorTy() && "Expected only for scalar type."); + auto *CI = cast(VI->getOperand(0)); + IntrinsicCost -= TTI->getCmpSelInstrCost( + CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(), + CostKind, {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, CI); + } + return IntrinsicCost; + }; switch (ShuffleOrOp) { case Instruction::PHI: { // Count reused scalars. @@ -9345,30 +9386,39 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; - return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy, - Builder.getInt1Ty(), CurrentPred, CostKind, - VI); + InstructionCost ScalarCost = TTI->getCmpSelInstrCost( + E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred, + CostKind, getOperandInfo(VI->getOperand(0)), + getOperandInfo(VI->getOperand(1)), VI); + InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI); + if (IntrinsicCost.isValid()) + ScalarCost = IntrinsicCost; + + return ScalarCost; }; auto GetVectorCost = [&](InstructionCost CommonCost) { - auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - - InstructionCost VecCost = TTI->getCmpSelInstrCost( - E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0); - // Check if it is possible and profitable to use min/max for selects - // in VL. - // - auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL); - if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) { - IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy, - {VecTy, VecTy}); - InstructionCost IntrinsicCost = - TTI->getIntrinsicInstrCost(CostAttrs, CostKind); - // If the selects are the only uses of the compares, they will be - // dead and we can adjust the cost by removing their cost. - if (IntrinsicAndUse.second) - IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, - MaskTy, VecPred, CostKind); - VecCost = std::min(VecCost, IntrinsicCost); + auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); + + InstructionCost VecCost = + TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred, + CostKind, getOperandInfo(E->getOperand(0)), + getOperandInfo(E->getOperand(1)), VL0); + if (auto *SI = dyn_cast(VL0)) { + auto *CondType = + getWidenedType(SI->getCondition()->getType(), VL.size()); + unsigned CondNumElements = CondType->getNumElements(); + unsigned VecTyNumElements = getNumElements(VecTy); + assert(VecTyNumElements >= CondNumElements && + VecTyNumElements % CondNumElements == 0 && + "Cannot vectorize Instruction::Select"); + if (CondNumElements != VecTyNumElements) { + // When the return type is i1 but the source is fixed vector type, we + // need to duplicate the condition value. + VecCost += ::getShuffleCost( + *TTI, TTI::SK_PermuteSingleSrc, CondType, + createReplicatedMask(VecTyNumElements / CondNumElements, + CondNumElements)); + } } return VecCost + CommonCost; }; @@ -9580,16 +9630,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, VecCost += TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind); } else if (auto *CI0 = dyn_cast(VL0)) { - auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, - CI0->getPredicate(), CostKind, VL0); + auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size()); + VecCost = TTIRef.getCmpSelInstrCost( + E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, + VL0); VecCost += TTIRef.getCmpSelInstrCost( E->getOpcode(), VecTy, MaskTy, cast(E->getAltOp())->getPredicate(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, E->getAltOp()); } else { Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType(); - auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size()); + auto *SrcTy = getWidenedType(SrcSclTy, VL.size()); if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { auto SrcIt = MinBWs.find(getOperandEntry(E, 0)); unsigned BWSz = DL->getTypeSizeInBits(ScalarTy); @@ -9598,7 +9651,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (SrcIt != MinBWs.end()) { SrcBWSz = SrcIt->second.first; SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz); - SrcTy = FixedVectorType::get(SrcSclTy, VL.size()); + SrcTy = getWidenedType(SrcSclTy, VL.size()); } if (BWSz <= SrcBWSz) { if (BWSz < SrcBWSz) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ControlFlowOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ControlFlowOpToLLVM.cpp index 2179cd7f72..398ede09a7 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/ControlFlowOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ControlFlowOpToLLVM.cpp @@ -112,6 +112,10 @@ struct CallOpConversion : public ConvertOpToLLVMPattern { auto newCallOp = rewriter.create( callOp.getLoc(), packedResult ? TypeRange(packedResult) : TypeRange(), promotedOperands, callOp->getAttrs()); + newCallOp.getProperties().setOpBundleSizes( + rewriter.getDenseI32ArrayAttr({})); + newCallOp.getProperties().setOperandSegmentSizes( + {static_cast(promotedOperands.size()), 0}); return newCallOp; } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp index 7b31367989..2cd885635d 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -1147,7 +1147,7 @@ struct ExternElementwiseOpConversion LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp( rewriter, op, funcName, funcType, op.getLibname(), op.getLibpath()); - auto callOp = rewriter.create(loc, funcOp, operands[0]); + auto callOp = LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]); callOp.setCConv(LLVM::cconv::CConv::SPIR_FUNC); return {callOp.getResult()}; @@ -1411,7 +1411,7 @@ struct MulhiUIOpConversion Type funcType = getFunctionType(elemTy, operands[0]); LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, op, funcName, funcType); - auto callOp = rewriter.create(loc, funcOp, operands[0]); + auto callOp = LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]); callOp.setCConv(LLVM::cconv::CConv::SPIR_FUNC); return {callOp.getResult()}; } @@ -1444,7 +1444,7 @@ struct OpToExternCallConversion Type funcType = getFunctionType(elemTy, operands[0]); LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, op, funcName, funcType); - auto callOp = rewriter.create(loc, funcOp, operands[0]); + auto callOp = LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]); callOp.setCConv(LLVM::cconv::CConv::SPIR_FUNC); return {callOp.getResult()}; } diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp index 0b663a8754..ef69b96fce 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -5,6 +5,7 @@ #include "mlir/Support/LLVM.h" #include "triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h" #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h" +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" using namespace mlir::triton::gpu; @@ -912,7 +913,7 @@ struct OpToExternCallConversion LLVM::LLVMFuncOp funcOp = appendOrGetExternFuncOp(rewriter, op, funcName, funcType); return { - rewriter.create(loc, funcOp, operands[0]).getResult()}; + LLVM::createLLVMCallOp(rewriter, loc, funcOp, operands[0]).getResult()}; } private: