From 3a381b0c80ad7d8a62a8d182524aa2fb71d93204 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 3 Jan 2023 19:37:16 +0000 Subject: [PATCH] [WebAssembly] Explicitly add {z,s}ext so extends are selected During DAG legalization, {u,s}itofp instructions on v2i8, v2i16, v4i8 and v4i16 types ended up being legalized into scalar instructions, when they could just be extended to v2i32/v4i32 instead. Fixes https://github.com/llvm/llvm-project/issues/57182 Differential Revision: https://reviews.llvm.org/D140916 --- .../WebAssembly/WebAssemblyISelLowering.cpp | 29 ++++ .../WebAssembly/simd-extending-convert.ll | 154 ++++-------------- 2 files changed, 58 insertions(+), 125 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f32efde45902..94544800a6fb 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -2356,6 +2356,32 @@ performVECTOR_SHUFFLECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return DAG.getBitcast(DstType, NewShuffle); } +/// Convert ({u,s}itofp vec) --> ({u,s}itofp ({s,z}ext vec)) so it doesn't get +/// split up into scalar instructions during legalization, and the vector +/// extending instructions are selected in performVectorExtendCombine below. +static SDValue +performVectorExtendToFPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + assert(N->getOpcode() == ISD::UINT_TO_FP || + N->getOpcode() == ISD::SINT_TO_FP); + + EVT InVT = N->getOperand(0)->getValueType(0); + EVT ResVT = N->getValueType(0); + MVT ExtVT; + if (ResVT == MVT::v4f32 && (InVT == MVT::v4i16 || InVT == MVT::v4i8)) + ExtVT = MVT::v4i32; + else if (ResVT == MVT::v2f64 && (InVT == MVT::v2i16 || InVT == MVT::v2i8)) + ExtVT = MVT::v2i32; + else + return SDValue(); + + unsigned Op = + N->getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + SDValue Conv = DAG.getNode(Op, SDLoc(N), ExtVT, N->getOperand(0)); + return DAG.getNode(N->getOpcode(), SDLoc(N), ResVT, Conv); +} + static SDValue performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto &DAG = DCI.DAG; @@ -2641,6 +2667,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return performVectorExtendCombine(N, DCI); + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + return performVectorExtendToFPCombine(N, DCI); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: case ISD::FP_ROUND: diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll index b42754ad6517..2b871f1e84ad 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll @@ -11,21 +11,8 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) { ; CHECK: .functype extend_to_float_low_i16x8_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 0 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 1 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 2 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 3 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i32x4.extend_low_i16x8_u +; CHECK-NEXT: f32x4.convert_i32x4_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> %extended = uitofp <4 x i16> %low to <4 x float> @@ -37,21 +24,8 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) { ; CHECK: .functype extend_to_float_high_i16x8_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 4 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 5 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 6 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 7 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i32x4.extend_high_i16x8_u +; CHECK-NEXT: f32x4.convert_i32x4_u ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> %extended = uitofp <4 x i16> %high to <4 x float> @@ -62,22 +36,10 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) { ; CHECK-LABEL: extend_to_float_low_i8x16_u: ; CHECK: .functype extend_to_float_low_i8x16_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 0 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 1 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 2 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 3 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15 +; CHECK-NEXT: f32x4.convert_i32x4_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> %extended = uitofp <4 x i8> %low to <4 x float> @@ -89,21 +51,9 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) { ; CHECK: .functype extend_to_float_high_i8x16_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 4 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 5 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 6 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_u 7 -; CHECK-NEXT: f32.convert_i32_u -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.shuffle 4, 17, 18, 19, 5, 21, 22, 23, 6, 25, 26, 27, 7, 29, 30, 31 +; CHECK-NEXT: f32x4.convert_i32x4_u ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> %extended = uitofp <4 x i8> %high to <4 x float> @@ -115,21 +65,8 @@ define <4 x float> @extend_to_float_low_i16x8_s(<8 x i16> %x) { ; CHECK: .functype extend_to_float_low_i16x8_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 0 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 1 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 2 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 3 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i32x4.extend_low_i16x8_s +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> %extended = sitofp <4 x i16> %low to <4 x float> @@ -141,21 +78,8 @@ define <4 x float> @extend_to_float_high_i16x8_s(<8 x i16> %x) { ; CHECK: .functype extend_to_float_high_i16x8_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 4 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 5 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 6 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_s 7 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i32x4.extend_high_i16x8_s +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> %extended = sitofp <4 x i16> %high to <4 x float> @@ -167,21 +91,13 @@ define <4 x float> @extend_to_float_low_i8x16_s(<8 x i8> %x) { ; CHECK: .functype extend_to_float_low_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 0 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.splat ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 1 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 1 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 2 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 3 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> %extended = sitofp <4 x i8> %low to <4 x float> @@ -193,21 +109,13 @@ define <4 x float> @extend_to_float_high_i8x16_s(<8 x i8> %x) { ; CHECK: .functype extend_to_float_high_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 4 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 5 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 1 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 6 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 2 -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i8x16.extract_lane_s 7 -; CHECK-NEXT: f32.convert_i32_s -; CHECK-NEXT: f32x4.replace_lane 3 +; CHECK-NEXT: i8x16.shuffle 4, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shl +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> %extended = sitofp <4 x i8> %high to <4 x float> @@ -230,14 +138,10 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) { ; CHECK-LABEL: extend_to_double_low_i16x4_u: ; CHECK: .functype extend_to_double_low_i16x4_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 0 -; CHECK-NEXT: f64.convert_i32_u -; CHECK-NEXT: f64x2.splat -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.extract_lane_u 1 -; CHECK-NEXT: f64.convert_i32_u -; CHECK-NEXT: f64x2.replace_lane 1 +; CHECK-NEXT: i8x16.shuffle 16, 17, 2, 3, 18, 19, 6, 7, 20, 21, 10, 11, 22, 23, 14, 15 +; CHECK-NEXT: f64x2.convert_low_i32x4_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> %extended = uitofp <2 x i16> %low to <2 x double>