Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

thomcc · 2022-12-05T01:52:08Z

Godbolt: https://rust.godbolt.org/z/hhMWb6Eja

On aarch64 I have this code:

// Note: Doesn't happen for 16, everything is great then.
const CHUNK: usize = 32;

pub fn all_ascii_chunk(s: &[u8; CHUNK]) -> bool {
    use std::simd::*;
    const ALL_HI: Simd<u8, CHUNK> = Simd::from_array([0x80; CHUNK]);
    const ZERO: Simd<u8, CHUNK> = Simd::from_array([0; CHUNK]);
    (Simd::<u8, CHUNK>::from_array(*s) & ALL_HI)
        .simd_eq(ZERO)
        .all()
}

pub fn all_ascii(s: &[[u8; CHUNK]]) -> bool {
    s.iter().all(|chunk| all_ascii_chunk(chunk))
}

Wonderfully, all_ascii_chunk compiles to essentially what I want (I mean, it's not perfect, but I certainly wouldn't file a bug about it):

example::all_ascii_chunk:
        ldp     q1, q0, [x0]
        orr.16b v0, v1, v0
        cmlt.16b        v0, v0, #0
        umaxv.16b       b0, v0
        fmov    w8, s0
        mvn     w8, w8
        and     w0, w8, #0x1
        ret

Unfortunately, when it gets called in a loop from all_ascii, we... seem to completely loose our ability to do something reasonable, and get this monstrosity:

example::all_ascii:
        sub     sp, sp, #16
        lsl     x9, x1, #5
LBB0_1:
        mov     x8, x9
        cbz     x9, LBB0_3
        ldp     q0, q1, [x0], #32
        cmlt.16b        v1, v1, #0
        umov.b  w9, v1[1]
        umov.b  w10, v1[0]
        and     w9, w9, #0x1
        and     w10, w10, #0x1
        bfi     w10, w9, #1, #1
        umov.b  w9, v1[2]
        and     w9, w9, #0x1
        bfi     w10, w9, #2, #1
        umov.b  w9, v1[3]
        and     w9, w9, #0x1
        umov.b  w11, v1[4]
        bfi     w10, w9, #3, #1
        and     w9, w11, #0x1
        bfi     w10, w9, #4, #1
        umov.b  w9, v1[5]
        and     w9, w9, #0x1
        bfi     w10, w9, #5, #1
        umov.b  w9, v1[6]
        and     w9, w9, #0x1
        umov.b  w11, v1[7]
        orr     w9, w10, w9, lsl #6
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #7
        umov.b  w10, v1[8]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #8
        umov.b  w10, v1[9]
        and     w10, w10, #0x1
        umov.b  w11, v1[10]
        orr     w9, w9, w10, lsl #9
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #10
        umov.b  w10, v1[11]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #11
        umov.b  w10, v1[12]
        and     w10, w10, #0x1
        umov.b  w11, v1[13]
        orr     w9, w9, w10, lsl #12
        and     w10, w11, #0x1
        orr     w9, w9, w10, lsl #13
        umov.b  w10, v1[14]
        and     w10, w10, #0x1
        orr     w9, w9, w10, lsl #14
        umov.b  w10, v1[15]
        cmlt.16b        v0, v0, #0
        umov.b  w11, v0[1]
        orr     w9, w9, w10, lsl #15
        and     w10, w11, #0x1
        umov.b  w11, v0[0]
        and     w11, w11, #0x1
        bfi     w11, w10, #1, #1
        umov.b  w10, v0[2]
        and     w10, w10, #0x1
        bfi     w11, w10, #2, #1
        umov.b  w10, v0[3]
        and     w10, w10, #0x1
        bfi     w11, w10, #3, #1
        umov.b  w10, v0[4]
        and     w10, w10, #0x1
        bfi     w11, w10, #4, #1
        umov.b  w10, v0[5]
        and     w10, w10, #0x1
        bfi     w11, w10, #5, #1
        umov.b  w10, v0[6]
        and     w10, w10, #0x1
        orr     w10, w11, w10, lsl #6
        umov.b  w11, v0[7]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #7
        umov.b  w11, v0[8]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #8
        umov.b  w11, v0[9]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #9
        umov.b  w11, v0[10]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #10
        umov.b  w11, v0[11]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #11
        umov.b  w11, v0[12]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #12
        umov.b  w11, v0[13]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #13
        umov.b  w11, v0[14]
        and     w11, w11, #0x1
        orr     w10, w10, w11, lsl #14
        umov.b  w11, v0[15]
        orr     w10, w10, w11, lsl #15
        orr     w10, w10, w9
        sub     x9, x8, #32
        tst     w10, #0xffff
        b.eq    LBB0_1
LBB0_3:
        cmp     x8, #0
        cset    w0, eq
        add     sp, sp, #16
        ret

And performance takes a nose-dive. This is very annoying, because this kind of issue means that I can't rely on functions that appear to codegen well continuing to do so when called :(. I know we have little control over this, but its... kind of an a huge issue for using std::simd to optimize functions if we can't rely it behaving consistently.

This is possibly related to the bad aarch64 scalar reductions I saw before, although it doesn't seem like it because all_ascii_chunk is fine.

The text was updated successfully, but these errors were encountered:

Nugine · 2022-12-05T04:25:33Z

Sometimes we may need to change the algorithm according to specific target features.

#[cfg(target_arch = "aarch64")]
pub fn all_ascii_chunk(s: &[u8; CHUNK]) -> bool {
    use std::simd::*;
    let x = Simd::<u8, CHUNK>::from_array(*s);
    x.reduce_max() < 0x80
}

It's annoying that portable-simd is not so "portable" when we try to optimize the codegen of some functions.
A solution is injecting the target features into a simd function by a generic parameter.

thomcc · 2022-12-07T01:49:51Z

@Nugine that's not the case here. These operations are fully supported, and LLVM even knows that. Please, look more closely at the example (and read the body of the issue again) -- the issue is that in the small function it vectorizes fine, but when that function is called from a loop it become scalarized.

I of course know that neon does not support the operations as I wrote them, but translates them vectorize fine in the small example (and for u8x16), and writing this way is done in order to get good codegen on other targets where emitting a check on the maximum would not be efficient (movemsk is much faster than horizontal max), without target conditionals. Anyway, your comment is basically off topic for this issue.

RDambrosio016 · 2022-12-07T02:12:40Z

Reproducible Godbolt

The weird thing is it appears the problem is not an optimization problem, its a codegen/ISel (instruction selection) problem, prior to codegen, the optimized LLVM IR is perfectly vectorized, the code gets inlined as expected, and everything is good:

define noundef zeroext i1 @_ZN7example9all_ascii17h6a179b75d513aa15E(ptr noalias noundef nonnull readonly align 1 %s.0, i64 %s.1) unnamed_addr #0 personality ptr @rust_eh_personality !dbg !5 {
  %0 = getelementptr inbounds [32 x i8], ptr %s.0, i64 %s.1, !dbg !10
  br label %bb1.i, !dbg !34

bb1.i:                                            ; preds = %bb3.i, %start
  %1 = phi ptr [ %2, %bb3.i ], [ %s.0, %start ]
  %_10.i.i = icmp eq ptr %1, %0, !dbg !39
  br i1 %_10.i.i, label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$3all17h8d243705e3f9eef8E.exit", label %bb3.i, !dbg !39

bb3.i:                                            ; preds = %bb1.i
  %2 = getelementptr inbounds [32 x i8], ptr %1, i64 1, !dbg !44
  %.val.i = load <32 x i8>, ptr %1, align 1, !dbg !58, !alias.scope !59, !noalias !62
  %3 = icmp slt <32 x i8> %.val.i, zeroinitializer, !dbg !65
  %4 = bitcast <32 x i1> %3 to i32, !dbg !79
  %5 = icmp eq i32 %4, 0, !dbg !79
  br i1 %5, label %bb1.i, label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$3all17h8d243705e3f9eef8E.exit", !dbg !90

"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$3all17h8d243705e3f9eef8E.exit": ; preds = %bb1.i, %bb3.i
  ret i1 %_10.i.i, !dbg !91
}

However, during the first and most important pass of codegen (ISel), LLVM thinks that for some reason the "pseudo-instructions" (the intrinsics) should be unrolled into scalar ops instead of becoming vector ops:

(the vector instructions are below the first red block, but the right side is too large to show that too).

calebzulawski · 2022-12-07T02:54:39Z

That makes me think this is another instance of #146. In the simpler case, there's probably an optimization pass that's vectorizing the scalar output from ISel. The more complicated case probably disrupts that optimization. Just a guess.

Nugine · 2022-12-07T04:48:44Z

It's interesting that the "fold" version generates vectorized instructions.

#![feature(portable_simd)]

extern crate core;

use core::simd::*;

const N: usize = 32;

pub fn check(chunk: &[u8; N]) -> bool {
    let x = Simd::<u8, N>::from_array(*chunk);
    let h = Simd::<u8, N>::splat(0x80);
    x.simd_lt(h).all()
}

pub fn all_ascii_v1(s: &[[u8; N]]) -> bool {
    s.iter().fold(true, |acc, x| acc & check(x))
}

pub fn all_ascii_v2(s: &[[u8; N]]) -> bool {
    s.iter().all(check)
}

example::check:
        ldp     q1, q0, [x0]
        orr.16b v0, v1, v0
        cmlt.16b        v0, v0, #0
        umaxv.16b       b0, v0
        fmov    w8, s0
        mvn     w8, w8
        and     w0, w8, #0x1
        ret

example::all_ascii_v1:
        cbz     x1, LBB1_4
        mov     x8, x0
        lsl     x9, x1, #5
        mov     w0, #1
LBB1_2:
        ldp     q1, q0, [x8], #32
        orr.16b v0, v1, v0
        cmlt.16b        v0, v0, #0
        umaxv.16b       b0, v0
        fmov    w10, s0
        bic     w10, w0, w10
        and     w0, w10, #0x1
        subs    x9, x9, #32
        b.ne    LBB1_2
        ret
LBB1_4:
        mov     w0, #1
        ret

Sp00ph · 2023-02-11T22:43:02Z

I think the root cause of this is that mask.all() uses bitcast <N x i1> to iN internally, which LLVM does a bad job at on arm. I opened an issue about this a while ago on the LLVM repo (llvm/llvm-project#59686), but doesn't look like it was fixed yet.

calebzulawski · 2023-02-11T22:55:36Z

I don't think the problem is limited to bitcasts, see llvm/llvm-project#50466

There is no bitcast, only truncate and reduce: https://github.com/rust-lang/rust/blob/e187f8871e3d553181c9d2d4ac111197a139ca0d/compiler/rustc_codegen_llvm/src/intrinsic.rs#L1724

Sp00ph · 2023-02-11T23:06:41Z

The intrinsic does initially get lowered to a llvm.vector.reduce.and, but it seems like LLVM replaces that by a bitcast in some optimisation pass. Enabling --emit=llvm-ir on the given godbolt shows bitcast <32 x i1> to i32 but no llvm.vector.reduce.and call anymore. In that case the bad bitcast might just be the root cause of the bad reduce.

workingjubilee added the I-scalarize Impact: code that should be vectorized, isn't label Mar 4, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

thomcc commented Dec 5, 2022 •

edited

Loading

Nugine commented Dec 5, 2022

thomcc commented Dec 7, 2022 •

edited

Loading

RDambrosio016 commented Dec 7, 2022

calebzulawski commented Dec 7, 2022

Nugine commented Dec 7, 2022 •

edited

Loading

Sp00ph commented Feb 11, 2023

calebzulawski commented Feb 11, 2023 •

edited

Loading

Sp00ph commented Feb 11, 2023

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

Simd-using functions sometimes scalarize after inlining, even if they use vector ops on their own #321

Comments

thomcc commented Dec 5, 2022 • edited Loading

Nugine commented Dec 5, 2022

thomcc commented Dec 7, 2022 • edited Loading

RDambrosio016 commented Dec 7, 2022

calebzulawski commented Dec 7, 2022

Nugine commented Dec 7, 2022 • edited Loading

Sp00ph commented Feb 11, 2023

calebzulawski commented Feb 11, 2023 • edited Loading

Sp00ph commented Feb 11, 2023

thomcc commented Dec 5, 2022 •

edited

Loading

thomcc commented Dec 7, 2022 •

edited

Loading

Nugine commented Dec 7, 2022 •

edited

Loading

calebzulawski commented Feb 11, 2023 •

edited

Loading