Rust auto-vectorisation difference?

i was following this blog post on autovectorisation and decided to try it myself. i tweaked the 3rd example to this:

pub struct StereoSample {
    l: Vec<f32>,
    r: Vec<f32>,
}

#[repr(transparent)]
pub struct MonoSample(Vec<f32>);

pub fn mix_mono_to_stereo(
    dst: &mut StereoSample,
    src: &MonoSample,
    gain_l: f32,
    gain_r: f32,
) {
    for i in 0..src.0.len() {
        dst.l[i] = src.0[i] * gain_l;
        dst.r[i] = src.0[i] * gain_r;
    }
}

to make it a struct of arrays. rust version 1.61.0 generated this code:

example::mix_mono_to_stereo:
        push    rbp
        push    r15
        push    r14
        push    r12
        push    rbx
        mov     r15, qword ptr [rsi + 16]
        test    r15, r15
        je      .LBB0_14
        mov     rax, rdi
        mov     r14, qword ptr [rsi]
        mov     r9, qword ptr [rdi]
        mov     rdi, qword ptr [rdi + 16]
        mov     r8, qword ptr [rax + 40]
        mov     r10, qword ptr [rax + 24]
        cmp     r8, rdi
        mov     rsi, rdi
        cmovb   rsi, r8
        lea     rax, [r15 - 1]
        cmp     rsi, rax
        cmovae  rsi, rax
        add     rsi, 1
        cmp     rsi, 4
        ja      .LBB0_6
        xor     r11d, r11d
        jmp     .LBB0_3
.LBB0_6:
        cmp     r8, rdi
        mov     rcx, rdi
        cmovb   rcx, r8
        cmp     rcx, rax
        cmovae  rcx, rax
        lea     rdx, [r9 + 4*rcx]
        add     rdx, 4
        lea     rbx, [r10 + 4*rcx]
        add     rbx, 4
        lea     rcx, [r14 + 4*rcx]
        add     rcx, 4
        cmp     r9, rbx
        setb    bpl
        cmp     r10, rdx
        setb    r12b
        cmp     r9, rcx
        setb    al
        cmp     r14, rdx
        setb    dl
        cmp     r10, rcx
        setb    cl
        cmp     r14, rbx
        setb    bl
        xor     r11d, r11d
        test    bpl, r12b
        jne     .LBB0_3
        and     al, dl
        jne     .LBB0_3
        and     cl, bl
        jne     .LBB0_3
        mov     eax, esi
        and     eax, 3
        test    rax, rax
        mov     ecx, 4
        cmovne  rcx, rax
        sub     rsi, rcx
        movaps  xmm2, xmm0
        shufps  xmm2, xmm0, 0
        movaps  xmm3, xmm1
        shufps  xmm3, xmm1, 0
        xor     eax, eax
.LBB0_10:
        movups  xmm4, xmmword ptr [r14 + 4*rax]
        mulps   xmm4, xmm2
        movups  xmmword ptr [r9 + 4*rax], xmm4
        movups  xmm4, xmmword ptr [r14 + 4*rax]
        mulps   xmm4, xmm3
        movups  xmmword ptr [r10 + 4*rax], xmm4
        add     rax, 4
        cmp     rsi, rax
        jne     .LBB0_10
        mov     r11, rsi
.LBB0_3:
        mov     r12, r8
        sub     r12, r11
        mov     rax, rdi
        sub     rax, r11
        sub     r15, r11
        lea     rdx, [r10 + 4*r11]
        lea     rbx, [r9 + 4*r11]
        lea     rcx, [r14 + 4*r11]
        xor     esi, esi
.LBB0_4:
        cmp     rax, rsi
        je      .LBB0_5
        movss   xmm2, dword ptr [rcx + 4*rsi]
        mulss   xmm2, xmm0
        movss   dword ptr [rbx + 4*rsi], xmm2
        cmp     r12, rsi
        je      .LBB0_15
        movss   xmm2, dword ptr [rcx + 4*rsi]
        mulss   xmm2, xmm1
        movss   dword ptr [rdx + 4*rsi], xmm2
        add     rsi, 1
        cmp     r15, rsi
        jne     .LBB0_4
.LBB0_14:
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        pop     rbp
        ret
.LBB0_5:
        lea     rdx, [rip + .L__unnamed_1]
        mov     rsi, rdi
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_15:
        lea     rdx, [rip + .L__unnamed_2]
        mov     rdi, r8
        mov     rsi, r8
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2

.L__unnamed_3:
        .ascii  "/app/example.rs"

.L__unnamed_1:
        .quad   .L__unnamed_3
        .asciz  "\017\000\000\000\000\000\000\000\013\000\000\000\t\000\000"

.L__unnamed_2:
        .quad   .L__unnamed_3
        .asciz  "\017\000\000\000\000\000\000\000\f\000\000\000\t\000\000"

a whopping 132 lines, whereas rust 1.45.2 generated this code:

example::mix_mono_to_stereo:
        push    rax
        mov     rcx, qword ptr [rsi + 16]
        test    rcx, rcx
        je      .LBB0_8
        mov     rdx, qword ptr [rsi]
        xor     eax, eax
.LBB0_2:
        mov     rsi, qword ptr [rdi + 16]
        cmp     rsi, rax
        jbe     .LBB0_3
        mov     rsi, qword ptr [rdi]
        movss   xmm2, dword ptr [rdx + 4*rax]
        mulss   xmm2, xmm0
        movss   dword ptr [rsi + 4*rax], xmm2
        mov     rsi, qword ptr [rdi + 40]
        cmp     rsi, rax
        jbe     .LBB0_6
        mov     rsi, qword ptr [rdi + 24]
        movss   xmm2, dword ptr [rdx + 4*rax]
        mulss   xmm2, xmm1
        movss   dword ptr [rsi + 4*rax], xmm2
        lea     rsi, [rax + 1]
        mov     rax, rsi
        cmp     rcx, rsi
        jne     .LBB0_2
.LBB0_8:
        pop     rax
        ret
.LBB0_3:
        lea     rdx, [rip + .L__unnamed_1]
        jmp     .LBB0_4
.LBB0_6:
        lea     rdx, [rip + .L__unnamed_2]
.LBB0_4:
        mov     rdi, rax
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2

.L__unnamed_3:
        .ascii  "/app/example.rs"

.L__unnamed_1:
        .quad   .L__unnamed_3
        .asciz  "\017\000\000\000\000\000\000\000\013\000\000\000\t\000\000"

.L__unnamed_2:
        .quad   .L__unnamed_3
        .asciz  "\017\000\000\000\000\000\000\000\f\000\000\000\t\000\000"

only 49 lines.

i don't know much about assembly code, so why does this happen, and which is faster? thanks!

If you're trying to get things to vectorize, you don't want to write things like this. It's possible that LLVM will do something smart, but to get the best results you want to give it more information.

Crucially, you haven't written anything in this function that says that dst.l and dst.r will be long enough. And thus LLVM needs to carefully ensure that it never writes past the valid end, and that only the correct things are updated in those slices if you catch the panic from the out-of-bounds indexing.

The most direct change for this code is what I call "reslicing":

pub fn mix_mono_to_stereo_via_reslicing(
    dst: &mut StereoSample,
    src: &MonoSample,
    gain_l: f32,
    gain_r: f32,
) {
    let n = src.0.len();
    let (dst_l, dst_r, src_0) = (&mut dst.l[..n], &mut dst.r[..n], &src.0[..n]);
    for i in 0..n {
        dst_l[i] = src_0[i] * gain_l;
        dst_r[i] = src_0[i] * gain_r;
    }
}

By doing this, you've made it really clear to LLVM that dst_l, dst_r, and src_0 are all definitely exactly n items long. So either the function will panic before the loop (because the dst vectors aren't long enough) or the loop will always run to completion, taking exactly n iterations, with definitely no out-of-bounds accesses.

That makes it as easy as possible for it to actually vectorize, which it does: https://rust.godbolt.org/z/szv6njr8c

As for checking whether it vectorized, I find it much easier to see in the LLVM IR. Conveniently, there's a section in the function labelled vector.body:

Full Block
vector.body:                                      ; preds = %vector.body, %vector.ph.new
  %index = phi i64 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ], !dbg !426
  %niter = phi i64 [ 0, %vector.ph.new ], [ %niter.next.1, %vector.body ]
  %17 = getelementptr inbounds [0 x float], [0 x float]* %_21.i.i1.i.i25, i64 0, i64 %index, !dbg !426
  %18 = bitcast float* %17 to <4 x float>*, !dbg !429
  %wide.load = load <4 x float>, <4 x float>* %18, align 4, !dbg !429, !alias.scope !431
  %19 = getelementptr inbounds float, float* %17, i64 4, !dbg !429
  %20 = bitcast float* %19 to <4 x float>*, !dbg !429
  %wide.load48 = load <4 x float>, <4 x float>* %20, align 4, !dbg !429, !alias.scope !431
  %21 = getelementptr inbounds [0 x float], [0 x float]* %_21.i.i1.i.i, i64 0, i64 %index, !dbg !426
  %22 = fmul <4 x float> %wide.load, %broadcast.splat, !dbg !434
  %23 = fmul <4 x float> %wide.load48, %broadcast.splat50, !dbg !434
  %24 = bitcast float* %21 to <4 x float>*, !dbg !434
  store <4 x float> %22, <4 x float>* %24, align 4, !dbg !434, !alias.scope !435, !noalias !437
  %25 = getelementptr inbounds float, float* %21, i64 4, !dbg !434
  %26 = bitcast float* %25 to <4 x float>*, !dbg !434
  store <4 x float> %23, <4 x float>* %26, align 4, !dbg !434, !alias.scope !435, !noalias !437
  %27 = bitcast float* %17 to <4 x float>*, !dbg !439
  %wide.load51 = load <4 x float>, <4 x float>* %27, align 4, !dbg !439, !alias.scope !431
  %28 = bitcast float* %19 to <4 x float>*, !dbg !439
  %wide.load52 = load <4 x float>, <4 x float>* %28, align 4, !dbg !439, !alias.scope !431
  %29 = getelementptr inbounds [0 x float], [0 x float]* %_21.i.i1.i.i22, i64 0, i64 %index, !dbg !426
  %30 = fmul <4 x float> %wide.load51, %broadcast.splat54, !dbg !440
  %31 = fmul <4 x float> %wide.load52, %broadcast.splat56, !dbg !440
  %32 = bitcast float* %29 to <4 x float>*, !dbg !440
  store <4 x float> %30, <4 x float>* %32, align 4, !dbg !440, !alias.scope !441, !noalias !431
  %33 = getelementptr inbounds float, float* %29, i64 4, !dbg !440
  %34 = bitcast float* %33 to <4 x float>*, !dbg !440
  store <4 x float> %31, <4 x float>* %34, align 4, !dbg !440, !alias.scope !441, !noalias !431
  %index.next = or i64 %index, 8, !dbg !426
  %35 = getelementptr inbounds [0 x float], [0 x float]* %_21.i.i1.i.i25, i64 0, i64 %index.next, !dbg !426
  %36 = bitcast float* %35 to <4 x float>*, !dbg !429
  %wide.load.1 = load <4 x float>, <4 x float>* %36, align 4, !dbg !429, !alias.scope !431
  %37 = getelementptr inbounds float, float* %35, i64 4, !dbg !429
  %38 = bitcast float* %37 to <4 x float>*, !dbg !429
  %wide.load48.1 = load <4 x float>, <4 x float>* %38, align 4, !dbg !429, !alias.scope !431
  %39 = getelementptr inbounds [0 x float], [0 x float]* %_21.i.i1.i.i, i64 0, i64 %index.next, !dbg !426
  %40 = fmul <4 x float> %wide.load.1, %broadcast.splat, !dbg !434
  %41 = fmul <4 x float> %wide.load48.1, %broadcast.splat50, !dbg !434
  %42 = bitcast float* %39 to <4 x float>*, !dbg !434
  store <4 x float> %40, <4 x float>* %42, align 4, !dbg !434, !alias.scope !435, !noalias !437
  %43 = getelementptr inbounds float, float* %39, i64 4, !dbg !434
  %44 = bitcast float* %43 to <4 x float>*, !dbg !434
  store <4 x float> %41, <4 x float>* %44, align 4, !dbg !434, !alias.scope !435, !noalias !437
  %45 = bitcast float* %35 to <4 x float>*, !dbg !439
  %wide.load51.1 = load <4 x float>, <4 x float>* %45, align 4, !dbg !439, !alias.scope !431
  %46 = bitcast float* %37 to <4 x float>*, !dbg !439
  %wide.load52.1 = load <4 x float>, <4 x float>* %46, align 4, !dbg !439, !alias.scope !431
  %47 = getelementptr inbounds [0 x float], [0 x float]* %_21.i.i1.i.i22, i64 0, i64 %index.next, !dbg !426
  %48 = fmul <4 x float> %wide.load51.1, %broadcast.splat54, !dbg !440
  %49 = fmul <4 x float> %wide.load52.1, %broadcast.splat56, !dbg !440
  %50 = bitcast float* %47 to <4 x float>*, !dbg !440
  store <4 x float> %48, <4 x float>* %50, align 4, !dbg !440, !alias.scope !441, !noalias !431
  %51 = getelementptr inbounds float, float* %47, i64 4, !dbg !440
  %52 = bitcast float* %51 to <4 x float>*, !dbg !440
  store <4 x float> %49, <4 x float>* %52, align 4, !dbg !440, !alias.scope !441, !noalias !431
  %index.next.1 = add nuw i64 %index, 16, !dbg !426
  %niter.next.1 = add i64 %niter, 2, !dbg !426
  %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter, !dbg !426
  br i1 %niter.ncmp.1, label %middle.block.unr-lcssa, label %vector.body, !dbg !426, !llvm.loop !442

In which you'll find

  • vector loads, to read multiple things at once
load <4 x float>
  • vector floating-point multiplications, to apply the gain to multiple things at once
fmul <4 x float>
  • and vector stores, to write the multiple results at once
store <4 x float>

In assembly those are somewhat less obvious, especially because on x64 it's common to use the SIMD registers even for scalar floating point -- mulps vs mulss doesn't jump off the page the way the vectors do in LLVM-IR.

But the compiler's pretty smart about slices and iterators, so you might also try writing it like this:

pub fn mix_mono_to_stereo_via_zip(
    dst: &mut StereoSample,
    src: &MonoSample,
    gain_l: f32,
    gain_r: f32,
) {
    for ((dst_l, dst_r), src) in std::iter::zip(&mut dst.l, &mut dst.r).zip(&src.0) {
        *dst_l = src * gain_l;
        *dst_r = src * gain_r;
    }
}

That also (same godbolt link) seems to vectorize quite well. Semantically it's a bit different, though, since it stops at whichever of the vectors is shortest, rather than panicking if the destination isn't long enough.

4 Likes

thanks, this helped me understand more of what the llvm auto-vectorisation in rust entails. however, you kind of missed my question, which was why does rustc 1.61.0 produce so many more instructions than the 1.45.2 one, and theoretically how will this affect the runtime (or is it inconclusive)?

Because of a different standard library and a different LLVM version.

More instructions might be good; might be bad. Hard to say. The only way to know is to benchmark, ideally with Criterion.rs - Criterion.rs Documentation.

CAD97 made some great examples in Converting a BGRA &[u8] to RGB [u8;N] (for images)? - #13 by CAD97 showing that the shorter ones are sometimes way slower.

(But the newer one calls MULPS — Multiply Packed Single Precision Floating-Point Values while the older one calls MULSS — Multiply Scalar Single Precision Floating-Point Values, so I bet the newer code is faster. You could also try optimizing for code size instead of speed, at which point it looks like new rust gives code more like your 1.45 example https://rust.godbolt.org/z/Kd5nddGez)

3 Likes

The 1.61 version is vectorized, the 1.45 version is straightforward but naive. The extra code is needed to handle the case where vector length is not divisible by 4 as well as the case where one of the destination vectors has insufficient length. You’re using checked indexing, so rustc still has to generate code that panics at the exact right spot. The loop that’s supposed to do the bulk work, .LBB0_10, is compact.

2 Likes

thanks for all the informative answers! running the benchmarks, 1.61 did show a vast improvement over the 1.45 one.

1 Like

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.