Rustc produces substantially different code for summing the first N elements of a vector vs. a slice

Code version 1:

pub fn sum(v: &[usize], n: usize) -> usize {
    let mut ret = 0;
    for i in 0..n {
        ret += v[i];
    }
    ret
}

output (with -O --target aarch64-unknown-linux-gnu):

sum:
        str     x30, [sp, #-16]!
        cbz     x2, .LBB0_5
        mov     x8, x0
        mov     x0, xzr
        sub     x9, x2, #1
.LBB0_2:
        cmp     x9, x1
        b.hs    .LBB0_6
        ldr     x10, [x8], #8
        subs    x2, x2, #1
        add     x0, x10, x0
        b.ne    .LBB0_2
        ldr     x30, [sp], #16
        ret
.LBB0_5:
        mov     x0, xzr
        ldr     x30, [sp], #16
        ret
.LBB0_6:
        adrp    x2, .L__unnamed_1
        add     x2, x2, :lo12:.L__unnamed_1
        mov     x0, x1
        bl      core::panicking::panic_bounds_check::hb550301af3964b1f

Notice that there is one loop iteration for each element to be summed, and we do the bounds check every time (cmp x9, x1 is the bounds check).

Now, if I change v to be of type &Vec<usize>, the output is as follows:

sum:
        str     x30, [sp, #-16]!
        cbz     x1, .LBB0_3
        mov     x8, x1
        ldp     x10, x1, [x0, #8]
        sub     x9, x8, #1
        cmp     x1, x9
        csel    x11, x1, x9, lo
        add     x11, x11, #1
        cmp     x11, #5
        b.hs    .LBB0_4
        mov     x0, xzr
        mov     x11, xzr
        b       .LBB0_7
.LBB0_3:
        mov     x0, xzr
        ldr     x30, [sp], #16
        ret
.LBB0_4:
        ands    x12, x11, #0x3
        mov     w13, #4
        movi    v0.2d, #0000000000000000
        movi    v1.2d, #0000000000000000
        csel    x12, x13, x12, eq
        sub     x11, x11, x12
        add     x12, x10, #16
        mov     x13, x11
.LBB0_5:
        ldp     q2, q3, [x12, #-16]
        subs    x13, x13, #4
        add     x12, x12, #32
        add     v0.2d, v2.2d, v0.2d
        add     v1.2d, v3.2d, v1.2d
        b.ne    .LBB0_5
        add     v0.2d, v1.2d, v0.2d
        addp    d0, v0.2d
        fmov    x0, d0
.LBB0_7:
        cmp     x1, x11
        b.eq    .LBB0_10
        ldr     x12, [x10, x11, lsl #3]
        add     x11, x11, #1
        cmp     x8, x11
        add     x0, x12, x0
        b.ne    .LBB0_7
        ldr     x30, [sp], #16
        ret
.LBB0_10:
        cmp     x1, x9
        adrp    x2, .L__unnamed_1
        add     x2, x2, :lo12:.L__unnamed_1
        csel    x0, x1, x9, lo
        bl      core::panicking::panic_bounds_check::hb550301af3964b1f

Now we only do one loop iteration per 4 elements (using the 128-bit q registers), and we don't do a bounds check in the hot loop, but only in the non-unrolled tail part.

What accounts for this difference?

I ran this code through godbolt:

pub fn sum(v: &Vec<usize>, n: usize) -> usize {
    let mut ret = 0;
    for i in 0..n {
        ret += v[i];
    }
    ret
}

and got:

example::sum::h94133c41680d7afa:
        push    rax
        test    rsi, rsi
        je      .LBB0_1
        mov     rcx, qword ptr [rdi + 8]
        mov     rdi, qword ptr [rdi + 16]
        lea     rdx, [rsi - 1]
        xor     eax, eax
        xor     r8d, r8d
.LBB0_4:
        cmp     rdi, rdx
        jbe     .LBB0_6
        add     rax, qword ptr [rcx + 8*r8]
        inc     r8
        cmp     rsi, r8
        jne     .LBB0_4
        pop     rcx
        ret
.LBB0_1:
        xor     eax, eax
        pop     rcx
        ret
.LBB0_6:
        lea     rdx, [rip + .L__unnamed_1]
        mov     rsi, rdi
        call    qword ptr [rip + core::panicking::panic_bounds_check::h9bb22f08a42e1ac8@GOTPCREL]

.L__unnamed_2:
        .ascii  "/app/example.rs"

.L__unnamed_1:
        .quad   .L__unnamed_2
        .asciz  "\017\000\000\000\000\000\000\000\004\000\000\000\021\000\000"

That doesn't seem to match your results. What code did you actually use for the Vec case?

1 Like

Here is the exact Godbolt link: Compiler Explorer

Note that I'm compiling for aarch64, whereas the output you posted is for x86-64.

If I add assert!(v.len() >= n); to the beginning of both versions (i.e. bounds-checking before the loop), the non-trivial differences go away. It looks like LLVM is able to determine that the Vec must be aligned to a 16-byte boundary and have an allocation that's a multiple of 16 bytes, and therefore that it's "safe" to read beyond the limits of v.len() for a Vec, but it's also aware that this is not true for an arbitrary slice.

Oddities around bounds checking is one reason why iterators tend to be recommended over indexing - you don't have a bounds check with iteration.

7 Likes

If you're going to loop over indices like that, then you should always use reslicing to make it clear up-front to LLVM that things are in-bounds. For more info, read

5 Likes

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.