Code version 1:
pub fn sum(v: &[usize], n: usize) -> usize {
let mut ret = 0;
for i in 0..n {
ret += v[i];
}
ret
}
output (with -O --target aarch64-unknown-linux-gnu
):
sum:
str x30, [sp, #-16]!
cbz x2, .LBB0_5
mov x8, x0
mov x0, xzr
sub x9, x2, #1
.LBB0_2:
cmp x9, x1
b.hs .LBB0_6
ldr x10, [x8], #8
subs x2, x2, #1
add x0, x10, x0
b.ne .LBB0_2
ldr x30, [sp], #16
ret
.LBB0_5:
mov x0, xzr
ldr x30, [sp], #16
ret
.LBB0_6:
adrp x2, .L__unnamed_1
add x2, x2, :lo12:.L__unnamed_1
mov x0, x1
bl core::panicking::panic_bounds_check::hb550301af3964b1f
Notice that there is one loop iteration for each element to be summed, and we do the bounds check every time (cmp x9, x1
is the bounds check).
Now, if I change v
to be of type &Vec<usize>
, the output is as follows:
sum:
str x30, [sp, #-16]!
cbz x1, .LBB0_3
mov x8, x1
ldp x10, x1, [x0, #8]
sub x9, x8, #1
cmp x1, x9
csel x11, x1, x9, lo
add x11, x11, #1
cmp x11, #5
b.hs .LBB0_4
mov x0, xzr
mov x11, xzr
b .LBB0_7
.LBB0_3:
mov x0, xzr
ldr x30, [sp], #16
ret
.LBB0_4:
ands x12, x11, #0x3
mov w13, #4
movi v0.2d, #0000000000000000
movi v1.2d, #0000000000000000
csel x12, x13, x12, eq
sub x11, x11, x12
add x12, x10, #16
mov x13, x11
.LBB0_5:
ldp q2, q3, [x12, #-16]
subs x13, x13, #4
add x12, x12, #32
add v0.2d, v2.2d, v0.2d
add v1.2d, v3.2d, v1.2d
b.ne .LBB0_5
add v0.2d, v1.2d, v0.2d
addp d0, v0.2d
fmov x0, d0
.LBB0_7:
cmp x1, x11
b.eq .LBB0_10
ldr x12, [x10, x11, lsl #3]
add x11, x11, #1
cmp x8, x11
add x0, x12, x0
b.ne .LBB0_7
ldr x30, [sp], #16
ret
.LBB0_10:
cmp x1, x9
adrp x2, .L__unnamed_1
add x2, x2, :lo12:.L__unnamed_1
csel x0, x1, x9, lo
bl core::panicking::panic_bounds_check::hb550301af3964b1f
Now we only do one loop iteration per 4 elements (using the 128-bit q
registers), and we don't do a bounds check in the hot loop, but only in the non-unrolled tail part.
What accounts for this difference?