i was following this blog post on autovectorisation and decided to try it myself. i tweaked the 3rd example to this:
pub struct StereoSample {
l: Vec<f32>,
r: Vec<f32>,
}
#[repr(transparent)]
pub struct MonoSample(Vec<f32>);
pub fn mix_mono_to_stereo(
dst: &mut StereoSample,
src: &MonoSample,
gain_l: f32,
gain_r: f32,
) {
for i in 0..src.0.len() {
dst.l[i] = src.0[i] * gain_l;
dst.r[i] = src.0[i] * gain_r;
}
}
to make it a struct of arrays. rust version 1.61.0
generated this code:
example::mix_mono_to_stereo:
push rbp
push r15
push r14
push r12
push rbx
mov r15, qword ptr [rsi + 16]
test r15, r15
je .LBB0_14
mov rax, rdi
mov r14, qword ptr [rsi]
mov r9, qword ptr [rdi]
mov rdi, qword ptr [rdi + 16]
mov r8, qword ptr [rax + 40]
mov r10, qword ptr [rax + 24]
cmp r8, rdi
mov rsi, rdi
cmovb rsi, r8
lea rax, [r15 - 1]
cmp rsi, rax
cmovae rsi, rax
add rsi, 1
cmp rsi, 4
ja .LBB0_6
xor r11d, r11d
jmp .LBB0_3
.LBB0_6:
cmp r8, rdi
mov rcx, rdi
cmovb rcx, r8
cmp rcx, rax
cmovae rcx, rax
lea rdx, [r9 + 4*rcx]
add rdx, 4
lea rbx, [r10 + 4*rcx]
add rbx, 4
lea rcx, [r14 + 4*rcx]
add rcx, 4
cmp r9, rbx
setb bpl
cmp r10, rdx
setb r12b
cmp r9, rcx
setb al
cmp r14, rdx
setb dl
cmp r10, rcx
setb cl
cmp r14, rbx
setb bl
xor r11d, r11d
test bpl, r12b
jne .LBB0_3
and al, dl
jne .LBB0_3
and cl, bl
jne .LBB0_3
mov eax, esi
and eax, 3
test rax, rax
mov ecx, 4
cmovne rcx, rax
sub rsi, rcx
movaps xmm2, xmm0
shufps xmm2, xmm0, 0
movaps xmm3, xmm1
shufps xmm3, xmm1, 0
xor eax, eax
.LBB0_10:
movups xmm4, xmmword ptr [r14 + 4*rax]
mulps xmm4, xmm2
movups xmmword ptr [r9 + 4*rax], xmm4
movups xmm4, xmmword ptr [r14 + 4*rax]
mulps xmm4, xmm3
movups xmmword ptr [r10 + 4*rax], xmm4
add rax, 4
cmp rsi, rax
jne .LBB0_10
mov r11, rsi
.LBB0_3:
mov r12, r8
sub r12, r11
mov rax, rdi
sub rax, r11
sub r15, r11
lea rdx, [r10 + 4*r11]
lea rbx, [r9 + 4*r11]
lea rcx, [r14 + 4*r11]
xor esi, esi
.LBB0_4:
cmp rax, rsi
je .LBB0_5
movss xmm2, dword ptr [rcx + 4*rsi]
mulss xmm2, xmm0
movss dword ptr [rbx + 4*rsi], xmm2
cmp r12, rsi
je .LBB0_15
movss xmm2, dword ptr [rcx + 4*rsi]
mulss xmm2, xmm1
movss dword ptr [rdx + 4*rsi], xmm2
add rsi, 1
cmp r15, rsi
jne .LBB0_4
.LBB0_14:
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
.LBB0_5:
lea rdx, [rip + .L__unnamed_1]
mov rsi, rdi
call qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
ud2
.LBB0_15:
lea rdx, [rip + .L__unnamed_2]
mov rdi, r8
mov rsi, r8
call qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
ud2
.L__unnamed_3:
.ascii "/app/example.rs"
.L__unnamed_1:
.quad .L__unnamed_3
.asciz "\017\000\000\000\000\000\000\000\013\000\000\000\t\000\000"
.L__unnamed_2:
.quad .L__unnamed_3
.asciz "\017\000\000\000\000\000\000\000\f\000\000\000\t\000\000"
a whopping 132 lines, whereas rust 1.45.2
generated this code:
example::mix_mono_to_stereo:
push rax
mov rcx, qword ptr [rsi + 16]
test rcx, rcx
je .LBB0_8
mov rdx, qword ptr [rsi]
xor eax, eax
.LBB0_2:
mov rsi, qword ptr [rdi + 16]
cmp rsi, rax
jbe .LBB0_3
mov rsi, qword ptr [rdi]
movss xmm2, dword ptr [rdx + 4*rax]
mulss xmm2, xmm0
movss dword ptr [rsi + 4*rax], xmm2
mov rsi, qword ptr [rdi + 40]
cmp rsi, rax
jbe .LBB0_6
mov rsi, qword ptr [rdi + 24]
movss xmm2, dword ptr [rdx + 4*rax]
mulss xmm2, xmm1
movss dword ptr [rsi + 4*rax], xmm2
lea rsi, [rax + 1]
mov rax, rsi
cmp rcx, rsi
jne .LBB0_2
.LBB0_8:
pop rax
ret
.LBB0_3:
lea rdx, [rip + .L__unnamed_1]
jmp .LBB0_4
.LBB0_6:
lea rdx, [rip + .L__unnamed_2]
.LBB0_4:
mov rdi, rax
call qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
ud2
.L__unnamed_3:
.ascii "/app/example.rs"
.L__unnamed_1:
.quad .L__unnamed_3
.asciz "\017\000\000\000\000\000\000\000\013\000\000\000\t\000\000"
.L__unnamed_2:
.quad .L__unnamed_3
.asciz "\017\000\000\000\000\000\000\000\f\000\000\000\t\000\000"
only 49 lines.
i don't know much about assembly code, so why does this happen, and which is faster? thanks!