I'm surprised - that doesn't optimise as nicely as I thought it would.
Given these types:
pub struct Foo {
first: u16,
big_field: [u8; 512],
}
pub enum Really {
Long(Thing),
Short(String),
}
pub enum Thing {
Foo(Foo),
Bar(Vec<u8>),
}
and this function:
pub fn take_last_foo(items: &mut Vec<Really>) -> Option<Foo> {
match items.pop() {
None => None,
Some(Really::Long(Thing::Foo(foo))) => Some(foo),
Some(other) => {
items.push(other);
None
}
}
}
We can ask the playground for the generated assembly when using a nightly compiler in release mode.
playground::take_last_foo: # @playground::take_last_foo
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 520
mov r14, rdi
mov rax, qword ptr [rsi + 16]
test rax, rax
je .LBB3_10
mov r12, rsi
lea r15, [rax - 1]
mov qword ptr [rsi + 16], r15
mov rbx, qword ptr [rsi]
mov rcx, r15
shl rcx, 9
lea rdx, [rcx + 8*rax]
add rdx, -8
movzx ecx, word ptr [rbx + rdx]
lea rax, [rbx + rdx + 2]
mov rbp, qword ptr [rbx + rdx + 8]
mov r13, qword ptr [rbx + rdx + 16]
lea rsi, [rbx + rdx]
add rsi, 24
test ecx, ecx
je .LBB3_5
cmp ecx, 3
je .LBB3_10
mov word ptr [rsp], cx
mov ecx, dword ptr [rax]
mov dword ptr [rsp + 2], ecx
movzx eax, word ptr [rax + 4]
mov word ptr [rsp + 6], ax
mov qword ptr [rsp + 8], rbp
mov qword ptr [rsp + 16], r13
lea rdi, [rsp + 24]
mov edx, 496
call qword ptr [rip + memcpy@GOTPCREL]
cmp r15, qword ptr [r12 + 8]
jne .LBB3_9
mov rdi, r12
mov rsi, r15
call alloc::raw_vec::RawVec<T,A>::reserve_for_push
mov rbx, qword ptr [r12]
mov r15, qword ptr [r12 + 16]
.LBB3_9:
mov rax, r15
shl rax, 9
lea rdi, [rax + 8*r15]
add rdi, rbx
mov rsi, rsp
mov edx, 520
call qword ptr [rip + memcpy@GOTPCREL]
inc r15
mov qword ptr [r12 + 16], r15
.LBB3_10:
xor eax, eax
jmp .LBB3_11
.LBB3_5:
movzx ecx, word ptr [rax + 4]
mov word ptr [r14 + 6], cx
mov eax, dword ptr [rax]
mov dword ptr [r14 + 2], eax
lea rdi, [r14 + 24]
mov edx, 492
call qword ptr [rip + memcpy@GOTPCREL]
mov qword ptr [r14 + 8], rbp
mov qword ptr [r14 + 16], r13
mov ax, 1
.LBB3_11:
mov word ptr [r14], ax
mov rax, r14
add rsp, 520
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
mov rbx, rax
mov rdi, rsp
call core::ptr::drop_in_place<playground::Really>
mov rdi, rbx
call _Unwind_Resume@PLT
ud2
In particular, there's a call to alloc::raw_vec::RawVec<T,A>::reserve_for_push()
that I'd expect to optimise out because we're guaranteed to have enough capacity for the push()
. The capacity
field never changed and was valid to begin with, but I'm guessing LLVM can't see that because it doesn't know what capacity
was before take_last_foo()
was called.
There's also a memcpy()
of 496 bytes onto the stack which I don't understand and another memcpy()
of 492 bytes to the return location (r14
, which contains the value from rdi
) when returning None
, but I'd assume you can skip that second big copy because the bytes are all uninit
.