So I modifed my code to look like this instead, which levels the playing field a bit. Obviously most of the new cost for the "uuid references"
test is that it needs to construct a Uuid
.
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use uuid::Uuid;
pub fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("uuid references", |b| b.iter(|| {
let uuid = Uuid::from_u128(0);
by_ref(black_box(&uuid))
}));
c.bench_function("uuid values", |b| b.iter(|| {
let uuid = Uuid::from_u128(0);
by_val(black_box(uuid))
}));
}
#[inline(never)]
fn by_ref(id: &Uuid) -> u128 {
id.as_u128()
}
#[inline(never)]
fn by_val(id: Uuid) -> u128 {
id.as_u128()
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Results (ran multiple times), one representative run is here:
uuid references time: [7.6366 ns 7.6687 ns 7.7065 ns]
change: [-1.5775% -1.0179% -0.4569%] (p = 0.00 < 0.05)
Change within noise threshold.
Found 5 outliers among 100 measurements (5.00%)
2 (2.00%) high mild
3 (3.00%) high severe
uuid values time: [13.790 ns 13.840 ns 13.896 ns]
change: [-1.1393% -0.0666% +0.8301%] (p = 0.90 > 0.05)
No change in performance detected.
As you can see, "uuid references"
is still winning by a good margin.
But your godbolt link made me curious, so I played around a bit, and modified your code by adding two functions: main_by_ref
and main_by_val
which calls the respective by_ref
and by_val
functions: godbolt
main_by_ref
generates this code:
example::main_by_ref:
sub rsp, 40
xorps xmm0, xmm0
movaps xmmword ptr [rsp + 16], xmm0
lea rax, [rsp + 16]
mov qword ptr [rsp + 8], rax
mov rdi, qword ptr [rsp + 8]
call qword ptr [rip + example::by_ref@GOTPCREL]
add rsp, 40
ret
But main_by_val
generates this monstrosity:
example::main_by_val:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 56
xorps xmm0, xmm0
movaps xmmword ptr [rsp + 16], xmm0
movzx eax, byte ptr [rsp + 31]
mov byte ptr [rsp + 15], al
movzx edi, byte ptr [rsp + 30]
movzx r8d, byte ptr [rsp + 29]
movzx r9d, byte ptr [rsp + 28]
movzx r10d, byte ptr [rsp + 27]
movzx r11d, byte ptr [rsp + 26]
movzx ebp, byte ptr [rsp + 25]
movzx r14d, byte ptr [rsp + 24]
movzx r15d, byte ptr [rsp + 23]
movzx r12d, byte ptr [rsp + 22]
movzx r13d, byte ptr [rsp + 21]
movzx esi, byte ptr [rsp + 20]
movzx edx, byte ptr [rsp + 19]
movzx ebx, byte ptr [rsp + 18]
movzx eax, byte ptr [rsp + 16]
movzx ecx, byte ptr [rsp + 17]
mov byte ptr [rsp + 40], al
mov byte ptr [rsp + 41], cl
mov byte ptr [rsp + 42], bl
mov byte ptr [rsp + 43], dl
mov byte ptr [rsp + 44], sil
mov byte ptr [rsp + 45], r13b
mov byte ptr [rsp + 46], r12b
mov byte ptr [rsp + 47], r15b
mov byte ptr [rsp + 48], r14b
mov byte ptr [rsp + 49], bpl
mov byte ptr [rsp + 50], r11b
mov byte ptr [rsp + 51], r10b
mov byte ptr [rsp + 52], r9b
mov byte ptr [rsp + 53], r8b
mov byte ptr [rsp + 54], dil
movzx eax, byte ptr [rsp + 15]
mov byte ptr [rsp + 55], al
lea rdi, [rsp + 40]
call qword ptr [rip + example::by_val@GOTPCREL]
add rsp, 56
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
So you're right @H2CO3 , both of the by_ref
and by_val
functions generate the same assembly, but the caller generates a lot more!
So we're kind of back at square one: Is by_ref
faster? Why?