I have now had a look at the code for a test program, and I don't think large structs/enums are being moved around [edit: hmm, I changed my mind, see the reply ].
I think I might be able to help the compiler avoid some calls in the inner loop by re-structuring my code slightly, in particular the push_tree function which has a simple case for a "small" (single level) Btree, so it could test for that inline and only do the call if the BTree is not single level. I think a call is a relatively expensive operation. All that said, I think the performance is actually quite ok for practical purposes.
My test program, which runs in about 2 seconds, which I think is 20ns for the loop which is executed 100 million times.
use pstd::collections::BTreeSet;
fn main() {
let set = BTreeSet::from([2, 4, 6]);
let mut x = 1234;
for _rep in 0..100_000_000 {
for e in &set { x += e; }
}
println!("x={}", x);
}
The assembler for the main program:
.section .text.btest::main,"ax",@progbits
.hidden btest::main
.globl btest::main
.p2align 4
.type btest::main,@function
btest::main:
.cfi_startproc
.cfi_personality 155, DW.ref.rust_eh_personality
.cfi_lsda 27, .Lexception3
push rbp
.cfi_def_cfa_offset 16
push r15
.cfi_def_cfa_offset 24
push r14
.cfi_def_cfa_offset 32
push r13
.cfi_def_cfa_offset 40
push r12
.cfi_def_cfa_offset 48
push rbx
.cfi_def_cfa_offset 56
sub rsp, 1784
.cfi_def_cfa_offset 1840
.cfi_offset rbx, -56
.cfi_offset r12, -48
.cfi_offset r13, -40
.cfi_offset r14, -32
.cfi_offset r15, -24
.cfi_offset rbp, -16
mov qword ptr [rsp + 920], 0
lea r12, [rsp + 928]
mov qword ptr [rsp + 928], 1
mov dword ptr [rsp + 936], 0
lea r15, [rsp + 944]
mov word ptr [rsp + 948], 0
mov dword ptr [rsp + 944], 1048640
mov byte ptr [rsp + 88], 1
mov qword ptr [rsp + 48], 0
mov qword ptr [rsp + 80], r15
lea rdx, [rsp + 48]
mov rdi, r12
mov esi, 2
call <pstd::collections::btree_map::Tree<i32, ()>>::insert::<pstd::collections::btree_map::CustomTuning>
cmp dword ptr [rsp + 48], 1
jne .LBB10_30
mov ebx, dword ptr [rsp + 56]
mov rbp, qword ptr [rsp + 64]
mov r13, qword ptr [rsp + 72]
call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL]
mov edi, 32
mov esi, 8
call qword ptr [rip + __rustc::__rust_alloc@GOTPCREL]
test rax, rax
je .LBB10_3
mov r14, rax
mov qword ptr [rax], 1
mov dword ptr [rax + 8], 0
mov qword ptr [rax + 16], 8
mov dword ptr [rax + 24], 0
mov esi, 1
mov rdi, rax
mov rdx, r15
call <pstd::collections::btree_map::vecs::PairVec<i32, ()>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
movzx eax, word ptr [r14 + 8]
cmp ax, word ptr [r14 + 10]
jae .LBB10_13
mov rdi, r14
add rdi, 16
mov rcx, qword ptr [r14]
mov dword ptr [rcx + 4*rax], ebx
inc eax
mov word ptr [r14 + 8], ax
mov esi, 2
mov rdx, r15
call <pstd::collections::btree_map::vecs::ShortVec<pstd::collections::btree_map::Tree<i32, ()>>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
mov rdi, r12
mov r12, rbp
mov rbp, r13
mov rbx, qword ptr [rsp + 928]
mov r13, qword ptr [rsp + 936]
mov qword ptr [rsp + 928], 1
mov dword ptr [rsp + 936], 0
movzx eax, word ptr [r14 + 24]
movzx edx, word ptr [r14 + 26]
cmp ax, dx
jae .LBB10_16
mov rcx, qword ptr [r14 + 16]
mov esi, eax
shl esi, 4
mov qword ptr [rcx + rsi], rbx
mov qword ptr [rcx + rsi + 8], r13
lea rsi, [rax + 1]
mov word ptr [r14 + 24], si
cmp si, dx
jae .LBB10_21
shl esi, 4
mov qword ptr [rcx + rsi], r12
mov qword ptr [rcx + rsi + 8], rbp
add eax, 2
mov word ptr [r14 + 24], ax
mov qword ptr [rsp + 928], 0
mov qword ptr [rsp + 936], r14
mov r12, rdi
.LBB10_30:
cmp byte ptr [rsp + 88], 0
jne .LBB10_32
inc qword ptr [rsp + 920]
.LBB10_32:
mov byte ptr [rsp + 88], 1
mov qword ptr [rsp + 48], 0
mov qword ptr [rsp + 80], r15
lea rdx, [rsp + 48]
mov rdi, r12
mov esi, 4
call <pstd::collections::btree_map::Tree<i32, ()>>::insert::<pstd::collections::btree_map::CustomTuning>
cmp dword ptr [rsp + 48], 1
jne .LBB10_41
mov ebx, dword ptr [rsp + 56]
mov rbp, qword ptr [rsp + 64]
mov r13, qword ptr [rsp + 72]
call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL]
mov edi, 32
mov esi, 8
call qword ptr [rip + __rustc::__rust_alloc@GOTPCREL]
test rax, rax
je .LBB10_3
mov r14, rax
mov qword ptr [rax], 1
mov dword ptr [rax + 8], 0
mov qword ptr [rax + 16], 8
mov dword ptr [rax + 24], 0
mov esi, 1
mov rdi, rax
mov rdx, r15
call <pstd::collections::btree_map::vecs::PairVec<i32, ()>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
movzx eax, word ptr [r14 + 8]
cmp ax, word ptr [r14 + 10]
jae .LBB10_13
mov rdi, r14
add rdi, 16
mov rcx, qword ptr [r14]
mov dword ptr [rcx + 4*rax], ebx
inc eax
mov word ptr [r14 + 8], ax
mov esi, 2
mov rdx, r15
call <pstd::collections::btree_map::vecs::ShortVec<pstd::collections::btree_map::Tree<i32, ()>>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
mov rdi, r12
mov r12, rbp
mov rbp, r13
mov rbx, qword ptr [rsp + 928]
mov r13, qword ptr [rsp + 936]
mov qword ptr [rsp + 928], 1
mov dword ptr [rsp + 936], 0
movzx eax, word ptr [r14 + 24]
movzx edx, word ptr [r14 + 26]
cmp ax, dx
jae .LBB10_16
mov rcx, qword ptr [r14 + 16]
mov esi, eax
shl esi, 4
mov qword ptr [rcx + rsi], rbx
mov qword ptr [rcx + rsi + 8], r13
lea rsi, [rax + 1]
mov word ptr [r14 + 24], si
cmp si, dx
jae .LBB10_21
shl esi, 4
mov qword ptr [rcx + rsi], r12
mov qword ptr [rcx + rsi + 8], rbp
add eax, 2
mov word ptr [r14 + 24], ax
mov qword ptr [rsp + 928], 0
mov qword ptr [rsp + 936], r14
mov r12, rdi
.LBB10_41:
cmp byte ptr [rsp + 88], 0
jne .LBB10_43
inc qword ptr [rsp + 920]
.LBB10_43:
mov byte ptr [rsp + 88], 1
mov qword ptr [rsp + 48], 0
mov qword ptr [rsp + 80], r15
lea rdx, [rsp + 48]
mov rdi, r12
mov esi, 6
call <pstd::collections::btree_map::Tree<i32, ()>>::insert::<pstd::collections::btree_map::CustomTuning>
cmp dword ptr [rsp + 48], 1
jne .LBB10_52
mov ebx, dword ptr [rsp + 56]
mov rbp, qword ptr [rsp + 64]
mov r13, qword ptr [rsp + 72]
call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL]
mov edi, 32
mov esi, 8
call qword ptr [rip + __rustc::__rust_alloc@GOTPCREL]
test rax, rax
je .LBB10_3
mov r14, rax
mov qword ptr [rax], 1
mov dword ptr [rax + 8], 0
mov qword ptr [rax + 16], 8
mov dword ptr [rax + 24], 0
mov esi, 1
mov rdi, rax
mov rdx, r15
call <pstd::collections::btree_map::vecs::PairVec<i32, ()>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
movzx eax, word ptr [r14 + 8]
cmp ax, word ptr [r14 + 10]
jae .LBB10_13
mov rdi, r14
add rdi, 16
mov rcx, qword ptr [r14]
mov dword ptr [rcx + 4*rax], ebx
inc eax
mov word ptr [r14 + 8], ax
mov esi, 2
mov rdx, r15
call <pstd::collections::btree_map::vecs::ShortVec<pstd::collections::btree_map::Tree<i32, ()>>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
mov r12, rbp
mov rbp, r13
mov rbx, qword ptr [rsp + 928]
mov r13, qword ptr [rsp + 936]
mov qword ptr [rsp + 928], 1
mov dword ptr [rsp + 936], 0
movzx eax, word ptr [r14 + 24]
movzx edx, word ptr [r14 + 26]
cmp ax, dx
jae .LBB10_16
mov rcx, qword ptr [r14 + 16]
mov esi, eax
shl esi, 4
mov qword ptr [rcx + rsi], rbx
mov qword ptr [rcx + rsi + 8], r13
lea rsi, [rax + 1]
mov word ptr [r14 + 24], si
cmp si, dx
jae .LBB10_21
shl esi, 4
mov qword ptr [rcx + rsi], r12
mov qword ptr [rcx + rsi + 8], rbp
add eax, 2
mov word ptr [r14 + 24], ax
mov qword ptr [rsp + 928], 0
mov qword ptr [rsp + 936], r14
.LBB10_52:
cmp byte ptr [rsp + 88], 0
jne .LBB10_54
inc qword ptr [rsp + 920]
.LBB10_54:
movups xmm0, xmmword ptr [rsp + 920]
movups xmm1, xmmword ptr [rsp + 936]
movaps xmmword ptr [rsp + 16], xmm0
movaps xmmword ptr [rsp + 32], xmm1
mov dword ptr [rsp + 12], 1234
mov r12, qword ptr [rsp + 16]
test r12, r12
je .LBB10_62
xor ebp, ebp
mov r13d, 1234
lea r14, [rsp + 48]
lea r15, [rsp + 920]
mov rbx, qword ptr [rip + memcpy@GOTPCREL]
jmp .LBB10_56
.p2align 4
.LBB10_81:
cmp ebp, 100000000
je .LBB10_65
.LBB10_56:
mov qword ptr [rsp + 496], 0
mov qword ptr [rsp + 904], 0
xorps xmm0, xmm0
movaps xmmword ptr [rsp + 48], xmm0
movaps xmmword ptr [rsp + 64], xmm0
movaps xmmword ptr [rsp + 80], xmm0
mov rdi, r14
lea rsi, [rsp + 24]
mov edx, 1
call <pstd::collections::btree_map::Range<i32, ()>>::push_tree
inc ebp
mov edx, 864
mov rdi, r15
mov rsi, r14
call rbx
mov edx, 864
mov rdi, r14
mov rsi, r15
call rbx
mov rax, r12
.p2align 4
.LBB10_58:
dec rax
mov qword ptr [rsp + 912], rax
mov rcx, qword ptr [rsp + 56]
cmp rcx, qword ptr [rsp + 64]
je .LBB10_77
mov rax, qword ptr [rsp + 48]
movzx edx, word ptr [rax + 8]
cmp rcx, rdx
jae .LBB10_60
mov rax, qword ptr [rax]
lea rax, [rax + 4*rcx]
inc rcx
mov qword ptr [rsp + 56], rcx
.LBB10_79:
test rax, rax
je .LBB10_81
add r13d, dword ptr [rax]
mov dword ptr [rsp + 12], r13d
mov rax, qword ptr [rsp + 912]
test rax, rax
jne .LBB10_58
jmp .LBB10_81
.LBB10_77:
mov rdi, r14
call <pstd::collections::btree_map::Range<i32, ()>>::next_cold
jmp .LBB10_79
.LBB10_62:
mov ebx, 100000000
lea r14, [rsp + 48]
.p2align 4
.LBB10_63:
mov qword ptr [rsp + 496], 0
mov qword ptr [rsp + 904], 0
xorps xmm0, xmm0
movaps xmmword ptr [rsp + 48], xmm0
movaps xmmword ptr [rsp + 64], xmm0
movaps xmmword ptr [rsp + 80], xmm0
mov rdi, r14
lea rsi, [rsp + 24]
mov edx, 1
call <pstd::collections::btree_map::Range<i32, ()>>::push_tree
dec ebx
jne .LBB10_63
.LBB10_65:
lea rax, [rsp + 12]
mov qword ptr [rsp + 48], rax
mov rax, qword ptr [rip + <i32 as core::fmt::Display>::fmt@GOTPCREL]
mov qword ptr [rsp + 56], rax
lea rdi, [rip + .Lanon.178299d487561156df92f126da52896f.31]
lea rsi, [rsp + 48]
call qword ptr [rip + std::io::stdio::_print@GOTPCREL]
lea rsi, [rsp + 40]
lea rdi, [rsp + 24]
call <pstd::collections::btree_map::Tree<i32, ()>>::dealloc::<pstd::collections::btree_map::CustomTuning>
cmp qword ptr [rsp + 24], 0
jne .LBB10_69
mov rdi, qword ptr [rsp + 32]
mov esi, 32
mov edx, 8
call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
.LBB10_69:
add rsp, 1784
.cfi_def_cfa_offset 56
pop rbx
.cfi_def_cfa_offset 48
pop r12
.cfi_def_cfa_offset 40
pop r13
.cfi_def_cfa_offset 32
pop r14
.cfi_def_cfa_offset 24
pop r15
.cfi_def_cfa_offset 16
pop rbp
.cfi_def_cfa_offset 8
ret
.LBB10_60:
.cfi_def_cfa_offset 1840
lea rdi, [rip + .Lanon.178299d487561156df92f126da52896f.26]
lea rdx, [rip + .Lanon.178299d487561156df92f126da52896f.37]
mov esi, 32
call qword ptr [rip + core::panicking::panic@GOTPCREL]
jmp .LBB10_4
.LBB10_3:
mov edi, 8
mov esi, 32
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
jmp .LBB10_4
.LBB10_13:
lea rdi, [rip + .Lanon.178299d487561156df92f126da52896f.39]
lea rdx, [rip + .Lanon.178299d487561156df92f126da52896f.40]
mov esi, 39
call qword ptr [rip + core::panicking::panic@GOTPCREL]
jmp .LBB10_4
.LBB10_16:
lea rdi, [rip + .Lanon.178299d487561156df92f126da52896f.39]
lea rdx, [rip + .Lanon.178299d487561156df92f126da52896f.43]
mov esi, 39
call qword ptr [rip + core::panicking::panic@GOTPCREL]
jmp .LBB10_4
.LBB10_21:
lea rdi, [rip + .Lanon.178299d487561156df92f126da52896f.39]
lea rdx, [rip + .Lanon.178299d487561156df92f126da52896f.43]
mov esi, 39
call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB10_4:
ud2
jmp .LBB10_73
mov r15, rax
cmp qword ptr [rsp + 24], 0
jne .LBB10_74
mov rdi, qword ptr [rsp + 32]
mov esi, 32
mov edx, 8
call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
mov rdi, r15
call _Unwind_Resume@PLT
jmp .LBB10_25
mov r15, rax
cmp qword ptr [rsp + 48], 0
je .LBB10_28
cmp qword ptr [rsp + 64], 0
jne .LBB10_28
mov r14, qword ptr [rsp + 72]
jmp .LBB10_8
jmp .LBB10_73
jmp .LBB10_73
mov r15, rax
test r12, r12
jne .LBB10_8
mov esi, 32
mov edx, 8
mov rdi, rbp
call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
jmp .LBB10_8
mov r15, rax
test rbx, rbx
jne .LBB10_18
mov esi, 32
mov edx, 8
mov rdi, r13
call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
.LBB10_18:
mov r13, rbp
mov rbp, r12
jmp .LBB10_26
.LBB10_25:
mov r15, rax
.LBB10_26:
mov esi, 32
mov edx, 8
mov rdi, r14
call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
jmp .LBB10_27
mov r15, rax
.LBB10_27:
mov r14, r13
test rbp, rbp
jne .LBB10_28
.LBB10_8:
mov esi, 32
mov edx, 8
mov rdi, r14
call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
.LBB10_28:
lea rdi, [rsp + 920]
call core::ptr::drop_in_place::<pstd::collections::btree_set::BTreeSet<i32>>
jmp .LBB10_74
call qword ptr [rip + core::panicking::panic_in_cleanup@GOTPCREL]
.LBB10_73:
mov r15, rax
lea rdi, [rsp + 16]
call core::ptr::drop_in_place::<pstd::collections::btree_set::BTreeSet<i32>>
.LBB10_74:
mov rdi, r15
call _Unwind_Resume@PLT
call qword ptr [rip + core::panicking::panic_in_cleanup@GOTPCREL]