Is there a good reason for MaybeUninit::uninit to zero memory?

I believe that MaybeUninit::uninit currently zeroes memory rather than doing nothing. I was wondering if there is a good reason for this, or whether it is simply that the “do nothing” optimisation has not yet been implemented.

I cannot think of a good reason, but I could very well be missing something. Any ideas?

Well, your belief may just be wrong?

use std::mem::MaybeUninit;

#[unsafe(no_mangle)]
fn test1(x: &mut MaybeUninit<u8>) {
    *x = MaybeUninit::new(0);
}

#[unsafe(no_mangle)]
fn test2(x: &mut MaybeUninit<u8>) {
    *x = MaybeUninit::uninit();
}

(playground)

test1:
	movb	$0, (%rdi)
	retq

test2:
	retq

Given this observation, I’ll just skip all your subsequent questions in the original post.


If you have some evidence that MaybeUninit::uninit does “zero memory” in certain situations (obviously the above example seems to be not one of them), you should share your code for such a concrete situation :wink:

2 Likes

I will try to provide a simple example, but I encountered it in my own code.

Edit: well I have not been able to make a simple example yet, but at least the answer so far seems to be that it doesn’t need to zero the memory ( which is what I originally expected ).

Ok, here is a non-simple example, the program:


use pstd::collections::BTreeSet;

fn main() {
    let set = BTreeSet::from([2, 4, 6]);
    let mut x = 1234;
    for _rep in 0..100_000_000 {
      for e in set.iter() { x += e; } 
    }
    println!("x={}", x);
}
```

The assembler for main:

.section .text.btest::main,"ax",@progbits
	.hidden	btest::main
	.globl	btest::main
	.p2align	4
.type	btest::main,@function
btest::main:
	.cfi_startproc
	.cfi_personality 155, DW.ref.rust_eh_personality
	.cfi_lsda 27, .Lexception3
	push rbp
	.cfi_def_cfa_offset 16
	push r15
	.cfi_def_cfa_offset 24
	push r14
	.cfi_def_cfa_offset 32
	push r13
	.cfi_def_cfa_offset 40
	push r12
	.cfi_def_cfa_offset 48
	push rbx
	.cfi_def_cfa_offset 56
	sub rsp, 1480
	.cfi_def_cfa_offset 1536
	.cfi_offset rbx, -56
	.cfi_offset r12, -48
	.cfi_offset r13, -40
	.cfi_offset r14, -32
	.cfi_offset r15, -24
	.cfi_offset rbp, -16
	mov qword ptr [rsp + 56], 0
	lea r12, [rsp + 64]
	mov qword ptr [rsp + 64], 1
	mov dword ptr [rsp + 72], 0
	lea r15, [rsp + 80]
	mov word ptr [rsp + 84], 0
	mov dword ptr [rsp + 80], 1048640
	mov byte ptr [rsp + 808], 1
	mov qword ptr [rsp + 768], 0
	mov qword ptr [rsp + 800], r15
	lea rdx, [rsp + 768]
	mov rdi, r12
	mov esi, 2
	call <pstd::collections::btree_map::Tree<i32, ()>>::insert::<pstd::collections::btree_map::CustomTuning>
	cmp dword ptr [rsp + 768], 1
	jne .LBB10_30
	mov ebx, dword ptr [rsp + 776]
	mov rbp, qword ptr [rsp + 784]
	mov r13, qword ptr [rsp + 792]
	call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL]
	mov edi, 32
	mov esi, 8
	call qword ptr [rip + __rustc::__rust_alloc@GOTPCREL]
	test rax, rax
	je .LBB10_3
	mov r14, rax
	mov qword ptr [rax], 1
	mov dword ptr [rax + 8], 0
	mov qword ptr [rax + 16], 8
	mov dword ptr [rax + 24], 0
	mov esi, 1
	mov rdi, rax
	mov rdx, r15
	call <pstd::collections::btree_map::vecs::PairVec<i32, ()>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
	movzx eax, word ptr [r14 + 8]
	cmp ax, word ptr [r14 + 10]
	jae .LBB10_13
	mov rdi, r14
	add rdi, 16
	mov rcx, qword ptr [r14]
	mov dword ptr [rcx + 4*rax], ebx
	inc eax
	mov word ptr [r14 + 8], ax
	mov esi, 2
	mov rdx, r15
	call <pstd::collections::btree_map::vecs::ShortVec<pstd::collections::btree_map::Tree<i32, ()>>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
	mov rdi, r12
	mov r12, rbp
	mov rbp, r13
	mov rbx, qword ptr [rsp + 64]
	mov r13, qword ptr [rsp + 72]
	mov qword ptr [rsp + 64], 1
	mov dword ptr [rsp + 72], 0
	movzx eax, word ptr [r14 + 24]
	movzx edx, word ptr [r14 + 26]
	cmp ax, dx
	jae .LBB10_16
	mov rcx, qword ptr [r14 + 16]
	mov esi, eax
	shl esi, 4
	mov qword ptr [rcx + rsi], rbx
	mov qword ptr [rcx + rsi + 8], r13
	lea rsi, [rax + 1]
	mov word ptr [r14 + 24], si
	cmp si, dx
	jae .LBB10_21
	shl esi, 4
	mov qword ptr [rcx + rsi], r12
	mov qword ptr [rcx + rsi + 8], rbp
	add eax, 2
	mov word ptr [r14 + 24], ax
	mov qword ptr [rsp + 64], 0
	mov qword ptr [rsp + 72], r14
	mov r12, rdi
.LBB10_30:
	cmp byte ptr [rsp + 808], 0
	jne .LBB10_32
	inc qword ptr [rsp + 56]
.LBB10_32:
	mov byte ptr [rsp + 808], 1
	mov qword ptr [rsp + 768], 0
	mov qword ptr [rsp + 800], r15
	lea rdx, [rsp + 768]
	mov rdi, r12
	mov esi, 4
	call <pstd::collections::btree_map::Tree<i32, ()>>::insert::<pstd::collections::btree_map::CustomTuning>
	cmp dword ptr [rsp + 768], 1
	jne .LBB10_41
	mov ebx, dword ptr [rsp + 776]
	mov rbp, qword ptr [rsp + 784]
	mov r13, qword ptr [rsp + 792]
	call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL]
	mov edi, 32
	mov esi, 8
	call qword ptr [rip + __rustc::__rust_alloc@GOTPCREL]
	test rax, rax
	je .LBB10_3
	mov r14, rax
	mov qword ptr [rax], 1
	mov dword ptr [rax + 8], 0
	mov qword ptr [rax + 16], 8
	mov dword ptr [rax + 24], 0
	mov esi, 1
	mov rdi, rax
	mov rdx, r15
	call <pstd::collections::btree_map::vecs::PairVec<i32, ()>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
	movzx eax, word ptr [r14 + 8]
	cmp ax, word ptr [r14 + 10]
	jae .LBB10_13
	mov rdi, r14
	add rdi, 16
	mov rcx, qword ptr [r14]
	mov dword ptr [rcx + 4*rax], ebx
	inc eax
	mov word ptr [r14 + 8], ax
	mov esi, 2
	mov rdx, r15
	call <pstd::collections::btree_map::vecs::ShortVec<pstd::collections::btree_map::Tree<i32, ()>>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
	mov rdi, r12
	mov r12, rbp
	mov rbp, r13
	mov rbx, qword ptr [rsp + 64]
	mov r13, qword ptr [rsp + 72]
	mov qword ptr [rsp + 64], 1
	mov dword ptr [rsp + 72], 0
	movzx eax, word ptr [r14 + 24]
	movzx edx, word ptr [r14 + 26]
	cmp ax, dx
	jae .LBB10_16
	mov rcx, qword ptr [r14 + 16]
	mov esi, eax
	shl esi, 4
	mov qword ptr [rcx + rsi], rbx
	mov qword ptr [rcx + rsi + 8], r13
	lea rsi, [rax + 1]
	mov word ptr [r14 + 24], si
	cmp si, dx
	jae .LBB10_21
	shl esi, 4
	mov qword ptr [rcx + rsi], r12
	mov qword ptr [rcx + rsi + 8], rbp
	add eax, 2
	mov word ptr [r14 + 24], ax
	mov qword ptr [rsp + 64], 0
	mov qword ptr [rsp + 72], r14
	mov r12, rdi
.LBB10_41:
	cmp byte ptr [rsp + 808], 0
	jne .LBB10_43
	inc qword ptr [rsp + 56]
.LBB10_43:
	mov byte ptr [rsp + 808], 1
	mov qword ptr [rsp + 768], 0
	mov qword ptr [rsp + 800], r15
	lea rdx, [rsp + 768]
	mov rdi, r12
	mov esi, 6
	call <pstd::collections::btree_map::Tree<i32, ()>>::insert::<pstd::collections::btree_map::CustomTuning>
	cmp dword ptr [rsp + 768], 1
	jne .LBB10_52
	mov ebx, dword ptr [rsp + 776]
	mov rbp, qword ptr [rsp + 784]
	mov r13, qword ptr [rsp + 792]
	call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL]
	mov edi, 32
	mov esi, 8
	call qword ptr [rip + __rustc::__rust_alloc@GOTPCREL]
	test rax, rax
	je .LBB10_3
	mov r14, rax
	mov qword ptr [rax], 1
	mov dword ptr [rax + 8], 0
	mov qword ptr [rax + 16], 8
	mov dword ptr [rax + 24], 0
	mov esi, 1
	mov rdi, rax
	mov rdx, r15
	call <pstd::collections::btree_map::vecs::PairVec<i32, ()>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
	movzx eax, word ptr [r14 + 8]
	cmp ax, word ptr [r14 + 10]
	jae .LBB10_13
	mov rdi, r14
	add rdi, 16
	mov rcx, qword ptr [r14]
	mov dword ptr [rcx + 4*rax], ebx
	inc eax
	mov word ptr [r14 + 8], ax
	mov esi, 2
	mov rdx, r15
	call <pstd::collections::btree_map::vecs::ShortVec<pstd::collections::btree_map::Tree<i32, ()>>>::set_alloc::<pstd::collections::btree_map::CustomTuning>
	mov r12, rbp
	mov rbp, r13
	mov rbx, qword ptr [rsp + 64]
	mov r13, qword ptr [rsp + 72]
	mov qword ptr [rsp + 64], 1
	mov dword ptr [rsp + 72], 0
	movzx eax, word ptr [r14 + 24]
	movzx edx, word ptr [r14 + 26]
	cmp ax, dx
	jae .LBB10_16
	mov rcx, qword ptr [r14 + 16]
	mov esi, eax
	shl esi, 4
	mov qword ptr [rcx + rsi], rbx
	mov qword ptr [rcx + rsi + 8], r13
	lea rsi, [rax + 1]
	mov word ptr [r14 + 24], si
	cmp si, dx
	jae .LBB10_21
	shl esi, 4
	mov qword ptr [rcx + rsi], r12
	mov qword ptr [rcx + rsi + 8], rbp
	add eax, 2
	mov word ptr [r14 + 24], ax
	mov qword ptr [rsp + 64], 0
	mov qword ptr [rsp + 72], r14
.LBB10_52:
	cmp byte ptr [rsp + 808], 0
	jne .LBB10_54
	inc qword ptr [rsp + 56]
.LBB10_54:
	movups xmm0, xmmword ptr [rsp + 56]
	movups xmm1, xmmword ptr [rsp + 72]
	movaps xmmword ptr [rsp + 16], xmm0
	movaps xmmword ptr [rsp + 32], xmm1
	mov dword ptr [rsp + 12], 1234
	mov r12, qword ptr [rsp + 16]
	xor ebp, ebp
	mov r13d, 1234
	lea r14, [rsp + 768]
	lea r15, [rsp + 56]
	mov rbx, qword ptr [rip + memcpy@GOTPCREL]
	jmp .LBB10_55
	.p2align	4
.LBB10_70:
	inc ebp
	cmp ebp, 100000000
	je .LBB10_71
.LBB10_55:
	mov edx, 704
	mov rdi, r14
	xor esi, esi
	call qword ptr [rip + memset@GOTPCREL]
	mov qword ptr [rsp + 1472], r12
	mov rdi, r14
	lea rsi, [rsp + 24]
	mov edx, 1
	call <pstd::collections::btree_map::Range<i32, ()>>::push_tree
	mov edx, 712
	mov rdi, r15
	mov rsi, r14
	call rbx
	mov rax, qword ptr [rsp + 760]
	test rax, rax
	je .LBB10_70
	.p2align	4
.LBB10_58:
	dec rax
	mov qword ptr [rsp + 760], rax
	mov rcx, qword ptr [rsp + 64]
	cmp rcx, qword ptr [rsp + 72]
	je .LBB10_69
	mov rax, qword ptr [rsp + 56]
	movzx edx, word ptr [rax + 8]
	cmp rcx, rdx
	jae .LBB10_60
	mov rax, qword ptr [rax]
	lea rax, [rax + 4*rcx]
	inc rcx
	mov qword ptr [rsp + 64], rcx
.LBB10_77:
	test rax, rax
	je .LBB10_70
	add r13d, dword ptr [rax]
	mov dword ptr [rsp + 12], r13d
	mov rax, qword ptr [rsp + 760]
	test rax, rax
	jne .LBB10_58
	jmp .LBB10_70
.LBB10_69:
	mov rdi, r15
	call <pstd::collections::btree_map::Range<i32, ()>>::next_cold
	jmp .LBB10_77
.LBB10_71:
	lea rax, [rsp + 12]
	mov qword ptr [rsp + 768], rax
	mov rax, qword ptr [rip + <i32 as core::fmt::Display>::fmt@GOTPCREL]
	mov qword ptr [rsp + 776], rax
	lea rdi, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.31]
	lea rsi, [rsp + 768]
	call qword ptr [rip + std::io::stdio::_print@GOTPCREL]
	lea rsi, [rsp + 40]
	lea rdi, [rsp + 24]
	call <pstd::collections::btree_map::Tree<i32, ()>>::dealloc::<pstd::collections::btree_map::CustomTuning>
	cmp qword ptr [rsp + 24], 0
	jne .LBB10_75
	mov rdi, qword ptr [rsp + 32]
	mov esi, 32
	mov edx, 8
	call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
.LBB10_75:
	add rsp, 1480
	.cfi_def_cfa_offset 56
	pop rbx
	.cfi_def_cfa_offset 48
	pop r12
	.cfi_def_cfa_offset 40
	pop r13
	.cfi_def_cfa_offset 32
	pop r14
	.cfi_def_cfa_offset 24
	pop r15
	.cfi_def_cfa_offset 16
	pop rbp
	.cfi_def_cfa_offset 8
	ret
.LBB10_60:
	.cfi_def_cfa_offset 1536
	lea rdi, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.26]
	lea rdx, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.36]
	mov esi, 32
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
	jmp .LBB10_4
.LBB10_3:
	mov edi, 8
	mov esi, 32
	call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
	jmp .LBB10_4
.LBB10_13:
	lea rdi, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.38]
	lea rdx, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.39]
	mov esi, 39
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
	jmp .LBB10_4
.LBB10_16:
	lea rdi, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.38]
	lea rdx, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.42]
	mov esi, 39
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
	jmp .LBB10_4
.LBB10_21:
	lea rdi, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.38]
	lea rdx, [rip + .Lanon.4e23a766b1161c58f315fd4481a40bbc.42]
	mov esi, 39
	call qword ptr [rip + core::panicking::panic@GOTPCREL]
.LBB10_4:
	ud2
	jmp .LBB10_65
	mov r15, rax
	cmp qword ptr [rsp + 24], 0
	jne .LBB10_66
	mov rdi, qword ptr [rsp + 32]
	mov esi, 32
	mov edx, 8
	call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
	mov rdi, r15
	call _Unwind_Resume@PLT
	jmp .LBB10_25
	mov r15, rax
	cmp qword ptr [rsp + 768], 0
	je .LBB10_28
	cmp qword ptr [rsp + 784], 0
	jne .LBB10_28
	mov r14, qword ptr [rsp + 792]
	jmp .LBB10_8
	jmp .LBB10_65
	mov r15, rax
	test r12, r12
	jne .LBB10_8
	mov esi, 32
	mov edx, 8
	mov rdi, rbp
	call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
	jmp .LBB10_8
	mov r15, rax
	test rbx, rbx
	jne .LBB10_18
	mov esi, 32
	mov edx, 8
	mov rdi, r13
	call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
.LBB10_18:
	mov r13, rbp
	mov rbp, r12
	jmp .LBB10_26
.LBB10_25:
	mov r15, rax
.LBB10_26:
	mov esi, 32
	mov edx, 8
	mov rdi, r14
	call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
	jmp .LBB10_27
	mov r15, rax
.LBB10_27:
	mov r14, r13
	test rbp, rbp
	jne .LBB10_28
.LBB10_8:
	mov esi, 32
	mov edx, 8
	mov rdi, r14
	call qword ptr [rip + __rustc::__rust_dealloc@GOTPCREL]
.LBB10_28:
	lea rdi, [rsp + 56]
	call core::ptr::drop_in_place::<pstd::collections::btree_set::BTreeSet<i32>>
	jmp .LBB10_66
	call qword ptr [rip + core::panicking::panic_in_cleanup@GOTPCREL]
.LBB10_65:
	mov r15, rax
	lea rdi, [rsp + 16]
	call core::ptr::drop_in_place::<pstd::collections::btree_set::BTreeSet<i32>>
.LBB10_66:
	mov rdi, r15
	call _Unwind_Resume@PLT
	call qword ptr [rip + core::panicking::panic_in_cleanup@GOTPCREL]

Maybe I am mis-interpreting this code:

.LBB10_55:
	mov edx, 704
	mov rdi, r14
	xor esi, esi
	call qword ptr [rip + memset@GOTPCREL]

Now I look at it more carefully, it seems to be zeroing 704 bytes, but I don’t know why. Hmm, maybe it needs to zero various parts of the struct ( Iterator ) so it zeroes all of it rather than fiddling around. Except if it is big, that can be slow. I think…

[ There are 2 fixed-length vecs in the struct and some other stuff, I think the only thing that needs to be initialised is the “other stuff” and the two vec lengths, but instead of doing 2 word-stores, it is zeroing it all… Yes, I think that is it. ]

Ok, I finally managed to make a simple example:

This code:
```

	movl	$8016, %edx
	movq	%rbx, %rdi
	xorl	%esi, %esi
	callq	*%r13

is clearing the whole struct, 8016 bytes, when only a couple of words (16 bytes) need to be set.

The layout is 8 bytes for the length + 4000 MaybeUninit bytes, times two.

5 Likes

Well I think it is fairly clear what is going on, the compiler decided to use memset to clear (initialise) an entire struct with large uninit sections, which is somewhat inefficient.

1 Like

SCNR: https://youtu.be/brfqm9k6qzc?si=1dh-Pa5ae6-ovTKk&t=1162

Yes, this looks to be behavior that was designed for cases where uninitialized data is put into something like small padding sections appearing in a struct, and memset-ting the whole thing would be faster than trying to leaving the uninitialized padding untouched. I could not immediately find a fitting issue on Github yet, perhaps it might be worth just opening one.


Actually testing a bit on godbolt.org, it might seem like this is actually a new / regression behavior in Rust 1.93?


Edit: Bisection points to

as the relevant change.

2 Likes

I’ve made an issue ^^

9 Likes

This is usually an LLVM behaviour. It will prefer const-folding to "real" values rather than to poison/undef, for example, even if poison would be correct.

Demo: https://llvm.godbolt.org/z/sbrvb5vq3

It would be legal for that to return poison, but that's not what LLVM does.

So typically the answer is that uninit in Rust gives undef in LLVM, but then LLVM turns that into something else.


Note that the PR linked earlier (Fix MaybeUninit codegen using GVN by saethlin · Pull Request #147827 · rust-lang/rust · GitHub) was fixing a different version of this where MaybeUninit::uninitialized() got codegen'd as copying a value from rodata, which is obviously also bad.

3 Likes

If you look at what we give LLVM: https://rust.godbolt.org/z/8WcjT98dY

@anon.29d017c02cc7c950eacbf58ef03f904f.0 = private unnamed_addr constant [4000 x i8] undef, align 4

define internal void @example[50e99ae1af2fd79d]::test(ptr dead_on_unwind noalias noundef writable sret([8016 x i8]) align 8 captures(none) dereferenceable(8016) %_0, i32 noundef %i) unnamed_addr {
start:
  %_3 = alloca [4008 x i8], align 8
  %result = alloca [8016 x i8], align 8
  call void @llvm.lifetime.start.p0(ptr %result)
  %0 = getelementptr inbounds i8, ptr %_3, i64 4000
  store i64 0, ptr %0, align 8
  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %_3, ptr align 4 @anon.29d017c02cc7c950eacbf58ef03f904f.0, i64 4000, i1 false)
  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %result, ptr align 8 %_3, i64 4008, i1 false)
  %1 = getelementptr inbounds i8, ptr %result, i64 4008
  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %1, ptr align 8 %_3, i64 4008, i1 false)
  call void @example[50e99ae1af2fd79d]::dosomething(ptr noalias noundef align 8 dereferenceable(8016) %result, i32 noundef %i)
  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %_0, ptr align 8 %result, i64 8016, i1 false)
  call void @llvm.lifetime.end.p0(ptr %result)
  ret void
}

It's copying from a fully-undef private constant.

And later, MemCpyOptPass replaces that sequence of memcpys with a memset: https://rust.godbolt.org/z/46bjEn446

3 Likes