Uninitialised read in LLVM

Hi guys, I've noticed that there seems to be an uninitialised read in LLVM when I spawn a thread in debug mode. This is on debug mode and seems to happen with both the 2021 edition and the 2024 edition.

I have put a link to the Rust playground and a snippet of LLVM below:

start:
  %try_result = alloca [16 x i8], align 8

bb9:                                              ; preds = %bb32, %bb7
  %43 = call i32 @__rust_try(ptr @_ZN3std9panicking3try7do_call17h201d696598a54e7cE, ptr %data, ptr @_ZN3std9panicking3try8do_catch17h82189ce1a5941e52E), !dbg !3429
  store i32 %43, ptr %0, align 4, !dbg !3429
  %_60 = load i32, ptr %0, align 4, !dbg !3429
  %44 = icmp eq i32 %_60, 0, !dbg !3429
  br i1 %44, label %bb27, label %bb28, !dbg !3429

bb27:                                             ; preds = %bb9
  store ptr null, ptr %try_result, align 8, !dbg !3430
  br label %bb29, !dbg !3431

bb28:                                             ; preds = %bb9
  %slot.0 = load ptr, ptr %data, align 8, !dbg !3432
  %45 = getelementptr inbounds i8, ptr %data, i64 8, !dbg !3432
  %slot.1 = load ptr, ptr %45, align 8, !dbg !3432
  store ptr %slot.0, ptr %slot.dbg.spill16, align 8, !dbg !3432
  %46 = getelementptr inbounds i8, ptr %slot.dbg.spill16, i64 8, !dbg !3432
  store ptr %slot.1, ptr %46, align 8, !dbg !3432
    #dbg_declare(ptr %slot.dbg.spill16, !3433, !DIExpression(), !3439)
  store ptr %slot.0, ptr %try_result, align 8, !dbg !3441
  %47 = getelementptr inbounds i8, ptr %try_result, i64 8, !dbg !3441
  store ptr %slot.1, ptr %47, align 8, !dbg !3441
  br label %bb29, !dbg !3431

bb29:                                             ; preds = %bb28, %bb27
  %_25.0 = load ptr, ptr %try_result, align 8, !dbg !3303
  %48 = getelementptr inbounds i8, ptr %try_result, i64 8, !dbg !3303
  %_25.1 = load ptr, ptr %48, align 8, !dbg !3303
  %49 = getelementptr inbounds i8, ptr %_24, i64 8, !dbg !3304
  store ptr %_25.0, ptr %49, align 8, !dbg !3304
  %50 = getelementptr inbounds i8, ptr %49, i64 8, !dbg !3304
  store ptr %_25.1, ptr %50, align 8, !dbg !3304
  store i64 1, ptr %_24, align 8, !dbg !3304
  %self17 = getelementptr inbounds i8, ptr %_1, i64 40, !dbg !3305
  store ptr %self17, ptr %self.dbg.spill18, align 8, !dbg !3305
    #dbg_declare(ptr %self.dbg.spill18, !3306, !DIExpression(), !3309)
    #dbg_declare(ptr %self.dbg.spill18, !3310, !DIExpression(), !3313)
  %51 = getelementptr inbounds i8, ptr %_1, i64 40, !dbg !3315
  %_68 = load ptr, ptr %51, align 8, !dbg !3315
  %52 = getelementptr inbounds i8, ptr %_68, i64 16, !dbg !3305
  %self19 = getelementptr inbounds i8, ptr %52, i64 8, !dbg !3305
  store ptr %self19, ptr %self.dbg.spill20, align 8, !dbg !3305
    #dbg_declare(ptr %self.dbg.spill20, !3319, !DIExpression(), !3327)
  %53 = getelementptr inbounds i8, ptr %_68, i64 16, !dbg !3329
  %_69 = getelementptr inbounds i8, ptr %53, i64 8, !dbg !3329
; invoke core::ptr::drop_in_place<core::option::Option<core::result::Result<(),alloc::boxed::Box<dyn core::any::Any+core::marker::Send>>>>
  invoke void @"_ZN4core3ptr158drop_in_place$LT$core..option..Option$LT$core..result..Result$LT$$LP$$RP$$C$alloc..boxed..Box$LT$dyn$u20$core..any..Any$u2b$core..marker..Send$GT$$GT$$GT$$GT$17h413afff4dc7d84d2E"(ptr align 8 %_69)
          to label %bb10 unwind label %cleanup21, !dbg !3330

So if the @__rust_try() call is successful we go to bb27 where null is stored in try_result(). Then it jumps to bb29 where try_result[8] is read. But I can't see where try_result[8] would come from?

The Rust code for try_result can be seen here:

        let main = move || {
            if let Err(_thread) = set_current(their_thread.clone()) {
                // Both the current thread handle and the ID should not be
                // initialized yet. Since only the C runtime and some of our
                // platform code run before this, this point shouldn't be
                // reachable. Use an abort to save binary size (see #123356).
                rtabort!("something here is badly broken!");
            }

            if let Some(name) = their_thread.cname() {
                imp::Thread::set_name(name);
            }

            let f = f.into_inner();
            let try_result = panic::catch_unwind(panic::AssertUnwindSafe(|| {
                crate::sys::backtrace::__rust_begin_short_backtrace(|| hooks.run());
                crate::sys::backtrace::__rust_begin_short_backtrace(f)
            }));
            // SAFETY: `their_packet` as been built just above and moved by the
            // closure (it is an Arc<...>) and `my_packet` will be stored in the
            // same `JoinInner` as this closure meaning the mutation will be
            // safe (not modify it and affect a value far away).
            unsafe { *their_packet.result.get() = Some(try_result) };
            // Here `their_packet` gets dropped, and if this is the last `Arc` for that packet that
            // will call `decrement_num_running_threads` and therefore signal that this thread is
            // done.
            drop(their_packet);
            // Here, the lifetime `'scope` can end. `main` keeps running for a bit
            // after that before returning itself.
        };

Found in std/src/thread/mod.rs: rust/library/std/src/thread/mod.rs at a42d5ecf34c1d4ec8f7e35059b647b576cb42d93 · rust-lang/rust · GitHub

Does anyone know why this uninitialised read takes place? I expect it is to do with the unit type as if the closure passed to thread::spawn() returns a value try_result[8] is filled with the return value. Is this an optimisation thing or does null fill the 16 bytes allocated for try_result?

Given that it copies Result, I think it's expected that the other alternative is unitialized.
Here you can see that only 8 bytes get initialized, but copied 16: Compiler Explorer

1 Like

Hi, thanks so much for the reply I'd like to check I'm understanding this correctly.

So in the compiler explorer example you sent Result is 16 bytes to hold Ok() and Err(), Since we return Ok() only 8 bytes are initialised for the Ok(42). However as enums must be the same size as their largest variant 16 bytes are copied. This makes sense to me.

However I'm a bit confused about the unit/() type. If I change your example to this: Compiler Explorer

If I am understanding correctly there is a 0 placed in [rsp - 16] then uninitialised is moved into rdx. So I am assuming the OK() type is represented as a 0 on the assembly level. Then the next field of the result type which should be () is uninitialised. This still needs to be loaded in order to store in the {ptr, ptr} struct we return. I've put the LLVM for the () example below:


define { i64, i64 } @make_ok() unnamed_addr #0 !dbg !7 {
  %_0 = alloca [16 x i8], align 8
  store i64 0, ptr %_0, align 8, !dbg !35
  %0 = load i64, ptr %_0, align 8, !dbg !36
  %1 = getelementptr inbounds i8, ptr %_0, i64 8, !dbg !36
  %2 = load i64, ptr %1, align 8, !dbg !36
  %3 = insertvalue { i64, i64 } poison, i64 %0, 0, !dbg !36
  %4 = insertvalue { i64, i64 } %3, i64 %2, 1, !dbg !36
  ret { i64, i64 } %4, !dbg !36
}

define { i64, i64 } @square(i64 %r.0, i64 %r.1) unnamed_addr #0 !dbg !37 {
  %r.dbg.spill = alloca [16 x i8], align 8
  store i64 %r.0, ptr %r.dbg.spill, align 8
  %0 = getelementptr inbounds i8, ptr %r.dbg.spill, i64 8
  store i64 %r.1, ptr %0, align 8
  %1 = insertvalue { i64, i64 } poison, i64 %r.0, 0, !dbg !43
  %2 = insertvalue { i64, i64 } %1, i64 %r.1, 1, !dbg !43
  ret { i64, i64 } %2, !dbg !43
}

I've read a paper about LLVM semantics and I understand an uninitialised load is not UB unless it's used. So if I am getting this right uninitialised loads can happen with result types as we need to cover both cases of Result being either Ok() or Err() and since Err might be larger but we still need to copy it, it's fine to perform an uninitialised load if this is not used?

Yes, this is my understanding. Another example where this might happen is field padding: the compiler can generate memcpy for the entire struct including uninitialized gaps.

1 Like

So long as the load in LLVM doesn't have !noundef metadata, then it's fine, yes.

To see this in the simplest case, compare the LLVM IR for these two functions:

3 Likes

Ah, an important clarification! Thank you.