Unfortunately, if we guarantee that stack locals do not have overlapping addresses, the presence of stack memcpy is observable and guaranteed behavior.
It's for this reason I think we might need to end up with both move-deinit and overlapping-stack-alloca semantics. Without allowing the two stack locals to have the same address, the optimizer has to prove that you don't observe at least one of the addresses, so that you can't observe that the two stack allocations overlap.
There's all sorts of thought around "allocation planes" and how we actually allow stack allocation to be formally removable anyway, if stack exhaustion is an observable effect. (The way current compilers justify it is either by saying memory exhaustion is a nonobservable condition external to the Virtual Machine, or just handwavy vibes along with the rest of the pointer provenance story.)
It's not (just) that the stack locals are (potentially) guaranteed to live at different addresses, though; an altered version of @afetisov's example proves that. With the inlining LLVM should be able to fairly trivially show that the address of the first ManuallyDrop
is never observed (no ptr2int
to allow leaking the address outside of provenance involved, so the optimizer can be sure it sees all uses of the first alloca).
ManuallyDrop
or any other wrapper isn't even involved in this missed optimization; even just copying an array through a temporary binding introduces the extra memcpy
. Here's another example of the same.
I believe the missed optimization is due to the array initialization. Consider this example:
pub unsafe fn no_memcpy() {
let a = [0x0101_u16; 1000];
let b = a;
print_slice(&b);
}
pub unsafe fn yes_memcpy() {
let a = [0x0102_u16; 1000];
let b = a;
print_slice(&b);
}
The former initializes the array with a memset
and contains no memcpy
. The latter initializes the array with a loop and then memcpy
s it to a new location. So IIUC, whats going on is that the array initialization causes LLVM to be unwilling (for whatever reason) to optimize out the initial stack allocation, instead keeping it at a separate address from the address which we do observe (by passing it to the opaque unknown output function).
Here's the exact same missed optimization in C/C++. If someone wants to report this upstream to LLVM (or find an open issue covering it), here's the unoptimized LLIR clang produces from that C/C++ example:
define dso_local void @_Z10yes_memcpyv() #0 !dbg !188 {
%1 = alloca [1000 x i16], align 16
%2 = alloca i32, align 4
%3 = alloca [1000 x i16], align 16
call void @llvm.dbg.declare(metadata ptr %1, metadata !193, metadata !DIExpression()), !dbg !197
call void @llvm.dbg.declare(metadata ptr %2, metadata !198, metadata !DIExpression()), !dbg !200
store i32 0, ptr %2, align 4, !dbg !200
br label %4, !dbg !201
4: ; preds = %11, %0
%5 = load i32, ptr %2, align 4, !dbg !202
%6 = icmp slt i32 %5, 1000, !dbg !204
br i1 %6, label %7, label %14, !dbg !205
7: ; preds = %4
%8 = load i32, ptr %2, align 4, !dbg !206
%9 = sext i32 %8 to i64, !dbg !207
%10 = getelementptr inbounds [1000 x i16], ptr %1, i64 0, i64 %9, !dbg !207
store i16 258, ptr %10, align 2, !dbg !208
br label %11, !dbg !207
11: ; preds = %7
%12 = load i32, ptr %2, align 4, !dbg !209
%13 = add nsw i32 %12, 1, !dbg !209
store i32 %13, ptr %2, align 4, !dbg !209
br label %4, !dbg !210, !llvm.loop !211
14: ; preds = %4
call void @llvm.dbg.declare(metadata ptr %3, metadata !214, metadata !DIExpression()), !dbg !215
%15 = getelementptr inbounds [1000 x i16], ptr %3, i64 0, i64 0, !dbg !216
%16 = getelementptr inbounds [1000 x i16], ptr %1, i64 0, i64 0, !dbg !216
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %15, ptr align 16 %16, i64 1000, i1 false), !dbg !216
%17 = getelementptr inbounds [1000 x i16], ptr %3, i64 0, i64 0, !dbg !217
call void @_Z9print_ptrPKt(ptr noundef %17), !dbg !218
ret void, !dbg !219
}
define dso_local void @_Z9no_memcpyv() #0 !dbg !220 {
%1 = alloca [1000 x i16], align 16
%2 = alloca i32, align 4
%3 = alloca [1000 x i16], align 16
call void @llvm.dbg.declare(metadata ptr %1, metadata !221, metadata !DIExpression()), !dbg !222
call void @llvm.dbg.declare(metadata ptr %2, metadata !223, metadata !DIExpression()), !dbg !225
store i32 0, ptr %2, align 4, !dbg !225
br label %4, !dbg !226
4: ; preds = %11, %0
%5 = load i32, ptr %2, align 4, !dbg !227
%6 = icmp slt i32 %5, 1000, !dbg !229
br i1 %6, label %7, label %14, !dbg !230
7: ; preds = %4
%8 = load i32, ptr %2, align 4, !dbg !231
%9 = sext i32 %8 to i64, !dbg !232
%10 = getelementptr inbounds [1000 x i16], ptr %1, i64 0, i64 %9, !dbg !232
store i16 257, ptr %10, align 2, !dbg !233
br label %11, !dbg !232
11: ; preds = %7
%12 = load i32, ptr %2, align 4, !dbg !234
%13 = add nsw i32 %12, 1, !dbg !234
store i32 %13, ptr %2, align 4, !dbg !234
br label %4, !dbg !235, !llvm.loop !236
14: ; preds = %4
call void @llvm.dbg.declare(metadata ptr %3, metadata !238, metadata !DIExpression()), !dbg !239
%15 = getelementptr inbounds [1000 x i16], ptr %3, i64 0, i64 0, !dbg !240
%16 = getelementptr inbounds [1000 x i16], ptr %1, i64 0, i64 0, !dbg !240
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %15, ptr align 16 %16, i64 1000, i1 false), !dbg !240
%17 = getelementptr inbounds [1000 x i16], ptr %3, i64 0, i64 0, !dbg !241
call void @_Z9print_ptrPKt(ptr noundef %17), !dbg !242
ret void, !dbg !243
}
The only initial difference ignoring metadata does seem to be that store i16 257, ptr %10, align 2
versus store i16 258, ptr %10, align 2
.