While working on my library and playing with the Compiler Explorer, I discovered that passing the same data in different ways results in different assembly (with max optimizations). Roughly the same happens with my real code.
The minimal assembly is produced when using primitive arguments and pairs (tuples, tuple structs and regular structs with two fields). Furthermore, the assembly stays roughly the same as the numeric type size grows.
Primitive arguments & tuple-y/struct-y pairs (all produce the same asm)
fn func_raw_i32(
x1: i32, y1: i32,
x2: i32, y2: i32,
wx1: i32, wy1: i32,
wx2: i32, wy2: i32
) -> i32 {
x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
fn func_small_tuples_i32(
(x1, y1): (i32, i32),
(x2, y2): (i32, i32),
(wx1, wy1): (i32, i32),
(wx2, wy2): (i32, i32),
) -> i32 {
x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
#[derive(Copy, Clone)]
struct Point<T>(T, T);
fn func_small_tuple_structs_i32(
Point(x1, y1): Point<i32>,
Point(x2, y2): Point<i32>,
Point(wx1, wy1): Point<i32>,
Point(wx2, wy2): Point<i32>,
) -> i32 {
x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
#[derive(Copy, Clone)]
struct Point<T> {
x: T,
y: T,
}
fn func_small_structs_i32(
p1: Point<i32>,
p2: Point<i32>,
w1: Point<i32>,
w2: Point<i32>,
) -> i32 {
p1.x + p1.y + p2.x + p2.y + w1.x + w1.y + w2.x + w2.y
}
func_raw_i32:
add esi, edi
lea eax, [rdx + rcx]
add eax, esi
add eax, r8d
add eax, r9d
add eax, dword ptr [rsp + 8]
add eax, dword ptr [rsp + 16]
ret
Surprisingly for me, expressing the pairs as arrays results in more verbose assembly. At i64
, SIMD-related instructions appear in the assembly.
Array pairs
fn func_small_arrays_i32(
[x1, y1]: [i32; 2],
[x2, y2]: [i32; 2],
[wx1, wy1]: [i32; 2],
[wx2, wy2]: [i32; 2],
) -> i32 {
x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
func_small_arrays_i32:
lea r8d, [rsi + rdi]
shr rdi, 32
shr rsi, 32
add esi, edx
shr rdx, 32
lea eax, [rcx + rdx]
shr rcx, 32
add r8d, edi
add r8d, esi
add eax, ecx
add eax, r8d
ret
func_small_arrays_i64:
movdqu xmm0, xmmword ptr [rdi]
movdqu xmm1, xmmword ptr [rsi]
movdqu xmm2, xmmword ptr [rdx]
paddq xmm2, xmm0
movdqu xmm0, xmmword ptr [rcx]
paddq xmm0, xmm1
paddq xmm0, xmm2
pshufd xmm1, xmm0, 238
paddq xmm1, xmm0
movq rax, xmm1
ret
Using structs and tuples with four elements, as well as nesting them (pair of pairs) results in even more verbose assembly (twice as long as the minimal version). SIMD-ness begins at i32
.
Bigger structs and tuples (slightly different asm between them)
struct Window<T> {
x1: T,
y1: T,
x2: T,
y2: T,
}
fn func_bigger_struct_i32(
(x1, y1): (i32, i32),
(x2, y2): (i32, i32),
w: Window<i32>,
) -> i32 {
x1 + y1 + x2 + y2 + w.x1 + w.y1 + w.x2 + w.y2
}
func_bigger_struct_i16:
mov r9d, r8d
shr r9d, 16
mov r10, r8
add r9d, r8d
mov rax, r8
shr rax, 32
shr r10, 48
add esi, edi
add ecx, edx
add ecx, esi
add r9d, ecx
add eax, r10d
add eax, r9d
ret
func_bigger_struct_i32:
movdqu xmm0, xmmword ptr [r8]
pshufd xmm1, xmm0, 238
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85
paddd xmm0, xmm1
movd r8d, xmm0
lea eax, [rsi + rdx]
add eax, edi
add eax, ecx
add eax, r8d
ret
fn func_bigger_tuples_i32(
(x1, y1, x2, y2): (i32, i32, i32, i32),
(wx1, wy1, wx2, wy2): (i32, i32, i32, i32)
) -> i32 {
x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
func_bigger_tuples_i16:
mov ecx, edi
shr ecx, 16
mov rdx, rdi
add ecx, edi
shr rdi, 32
shr rdx, 48
mov eax, esi
shr eax, 16
mov r8, rsi
add eax, esi
shr rsi, 32
shr r8, 48
add edi, edx
add edi, ecx
add eax, esi
add eax, edi
add eax, r8d
ret
func_bigger_tuples_i32:
movdqu xmm0, xmmword ptr [rdi]
movdqu xmm1, xmmword ptr [rsi]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 238
paddd xmm0, xmm1
pshufd xmm1, xmm0, 85
paddd xmm1, xmm0
movd eax, xmm1
ret
The library I'm working on is supposed to be used in a hot loop, and I would like to avoid unnecessary instructions. Maybe understanding deeper how these different layouts get translated to assembly will help me find peace and stop ruminating. Where should I look?