Optimizing layout of function arguments

While working on my library and playing with the Compiler Explorer, I discovered that passing the same data in different ways results in different assembly (with max optimizations). Roughly the same happens with my real code.

The minimal assembly is produced when using primitive arguments and pairs (tuples, tuple structs and regular structs with two fields). Furthermore, the assembly stays roughly the same as the numeric type size grows.

Primitive arguments & tuple-y/struct-y pairs (all produce the same asm)
fn func_raw_i32(
    x1: i32, y1: i32, 
    x2: i32, y2: i32, 
    wx1: i32, wy1: i32, 
    wx2: i32, wy2: i32
) -> i32 {
    x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
fn func_small_tuples_i32(
    (x1, y1): (i32, i32), 
    (x2, y2): (i32, i32), 
    (wx1, wy1): (i32, i32), 
    (wx2, wy2): (i32, i32),
) -> i32 {
    x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
#[derive(Copy, Clone)]
struct Point<T>(T, T);

fn func_small_tuple_structs_i32(
    Point(x1, y1): Point<i32>, 
    Point(x2, y2): Point<i32>, 
    Point(wx1, wy1): Point<i32>, 
    Point(wx2, wy2): Point<i32>,
) -> i32 {
    x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
#[derive(Copy, Clone)]
struct Point<T> {
    x: T,
    y: T,
}

fn func_small_structs_i32(
    p1: Point<i32>, 
    p2: Point<i32>, 
    w1: Point<i32>, 
    w2: Point<i32>,
) -> i32 {
    p1.x + p1.y + p2.x + p2.y + w1.x + w1.y + w2.x + w2.y
}
func_raw_i32:
      add     esi, edi
      lea     eax, [rdx + rcx]
      add     eax, esi
      add     eax, r8d
      add     eax, r9d
      add     eax, dword ptr [rsp + 8]
      add     eax, dword ptr [rsp + 16]
      ret

Surprisingly for me, expressing the pairs as arrays results in more verbose assembly. At i64, SIMD-related instructions appear in the assembly.

Array pairs
fn func_small_arrays_i32(
    [x1, y1]: [i32; 2], 
    [x2, y2]: [i32; 2], 
    [wx1, wy1]: [i32; 2],
    [wx2, wy2]: [i32; 2],
) -> i32 {
    x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
func_small_arrays_i32:
        lea     r8d, [rsi + rdi]
        shr     rdi, 32
        shr     rsi, 32
        add     esi, edx
        shr     rdx, 32
        lea     eax, [rcx + rdx]
        shr     rcx, 32
        add     r8d, edi
        add     r8d, esi
        add     eax, ecx
        add     eax, r8d
        ret
func_small_arrays_i64:
        movdqu  xmm0, xmmword ptr [rdi]
        movdqu  xmm1, xmmword ptr [rsi]
        movdqu  xmm2, xmmword ptr [rdx]
        paddq   xmm2, xmm0
        movdqu  xmm0, xmmword ptr [rcx]
        paddq   xmm0, xmm1
        paddq   xmm0, xmm2
        pshufd  xmm1, xmm0, 238
        paddq   xmm1, xmm0
        movq    rax, xmm1
        ret

Using structs and tuples with four elements, as well as nesting them (pair of pairs) results in even more verbose assembly (twice as long as the minimal version). SIMD-ness begins at i32.

Bigger structs and tuples (slightly different asm between them)
struct Window<T> {
    x1: T,
    y1: T,
    x2: T,
    y2: T,
}

fn func_bigger_struct_i32(
    (x1, y1): (i32, i32), 
    (x2, y2): (i32, i32), 
    w: Window<i32>,
) -> i32 {
    x1 + y1 + x2 + y2 + w.x1 + w.y1 + w.x2 + w.y2
}
func_bigger_struct_i16:
        mov     r9d, r8d
        shr     r9d, 16
        mov     r10, r8
        add     r9d, r8d
        mov     rax, r8
        shr     rax, 32
        shr     r10, 48
        add     esi, edi
        add     ecx, edx
        add     ecx, esi
        add     r9d, ecx
        add     eax, r10d
        add     eax, r9d
        ret

func_bigger_struct_i32:
        movdqu  xmm0, xmmword ptr [r8]
        pshufd  xmm1, xmm0, 238
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85
        paddd   xmm0, xmm1
        movd    r8d, xmm0
        lea     eax, [rsi + rdx]
        add     eax, edi
        add     eax, ecx
        add     eax, r8d
        ret
fn func_bigger_tuples_i32(
    (x1, y1, x2, y2): (i32, i32, i32, i32),
    (wx1, wy1, wx2, wy2): (i32, i32, i32, i32)
) -> i32 {
    x1 + y1 + x2 + y2 + wx1 + wy1 + wx2 + wy2
}
func_bigger_tuples_i16:
        mov     ecx, edi
        shr     ecx, 16
        mov     rdx, rdi
        add     ecx, edi
        shr     rdi, 32
        shr     rdx, 48
        mov     eax, esi
        shr     eax, 16
        mov     r8, rsi
        add     eax, esi
        shr     rsi, 32
        shr     r8, 48
        add     edi, edx
        add     edi, ecx
        add     eax, esi
        add     eax, edi
        add     eax, r8d
        ret

func_bigger_tuples_i32:
        movdqu  xmm0, xmmword ptr [rdi]
        movdqu  xmm1, xmmword ptr [rsi]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 238
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 85
        paddd   xmm1, xmm0
        movd    eax, xmm1
        ret

The library I'm working on is supposed to be used in a hot loop, and I would like to avoid unnecessary instructions. Maybe understanding deeper how these different layouts get translated to assembly will help me find peace and stop ruminating. Where should I look?

That's a noble but naïve goal. If you want to save time, then you should pick the fastest option, not the smallest number of instructions of bytes.

The best (and really, only) way to make sure that you do is profiling the code. Even more so in the presence of other code – such small and trivial functions are overwhelmingly likely to be inlined, so trying to profile them outside the context/loop they will actually be used in is almost completely meaningless.

4 Likes

Godbolt's Compiler Explorer supports a really useful tool for this sort of analysis, called llvm-mca. It has limitations, but it provides a wealth of information on how expensive it is to run any given code block on a CPU (and you can do things like tell llvm-mca to consider different CPUs with the same input assembly code to get a sense of what it's optimizing for).

2 Likes

If your function arguments are expensive, you don't have an argument problem, you have have a lack-of-inlining problem.

3 Likes

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.