Avoiding generics when used in multiple crates for compile speed

Hi,

I'm in the process of trying to optimize compile/build time for a larg-ish workspace.

One of the suspected culprits is a lot of generic crates imported by many other crates, with the implication that the same code is build many times.

As the most extreme example I have a Crate exporting a structure:

Curve<T> where T: Float

That is used in all other crates and binaries. It's only every used as Curve or Curve.

Now defining new types:

pub type Curve32 = Curve<f32>

Will obviously have now effect, but if I created Curve32 and Curve64 as tuple structs:

pub struct Curve32(Curve<f32>);

And implemented wrapping methods for all methods of Curve. Doing this (for multiple crates) my thinking was that I would decrease the compile time since I do not expose generics across crate boundaries => the code is only instanciated once.

Do you think this approach makes sense - to avoid generics across cate boundaries?

I don't believe this is the case (IIUC; someone may be able to correct me). The type of the struct is Curve32(Curve<f32>) not Curve32. So, you wouldn't be reducing the type-resolution cost any more than just using pub type Curve32 = Curve<f32>.

The way I understand it, types like Curve<f32> are (hypothetically) not really a problem, because they have only one solution, but in practice functions can be codegen'd multiple times due to compiler implementation details. The unstable option -Zshare-generics should address many compile speed issues related to monomorphization.

On the other hand, it becomes a problem when you have many types: Curve<f32>, Curve<f64>, Curve<N32>, Curve<N64>, Curve<R32>, Curve<R64>, ... because codegen has to do more work to monomorphize and deduplicate implementations generated for each function for every type. And this issue can be addressed, somewhat, with non-generic inner functions or polymorphization.

But anyway, do you know for certain that monomorphization is the problem? Have you benchmarked the compiler building your crate or workspace with cargo --timings or -Zself-profile?

1 Like

I can't speak authoritatively to the overall compilation performance, but this is definitely not true: (Curve<f32>) is just the fields of Curve32, and Curve32 is not a generic type any more than any other struct without type parameters.

To check and demonstrate this, I wrote the following test code in a library package:

pub struct Example(Vec<f32>);

impl Example {
    pub fn new() -> Self {
        Self(Vec::new())
    }

    pub fn push(&mut self, x: f32) {
        self.0.push(x)
    }
}

And then compiled it with cargo rustc --lib --release -- --emit=asm, producing the following assembly file (for x86_64-apple-darwin):

        .section        __TEXT,__text,regular,pure_instructions
        .macosx_version_min 10, 7
        .p2align        4, 0x90
__ZN5alloc7raw_vec11finish_grow17h2cec18ebe4b8467fE:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        pushq   %r15
        pushq   %r14
        pushq   %rbx
        pushq   %rax
        .cfi_offset %rbx, -40
        .cfi_offset %r14, -32
        .cfi_offset %r15, -24
        movq    %rsi, %r14
        movq    %rdi, %rbx
        testq   %rdx, %rdx
        je      LBB0_5
        movq    %rdx, %r15
        cmpq    $0, 16(%rcx)
        je      LBB0_7
        movq    8(%rcx), %rsi
        testq   %rsi, %rsi
        je      LBB0_7
        movq    (%rcx), %rdi
        movq    %r15, %rdx
        movq    %r14, %rcx
        callq   ___rust_realloc
        testq   %rax, %rax
        jne     LBB0_11
LBB0_4:
        movq    %r14, 8(%rbx)
        movq    %r15, 16(%rbx)
        jmp     LBB0_6
LBB0_7:
        testq   %r14, %r14
        je      LBB0_8
        movq    %r14, %rdi
        movq    %r15, %rsi
        callq   ___rust_alloc
        testq   %rax, %rax
        je      LBB0_4
LBB0_11:
        movq    %rax, 8(%rbx)
        movq    %r14, 16(%rbx)
        xorl    %eax, %eax
        jmp     LBB0_12
LBB0_5:
        movq    %r14, 8(%rbx)
        movq    $0, 16(%rbx)
LBB0_6:
        movl    $1, %eax
LBB0_12:
        movq    %rax, (%rbx)
        addq    $8, %rsp
        popq    %rbx
        popq    %r14
        popq    %r15
        popq    %rbp
        retq
LBB0_8:
        movq    %r15, %rax
        testq   %rax, %rax
        jne     LBB0_11
        jmp     LBB0_4
        .cfi_endproc

        .p2align        4, 0x90
__ZN5alloc7raw_vec19RawVec$LT$T$C$A$GT$16reserve_for_push17hd9ce4f6e52de7bc4E:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        pushq   %r14
        pushq   %rbx
        subq    $48, %rsp
        .cfi_offset %rbx, -32
        .cfi_offset %r14, -24
        incq    %rsi
        je      LBB1_10
        movq    %rdi, %rbx
        movq    (%rdi), %rax
        leaq    (%rax,%rax), %rcx
        cmpq    %rsi, %rcx
        cmovaq  %rcx, %rsi
        cmpq    $5, %rsi
        movl    $4, %r14d
        cmovaeq %rsi, %r14
        xorl    %edx, %edx
        movq    %r14, %rcx
        shrq    $61, %rcx
        sete    %dl
        leaq    (,%r14,4), %rsi
        shlq    $2, %rdx
        testq   %rax, %rax
        je      LBB1_3
        movq    8(%rbx), %rcx
        shlq    $2, %rax
        movq    %rcx, -40(%rbp)
        movq    %rax, -32(%rbp)
        movq    $4, -24(%rbp)
        jmp     LBB1_4
LBB1_3:
        movq    $0, -24(%rbp)
LBB1_4:
        leaq    -64(%rbp), %rdi
        leaq    -40(%rbp), %rcx
        callq   __ZN5alloc7raw_vec11finish_grow17h2cec18ebe4b8467fE
        cmpq    $0, -64(%rbp)
        movq    -56(%rbp), %rdi
        je      LBB1_5
        movq    -48(%rbp), %rsi
        movabsq $-9223372036854775807, %rax
        cmpq    %rax, %rsi
        je      LBB1_6
        testq   %rsi, %rsi
        jne     LBB1_9
LBB1_10:
        callq   __ZN5alloc7raw_vec17capacity_overflow17h833df1bab8a6c0cbE
LBB1_5:
        movq    %rdi, 8(%rbx)
        movq    %r14, (%rbx)
LBB1_6:
        addq    $48, %rsp
        popq    %rbx
        popq    %r14
        popq    %rbp
        retq
LBB1_9:
        callq   __ZN5alloc5alloc18handle_alloc_error17h7841226fad19e9beE
        .cfi_endproc

        .globl  __ZN10scratchpad7Example3new17ha3cb64f0783a7025E
        .p2align        4, 0x90
__ZN10scratchpad7Example3new17ha3cb64f0783a7025E:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        movq    %rdi, %rax
        movq    $0, (%rdi)
        movq    $4, 8(%rdi)
        movq    $0, 16(%rdi)
        popq    %rbp
        retq
        .cfi_endproc

        .globl  __ZN10scratchpad7Example4push17hbf1de231eeeb6362E
        .p2align        4, 0x90
__ZN10scratchpad7Example4push17hbf1de231eeeb6362E:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset %rbp, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register %rbp
        pushq   %rbx
        pushq   %rax
        .cfi_offset %rbx, -24
        movq    %rdi, %rbx
        movq    16(%rdi), %rsi
        cmpq    (%rdi), %rsi
        jne     LBB3_2
        movq    %rbx, %rdi
        movss   %xmm0, -12(%rbp)
        callq   __ZN5alloc7raw_vec19RawVec$LT$T$C$A$GT$16reserve_for_push17hd9ce4f6e52de7bc4E
        movss   -12(%rbp), %xmm0
        movq    16(%rbx), %rsi
LBB3_2:
        movq    8(%rbx), %rax
        movss   %xmm0, (%rax,%rsi,4)
        incq    %rsi
        movq    %rsi, 16(%rbx)
        addq    $8, %rsp
        popq    %rbx
        popq    %rbp
        retq
        .cfi_endproc

.subsections_via_symbols

This contains code for creating and (re)allocating the Vec<f32>, so monomorphization and code generation has happened at the library compilation stage.

If the type were generic, we would not see any of this machine code; it would only exist as MIR in the .rlib output, and be turned into machine code only when it is used concretely in some other crate.

Sorry, but aren't we saying the same thing? My point is that the type alias and newtype are roughly equivalent WRT codegen.

Change the code to use an alias and the assembly output should be identical, with the exception of symbol names.

pub type Example = Vec<f32>;

pub fn new() -> Example {
    Example::new()
}

pub fn push(vec: &mut Example, x: f32) {
    vec.push(x)
}

By introducing those free functions you have made the code non-generic in another, equivalent way.

I took your original post as saying that @vron 's idea will not work, and I showed that it would. Ah, I see — you are saying that creating structs is not necessary to achieve earlier monomorphization? That's true. The thing that is necessary is to expose non-generic functions instead of generic ones, by whatever means.

I completely agree.

The original hypothesis was that using a newtype instead of a type alias would improve compile speed, but I don't think that is true for all of the reasons stated here. The difference is most likely to be net neutral between the two approaches.

I wasn't saying the idea "won't work", just that either approach will be roughly equivalent. I could have been clearer about what I was disagreeing with in the OP.

Thank you both - that was a very instructive discussion :slight_smile:

So my assumption that a type alias is just that - a type alias that behaves exactly the same as if I would type out the type is actually not true for codegen - good to know - and that makes my task much easier if that is the case since I do not have to maintain all the wrapping methods.

However - are you completely sure? (I will be making a test myself) - but I'm 99% sure that I previously have tested and concluded that a type alias was not enough to prevent a large numerical library crate (linking c++ etc. so the build time is long) from being compiled multiple times when building multiple binary crates in the same workspace. Admittedly that wasa a while back - so will be testing it again.

/V

What makes you think so? The example in this thread showed not the alias being the reason for codegen changes, but the usage of it.

Sorry - not getting your point here - if you read a little longer i write "not tru for codegen" - i.e. I try to accept that my initial assumption (the one you are referring to) was wrong.

I.e. My point was to say "thank you - I was apparently wrong" :slight_smile:

Hi,

So I have tried the -Zself-profile as you recommended but struggling to understand the output (there is not much documentation available online).

A first glance at the below output seems to indicate that all the time is spent i codegen & optimization of the binary im here profiling - but I'm struggling to reconcile that with that the binary really only is a small tokio/axum server calling a lot of my other crates (which have allready been built so should only be to link). I.e. could it be that the codegen here really is codegen also of generic code from other crates?

If so - how can I profile to see which imported crate is taking up the time? (Not seeing that output either from self-profile (showing the below) or cargo --timings (showing just the total minute to build the binary).

Thanks for any pointers

+-------------------------------------------------+-----------+-----------------+----------+------------+
| Item                                            | Self time | % of total time | Time     | Item count |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| LLVM_module_optimize                            | 103.62s   | 46.528          | 103.62s  | 17         |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| LLVM_module_codegen_emit_obj                    | 55.70s    | 25.010          | 55.70s   | 17         |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| LLVM_passes                                     | 27.35s    | 12.282          | 27.35s   | 1          |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| finish_ongoing_codegen                          | 20.67s    | 9.281           | 20.67s   | 1          |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| codegen_module                                  | 4.53s     | 2.035           | 5.61s    | 16         |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| run_linker                                      | 2.01s     | 0.901           | 2.01s    | 1          |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| codegen_crate                                   | 1.63s     | 0.730           | 11.49s   | 1          |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| normalize_projection_ty                         | 1.11s     | 0.499           | 1.15s    | 4950       |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| codegen_fulfill_obligation                      | 1.08s     | 0.487           | 1.31s    | 13273      |
+-------------------------------------------------+-----------+-----------------+----------+------------+
| LLVM_module_codegen                             | 820.73ms  | 0.369           | 56.52s   | 17         |
+-------------------------------------------------+-----------+-----------------+----------+------------+

No, that's not quite it. It shouldn't matter whether you use the original type Curve<f32>, a type alias, or a newtype struct, but you do have to write the wrapping non-generic functions — whether they are methods of a newtype (as in my example code) or not (as in parasyte's example). Those functions are where the early code generation you're looking for happens.

But given that you have to write the wrappers (and use them in the dependent crates), writing them as methods on a newtype may be the most usable way to provide them. No effect on performance.

2 Likes

Aaa - Got it... Back to trying to actually figuring out if this codegen in today's code with generics is actually the problem before I start maintaining all those wrappers then.

Thx

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.