Which one has better Assembly output between Pattern Matching Enum and Fixed Size Array Enum Lookup?

I'm trying to understand this Assembly output between matching enum and fixed size array enum lookup

Code 1:

#[derive(Debug, Clone, Copy)]
#[repr(u8)]
pub enum LogLevel {
    Trace = 0,
    Debug = 1,
    Info = 2,
    Warn = 3,
    Error = 4,
}

impl LogLevel {
    #[inline(always)]
    const fn as_bytes(self) -> &'static [u8] {
        match self {
            LogLevel::Trace => b"TRACE",
            LogLevel::Debug => b"\x1b[34mDEBUG\x1b[0m",
            LogLevel::Info => b"\x1b[32mINFO\x1b[0m",
            LogLevel::Warn => b"\x1b[33mWARN\x1b[0m",
            LogLevel::Error => b"\x1b[31mERROR\x1b[0m",
        }
    }
}

#[unsafe(no_mangle)]
fn task(a: LogLevel) -> &'static [u8] {
    a.as_bytes()
}
task:
        lea     rax, [rip + .Lswitch.table.task]
        movzx   ecx, dil
        mov     rdx, qword ptr [rax + 8*rcx]
        lea     rsi, [rip + .Lswitch.table.task.1.rel]
        movsxd  rax, dword ptr [rsi + 4*rcx]
        add     rax, rsi
        ret

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.0:
        .ascii  "TRACE"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.1:
        .ascii  "\033[34mDEBUG\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.2:
        .ascii  "\033[32mINFO\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.3:
        .ascii  "\033[33mWARN\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.4:
        .ascii  "\033[31mERROR\033[0m"

.Lswitch.table.task:
        .quad   5
        .quad   14
        .quad   13
        .quad   13
        .quad   14

.Lswitch.table.task.1.rel:
        .long   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.0-.Lswitch.table.task.1.rel
        .long   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.1-.Lswitch.table.task.1.rel
        .long   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.2-.Lswitch.table.task.1.rel
        .long   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.3-.Lswitch.table.task.1.rel
        .long   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.4-.Lswitch.table.task.1.rel

Code 2:

#[derive(Debug, Clone, Copy)]
#[repr(u8)]
pub enum LogLevel {
    Trace = 0,
    Debug = 1,
    Info = 2,
    Warn = 3,
    Error = 4,
}

impl LogLevel {
    #[inline(always)]
    const fn as_bytes(self) -> &'static [u8] {
        const LOOKUP: &[&[u8]] = &[
            b"TRACE",
            b"\x1b[34mDEBUG\x1b[0m",
            b"\x1b[32mINFO\x1b[0m",
            b"\x1b[33mWARN\x1b[0m",
            b"\x1b[31mERROR\x1b[0m",
        ];
        LOOKUP[self as usize]
    }
}

#[unsafe(no_mangle)]
fn task(a: LogLevel) -> &'static[u8] {
    a.as_bytes()
}

task:
        movzx   ecx, dil
        shl     ecx, 4
        lea     rdx, [rip + .Lanon.478f3a4c51824ad23cb50c1c60670c0f.5]
        mov     rax, qword ptr [rcx + rdx]
        mov     rdx, qword ptr [rcx + rdx + 8]
        ret

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.0:
        .ascii  "TRACE"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.1:
        .ascii  "\033[34mDEBUG\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.2:
        .ascii  "\033[32mINFO\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.3:
        .ascii  "\033[33mWARN\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.4:
        .ascii  "\033[31mERROR\033[0m"

.Lanon.478f3a4c51824ad23cb50c1c60670c0f.5:
        .quad   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.0
        .asciz  "\005\000\000\000\000\000\000"
        .quad   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.1
        .asciz  "\016\000\000\000\000\000\000"
        .quad   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.2
        .asciz  "\r\000\000\000\000\000\000"
        .quad   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.3
        .asciz  "\r\000\000\000\000\000\000"
        .quad   .Lanon.478f3a4c51824ad23cb50c1c60670c0f.4
        .asciz  "\016\000\000\000\000\000\000"

If the LogLevel is known at compile time they produce the same Assembly. But if LogLevel is known at runtime they produce different Assembly as shown above. My interpretation is Code 2 is better because 1 linear table, fewer instructions, records are stored in contigously that move will read memory address that close to each other continously (cache localicy for better cache hit). Where Code 1 has 2 different switch tables that if the table is big will cause cache miss, and has more instructions

What is your interpretation version?

My interpretation is that it doesn't matter at all because if you're outputting to a human to look at -- which you must be or you wouldn't bother with the colour codes -- then you're going to be doing it so infrequently that it's not going to matter.

Write the obvious code and trust that LLVM will do something reasonable.

2 Likes

Hmm I explicitly want the best Assembly output here. That is why I ask this. The answer: it doesn't matter doesn't align with my goal (please don't be so defensive about Rust that no matter what you normalize every Rust code, evaluative/pragmatic answer is better, I don't like defensive answer because it blocks progress). I want to verify if my interpretation correct, knowing more about what each Assembly line does there, and the performance effect

What I gave is the pragmatic answer.

Care about the assembly in the things that you run often enough that it matters, absolutely. I'm happy to experiment for things like image conversions where millions of pixels are normal -- see Converting a BGRA &[u8] to RGB [u8;N] (for images)? - #12 by scottmcm -- for example, since how you do it can make a material difference.

But your console-output-to-humans is gated on human brain speed, and even horrible slow code is still way faster than that.

4 Likes

Hmm you said it doesn't matter where I already said I explicitely want the best Assembly output. You are assuming I do this in infrequently called code, please don't assume something like this, just answer what the asker wants

I'm creating logger code that will be called milions times, 1 request alone will call the log tens times. The console log output is readable

I already explicitely said I don't want slower code, so I'm searching information. The answer I'm looking for is explanation about the 2 assembly code above and the performance effect.

Looks like there are two table lookups in the first example - only one in the 2nd - indexing into a table of 2x8 byte values (slice: pointer + length). 2nd is one instruction shorter as well - should be the better version.

1 Like

Thankss for the answer. I also think like that, but has a doubt because I still learn Assembly. Your answer really helpful clearing my doubt

I'd back up "which one is better" with benchmark times (and flamegraphs for more complicated code). I am somewhat lazy and will happily use a system time utility to run and time an executable, but I believe the recommended approach is Criterion: Criterion.rs - Criterion.rs Documentation

(I do also expect based on the assembly that the second is better, but it's likewise hard for me to be confident about assembly.)

3 Likes