Rust auto-vectorization is 9000% slower

I am developing a game that uses Worley noise. The benchmark I am using to measure performance runs in 60 microseconds when no SIMD is generated, and ~9 milliseconds when SIMD code is generated (checked with perf annotate). The SIMD version of the function is several hundred lines of assembly, while the non-simd is 20-30.

How do I stop rust from generating SIMD code? Whether or not it generates doesn't seem to be based on any one variable in general.

use glam::Vec2;

// !! This function compiles to SIMD.
#[inline]
pub fn worley2x2(pos: Vec2, perm: &Permutation) -> [(f32, Vec2); 2] {
    let mut min = [(f32::INFINITY, Vec2::ZERO); 2];
    let cell = pos.as_ivec2();
    for i in 0..9 {
        let x = (i % 3) as i32 - 1;
        let y = (i / 3) as i32 - 1;
        let rhs = perm.cell2(cell + ivec2(x, y));
        let dsq = (pos.x - rhs.x).powi(2) + (pos.y - rhs.y).powi(2);
        if dsq < min[1].0 {
            min[1] = min[0];
            let k = if dsq < min[0].0 { 0 } else { 1 };
            min[k] = (dsq, rhs);
        }
    }

    min.map(|(d, v)| (d.sqrt(), v))
}

What target-cpu are you compiling for, and what SIMD ISA instructions are emitted? We have seen these kinds bugs with SIMD lowering, before. It's kind of tricky to diagnose, but you might also be running against a known issue. I would suggest searching for similar reports.

To disable SIMD on the entire binary, you can use things like RUSTFLAGS='-C target-feature=-avx,-avx2' for whatever series of features you want to disable.

4 Likes

I'm not compiling for any specific features. I'm just running cargo bench "name".

I don't want to disable SIMD entirely, it works very well for perlin noise that utilizes f32x4.

This is the output of my baseline function (with outer loop from benchmark)

 Percent |	Source code & Disassembly of worley-f094cb88dc8042c1 for cycles:P (35707 samples, percent: local period)
------------------------------------------------------------------------------------------------------------------------
         :
         :
         :
         : 3      Disassembly of section .text:
         :
         : 5      0000000000368170 <worley::baseline_worley2x2>:
    0.00 :   368170: mov    0x10(%rdi),%rax
    0.00 :   368174: test   %rax,%rax
    0.00 :   368177: je     3681c2 <worley::baseline_worley2x2+0x52>
    0.00 :   368179: push   %rbx
    0.00 :   36817a: mov    0x1c(%rdi),%ecx
    0.00 :   36817d: mov    0x20(%rdi),%edx
    0.00 :   368180: mov    0x24(%rdi),%esi
    0.00 :   368183: mov    0x28(%rdi),%r8d
    0.00 :   368187: mov    0x2c(%rdi),%r9d
    0.00 :   36818b: mov    0x8(%rdi),%r10
    0.00 :   36818f: mov    0x18(%rdi),%edi
    0.00 :   368192: shl    $0x3,%rax
    0.00 :   368196: xor    %r11d,%r11d
    0.00 :   368199: mov    %ecx,%ebx
    0.00 :   36819b: jmp    3681b4 <worley::baseline_worley2x2+0x44>
    0.00 :   36819d: nopl   (%rax)
   16.32 :   3681a0: add    %r9d,%ebx
   32.66 :   3681a3: movq   $0x0,(%r10,%r11,1)
   16.51 :   3681ab: add    $0x8,%r11
   16.35 :   3681af: cmp    %r11,%rax
    0.00 :   3681b2: je     3681c1 <worley::baseline_worley2x2+0x51>
   16.42 :   3681b4: cmp    %esi,%ebx
    0.00 :   3681b6: jl     3681a0 <worley::baseline_worley2x2+0x30>
    1.03 :   3681b8: add    %r8d,%edi
    0.47 :   3681bb: mov    %ecx,%ebx
    0.21 :   3681bd: cmp    %edx,%edi
    0.02 :   3681bf: jl     3681a0 <worley::baseline_worley2x2+0x30>
    0.00 :   3681c1: pop    %rbx
    0.00 :   3681c2: ret

And this is the output when I put #[inline(never)] on the problem worley2x2 function.

 Percent |	Source code & Disassembly of worley-f094cb88dc8042c1 for cycles:P (34556 samples, percent: local period)
------------------------------------------------------------------------------------------------------------------------
         :
         :
         :
         : 3      Disassembly of section .text:
         :
         : 5      000000000032e930 <terrain::noise::worley::worley2x2>:
    0.13 :   32e930: push   %rbp
    0.32 :   32e931: push   %r15
    0.28 :   32e933: push   %r14
    0.13 :   32e935: push   %r12
    0.12 :   32e937: push   %rbx
    0.27 :   32e938: cvttss2si %xmm0,%ecx
    0.15 :   32e93c: movss  -0x2e9f44(%rip),%xmm2        # 44a00 <anon.8a551d1f60a9618a374ee40eae30dd74.44.llvm.9167635081569607948+0xbc0>
    0.26 :   32e944: ucomiss %xmm2,%xmm0
    0.11 :   32e947: mov    $0x7fffffff,%r9d
    0.17 :   32e94d: cmova  %r9d,%ecx
    0.15 :   32e951: xor    %edx,%edx
    0.30 :   32e953: ucomiss %xmm0,%xmm0
    0.10 :   32e956: cmovp  %edx,%ecx
    0.27 :   32e959: cvttss2si %xmm1,%r8d
    0.25 :   32e95e: ucomiss %xmm2,%xmm1
    0.15 :   32e961: mov    %rdi,%rax
    0.15 :   32e964: cmova  %r9d,%r8d
    0.24 :   32e968: ucomiss %xmm1,%xmm1
    0.13 :   32e96b: cmovp  %edx,%r8d
    0.15 :   32e96f: dec    %r8d
    0.14 :   32e972: unpcklps %xmm1,%xmm0
    0.16 :   32e975: dec    %ecx
    0.18 :   32e977: movss  -0x2eb147(%rip),%xmm2        # 43838 <anon.d84a67f96832bb96bdd3e33829159445.308.llvm.13447959004201808895+0x404>
    0.15 :   32e97f: mov    $0xaaaaaaab,%edi
    0.14 :   32e984: xorps  %xmm1,%xmm1
    0.12 :   32e987: movaps -0x2ecede(%rip),%xmm3        # 41ab0 <anon.04905de9e591e303b6bdd27fb733ee02.158.llvm.9333659855979572961+0x140>
    0.10 :   32e98e: xor    %r10d,%r10d
    0.12 :   32e991: movaps %xmm2,%xmm4
    0.13 :   32e994: xor    %r9d,%r9d
    0.12 :   32e997: xor    %r11d,%r11d
    0.11 :   32e99a: xor    %ebx,%ebx
    0.13 :   32e99c: jmp    32e9ab <terrain::noise::worley::worley2x2+0x7b>
    0.00 :   32e99e: xchg   %ax,%ax
    1.28 :   32e9a0: inc    %ebx
    1.24 :   32e9a2: cmp    $0x9,%ebx
    0.00 :   32e9a5: je     32eaa6 <terrain::noise::worley::worley2x2+0x176>
    1.41 :   32e9ab: mov    %ebx,%r14d
    1.25 :   32e9ae: imul   %rdi,%r14
    1.19 :   32e9b2: shr    $0x21,%r14
    1.33 :   32e9b6: lea    (%r14,%r14,2),%ebp
    1.26 :   32e9ba: mov    %ecx,%r15d
    1.46 :   32e9bd: sub    %ebp,%r15d
    1.44 :   32e9c0: add    %ebx,%r15d
    1.41 :   32e9c3: add    %r8d,%r14d
    1.67 :   32e9c6: mov    %r15d,%ebp
    1.30 :   32e9c9: rol    $0x3,%ebp
    1.53 :   32e9cc: xor    %r14d,%ebp
    1.38 :   32e9cf: mov    %ebp,%r12d
    1.36 :   32e9d2: rol    $0x5,%r12d
    1.45 :   32e9d6: xor    %ebp,%r12d
    1.41 :   32e9d9: mov    %r12d,%ebp
    1.64 :   32e9dc: shr    $0x7,%ebp
    1.93 :   32e9df: xor    %r12d,%ebp
    1.69 :   32e9e2: movzbl %bpl,%r12d
    1.28 :   32e9e6: rol    $0x3,%r14d
    1.77 :   32e9ea: xor    %r15d,%r14d
    1.69 :   32e9ed: mov    %r14d,%ebp
    1.39 :   32e9f0: rol    $0x5,%ebp
    1.50 :   32e9f3: xor    %r14d,%ebp
    1.50 :   32e9f6: mov    %ebp,%r14d
    1.90 :   32e9f9: shr    $0x7,%r14d
    1.47 :   32e9fd: xor    %ebp,%r14d
    1.55 :   32ea00: movzbl %r14b,%r14d
    1.47 :   32ea04: movzbl (%rsi,%r14,1),%ebp
    1.63 :   32ea09: movzbl 0x100(%rsi,%r12,1),%r15d
    1.44 :   32ea12: movzbl 0x100(%rsi,%r14,1),%r14d
    1.55 :   32ea1b: shl    $0x8,%r14d
    1.38 :   32ea1f: or     %ebp,%r14d
    2.84 :   32ea22: movd   %r14d,%xmm5
    1.43 :   32ea27: movzbl (%rsi,%r12,1),%ebp
    1.74 :   32ea2c: shl    $0x8,%ebp
    1.41 :   32ea2f: or     %r15d,%ebp
    2.67 :   32ea32: movd   %ebp,%xmm6
    1.44 :   32ea36: pxor   %xmm5,%xmm6
    1.23 :   32ea3a: punpcklbw %xmm1,%xmm6
    1.34 :   32ea3e: punpcklwd %xmm1,%xmm6
    1.27 :   32ea42: cvtdq2ps %xmm6,%xmm6
    1.27 :   32ea45: mulps  %xmm3,%xmm6
    1.42 :   32ea48: movaps %xmm0,%xmm7
    1.24 :   32ea4b: subps  %xmm6,%xmm7
    1.29 :   32ea4e: mulps  %xmm7,%xmm7
    1.28 :   32ea51: movaps %xmm7,%xmm5
    1.24 :   32ea54: shufps $0x55,%xmm7,%xmm5
    1.29 :   32ea58: addss  %xmm7,%xmm5
    2.57 :   32ea5c: ucomiss %xmm5,%xmm4
    1.23 :   32ea5f: jbe    32e9a0 <terrain::noise::worley::worley2x2+0x70>
    1.51 :   32ea65: ucomiss %xmm5,%xmm2
    0.69 :   32ea68: movd   %xmm6,%r9d
    0.76 :   32ea6d: shufps $0x55,%xmm6,%xmm6
    0.76 :   32ea71: movd   %xmm6,%r11d
    0.58 :   32ea76: movaps %xmm5,%xmm6
    0.68 :   32ea79: mov    %edx,%ebp
    0.63 :   32ea7b: cmova  %r9d,%ebp
    0.77 :   32ea7f: mov    %r10d,%r14d
    0.69 :   32ea82: cmova  %r11d,%r14d
    0.70 :   32ea86: minss  %xmm2,%xmm6
    0.99 :   32ea8a: cmova  %edx,%r9d
    0.65 :   32ea8e: movaps %xmm2,%xmm4
    0.71 :   32ea91: cmova  %r10d,%r11d
    0.68 :   32ea95: maxss  %xmm5,%xmm4
    0.69 :   32ea99: movaps %xmm6,%xmm2
    0.64 :   32ea9c: mov    %ebp,%edx
    0.69 :   32ea9e: mov    %r14d,%r10d
    0.68 :   32eaa1: jmp    32e9a0 <terrain::noise::worley::worley2x2+0x70>
    0.20 :   32eaa6: sqrtss %xmm2,%xmm0
    0.14 :   32eaaa: xorps  %xmm1,%xmm1
    0.15 :   32eaad: sqrtss %xmm4,%xmm1
    0.12 :   32eab1: movss  %xmm0,(%rax)
    0.18 :   32eab5: mov    %edx,0x4(%rax)
    0.21 :   32eab8: mov    %r10d,0x8(%rax)
    0.14 :   32eabc: movss  %xmm1,0xc(%rax)
    0.18 :   32eac1: mov    %r9d,0x10(%rax)
    0.14 :   32eac5: mov    %r11d,0x14(%rax)
    0.21 :   32eac9: pop    %rbx
    0.21 :   32eaca: pop    %r12
    0.16 :   32eacc: pop    %r14
    0.18 :   32eace: pop    %r15
    0.13 :   32ead0: pop    %rbp
    0.40 :   32ead1: ret

Your "baseline" doesn't appear to have any floating point operations -- I suspect the entire calculation was optimized away as an unused result.

2 Likes

I'm embarrassed. I had a mistake in the baseline function that equalized the results once fixed. Thank you for your help.

3 Likes