I'm not compiling for any specific features. I'm just running cargo bench "name"
.
I don't want to disable SIMD entirely, it works very well for perlin noise that utilizes f32x4.
This is the output of my baseline function (with outer loop from benchmark)
Percent | Source code & Disassembly of worley-f094cb88dc8042c1 for cycles:P (35707 samples, percent: local period)
------------------------------------------------------------------------------------------------------------------------
:
:
:
: 3 Disassembly of section .text:
:
: 5 0000000000368170 <worley::baseline_worley2x2>:
0.00 : 368170: mov 0x10(%rdi),%rax
0.00 : 368174: test %rax,%rax
0.00 : 368177: je 3681c2 <worley::baseline_worley2x2+0x52>
0.00 : 368179: push %rbx
0.00 : 36817a: mov 0x1c(%rdi),%ecx
0.00 : 36817d: mov 0x20(%rdi),%edx
0.00 : 368180: mov 0x24(%rdi),%esi
0.00 : 368183: mov 0x28(%rdi),%r8d
0.00 : 368187: mov 0x2c(%rdi),%r9d
0.00 : 36818b: mov 0x8(%rdi),%r10
0.00 : 36818f: mov 0x18(%rdi),%edi
0.00 : 368192: shl $0x3,%rax
0.00 : 368196: xor %r11d,%r11d
0.00 : 368199: mov %ecx,%ebx
0.00 : 36819b: jmp 3681b4 <worley::baseline_worley2x2+0x44>
0.00 : 36819d: nopl (%rax)
16.32 : 3681a0: add %r9d,%ebx
32.66 : 3681a3: movq $0x0,(%r10,%r11,1)
16.51 : 3681ab: add $0x8,%r11
16.35 : 3681af: cmp %r11,%rax
0.00 : 3681b2: je 3681c1 <worley::baseline_worley2x2+0x51>
16.42 : 3681b4: cmp %esi,%ebx
0.00 : 3681b6: jl 3681a0 <worley::baseline_worley2x2+0x30>
1.03 : 3681b8: add %r8d,%edi
0.47 : 3681bb: mov %ecx,%ebx
0.21 : 3681bd: cmp %edx,%edi
0.02 : 3681bf: jl 3681a0 <worley::baseline_worley2x2+0x30>
0.00 : 3681c1: pop %rbx
0.00 : 3681c2: ret
And this is the output when I put #[inline(never)]
on the problem worley2x2 function.
Percent | Source code & Disassembly of worley-f094cb88dc8042c1 for cycles:P (34556 samples, percent: local period)
------------------------------------------------------------------------------------------------------------------------
:
:
:
: 3 Disassembly of section .text:
:
: 5 000000000032e930 <terrain::noise::worley::worley2x2>:
0.13 : 32e930: push %rbp
0.32 : 32e931: push %r15
0.28 : 32e933: push %r14
0.13 : 32e935: push %r12
0.12 : 32e937: push %rbx
0.27 : 32e938: cvttss2si %xmm0,%ecx
0.15 : 32e93c: movss -0x2e9f44(%rip),%xmm2 # 44a00 <anon.8a551d1f60a9618a374ee40eae30dd74.44.llvm.9167635081569607948+0xbc0>
0.26 : 32e944: ucomiss %xmm2,%xmm0
0.11 : 32e947: mov $0x7fffffff,%r9d
0.17 : 32e94d: cmova %r9d,%ecx
0.15 : 32e951: xor %edx,%edx
0.30 : 32e953: ucomiss %xmm0,%xmm0
0.10 : 32e956: cmovp %edx,%ecx
0.27 : 32e959: cvttss2si %xmm1,%r8d
0.25 : 32e95e: ucomiss %xmm2,%xmm1
0.15 : 32e961: mov %rdi,%rax
0.15 : 32e964: cmova %r9d,%r8d
0.24 : 32e968: ucomiss %xmm1,%xmm1
0.13 : 32e96b: cmovp %edx,%r8d
0.15 : 32e96f: dec %r8d
0.14 : 32e972: unpcklps %xmm1,%xmm0
0.16 : 32e975: dec %ecx
0.18 : 32e977: movss -0x2eb147(%rip),%xmm2 # 43838 <anon.d84a67f96832bb96bdd3e33829159445.308.llvm.13447959004201808895+0x404>
0.15 : 32e97f: mov $0xaaaaaaab,%edi
0.14 : 32e984: xorps %xmm1,%xmm1
0.12 : 32e987: movaps -0x2ecede(%rip),%xmm3 # 41ab0 <anon.04905de9e591e303b6bdd27fb733ee02.158.llvm.9333659855979572961+0x140>
0.10 : 32e98e: xor %r10d,%r10d
0.12 : 32e991: movaps %xmm2,%xmm4
0.13 : 32e994: xor %r9d,%r9d
0.12 : 32e997: xor %r11d,%r11d
0.11 : 32e99a: xor %ebx,%ebx
0.13 : 32e99c: jmp 32e9ab <terrain::noise::worley::worley2x2+0x7b>
0.00 : 32e99e: xchg %ax,%ax
1.28 : 32e9a0: inc %ebx
1.24 : 32e9a2: cmp $0x9,%ebx
0.00 : 32e9a5: je 32eaa6 <terrain::noise::worley::worley2x2+0x176>
1.41 : 32e9ab: mov %ebx,%r14d
1.25 : 32e9ae: imul %rdi,%r14
1.19 : 32e9b2: shr $0x21,%r14
1.33 : 32e9b6: lea (%r14,%r14,2),%ebp
1.26 : 32e9ba: mov %ecx,%r15d
1.46 : 32e9bd: sub %ebp,%r15d
1.44 : 32e9c0: add %ebx,%r15d
1.41 : 32e9c3: add %r8d,%r14d
1.67 : 32e9c6: mov %r15d,%ebp
1.30 : 32e9c9: rol $0x3,%ebp
1.53 : 32e9cc: xor %r14d,%ebp
1.38 : 32e9cf: mov %ebp,%r12d
1.36 : 32e9d2: rol $0x5,%r12d
1.45 : 32e9d6: xor %ebp,%r12d
1.41 : 32e9d9: mov %r12d,%ebp
1.64 : 32e9dc: shr $0x7,%ebp
1.93 : 32e9df: xor %r12d,%ebp
1.69 : 32e9e2: movzbl %bpl,%r12d
1.28 : 32e9e6: rol $0x3,%r14d
1.77 : 32e9ea: xor %r15d,%r14d
1.69 : 32e9ed: mov %r14d,%ebp
1.39 : 32e9f0: rol $0x5,%ebp
1.50 : 32e9f3: xor %r14d,%ebp
1.50 : 32e9f6: mov %ebp,%r14d
1.90 : 32e9f9: shr $0x7,%r14d
1.47 : 32e9fd: xor %ebp,%r14d
1.55 : 32ea00: movzbl %r14b,%r14d
1.47 : 32ea04: movzbl (%rsi,%r14,1),%ebp
1.63 : 32ea09: movzbl 0x100(%rsi,%r12,1),%r15d
1.44 : 32ea12: movzbl 0x100(%rsi,%r14,1),%r14d
1.55 : 32ea1b: shl $0x8,%r14d
1.38 : 32ea1f: or %ebp,%r14d
2.84 : 32ea22: movd %r14d,%xmm5
1.43 : 32ea27: movzbl (%rsi,%r12,1),%ebp
1.74 : 32ea2c: shl $0x8,%ebp
1.41 : 32ea2f: or %r15d,%ebp
2.67 : 32ea32: movd %ebp,%xmm6
1.44 : 32ea36: pxor %xmm5,%xmm6
1.23 : 32ea3a: punpcklbw %xmm1,%xmm6
1.34 : 32ea3e: punpcklwd %xmm1,%xmm6
1.27 : 32ea42: cvtdq2ps %xmm6,%xmm6
1.27 : 32ea45: mulps %xmm3,%xmm6
1.42 : 32ea48: movaps %xmm0,%xmm7
1.24 : 32ea4b: subps %xmm6,%xmm7
1.29 : 32ea4e: mulps %xmm7,%xmm7
1.28 : 32ea51: movaps %xmm7,%xmm5
1.24 : 32ea54: shufps $0x55,%xmm7,%xmm5
1.29 : 32ea58: addss %xmm7,%xmm5
2.57 : 32ea5c: ucomiss %xmm5,%xmm4
1.23 : 32ea5f: jbe 32e9a0 <terrain::noise::worley::worley2x2+0x70>
1.51 : 32ea65: ucomiss %xmm5,%xmm2
0.69 : 32ea68: movd %xmm6,%r9d
0.76 : 32ea6d: shufps $0x55,%xmm6,%xmm6
0.76 : 32ea71: movd %xmm6,%r11d
0.58 : 32ea76: movaps %xmm5,%xmm6
0.68 : 32ea79: mov %edx,%ebp
0.63 : 32ea7b: cmova %r9d,%ebp
0.77 : 32ea7f: mov %r10d,%r14d
0.69 : 32ea82: cmova %r11d,%r14d
0.70 : 32ea86: minss %xmm2,%xmm6
0.99 : 32ea8a: cmova %edx,%r9d
0.65 : 32ea8e: movaps %xmm2,%xmm4
0.71 : 32ea91: cmova %r10d,%r11d
0.68 : 32ea95: maxss %xmm5,%xmm4
0.69 : 32ea99: movaps %xmm6,%xmm2
0.64 : 32ea9c: mov %ebp,%edx
0.69 : 32ea9e: mov %r14d,%r10d
0.68 : 32eaa1: jmp 32e9a0 <terrain::noise::worley::worley2x2+0x70>
0.20 : 32eaa6: sqrtss %xmm2,%xmm0
0.14 : 32eaaa: xorps %xmm1,%xmm1
0.15 : 32eaad: sqrtss %xmm4,%xmm1
0.12 : 32eab1: movss %xmm0,(%rax)
0.18 : 32eab5: mov %edx,0x4(%rax)
0.21 : 32eab8: mov %r10d,0x8(%rax)
0.14 : 32eabc: movss %xmm1,0xc(%rax)
0.18 : 32eac1: mov %r9d,0x10(%rax)
0.14 : 32eac5: mov %r11d,0x14(%rax)
0.21 : 32eac9: pop %rbx
0.21 : 32eaca: pop %r12
0.16 : 32eacc: pop %r14
0.18 : 32eace: pop %r15
0.13 : 32ead0: pop %rbp
0.40 : 32ead1: ret