Auto-vectorization in Rust

chriselrod · January 20, 2019, 12:01am

Hi everyone!

I've been playing around with benchmarking a simple problem in a variety of languages/compilers. So far that list is Rust, clang++/g++, ispc, Julia, and gfortran/Flang.

Unfortunately, I cannot get Rust to vectorize the code. This is fairly simple numerical code. A Rust version of the code:

#[inline]
pub fn pdbacksolve(x: (f32,f32,f32), s: (f32,f32,f32,f32,f32,f32)) -> (f32, f32, f32) {

    let (x1,x2,x3) = x;
    let (s11,s12,s22,s13,s23,s33) = s;

    let ui33 = s33.sqrt().recip();
    let u13 = s13 * ui33;
    let u23 = s23 * ui33;
    let ui22 = (s22 - u23*u23).sqrt().recip();
    let u12 = (s12 - u13*u23) * ui22;

    let ui11 = (s11 - u12*u12 - u13*u13).sqrt().recip();
    let ui12 = - u12 * ui11 * ui22;
    let ui33x3 = ui33 * x3;

    (
        ui11*x1 + ui12*x2 - (u13 * ui11 + u23 * ui12) * ui33x3,
        ui22*x1 - u23 * ui22 * ui33x3,
        ui33x3
    )
}

fn processbpp_safe(x: &mut [f32], bpp: &[f32], n: usize){
    for i in 0..n {
        let (x1,x2,x3) = pdbacksolve(
            (bpp[i], bpp[i + n], bpp[i + 2*n]),
            (bpp[i + 4*n], bpp[i + 5*n], bpp[i + 6*n], bpp[i + 7*n], bpp[i + 8*n], bpp[i + 9*n])
        );

        x[i      ] = x1;
        x[i +   n] = x2;
        x[i + 2*n] = x3;
    }
}

#[no_mangle]
pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
    processbpp_safe(unsafe { std::slice::from_raw_parts_mut(x, 3*n) }, unsafe { std::slice::from_raw_parts(bpp, 10*n) }, n);
}

The idea is to create a shared library that could be called from another language.
I am not sure I did that correctly, because I get warnings about the unsafe blocks being unnecessary:

$ cargo rustc --release -- --emit asm
   Compiling process_inputs v0.1.0 (/home/chriselrod/Documents/progwork/rust/process_inputs)
warning: unnecessary `unsafe` block
  --> src/lib.rs:48:21
   |
47 | pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
   | ----------------------------------------------------------------------- because it's nested under this `unsafe` fn
48 |     processbpp_safe(unsafe { std::slice::from_raw_parts_mut(x, 3*n) }, unsafe { std::slice::from_raw_parts(bpp, 10*n) }, n);
   |                     ^^^^^^ unnecessary `unsafe` block
   |
   = note: #[warn(unused_unsafe)] on by default

warning: unnecessary `unsafe` block
  --> src/lib.rs:48:72
   |
47 | pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
   | ----------------------------------------------------------------------- because it's nested under this `unsafe` fn
48 |     processbpp_safe(unsafe { std::slice::from_raw_parts_mut(x, 3*n) }, unsafe { std::slice::from_raw_parts(bpp, 10*n) }, n);
   |                                                                        ^^^^^^ unnecessary `unsafe` block

    Finished release [optimized] target(s) in 0.36s

I compiled with these flags on an architecture with avx512f:

$ rustc --version
rustc 1.33.0-nightly (c76f3c374 2019-01-18)
$ cat .cargo/config 
[target.'cfg(any(windows, unix))']
rustflags = ["-C", "target-cpu=native", "-C", "llvm-args=-ffast-math", "-C", "opt-level=3", "-C", "llvm-args=-force-vector-width=16"]
$ rustc --print cfg -C target-cpu=native -C opt-level=3
target_arch="x86_64"
target_endian="little"
target_env="gnu"
target_family="unix"
target_feature="adx"
target_feature="aes"
target_feature="avx"
target_feature="avx2"
target_feature="avx512bw"
target_feature="avx512cd"
target_feature="avx512dq"
target_feature="avx512f"
target_feature="avx512vl"
target_feature="bmi1"
target_feature="bmi2"
target_feature="cmpxchg16b"
target_feature="fma"
target_feature="fxsr"
target_feature="lzcnt"
target_feature="mmx"
target_feature="pclmulqdq"
target_feature="popcnt"
target_feature="rdrand"
target_feature="rdseed"
target_feature="sse"
target_feature="sse2"
target_feature="sse3"
target_feature="sse4.1"
target_feature="sse4.2"
target_feature="ssse3"
target_feature="xsave"
target_feature="xsavec"
target_feature="xsaveopt"
target_feature="xsaves"
target_has_atomic="16"
target_has_atomic="32"
target_has_atomic="64"
target_has_atomic="8"
target_has_atomic="cas"
target_has_atomic="ptr"
target_os="linux"
target_pointer_width="64"
target_thread_local
target_vendor="unknown"
unix

However, looking at the asm reveals there was no vectorization.

This is the main for loop when compiler with rust:

.LBB0_2:
	cmpq	%rdx, %rcx
	jae	.LBB0_21
	leaq	(%r8,%rcx), %r11
	cmpq	%rdx, %r11
	jae	.LBB0_23
	leaq	(%r14,%rcx), %r10
	cmpq	%rdx, %r10
	jae	.LBB0_24
	leaq	(%rbx,%rcx), %rax
	cmpq	%rdx, %rax
	jae	.LBB0_25
	movq	16(%rsp), %rax
	leaq	(%rax,%rcx), %rax
	cmpq	%rdx, %rax
	jae	.LBB0_26
	movq	8(%rsp), %rax
	leaq	(%rax,%rcx), %rax
	cmpq	%rdx, %rax
	jae	.LBB0_27
	leaq	(%rcx,%rbp), %rax
	cmpq	%rdx, %rax
	jae	.LBB0_28
	leaq	(%rcx,%r13), %rax
	cmpq	%rdx, %rax
	jae	.LBB0_29
	movq	(%rsp), %rax
	leaq	(%rax,%rcx), %rax
	cmpq	%rdx, %rax
	jae	.LBB0_30
	leaq	(%rsi,%rcx,4), %rax
	vmovss	(%rsi,%rcx,4), %xmm3
	vmovss	(%rax,%rbx), %xmm11
	addq	%rbx, %rax
	vmovss	(%rax,%rbx), %xmm12
	addq	%rbx, %rax
	vmovss	(%rax,%r13), %xmm0
	addq	%r13, %rax
	vmovss	(%rax,%rbx), %xmm4
	addq	%rbx, %rax
	vmovss	(%rax,%rbx), %xmm7
	addq	%rbx, %rax
	vmovss	(%rax,%rbx), %xmm2
	addq	%rbx, %rax
	vmovss	(%rax,%rbx), %xmm5
	addq	%rbx, %rax
	vmovss	(%rbx,%rax), %xmm6
	vucomiss	%xmm6, %xmm9
	vmovaps	%xmm10, %xmm1
	ja	.LBB0_13
	vsqrtss	%xmm6, %xmm6, %xmm1
	vdivss	%xmm1, %xmm8, %xmm1
.LBB0_13:
	vmulss	%xmm1, %xmm2, %xmm2
	vmulss	%xmm1, %xmm5, %xmm6
	vmulss	%xmm6, %xmm6, %xmm5
	vsubss	%xmm5, %xmm7, %xmm5
	vucomiss	%xmm5, %xmm9
	vmovaps	%xmm10, %xmm7
	ja	.LBB0_15
	vsqrtss	%xmm5, %xmm5, %xmm5
	vdivss	%xmm5, %xmm8, %xmm7
.LBB0_15:
	vmulss	%xmm6, %xmm2, %xmm5
	vsubss	%xmm5, %xmm4, %xmm4
	vmulss	%xmm7, %xmm4, %xmm5
	vmulss	%xmm5, %xmm5, %xmm4
	vsubss	%xmm4, %xmm0, %xmm0
	vmulss	%xmm2, %xmm2, %xmm4
	vsubss	%xmm4, %xmm0, %xmm0
	vucomiss	%xmm0, %xmm9
	vmovaps	%xmm10, %xmm4
	ja	.LBB0_17
	vsqrtss	%xmm0, %xmm0, %xmm0
	vdivss	%xmm0, %xmm8, %xmm4
.LBB0_17:
	cmpq	%r9, %rcx
	jae	.LBB0_31
	vmulss	%xmm4, %xmm5, %xmm0
	vmulss	%xmm0, %xmm7, %xmm0
	vxorps	.LCPI0_2(%rip){1to4}, %xmm0, %xmm5
	vmulss	%xmm1, %xmm12, %xmm0
	vmulss	%xmm4, %xmm2, %xmm1
	vmulss	%xmm5, %xmm6, %xmm2
	vaddss	%xmm2, %xmm1, %xmm1
	vmulss	%xmm1, %xmm0, %xmm1
	vmulss	%xmm4, %xmm3, %xmm2
	vmulss	%xmm5, %xmm11, %xmm4
	vaddss	%xmm4, %xmm2, %xmm2
	vsubss	%xmm1, %xmm2, %xmm1
	vmovss	%xmm1, (%rdi,%rcx,4)
	cmpq	%r9, %r11
	jae	.LBB0_32
	vmulss	%xmm7, %xmm6, %xmm1
	vmulss	%xmm1, %xmm0, %xmm1
	vmulss	%xmm7, %xmm3, %xmm2
	vsubss	%xmm1, %xmm2, %xmm1
	vmovss	%xmm1, (%r15,%rcx,4)
	cmpq	%r9, %r10
	jae	.LBB0_20
	vmovss	%xmm0, (%r12,%rcx,4)
	addq	$1, %rcx
	cmpq	%r8, %rcx
	jb	.LBB0_2

versus compiled with

$ /usr/bin/clang++ --version
clang version 7.0.1 (Fedora 7.0.1-1.fc29)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
$ /usr/bin/clang++ -Ofast -march=native -S -fPIC process_inputs.cpp -o process_inputs.s

The C++ code looks more or less the same as the rust code, except that instead of creating and unpacking tuples, I moved that from the for loop to the inside of a function. I tested a Rust version written that way, and there was no change from the version written the way presented here.
The Fortran version was written like the Rust version, and was vectorized well with Flang (although not as well as with Clang). I would share that code if anyone is interested.

.LBB0_8:                                # =>This Inner Loop Header: Depth=1
	leaq	(%rsi,%rcx,4), %r14
	vmovups	(%rax,%r14), %zmm2
	addq	%rax, %r14
	leaq	(%r14,%rax), %rbx
	vmovups	(%r9,%rbx), %zmm3
	addq	%r9, %rbx
	leaq	(%rbx,%rax), %r15
	leaq	(%r15,%rax), %r12
	leaq	(%r12,%rax), %r13
	leaq	(%rax,%r13), %rbp
	vmovups	(%rax,%rbp), %zmm4
	vrsqrt14ps	%zmm4, %zmm5
	vmulps	%zmm5, %zmm4, %zmm4
	vfmadd213ps	%zmm0, %zmm5, %zmm4 # zmm4 = (zmm5 * zmm4) + zmm0
	vmulps	%zmm1, %zmm5, %zmm5
	vmulps	%zmm4, %zmm5, %zmm4
	vmulps	(%rax,%r12), %zmm4, %zmm5
	vmulps	(%rax,%r13), %zmm4, %zmm6
	vmovups	(%rax,%r15), %zmm7
	vfnmadd231ps	%zmm6, %zmm6, %zmm7 # zmm7 = -(zmm6 * zmm6) + zmm7
	vrsqrt14ps	%zmm7, %zmm8
	vmulps	%zmm8, %zmm7, %zmm7
	vfmadd213ps	%zmm0, %zmm8, %zmm7 # zmm7 = (zmm8 * zmm7) + zmm0
	vmulps	%zmm1, %zmm8, %zmm8
	vmulps	%zmm7, %zmm8, %zmm7
	vmovups	(%rax,%rbx), %zmm8
	vfnmadd231ps	%zmm6, %zmm5, %zmm8 # zmm8 = -(zmm5 * zmm6) + zmm8
	vmulps	%zmm8, %zmm7, %zmm8
	vmulps	%zmm5, %zmm5, %zmm9
	vfmadd231ps	%zmm8, %zmm8, %zmm9 # zmm9 = (zmm8 * zmm8) + zmm9
	vsubps	%zmm9, %zmm3, %zmm3
	vrsqrt14ps	%zmm3, %zmm9
	vmulps	%zmm9, %zmm3, %zmm3
	vfmadd213ps	%zmm0, %zmm9, %zmm3 # zmm3 = (zmm9 * zmm3) + zmm0
	vmulps	%zmm1, %zmm9, %zmm9
	vmulps	%zmm3, %zmm9, %zmm3
	vmulps	%zmm8, %zmm7, %zmm8
	vmulps	%zmm3, %zmm8, %zmm8
	vmulps	(%rax,%r14), %zmm4, %zmm4
	vmulps	%zmm8, %zmm2, %zmm9
	vfmsub231ps	(%rsi,%rcx,4), %zmm3, %zmm9 # zmm9 = (zmm3 * mem) - zmm9
	vmulps	%zmm5, %zmm3, %zmm3
	vfmsub231ps	%zmm8, %zmm6, %zmm3 # zmm3 = (zmm6 * zmm8) - zmm3
	vfmadd213ps	%zmm9, %zmm4, %zmm3 # zmm3 = (zmm4 * zmm3) + zmm9
	vmovups	%zmm3, (%rdi,%rcx,4)
	vfnmadd213ps	%zmm2, %zmm4, %zmm6 # zmm6 = -(zmm4 * zmm6) + zmm2
	vmulps	%zmm6, %zmm7, %zmm2
	vmovups	%zmm2, (%r10,%rcx,4)
	vmovups	%zmm4, (%r11,%rcx,4)
	addq	$16, %rcx
	cmpq	%rcx, %r8
	jne	.LBB0_8

There is also a second copy of the loop using xmm registers, to catch the n mod vector width remainder.

A few noteworthy differences: the C++ version was vectorized, using zmm registers.
The Rust version was not vectorized.
The C++ version used f(n)m(add/sub) instructions. The Rust version did not.
The C++ version used reciprocal square root instructions.

Other than the fact that calling cargo rustc --release -- --emit asm throws errors when I misspecify the rustflags, these flags do not seem to be doing anything. As far as I can tell, the asm remains the same (and unoptimized).

Am I missing something on how to get Rust to vectorize this numerical code?
If I understand the asm correctly, the bounds checks are a (perhaps the?) problem.

OptimisticPeach · January 20, 2019, 12:10am

Just as a note (I don't know anything about the compiler's internal optimizations) The warnings you get are because the function itself is already declared to have an unsafe context and therefore require an unsafe context to run, so another unsafe context is unnecessary. IE:

pub fn main() {
    unsafe { //We declare an unsafe context here, which we need to call `foo`
        foo();
    }
}
// When we declare this function as unsafe, it means that it can only be run in
// an unsafe context, like the one we declare in `main` with the `unsafe` keyword
// again.
pub unsafe fn foo() {}

kornel · January 20, 2019, 12:10am

You must add -C target-cpu=native to rust flags to use zmm register. By default Rust supports very old CPUs.

for i in 0..n loops are generally the slowest in Rust. If you can't use iterators (which eliminate bounds checks), then the trick is to hint LLVM that the slices are large enough:

    let x = &x[0..n];

but that very much depends on whether LLVM will figure out that the range of the slice matches range used in the for loop. In your more complex case it doesn't.

Out of bounds panic is an observable side effect, so it will totally ruin any autovectorization.

https://rust.godbolt.org/ is awesome for checking ASM output without fiddling with the rustc flags. Here's a version that looks more vectorized:

chriselrod · January 20, 2019, 1:50am

My .cargo/config did have target-cpu=native:

$ cat .cargo/config 
[target.'cfg(any(windows, unix))']
rustflags = ["-C", "target-cpu=native", "-C", "llvm-args=-ffast-math", "-C", "opt-level=3", "-C", "llvm-args=-force-vector-width=16"]

I've now taken out "-C", "llvm-args=-force-vector-width=16" (forcing vector width mostly just seems necessary with gcc-compilers).

Anyway, very nice! That was a simple fix.

However, even with -C llvm-args=-ffast-math it wont emit f[n]m[add/sub] instructions.
Additionally, it does domain checks on the square roots:

  vcmpltps %zmm0, %zmm12, %k1
  vsqrtps %zmm12, %zmm12
  vdivps %zmm12, %zmm1, %zmm12
  vbroadcastss %xmm2, %zmm12 {%k1}

or if you prefer Intel syntax:

  vcmpltps k1, zmm12, zmm0
  vsqrtps zmm12, zmm12
  vdivps zmm12, zmm1, zmm12
  vbroadcastss zmm12 {k1}, xmm2

gcc also does checks, but it did masked square root operations. I'm not sure what xmm2 contains.
I'd have to try the code with non-positive definite matrices to find out.
But in practice, the inputs should be guaranteed to contain valid data (so that the square roots are positive).

EDIT:
However, something is incorrect (results were wrong; mostly NaN).

fn processbpp_safe(x: &mut [f32], bpp: &[f32], n: usize){
    let x = &mut x[0..3*n];
    let bpp = &bpp[0..10*n];
    for (x, bpp) in x.chunks_exact_mut(3).zip(bpp.windows(10)) {
        let i = 0;
        let n = 1;
        let (x1,x2,x3) = pdbacksolve(
            (bpp[i], bpp[i + n], bpp[i + 2*n]),
            (bpp[i + 4*n], bpp[i + 5*n], bpp[i + 6*n], bpp[i + 7*n], bpp[i + 8*n], bpp[i + 9*n])
        );

        x[i      ] = x1;
        x[i +   n] = x2;
        x[i + 2*n] = x3;
    }
}

I suspect the issue is with the stride. The data layout is "struct of arrays" style, so that x is organized as n instances of x1, followed by n instances of x2, etc...

For what it's worth, this awkward version is also vectorized, and makes the underlying data layout more clear:

Ideally, there would be an actual struct wrapping the arrays to make the API (and working with elements) more clear.

EDIT:
For what it's worth, here is a series of benchmarks run from Julia, with N = 2827, with results sorted from fastest to slowest:

julia> @benchmark clangtest($X32,  $BPP32,  $N) # LLVM, C++
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.032 μs (0.00% GC)
  median time:      2.042 μs (0.00% GC)
  mean time:        2.132 μs (0.00% GC)
  maximum time:     5.793 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark icpctest($X32,  $BPP32,  $N) # Intel, C++
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.284 μs (0.00% GC)
  median time:      2.364 μs (0.00% GC)
  mean time:        2.429 μs (0.00% GC)
  maximum time:     6.087 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark flangtest2($X32,  $BPP32,  $N)  # LLVM Fortran, with workaround for GNU performance bug
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.330 μs (0.00% GC)
  median time:      2.346 μs (0.00% GC)
  mean time:        2.454 μs (0.00% GC)
  maximum time:     6.004 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark flangtest($X32,  $BPP32,  $N) # llvm Fortran
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.330 μs (0.00% GC)
  median time:      2.404 μs (0.00% GC)
  mean time:        2.477 μs (0.00% GC)
  maximum time:     6.159 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark ispctest($X32,  $BPP32,  $N) #LLVM single program multiple data compiler
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.348 μs (0.00% GC)
  median time:      2.429 μs (0.00% GC)
  mean time:        2.477 μs (0.00% GC)
  maximum time:     6.165 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark juliatest_fast!($X32,  $BPP32) # Julia, but I cheated with a loop vectorization macro and rsqrt intrinsic
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.528 μs (0.00% GC)
  median time:      2.605 μs (0.00% GC)
  mean time:        2.665 μs (0.00% GC)
  maximum time:     6.547 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark juliatest!($X32,  $BPP32) # like the above, but I added a Newton step after the rsqrt for increased accuracy
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.554 μs (0.00% GC)
  median time:      3.760 μs (0.00% GC)
  mean time:        3.885 μs (0.00% GC)
  maximum time:     7.952 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8

julia> @benchmark gforttest2($X32,  $BPP32,  $N) # GNU Fortran, with workaround
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.244 μs (0.00% GC)
  median time:      4.259 μs (0.00% GC)
  mean time:        4.428 μs (0.00% GC)
  maximum time:     9.575 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     7

julia> @benchmark gpptest($X32,  $BPP32,  $N) # GNU C++
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.255 μs (0.00% GC)
  median time:      4.375 μs (0.00% GC)
  mean time:        4.449 μs (0.00% GC)
  maximum time:     9.463 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     7

julia> @benchmark rusttest($X32,  $BPP32,  $NU) # Rust
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.547 μs (0.00% GC)
  median time:      4.600 μs (0.00% GC)
  mean time:        4.747 μs (0.00% GC)
  maximum time:     10.810 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     7

julia> @benchmark ifort2($X32,  $BPP32,  $N) # Intel Fortran; with workaround for GNU Fortran performance bug
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     16.798 μs (0.00% GC)
  median time:      17.011 μs (0.00% GC)
  mean time:        17.473 μs (0.00% GC)
  maximum time:     50.643 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark ifort($X32,  $BPP32,  $N) # Intel Fortran; no workaround for GNU Fortran performance bug
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     16.796 μs (0.00% GC)
  median time:      17.049 μs (0.00% GC)
  mean time:        17.683 μs (0.00% GC)
  maximum time:     50.748 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark gforttest($X32,  $BPP32,  $N) # GNU Fortran; no workaround for GNU Fortran performance bug
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     150.899 μs (0.00% GC)
  median time:      151.311 μs (0.00% GC)
  mean time:        156.347 μs (0.00% GC)
  maximum time:     226.430 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

Basic summary: clang++ produced the fastest code, at about 2 microseconds.
Rust clocked in at around 4.5 microseconds. This is largely because of calculating the reciprocal square roots via dividing 1 by the square root. Dividing 1 by the square root is more accurate, so that is reasonable.
The lack of fused arithmetic instructions play a roll.

g++, and gfortran ran on a code to avoid a performance regression were a little faster than Rust. They use the reciprocal square root intrinsic, but add a Newton step for better accuracy. They also fuse multiplications and additions.
The Julia version using a Newton iteration after the reciprocal square root took about 3.6 microseconds. Without the step, it took 2.6. (For the record, all the compilers using reciprocal square roots on their own did add the step).

ifort fell far behind at around 16 microseconds, while gfortran completely failed to vectorize the first version of the Fortran code, taking over 150 microseconds (there are open bug reports on bugzilla).

C++ is the C++ code, assembly, and compiler options.
Rust is the Rust code.
Fortran code. GNUprocessBPP has the workaround for GNU Fortran; processBPP does not. Unfortunately, Flang is not included among the optional compilers.
ispc code.

leonardo · January 20, 2019, 9:34am

While this generates the same assembly, I think this is a better signature for your Rust function:

pub fn pd_back_solve([x1,x2,x3]: [f32; 3], [s11,s12,s22,s13,s23,s33]: [f32; 6]) -> [f32; 3] {

chriselrod · January 22, 2019, 1:53am

A little googling showed I could use std::intrinsics to get fastmath.

#![feature(core_intrinsics)]

#[inline]
pub unsafe fn pd_back_solve([x1,x2,x3]: [f32; 3], [s11,s12,s22,s13,s23,s33]: [f32; 6]) -> [f32; 3] {

    let ui33 = std::intrinsics::fdiv_fast(1f32, std::intrinsics::sqrtf32(s33));
    let ui33x3 = std::intrinsics::fmul_fast(ui33, x3);
    let u13 = std::intrinsics::fmul_fast(s13, ui33);
    let u23 = std::intrinsics::fmul_fast(s23, ui33);
    let ui22 = std::intrinsics::fdiv_fast(1f32, std::intrinsics::sqrtf32(std::intrinsics::fsub_fast(s22, std::intrinsics::fmul_fast(u23,u23))));
    let u12 = std::intrinsics::fsub_fast(s12, std::intrinsics::fmul_fast(u13,u23)) * ui22;

    let ui11 = std::intrinsics::fdiv_fast(1f32, std::intrinsics::sqrtf32(std::intrinsics::fsub_fast(std::intrinsics::fsub_fast(s11,std::intrinsics::fmul_fast(u13,u13)),std::intrinsics::fmul_fast(u12,u12))));
    let nui12 = std::intrinsics::fmul_fast(std::intrinsics::fmul_fast(u12, ui22), ui11);

    [
        std::intrinsics::fsub_fast(std::intrinsics::fsub_fast(std::intrinsics::fmul_fast(ui11,x1),std::intrinsics::fmul_fast(std::intrinsics::fsub_fast(std::intrinsics::fmul_fast(u13, ui11), std::intrinsics::fmul_fast(u23, nui12)), ui33x3)), std::intrinsics::fmul_fast(nui12,x2)),
        std::intrinsics::fsub_fast(ui22*x1, std::intrinsics::fmul_fast(std::intrinsics::fmul_fast(u23,ui33x3),ui22)),
        ui33x3
    ]
}

fn processbpp_safe(y1: &mut [f32], y2: &mut [f32], y3: &mut [f32], x1: &[f32], x2: &[f32], x3: &[f32],
                    s11: &[f32], s12: &[f32], s22: &[f32], s13: &[f32], s23: &[f32], s33: &[f32]){

    let n = y1.len();
    assert_eq!(y2.len(), n);
    assert_eq!(y3.len(), n);
    assert_eq!(x1.len(), n);
    assert_eq!(x2.len(), n);
    assert_eq!(x3.len(), n);
    assert_eq!(s11.len(), n);
    assert_eq!(s12.len(), n);
    assert_eq!(s22.len(), n);
    assert_eq!(s13.len(), n);
    assert_eq!(s23.len(), n);
    assert_eq!(s33.len(), n);
    for i in 0..n {
        unsafe {
            let [y1i,y2i,y3i] = pd_back_solve([x1[i],x2[i],x3[i]], [s11[i],s12[i],s22[i],s13[i],s23[i],s33[i]]);
            y1[i] = y1i;
            y2[i] = y2i;
            y3[i] = y3i;
        }
    }
}

#[no_mangle]
pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
    processbpp_safe(
        std::slice::from_raw_parts_mut(x,                      n),
        std::slice::from_raw_parts_mut(x.offset(  n as isize), n),
        std::slice::from_raw_parts_mut(x.offset(2*n as isize), n),
        std::slice::from_raw_parts(bpp,                      n),
        std::slice::from_raw_parts(bpp.offset(  n as isize), n),
        std::slice::from_raw_parts(bpp.offset(2*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(4*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(5*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(6*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(7*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(8*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(9*n as isize), n)
    );
}

The code is now unsafe, and it requires Rust nightly, Godbolt for reference (thanks @leonardo -- not sure how I missed that; I knew gcc-trunk, etc, are options). This is the relevant for loop:

.LBB0_7:
	movq	%r8, %rcx
	movq	%r14, %rbx
	leaq	(%rsi,%r11,4), %r14
	leaq	(%r14,%r9), %r15
	leaq	(%r15,%r9), %rax
	movq	(%rsp), %rbp
	vmovups	(%rbp,%rax), %zmm2
	addq	%rbp, %rax
	leaq	(%rax,%r9), %r12
	leaq	(%r12,%r9), %r13
	leaq	(%r9,%r13), %rbp
	leaq	(%r9,%rbp), %r8
	vmovups	(%r9,%r8), %zmm3
	vrsqrt14ps	%zmm3, %zmm4
	vmovups	(%rsi,%r11,4), %zmm5
	vmulps	%zmm4, %zmm3, %zmm3
	vfmadd213ps	%zmm0, %zmm4, %zmm3
	vmulps	%zmm1, %zmm4, %zmm4
	vmulps	%zmm3, %zmm4, %zmm3
	vmulps	(%r9,%r13), %zmm3, %zmm4
	vmulps	(%r9,%rbp), %zmm3, %zmm6
	vmovups	(%r9,%r12), %zmm7
	vfnmadd231ps	%zmm6, %zmm6, %zmm7
	vrsqrt14ps	%zmm7, %zmm8
	vmulps	%zmm8, %zmm7, %zmm7
	vfmadd213ps	%zmm0, %zmm8, %zmm7
	vmulps	%zmm1, %zmm8, %zmm8
	vmulps	%zmm7, %zmm8, %zmm7
	vmovups	(%r9,%rax), %zmm8
	vfnmadd231ps	%zmm6, %zmm4, %zmm8
	vmulps	%zmm7, %zmm8, %zmm8
	vmulps	%zmm4, %zmm4, %zmm9
	vfmadd231ps	%zmm8, %zmm8, %zmm9
	vsubps	%zmm9, %zmm2, %zmm2
	vrsqrt14ps	%zmm2, %zmm9
	vmulps	(%r9,%r15), %zmm3, %zmm3
	vmulps	%zmm9, %zmm2, %zmm2
	vfmadd213ps	%zmm0, %zmm9, %zmm2
	vmulps	%zmm1, %zmm9, %zmm9
	vmulps	%zmm2, %zmm9, %zmm2
	vmulps	%zmm7, %zmm8, %zmm8
	vmulps	%zmm2, %zmm8, %zmm8
	vmulps	%zmm4, %zmm2, %zmm4
	vfmsub231ps	%zmm6, %zmm8, %zmm4
	vmulps	(%r9,%r14), %zmm8, %zmm8
	movq	%rbx, %r14
	movq	%rcx, %r8
	vfmsub213ps	%zmm8, %zmm3, %zmm4
	vfmadd213ps	%zmm4, %zmm5, %zmm2
	vmovups	%zmm2, (%rdi,%r11,4)
	vfnmadd213ps	%zmm5, %zmm3, %zmm6
	vmulps	%zmm6, %zmm7, %zmm2
	vmovups	%zmm2, (%rcx,%r11,4)
	vmovups	%zmm3, (%rbx,%r11,4)
	addq	$16, %r11
	cmpq	%r11, %rdx
	jne	.LBB0_7

Clang++ benchmark:

julia> @benchmark clangtest($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.029 μs (0.00% GC)
  median time:      2.138 μs (0.00% GC)
  mean time:        2.183 μs (0.00% GC)
  maximum time:     5.741 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

vs Rust:

julia> @benchmark rusttest($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.338 μs (0.00% GC)
  median time:      2.357 μs (0.00% GC)
  mean time:        2.486 μs (0.00% GC)
  maximum time:     6.324 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

So speed is now close. But the Rust code is still slower, is unsafe, and depends on a less-than-pretty nightly-only API. It is faster than g++, however.

The smarter thing API-wise, if I were to do a lot of this, would probably be define a wrapper type, where arithmetic operations default to these intrinsics. That was a suggested approach here, for example. Someone also shared code through a link without an attached license agreement.

I assume there's a terms of service that means all code actually posted on the discourse is under something like MIT? That's what I intend when I post code, anyway.

Zarenor · January 22, 2019, 3:30pm

You could probably post your code and add that as a license option for anyone wishing to use it. The TOS here specifies anything posted on this forum is under CC BY-NC-SA 3.0-NP. Attribution is required for usages, and commercial use is forbidden. Anything created through a use of this license must also be shared under this license.

leonardo · January 22, 2019, 6:16pm

It compiles on Godbolt if you use the Nightly compiler on the menu.

robsmith11 · January 23, 2019, 10:53pm

If you haven't seen it already, there is an example of using fast-math operations with a wrapped floating point type: https://github.com/bluss/fast-floats

chriselrod · January 24, 2019, 6:42am

Thanks @Zarenor. I'll post the code somewhere eventually. I brought that up because the wrapper type code didn't have a license (and was hosted elsewhere), which AFAIK not only renders it unusable, but means even looking at it becomes questionable if we're likely to implement something similar in the future. Even though I'd the author's intention was for people to actually look at and use the code.

@robsmith11

Cool, that is neater:

#![feature(core_intrinsics)]
extern crate fast_floats;

use fast_floats::Fast;

#[inline(always)]
fn rsqrt32(a: Fast<f32>) -> Fast<f32> {
    unsafe {
        Fast(std::intrinsics::fdiv_fast(1f32, std::intrinsics::sqrtf32(a.get())))
    }
}

#[inline]
pub fn pd_back_solve([x1,x2,x3]: [Fast<f32>; 3], [s11,s12,s22,s13,s23,s33]: [Fast<f32>; 6]) -> [Fast<f32>; 3] {

    let ui33 = rsqrt32(s33);
    let ui33x3 = ui33 * x3;
    let u13 = s13 * ui33;
    let u23 = s23 * ui33;
    let ui22 = rsqrt32(s22 - u23*u23);
    let u12 = (s12 - u13*u23) * ui22;

    let ui11 = rsqrt32(s11 - u13*u13 - u12*u12);
    let nui12 = u12 * ui11 * ui22;

    [
        ui11*x1 - nui12*x2 - (u13 * ui11 - u23 * nui12) * ui33x3,
        ui22*x1 - u23 * ui22 * ui33x3,
        ui33x3
    ]
}

fn processbpp_safe(y1: &mut [Fast<f32>], y2: &mut [Fast<f32>], y3: &mut [Fast<f32>], x1: &[Fast<f32>], x2: &[Fast<f32>], x3: &[Fast<f32>],
                    s11: &[Fast<f32>], s12: &[Fast<f32>], s22: &[Fast<f32>], s13: &[Fast<f32>], s23: &[Fast<f32>], s33: &[Fast<f32>]){

    let n = y1.len();
    assert_eq!(y2.len(), n);
    assert_eq!(y3.len(), n);
    assert_eq!(x1.len(), n);
    assert_eq!(x2.len(), n);
    assert_eq!(x3.len(), n);
    assert_eq!(s11.len(), n);
    assert_eq!(s12.len(), n);
    assert_eq!(s22.len(), n);
    assert_eq!(s13.len(), n);
    assert_eq!(s23.len(), n);
    assert_eq!(s33.len(), n);
    for i in 0..n {
            let [y1i,y2i,y3i] = pd_back_solve([x1[i],x2[i],x3[i]], [s11[i],s12[i],s22[i],s13[i],s23[i],s33[i]]);
            y1[i] = y1i;
            y2[i] = y2i;
            y3[i] = y3i;
    }
}

#[no_mangle]
pub unsafe extern fn processbpp(x: *mut Fast<f32>, bpp: *const Fast<f32>, n: usize){
    processbpp_safe(
        std::slice::from_raw_parts_mut(x,                      n),
        std::slice::from_raw_parts_mut(x.offset(  n as isize), n),
        std::slice::from_raw_parts_mut(x.offset(2*n as isize), n),
        std::slice::from_raw_parts(bpp,                      n),
        std::slice::from_raw_parts(bpp.offset(  n as isize), n),
        std::slice::from_raw_parts(bpp.offset(2*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(4*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(5*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(6*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(7*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(8*n as isize), n),
        std::slice::from_raw_parts(bpp.offset(9*n as isize), n)
    );
}

Now:

julia> @benchmark rusttest($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.062 μs (0.00% GC)
  median time:      2.074 μs (0.00% GC)
  mean time:        2.163 μs (0.00% GC)
  maximum time:     6.962 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark clangtest($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.069 μs (0.00% GC)
  median time:      2.084 μs (0.00% GC)
  mean time:        2.158 μs (0.00% GC)
  maximum time:     5.806 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

Anyway, to make this code fast, a few tricks are needed:

Recognize that you can vectorize it. This is where gfortran version 1, and ifort failed. gfortran will be fixed in gcc 10; gcc 9 is currently only accepting regression-fixes. Here is a link to the mailing list, for reference.
fma, and especially rsqrt approximation and Newton Raphson step. A bug was uncovered in gcc on skylakex, where it replaces r = 1/sqrt(a) not with rsqrt(a) -> NR, but with the sqrt approximation a * (rsqrt(a) -> NR), and then a reciprocal approximation followed by yet another NR step. Yikes. I'm still following the discussion online, but because that is a smaller change than (1) and more obviously a regression-fix, I'm guessing it'll make it into gcc 9. All that takes about 4.25 microseconds on that data set.
Instead, using fma instructions but calculating the reciprocal square root via a vsqrt followed by a div takes 3.5 microseconds. I've improved gcc's times to this by no longer passing -funsafe-math-optimizations or -ffast-math, but hopefully the regression will be fixed for the next release in a few months.
LLVM 6, which Julia currently uses, doesn't do the rsqrt -> NR, so (having gotten the autovectorizer to actually vectorize the code), it also takes about 3.5 microseconds.
ispc (LLVM 7), Flang (LLVM 7), and Julia with a macro that manually transforms the loop and inserts intrinsics (including rsqrt -> NR) all take about 2.3 microseconds.
While Clang and Rust now take about 2 microseconds. I haven't found out yet what's causing that last difference.

leonardo · January 24, 2019, 11:14am

extern crate fast_floats;

With recent rustc compilers the extern isn't necessary.

Cool, that is neater:

With a new type your code look could improve a bit:

type Ff32 = Fast<f32>;

bpp.offset(2*n as isize)

Try:

bpp.add(2 * n)

pub unsafe extern fn processbpp(x: *mut Ff32, bpp: *const Ff32, n: usize) {
    processbpp_safe(
        std::slice::from_raw_parts_mut(x,                      n),
        std::slice::from_raw_parts_mut(x.offset(  n as isize), n),
        ...

=>

pub unsafe extern fn processbpp(x: *mut Ff32, bpp: *const Ff32, n: usize) {
    import std::slice::from_raw_parts_mut;
    
    processbpp_safe(
        from_raw_parts_mut(x,                      n),
        from_raw_parts_mut(x.offset(  n as isize), n),

Probably there are other ways to improve the look of your code.

microcolonel · January 25, 2019, 12:17pm

Hi.

In my experience, at these time quantums system specific power-performance optimisations often get in the way of benchmarking and have the potential to muddy things.

This may not be the case for you but to be sure: Are you running these benchmarks on a Linux host ? If so, it would be prudent to get DVFS out of the loop by switching cpufreq to the performance governor.

See this for ideas on how to do that.

Also, if your host has a multi-socketed x86_64 processor, it would be prudent to verify that to indeed be the case and if so restrict the kernel task scheduler to a single scheduling domain (all cores in the same socket) by CPU hotplugging out any cores not in the single domain.

See this for ideas on how to do that.

I would go one step further and aim to run such experiments at the lowest system run-level to avoid fuzz added by misc system services.

See this for ideas on how to do that.

robsmith11 · January 26, 2019, 8:55am

I've started using fast-math wrappers in my own numeric project and discovered that bluss's crate is a bit incomplete, so I've created my own version that also includes a few other related features that I commonly use:

Fast approximations of exp() and ln() (will add more later).
An Ord implementation that allows for easy sorting when it is known that NaNs will not exist.

I've tried to make it as easy as possible to mix wrapped and unwrapped values, but rust doesn't always make it easy. So it's possble to do fa(1.1) <= 1.2, but not 1.2 <= fa(1.1).

I've just started using it, so please let me know if you have any suggestions on how to improve it.

chriselrod · January 27, 2019, 9:47am

use std::slice::from_raw_parts(_mut) worked.
On the topic of making the code prettier, I tried briefly to implement struct of arrays types to try and abstract the underlying memory layout. Unfortunately, I realized there are limitations on the implementations of the index and index_mut traits.

You're right, sometimes results seem oddly inconsistent. This is especially the case for times in the nanoseconds, but I also could not reproduce the benchmark where Rust was about as fast as clang with the code I had written.

Looking at the assembly, the reasons become clear. For starters, it's obviously worth noting

that this code is low level and written in a simple way, not using much in the way of standard library features or abstractions that have to be removed during compilation
The compilers are just front ends to LLVM. We should expect the same performance, especially given "1."

When Rust took 4.5 microseconds, Rust was not uses fused arithmetic instructions, and it was calculating the reciprocal square roots via vsrqrt, and then div.

Using fast-floats allowed Rust to make both of those optimizations, reducing the runtime to about 2.2 microseconds, vs 2 microseconds for Clang. I will try controlling things as you described to give benchmarks we can be more confident about.

Looking at the asm, this is how the for loop started in the Rust code:

.LBB0_7:
	leaq	(%rsi,%r11,4), %r15
	movq	%r14, %rbx
	leaq	(%r15,%r9), %r14
	leaq	(%r14,%r9), %rax
	vmovups	(%r10,%rax), %zmm2
	vrsqrt14ps	%zmm2, %zmm3
	addq	%r10, %rax
	vmulps	%zmm3, %zmm2, %zmm2
	vfmadd213ps	%zmm0, %zmm3, %zmm2
	vmulps	%zmm1, %zmm3, %zmm3
	vmulps	%zmm2, %zmm3, %zmm2
	leaq	(%rax,%r9), %r12
	leaq	(%r12,%r9), %r13
	leaq	(%r13,%r9), %rbp
	leaq	(%rbp,%r9), %r8
	vmovups	(%r9,%r8), %zmm3

from there, the arithmetic starts. There are a couple more movq further down in the loop body.
These are the first few lines of the C++ for loop asm:

.LBB0_10:                               # =>This Inner Loop Header: Depth=1
	vmovups	(%r14,%r13,4), %zmm2
	vmovups	(%r12,%r13,4), %zmm3
	vrsqrt14ps	%zmm3, %zmm4
	vmovups	(%rbp,%r13,4), %zmm5
	vmulps	%zmm4, %zmm3, %zmm3
	vfmadd213ps	%zmm0, %zmm4, %zmm3 # zmm3 = (zmm4 * zmm3) + zmm0
	vmulps	%zmm1, %zmm4, %zmm4
	vmulps	%zmm3, %zmm4, %zmm3
	vmulps	(%rax,%r13,4), %zmm3, %zmm4
	vmulps	(%r15,%r13,4), %zmm3, %zmm6
	vmovups	(%rbx,%r13,4), %zmm7
	vfnmadd231ps	%zmm6, %zmm6, %zmm7 # zmm7 = -(zmm6 * zmm6) + zmm7

The loop contains no leaq and no movq
That is, Rust spent a lot of time on memory address calculations. C++ did not.
Ideally, these instructions shouldn't matter too much / conflict with floating point operations, but they evidently had an impact.

I posted the C++ version in a Godbolt link. The major difference between that code and the Rust code is that in the C++ function, the loop is calling a void function that assigns to the array.

Unfortunately, using the fast-float crate, I now can't use Godbolt (unless I am mistaken?), but writing the function the same way:

#![feature(core_intrinsics)]

use fast_floats::Fast;
type Ff32 = Fast<f32>;

#[inline(always)]
fn rsqrt32(a: Ff32) -> Ff32 {
    unsafe {
        Fast(std::intrinsics::fdiv_fast(1f32, std::intrinsics::sqrtf32(a.get())))
    }
}

#[inline]
pub fn pd_forward_solve2(y1: &mut [Ff32], y2: &mut [Ff32], y3: &mut [Ff32], x1: &[Ff32], x2: &[Ff32], x3: &[Ff32],
                    s11: &[Ff32], s12: &[Ff32], s22: &[Ff32], s13: &[Ff32], s23: &[Ff32], s33: &[Ff32], i: usize) {

    let r11 = rsqrt32(s11[i]);
    let r11x1 = r11 * x1[i];
    let l21 = r11 * s12[i];
    let l31 = r11 * s13[i];
    let r22 = rsqrt32(s22[i] - l21*l21);
    let l32 = r22 * (s23[i] - l21*l31);
    let r33 = rsqrt32(s33[i] - l31*l31 - l32*l32);

    let nr21x1 = r22 * l21*r11x1;
    let r31x1 = r33 * ( l32*nr21x1 - l31*r11x1 );
    let nr32 = r33 * l32 * r22;

    y1[i] = r11x1;
    y2[i] = r22*x2[i] - nr21x1;
    y3[i] = r31x1 - nr32*x2[i] + r33*x3[i];
}

fn processbpp_safe2(y1: &mut [Ff32], y2: &mut [Ff32], y3: &mut [Ff32], x1: &[Ff32], x2: &[Ff32], x3: &[Ff32],
                    s11: &[Ff32], s12: &[Ff32], s22: &[Ff32], s13: &[Ff32], s23: &[Ff32], s33: &[Ff32]){

    let n = y1.len();
    assert_eq!( y2.len(), n);
    assert_eq!( y3.len(), n);
    assert_eq!( x1.len(), n);
    assert_eq!( x2.len(), n);
    assert_eq!( x3.len(), n);
    assert_eq!(s11.len(), n);
    assert_eq!(s12.len(), n);
    assert_eq!(s22.len(), n);
    assert_eq!(s13.len(), n);
    assert_eq!(s23.len(), n);
    assert_eq!(s33.len(), n);
    for i in 0..n {
            pd_forward_solve2(y1, y2, y3, x1, x2, x3, s11, s12, s22, s13, s23, s33, i);
    }
}

#[no_mangle]
pub unsafe extern fn processbpp2(x: *mut Ff32, bpp: *const Ff32, n: usize){

    use std::slice::from_raw_parts_mut;
    use std::slice::from_raw_parts;
    let sn = n as isize;

    processbpp_safe2(
        from_raw_parts_mut(x,              n),
        from_raw_parts_mut(x.offset(  sn), n),
        from_raw_parts_mut(x.offset(2*sn), n),
        from_raw_parts(bpp,              n),
        from_raw_parts(bpp.offset(  sn), n),
        from_raw_parts(bpp.offset(2*sn), n),
        from_raw_parts(bpp.offset(4*sn), n),
        from_raw_parts(bpp.offset(5*sn), n),
        from_raw_parts(bpp.offset(6*sn), n),
        from_raw_parts(bpp.offset(7*sn), n),
        from_raw_parts(bpp.offset(8*sn), n),
        from_raw_parts(bpp.offset(9*sn), n)
    );
}

Also, switching to "forward" solving instead of "back" solving, because I (a) decided to reorganize the code that inspired this, and as far as having an example goes, this method is preferable.
For reference:
the inputs are a set of 3d points, x, and a 3x3 positive definite matrix, in a struct-of-arrays memory layout.
The "forward" solve function takes the Cholesky factor, L of the PD matrix, and then solves for y in Ly = x.

Digression asside, Rust's assembly now looks like:

.LBB1_7:
	vmovups	(%r13,%rbp,4), %zmm2
	vrsqrt14ps	%zmm2, %zmm3
	vmulps	%zmm3, %zmm2, %zmm2
	vfmadd213ps	%zmm0, %zmm3, %zmm2
	vmulps	%zmm1, %zmm3, %zmm3
	vmulps	%zmm2, %zmm3, %zmm2
	vmulps	(%rsi,%rbp,4), %zmm2, %zmm3
	vmulps	(%r14,%rbp,4), %zmm2, %zmm4
	vmulps	(%rcx,%rbp,4), %zmm2, %zmm2
	vmovups	(%r15,%rbp,4), %zmm5
	vfnmadd231ps	%zmm4, %zmm4, %zmm5
	vrsqrt14ps	%zmm5, %zmm6
	vmulps	%zmm6, %zmm5, %zmm5
	vfmadd213ps	%zmm0, %zmm6, %zmm5

There are no leaqs, and no movqs. It looks a lot like the C++ asm, and I have no reason to expect their performance to be different.

I don't know why the compiler struggles to eliminate addressing calculations when doing the assignments outside of the inlined function.

Thank you for the rest of your advice and links. When I make an updated post with benchmarks, I will follow it.
Julia's benchmarking macro does have the advantage of taking a lot of samples. Eg, when a benchmark takes 2 microseconds, it takes 10,000 samples, with about 9 function evaluations per sample, for about 90,0000 total function evaluations. This should be enough for an on demand governor to ramp up, and the minimum time should be relatively noise free.
But they could still all be correlated with one another, and thus systematically biased.

Cool. I did try making the switch, however, the code would not vectorize. This was the loop body:

	movq	_ZN35_$LT$fastfloat..Fast$LT$f32$GT$$GT$4sqrt17ha0e4244e6de12ef4E@GOTPCREL(%rip), %r15
	movq	_ZN9fastfloat89_$LT$impl$u20$core..ops..arith..Div$LT$fastfloat..Fast$LT$f32$GT$$GT$$u20$for$u20$f32$GT$3div17hbf69dd08d370295aE@GOTPCREL(%rip), %r14
	.p2align	4, 0x90
.LBB0_2:
	vmovss	(%rbx,%r13,4), %xmm0
	vmovss	%xmm0, 24(%rsp)
	leaq	(%rbx,%r13,4), %rax
	vmovss	(%rbp,%rax), %xmm0
	vmovss	%xmm0, 32(%rsp)
	addq	%rbp, %rax
	vmovss	(%rbp,%rax), %xmm0
	vmovss	%xmm0, 28(%rsp)
	addq	%rbp, %rax
	vmovss	(%r12,%rax), %xmm0
	addq	%r12, %rax
	vmovss	(%rbp,%rax), %xmm1
	vmovss	%xmm1, 8(%rsp)
	addq	%rbp, %rax
	vmovss	(%rbp,%rax), %xmm1
	vmovss	%xmm1, 4(%rsp)
	addq	%rbp, %rax
	vmovss	(%rbp,%rax), %xmm1
	vmovss	%xmm1, 16(%rsp)
	addq	%rbp, %rax
	vmovss	(%rbp,%rax), %xmm1
	vmovss	%xmm1, 12(%rsp)
	addq	%rbp, %rax
	vmovss	(%rbp,%rax), %xmm1
	vmovss	%xmm1, 36(%rsp)
.Ltmp0:
	callq	*%r15
.Ltmp1:
	vmovaps	%xmm0, %xmm1
.Ltmp2:
	vmovss	.LCPI0_0(%rip), %xmm0
	callq	*%r14
	vmovss	%xmm0, 20(%rsp)
.Ltmp3:
	vmovss	8(%rsp), %xmm0
	vmulss	20(%rsp), %xmm0, %xmm1
	vmovss	%xmm1, 8(%rsp)
	vmovss	4(%rsp), %xmm0
	vfnmadd231ss	%xmm1, %xmm1, %xmm0
.Ltmp4:
	callq	*%r15
.Ltmp5:
	vmovaps	%xmm0, %xmm1
.Ltmp6:
	vmovss	.LCPI0_0(%rip), %xmm0
	callq	*%r14
	vmovss	%xmm0, 4(%rsp)
.Ltmp7:
	vmovss	16(%rsp), %xmm0
	vmulss	20(%rsp), %xmm0, %xmm1
	vmovss	12(%rsp), %xmm0
	vfnmadd231ss	8(%rsp), %xmm1, %xmm0
	vmulss	4(%rsp), %xmm0, %xmm2
	vmovss	%xmm1, 16(%rsp)
	vmulss	%xmm1, %xmm1, %xmm0
	vmovss	%xmm2, 12(%rsp)
	vfmadd231ss	%xmm2, %xmm2, %xmm0
	vmovss	36(%rsp), %xmm1
	vsubss	%xmm0, %xmm1, %xmm0
.Ltmp8:
	callq	*%r15
.Ltmp9:
	vmovaps	%xmm0, %xmm1
.Ltmp10:
	vmovss	.LCPI0_0(%rip), %xmm0
	callq	*%r14
.Ltmp11:
	vmovss	20(%rsp), %xmm1
	vmulss	24(%rsp), %xmm1, %xmm1
	vmulss	8(%rsp), %xmm1, %xmm2
	vmovss	4(%rsp), %xmm3
	vmulss	%xmm3, %xmm2, %xmm2
	vmulss	32(%rsp), %xmm3, %xmm3
	vsubss	%xmm3, %xmm2, %xmm4
	vmovss	16(%rsp), %xmm5
	vfnmadd213ss	28(%rsp), %xmm1, %xmm5
	vmovss	12(%rsp), %xmm6
	vfmadd213ss	%xmm5, %xmm4, %xmm6
	vmulss	%xmm6, %xmm0, %xmm0
	movq	40(%rsp), %rax
	vmovss	%xmm1, (%rax,%r13,4)
	vsubss	%xmm2, %xmm3, %xmm1
	movq	64(%rsp), %rax
	vmovss	%xmm1, (%rax,%r13,4)
	movq	56(%rsp), %rax
	vmovss	%xmm0, (%rax,%r13,4)
	leaq	1(%r13), %r13
	cmpq	48(%rsp), %r13
	jb	.LBB0_2

The problem is that neither sqrt or div were inlined (as you see from the first two lines I pasted, %r15 is sqrt, and %14 is Div.
I would add a healthy dose of #[inline(always)] or #[inline]. As far as I'm concerned, the primary purposes of -ffast-math are fused arithmetic operations and enabling vectorization.

GCC can vectorize log and exp.
Clang seems to be extremely fickle, even if I specify -fveclib.
It would be nice if Rust could do this.

I haven't played with Rust's metaprogramming at all, but that may be one route.
That was my approach in Julia:

julia> (@macroexpand @vectorize Float32 for i ∈ 1:size(Data,1)
                   X[i,:] .= notarealfunction(
                       exp(Data[i,1]),log(Data[i,2]),sin(Data[i,3]),
                       cos(Data[i,5]),exp2(Data[i,6]),exp10(Data[i,7]),sincos(Data[i,8]),log2(Data[i,9]),log10(Data[i,10])
                   )
               end) |> striplines
quote
    ##N#364 = size(Data, 1)
    (##Q#381, ##rem#382) = (LoopVectorization.VectorizationBase).size_loads(Data, 1, Val{16}())
    ##pData#370 = vectorizable(Data)
    ##pX#365 = vectorizable(X)
    begin
        ##stride#369 = LoopVectorization.stride_row(X) * sizeof(eltype(##pX#365))
        ##stride#371 = LoopVectorization.stride_row(Data) * sizeof(eltype(##pData#370))
    end
    begin
        for ##i#363 = 0:64:##Q#381 * 64 - 4
            begin
                ####pData#370_i#372 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 0##stride#371)
                ####pData#370_i#373 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 1##stride#371)
                ####pData#370_i#374 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 2##stride#371)
                ####pData#370_i#375 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 4##stride#371)
                ####pData#370_i#376 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 5##stride#371)
                ####pData#370_i#377 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 6##stride#371)
                ####pData#370_i#378 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 7##stride#371)
                ####pData#370_i#379 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 8##stride#371)
                ####pData#370_i#380 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 9##stride#371)
                begin
                    ##B#366 = LoopVectorization.extract_data.(notarealfunction(SLEEF.exp(####pData#370_i#372), SLEEF.log_fast(####pData#370_i#373), SLEEF.sin_fast(####pData#370_i#374), SLEEF.cos_fast(####pData#370_i#375), SLEEF.exp2(####pData#370_i#376), SLEEF.exp10(####pData#370_i#377), SLEEF.sincos_fast(####pData#370_i#378), SLEEF.log2(####pData#370_i#379), SLEEF.log10(####pData#370_i#380)))
                    for ##j#368 = 0:SIMDPirates.vsub(length(##B#366), 1)
                        begin
                            $(Expr(:inbounds, true))
                            local #19#val = SIMDPirates.vstore(getindex(##B#366, SIMDPirates.vadd(1, ##j#368)), SIMDPirates.vadd(##pX#365, ##i#363, SIMDPirates.vmul(##stride#369, ##j#368)))
                            $(Expr(:inbounds, :pop))
                            #19#val
                        end
                    end
                end
            end
        end
    end
    begin
        if ##rem#382 > 0
            ##mask#383 = one(UInt16) << ##rem#382 - one(UInt16)
            ##i#363 = (##N#364 - ##rem#382) * 4
            begin
                ####pData#370_i#372 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 0##stride#371, ##mask#383)
                ####pData#370_i#373 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 1##stride#371, ##mask#383)
                ####pData#370_i#374 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 2##stride#371, ##mask#383)
                ####pData#370_i#375 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 4##stride#371, ##mask#383)
                ####pData#370_i#376 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 5##stride#371, ##mask#383)
                ####pData#370_i#377 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 6##stride#371, ##mask#383)
                ####pData#370_i#378 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 7##stride#371, ##mask#383)
                ####pData#370_i#379 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 8##stride#371, ##mask#383)
                ####pData#370_i#380 = SIMDPirates.vload(VectorizationBase.SVec{16,Float32}, ##pData#370 + ##i#363 + 9##stride#371, ##mask#383)
                begin
                    ##B#366 = LoopVectorization.extract_data.(notarealfunction(SLEEF.exp(####pData#370_i#372), SLEEF.log_fast(####pData#370_i#373), SLEEF.sin_fast(####pData#370_i#374), SLEEF.cos_fast(####pData#370_i#375), SLEEF.exp2(####pData#370_i#376), SLEEF.exp10(####pData#370_i#377), SLEEF.sincos_fast(####pData#370_i#378), SLEEF.log2(####pData#370_i#379), SLEEF.log10(####pData#370_i#380)))
                    for ##j#368 = 0:SIMDPirates.vsub(length(##B#366), 1)
                        begin
                            $(Expr(:inbounds, true))
                            local #20#val = SIMDPirates.vstore(getindex(##B#366, SIMDPirates.vadd(1, ##j#368)), SIMDPirates.vadd(##pX#365, ##i#363, SIMDPirates.vmul(##stride#369, ##j#368)), ##mask#383)
                            $(Expr(:inbounds, :pop))
                            #20#val
                        end
                    end
                end
            end
        end
    end
end

(All the generated names are mangled for hygiene; names with # would otherwise be illegal; these declare .)
In Julia, macros are prefixed with @ (instead of suffixed with a bang).
@macroexpand expands a macro, so you can see what it does. |> is a pipe; I pipe the output to striplines, which removes extraneous debugging info that clutters up the output.

The @vectorize macro takes in a type (eg, Float32 or Float64), as well as a for loop. It then uses the host's architectures SIMD vector width to manually unroll the for loop, and uses a dictionary to substitute appropriate function calls with vectorized versions. For example, the function call gets replaced with:

##B#366 = LoopVectorization.extract_data.(notarealfunction(SLEEF.exp(####pData#370_i#372), SLEEF.log_fast(####pData#370_i#373), SLEEF.sin_fast(####pData#370_i#374), SLEEF.cos_fast(####pData#370_i#375), SLEEF.exp2(####pData#370_i#376), SLEEF.exp10(####pData#370_i#377), SLEEF.sincos_fast(####pData#370_i#378), SLEEF.log2(####pData#370_i#379), SLEEF.log10(####pData#370_i#380)))

so that all these special function calls now call my fork of the Julia port of a now outdated version of the SLEEF vectorized math library.
There are rust bindings to SLEEF. For good measure, I have Julia bindings to SLEEF as well, which provides a similar macro as LoopVectorization, except using the C bindings instead.

The Julia port has the advantage of being inline-able. This allows loop constants to be held in registers across loop iterations. With avx512 (and 32 registers), this can be quite the boon. Note the functions introduce a lot of constants themselves, eg in horner polynomials.

The macro has a lot of other things going on, eg relying on dispatch of the "vectorizable" function to gracefully handle different memory layouts, like ordinary arrays or struct of array types.
How sophisticated are the transformations enabled by Rust's macros?
Or, maybe the iterator system?

But if you want to get serious about vectorizing numerical workloads in Rust, those SLEEF C bindings may be a great start.

robsmith11 · January 27, 2019, 11:37am

Thanks for the feedback! It appears LTO is not always as aggressive as I thought it was. I've published a new version with all functions marked as inlined. (My own project got an additional 8% speed-up!)

Your code now compiles to the same assembly as with fast-floats. Using the sqrt from my crate, I was also able to produce vrsqrtps without your helper function by just writing 1.0 / s11[i].sqrt().

chriselrod · January 27, 2019, 3:55pm

The OS is Fedora 29, with kernel 4.20.3-200.fc29.x86_64.
I am currently on a different system than the one on which I ran previous benchmarks. But both systems are single socket, with the same OS.

The current system's CPU is an i9 9940X.
I set the governor to performance.
I haven't set the system's run label. But I could consider doing that. The maximum time is still obviously much higher than the minimum, median, and mean times.

Repeating benchmarks a bunch of times, alternating between (100,000 runs of) Flang, Clang, Julia, and Rust; the benchmarks were run one after the other with no pause:

Initial run

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.725 μs (0.00% GC)
  median time:      1.740 μs (0.00% GC)
  mean time:        1.743 μs (0.00% GC)
  maximum time:     4.285 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.677 μs (0.00% GC)
  median time:      1.740 μs (0.00% GC)
  mean time:        1.737 μs (0.00% GC)
  maximum time:     4.981 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.865 μs (0.00% GC)
  median time:      1.872 μs (0.00% GC)
  mean time:        1.875 μs (0.00% GC)
  maximum time:     4.476 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.804 μs (0.00% GC)
  median time:      1.813 μs (0.00% GC)
  mean time:        1.816 μs (0.00% GC)
  maximum time:     4.476 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.710 μs (0.00% GC)
  median time:      1.738 μs (0.00% GC)
  mean time:        1.742 μs (0.00% GC)
  maximum time:     4.373 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.729 μs (0.00% GC)
  median time:      1.737 μs (0.00% GC)
  mean time:        1.740 μs (0.00% GC)
  maximum time:     4.394 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.803 μs (0.00% GC)
  median time:      1.812 μs (0.00% GC)
  mean time:        1.835 μs (0.00% GC)
  maximum time:     4.998 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.804 μs (0.00% GC)
  median time:      1.813 μs (0.00% GC)
  mean time:        1.817 μs (0.00% GC)
  maximum time:     4.388 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.710 μs (0.00% GC)
  mean time:        1.709 μs (0.00% GC)
  maximum time:     5.175 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.676 μs (0.00% GC)
  median time:      1.684 μs (0.00% GC)
  mean time:        1.699 μs (0.00% GC)
  maximum time:     5.057 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.803 μs (0.00% GC)
  median time:      1.810 μs (0.00% GC)
  mean time:        1.814 μs (0.00% GC)
  maximum time:     4.406 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.749 μs (0.00% GC)
  median time:      1.757 μs (0.00% GC)
  mean time:        1.775 μs (0.00% GC)
  maximum time:     4.320 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.691 μs (0.00% GC)
  median time:      1.714 μs (0.00% GC)
  mean time:        1.721 μs (0.00% GC)
  maximum time:     4.337 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.677 μs (0.00% GC)
  median time:      1.733 μs (0.00% GC)
  mean time:        1.713 μs (0.00% GC)
  maximum time:     4.325 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.804 μs (0.00% GC)
  median time:      1.871 μs (0.00% GC)
  mean time:        1.859 μs (0.00% GC)
  maximum time:     4.422 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.748 μs (0.00% GC)
  median time:      1.813 μs (0.00% GC)
  mean time:        1.805 μs (0.00% GC)
  maximum time:     4.974 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.692 μs (0.00% GC)
  median time:      1.732 μs (0.00% GC)
  mean time:        1.727 μs (0.00% GC)
  maximum time:     4.346 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.677 μs (0.00% GC)
  median time:      1.685 μs (0.00% GC)
  mean time:        1.707 μs (0.00% GC)
  maximum time:     4.322 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.803 μs (0.00% GC)
  median time:      1.812 μs (0.00% GC)
  mean time:        1.830 μs (0.00% GC)
  maximum time:     4.944 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.749 μs (0.00% GC)
  median time:      1.756 μs (0.00% GC)
  mean time:        1.759 μs (0.00% GC)
  maximum time:     4.377 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.736 μs (0.00% GC)
  mean time:        1.735 μs (0.00% GC)
  maximum time:     5.409 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.677 μs (0.00% GC)
  median time:      1.738 μs (0.00% GC)
  mean time:        1.735 μs (0.00% GC)
  maximum time:     4.592 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.804 μs (0.00% GC)
  median time:      1.811 μs (0.00% GC)
  mean time:        1.814 μs (0.00% GC)
  maximum time:     5.027 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.748 μs (0.00% GC)
  median time:      1.757 μs (0.00% GC)
  mean time:        1.774 μs (0.00% GC)
  maximum time:     4.876 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.716 μs (0.00% GC)
  mean time:        1.725 μs (0.00% GC)
  maximum time:     4.623 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.685 μs (0.00% GC)
  median time:      1.736 μs (0.00% GC)
  mean time:        1.726 μs (0.00% GC)
  maximum time:     4.296 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.803 μs (0.00% GC)
  median time:      1.810 μs (0.00% GC)
  mean time:        1.814 μs (0.00% GC)
  maximum time:     4.461 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.804 μs (0.00% GC)
  median time:      1.813 μs (0.00% GC)
  mean time:        1.816 μs (0.00% GC)
  maximum time:     4.353 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

Rerun with more aggressive CPU clock

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.594 μs (0.00% GC)
  median time:      1.625 μs (0.00% GC)
  mean time:        1.622 μs (0.00% GC)
  maximum time:     4.750 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.582 μs (0.00% GC)
  median time:      1.588 μs (0.00% GC)
  mean time:        1.599 μs (0.00% GC)
  maximum time:     4.672 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.689 μs (0.00% GC)
  median time:      1.696 μs (0.00% GC)
  mean time:        1.718 μs (0.00% GC)
  maximum time:     3.563 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.636 μs (0.00% GC)
  median time:      1.688 μs (0.00% GC)
  mean time:        1.683 μs (0.00% GC)
  maximum time:     3.698 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.595 μs (0.00% GC)
  median time:      1.608 μs (0.00% GC)
  mean time:        1.616 μs (0.00% GC)
  maximum time:     3.556 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.581 μs (0.00% GC)
  median time:      1.588 μs (0.00% GC)
  mean time:        1.604 μs (0.00% GC)
  maximum time:     3.986 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.695 μs (0.00% GC)
  mean time:        1.706 μs (0.00% GC)
  maximum time:     3.652 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.636 μs (0.00% GC)
  median time:      1.688 μs (0.00% GC)
  mean time:        1.678 μs (0.00% GC)
  maximum time:     3.590 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.594 μs (0.00% GC)
  median time:      1.608 μs (0.00% GC)
  mean time:        1.615 μs (0.00% GC)
  maximum time:     3.532 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.582 μs (0.00% GC)
  median time:      1.588 μs (0.00% GC)
  mean time:        1.601 μs (0.00% GC)
  maximum time:     3.836 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.696 μs (0.00% GC)
  mean time:        1.719 μs (0.00% GC)
  maximum time:     3.652 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.635 μs (0.00% GC)
  median time:      1.687 μs (0.00% GC)
  mean time:        1.673 μs (0.00% GC)
  maximum time:     4.186 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.594 μs (0.00% GC)
  median time:      1.609 μs (0.00% GC)
  mean time:        1.618 μs (0.00% GC)
  maximum time:     3.510 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.581 μs (0.00% GC)
  median time:      1.589 μs (0.00% GC)
  mean time:        1.606 μs (0.00% GC)
  maximum time:     3.570 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.696 μs (0.00% GC)
  mean time:        1.716 μs (0.00% GC)
  maximum time:     3.665 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.634 μs (0.00% GC)
  median time:      1.685 μs (0.00% GC)
  mean time:        1.668 μs (0.00% GC)
  maximum time:     3.507 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.593 μs (0.00% GC)
  median time:      1.609 μs (0.00% GC)
  mean time:        1.617 μs (0.00% GC)
  maximum time:     4.522 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.580 μs (0.00% GC)
  median time:      1.587 μs (0.00% GC)
  mean time:        1.598 μs (0.00% GC)
  maximum time:     3.557 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.696 μs (0.00% GC)
  mean time:        1.715 μs (0.00% GC)
  maximum time:     4.570 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.635 μs (0.00% GC)
  median time:      1.642 μs (0.00% GC)
  mean time:        1.660 μs (0.00% GC)
  maximum time:     3.525 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.594 μs (0.00% GC)
  median time:      1.607 μs (0.00% GC)
  mean time:        1.612 μs (0.00% GC)
  maximum time:     3.429 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.581 μs (0.00% GC)
  median time:      1.587 μs (0.00% GC)
  mean time:        1.593 μs (0.00% GC)
  maximum time:     3.534 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.689 μs (0.00% GC)
  median time:      1.697 μs (0.00% GC)
  mean time:        1.721 μs (0.00% GC)
  maximum time:     3.671 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.635 μs (0.00% GC)
  median time:      1.642 μs (0.00% GC)
  mean time:        1.654 μs (0.00% GC)
  maximum time:     4.271 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark flangtest4!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.594 μs (0.00% GC)
  median time:      1.625 μs (0.00% GC)
  mean time:        1.622 μs (0.00% GC)
  maximum time:     3.607 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark clangtest!($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.582 μs (0.00% GC)
  median time:      1.588 μs (0.00% GC)
  mean time:        1.603 μs (0.00% GC)
  maximum time:     3.571 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.690 μs (0.00% GC)
  median time:      1.696 μs (0.00% GC)
  mean time:        1.714 μs (0.00% GC)
  maximum time:     3.528 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

julia> @benchmark rusttest2!($X32,  $BPP32,  $NU)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.635 μs (0.00% GC)
  median time:      1.643 μs (0.00% GC)
  mean time:        1.660 μs (0.00% GC)
  maximum time:     4.171 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

N was set to 2827.
That means the vectorized part of the loop repeats 176 times, and there is a remainder of 11 covered in the scalar loop (or the masked iteration, in the case of my Julia macro).

The assembly between Flang, Clang, and Rust now look extremely similar. The vectorized loop bodies:
Flang (Fortran front end to LLVM 7):

asm

.LBB7_7:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
	.loc	1 146 1 is_stmt 1       # vectorization_test.f90:146:1
	vmovups	(%r11,%r14,4), %zmm2
	vrsqrt14ps	%zmm2, %zmm3
	vmulps	%zmm3, %zmm2, %zmm2
	vfmadd213ps	%zmm0, %zmm3, %zmm2 # zmm2 = (zmm3 * zmm2) + zmm0
	vmulps	%zmm1, %zmm3, %zmm3
	vmulps	%zmm2, %zmm3, %zmm2
	.loc	1 147 1                 # vectorization_test.f90:147:1
	vmulps	(%r13,%r14,4), %zmm2, %zmm3
	.loc	1 148 1                 # vectorization_test.f90:148:1
	vmulps	(%r10,%r14,4), %zmm2, %zmm4
	vmovups	(%r8,%r14,4), %zmm5
	.loc	1 150 1                 # vectorization_test.f90:150:1
	vfnmadd231ps	%zmm4, %zmm4, %zmm5 # zmm5 = -(zmm4 * zmm4) + zmm5
	vrsqrt14ps	%zmm5, %zmm6
	.loc	1 149 1                 # vectorization_test.f90:149:1
	vmulps	(%r9,%r14,4), %zmm2, %zmm2
	.loc	1 150 1                 # vectorization_test.f90:150:1
	vmulps	%zmm6, %zmm5, %zmm5
	vfmadd213ps	%zmm0, %zmm6, %zmm5 # zmm5 = (zmm6 * zmm5) + zmm0
	vmulps	%zmm1, %zmm6, %zmm6
	vmulps	%zmm5, %zmm6, %zmm5
	.loc	1 154 1                 # vectorization_test.f90:154:1
	vmulps	%zmm4, %zmm3, %zmm6
	.loc	1 151 1                 # vectorization_test.f90:151:1
	vfnmadd213ps	(%rbp,%r14,4), %zmm2, %zmm4 # zmm4 = -(zmm2 * zmm4) + mem
	vmulps	%zmm4, %zmm5, %zmm4
	.loc	1 152 1                 # vectorization_test.f90:152:1
	vmovups	(%rdx,%r14,4), %zmm7
	vmulps	%zmm2, %zmm2, %zmm8
	vfmadd231ps	%zmm4, %zmm4, %zmm8 # zmm8 = (zmm4 * zmm4) + zmm8
	vsubps	%zmm8, %zmm7, %zmm7
	vrsqrt14ps	%zmm7, %zmm8
	vmulps	%zmm8, %zmm7, %zmm7
	vfmadd213ps	%zmm0, %zmm8, %zmm7 # zmm7 = (zmm8 * zmm7) + zmm0
	.loc	1 158 1                 # vectorization_test.f90:158:1
	vmovups	%zmm3, (%rdi,%r14,4)
	.loc	1 159 1                 # vectorization_test.f90:159:1
	vmovups	(%rax,%r14,4), %zmm9
	vsubps	%zmm6, %zmm9, %zmm10
	vmulps	%zmm5, %zmm10, %zmm10
	vmovups	%zmm10, (%rbx,%r14,4)
	vmulps	%zmm1, %zmm8, %zmm8
	vsubps	%zmm9, %zmm6, %zmm6
	vmulps	%zmm6, %zmm4, %zmm4
	vfnmadd213ps	(%r15,%r14,4), %zmm2, %zmm3 # zmm3 = -(zmm2 * zmm3) + mem
	vfmadd231ps	%zmm4, %zmm5, %zmm3 # zmm3 = (zmm5 * zmm4) + zmm3
	vmulps	%zmm3, %zmm8, %zmm2
	.loc	1 160 1                 # vectorization_test.f90:160:1
	vmulps	%zmm2, %zmm7, %zmm2
	vmovups	%zmm2, (%r12,%r14,4)
	addq	$16, %r14
	cmpq	%r14, %rsi
	jne	.LBB7_7

Flang inserts some debug info by default, and the compiler refuses no to. Perhaps irrelevant, but I compiled with -Wl,-strip-debug to strip them.

Clang:

.LBB0_10:                               # =>This Inner Loop Header: Depth=1
	vmovups	(%rbp,%r13,4), %zmm2
	vrsqrt14ps	%zmm2, %zmm3
	vmovups	(%r14,%r13,4), %zmm4
	vmulps	%zmm3, %zmm2, %zmm2
	vfmadd213ps	%zmm0, %zmm3, %zmm2 # zmm2 = (zmm3 * zmm2) + zmm0
	vmulps	%zmm1, %zmm3, %zmm3
	vmulps	%zmm2, %zmm3, %zmm2
	vmovups	(%r12,%r13,4), %zmm3
	vmulps	(%rsi,%r13,4), %zmm2, %zmm5
	vmulps	(%rcx,%r13,4), %zmm2, %zmm6
	vmovups	(%rbx,%r13,4), %zmm7
	vfnmadd231ps	%zmm6, %zmm6, %zmm7 # zmm7 = -(zmm6 * zmm6) + zmm7
	vrsqrt14ps	%zmm7, %zmm8
	vmulps	(%rax,%r13,4), %zmm2, %zmm2
	vmulps	%zmm8, %zmm7, %zmm7
	vfmadd213ps	%zmm0, %zmm8, %zmm7 # zmm7 = (zmm8 * zmm7) + zmm0
	vmulps	%zmm1, %zmm8, %zmm8
	vmulps	%zmm7, %zmm8, %zmm7
	vmulps	%zmm6, %zmm5, %zmm8
	vfnmadd213ps	(%r15,%r13,4), %zmm2, %zmm6 # zmm6 = -(zmm2 * zmm6) + mem
	vmulps	%zmm6, %zmm7, %zmm6
	vmulps	%zmm2, %zmm2, %zmm9
	vfmadd231ps	%zmm6, %zmm6, %zmm9 # zmm9 = (zmm6 * zmm6) + zmm9
	vsubps	%zmm9, %zmm3, %zmm3
	vrsqrt14ps	%zmm3, %zmm9
	vmulps	%zmm9, %zmm3, %zmm3
	vfmadd213ps	%zmm0, %zmm9, %zmm3 # zmm3 = (zmm9 * zmm3) + zmm0
	vmulps	%zmm1, %zmm9, %zmm9
	vfnmadd213ps	(%r10,%r13,4), %zmm5, %zmm2 # zmm2 = -(zmm5 * zmm2) + mem
	vmovups	%zmm5, (%rdi,%r13,4)
	vsubps	%zmm8, %zmm4, %zmm5
	vmulps	%zmm5, %zmm7, %zmm5
	vmovups	%zmm5, (%rdx,%r13,4)
	vsubps	%zmm4, %zmm8, %zmm4
	vmulps	%zmm7, %zmm6, %zmm5
	vfmadd213ps	%zmm2, %zmm4, %zmm5 # zmm5 = (zmm4 * zmm5) + zmm2
	vmulps	%zmm5, %zmm9, %zmm2
	vmulps	%zmm2, %zmm3, %zmm2
	vmovups	%zmm2, (%r11,%r13,4)
	addq	$16, %r13
	cmpq	%r13, %r9
	jne	.LBB0_10

Julia:

L288:
	vmovups	(%rbx,%r15), %zmm2
	vmovups	(%r10,%r15), %zmm3
	vrsqrt14ps	%zmm3, %zmm4
	vmulps	%zmm4, %zmm4, %zmm5
	vfmadd213ps	%zmm0, %zmm3, %zmm5
	vmulps	%zmm1, %zmm4, %zmm3
	vmulps	%zmm5, %zmm3, %zmm3
	vmulps	(%r14,%r15), %zmm3, %zmm4
	vmulps	(%rbp,%r15), %zmm3, %zmm5
	vmovups	(%rdi,%r15), %zmm6
	vfnmadd231ps	%zmm5, %zmm5, %zmm6
	vrsqrt14ps	%zmm6, %zmm7
	vmulps	%zmm7, %zmm7, %zmm8
	vfmadd213ps	%zmm0, %zmm6, %zmm8
	vmulps	%zmm1, %zmm7, %zmm6
	vmulps	%zmm8, %zmm6, %zmm6
	vmulps	(%r12,%r15), %zmm3, %zmm3
	vmovups	(%rsi,%r15), %zmm7
	vfnmadd231ps	%zmm3, %zmm5, %zmm7
	vmulps	%zmm4, %zmm3, %zmm8
	vfnmadd213ps	(%r11,%r15), %zmm3, %zmm3
	vmulps	%zmm6, %zmm7, %zmm7
	vfnmadd231ps	%zmm7, %zmm7, %zmm3
	vrsqrt14ps	%zmm3, %zmm9
	vmulps	%zmm9, %zmm9, %zmm10
	vfmadd213ps	%zmm0, %zmm3, %zmm10
	vmulps	%zmm1, %zmm9, %zmm3
	vmulps	%zmm10, %zmm3, %zmm3
	vmulps	%zmm4, %zmm5, %zmm5
	vmulps	%zmm6, %zmm5, %zmm5
	vfmsub231ps	%zmm5, %zmm7, %zmm8
	vmulps	%zmm7, %zmm6, %zmm7
	vmulps	%zmm3, %zmm7, %zmm7
	vmulps	%zmm7, %zmm2, %zmm7
	vfmsub231ps	%zmm8, %zmm3, %zmm7
	vfmadd231ps	(%rcx,%r15), %zmm3, %zmm7
	vmovups	%zmm4, (%r13,%r15)
	vfmsub213ps	%zmm5, %zmm6, %zmm2
	vmovups	%zmm2, (%r9,%r15)
	vmovups	%zmm7, (%r8,%r15)
	addq	$64, %r15
	cmpq	%r15, %rax
	jne	L288

Rust:

.LBB0_7:
	vmovups	(%r13,%rbp,4), %zmm2
	vrsqrt14ps	%zmm2, %zmm3
	vmulps	%zmm3, %zmm2, %zmm2
	vfmadd213ps	%zmm0, %zmm3, %zmm2
	vmulps	%zmm1, %zmm3, %zmm3
	vmulps	%zmm2, %zmm3, %zmm2
	vmulps	(%rsi,%rbp,4), %zmm2, %zmm3
	vmulps	(%r14,%rbp,4), %zmm2, %zmm4
	vmulps	(%rcx,%rbp,4), %zmm2, %zmm2
	vmovups	(%r15,%rbp,4), %zmm5
	vfnmadd231ps	%zmm4, %zmm4, %zmm5
	vrsqrt14ps	%zmm5, %zmm6
	vmulps	%zmm6, %zmm5, %zmm5
	vfmadd213ps	%zmm0, %zmm6, %zmm5
	vmulps	%zmm1, %zmm6, %zmm6
	vmulps	%zmm5, %zmm6, %zmm5
	vmulps	%zmm3, %zmm4, %zmm6
	vfnmadd213ps	(%rbx,%rbp,4), %zmm2, %zmm4
	vmulps	%zmm4, %zmm5, %zmm4
	vmovups	(%r12,%rbp,4), %zmm7
	vmulps	%zmm2, %zmm2, %zmm8
	vfmadd231ps	%zmm4, %zmm4, %zmm8
	vsubps	%zmm8, %zmm7, %zmm7
	vrsqrt14ps	%zmm7, %zmm8
	vmulps	%zmm8, %zmm7, %zmm7
	vfmadd213ps	%zmm0, %zmm8, %zmm7
	vmulps	%zmm1, %zmm8, %zmm8
	vmulps	%zmm7, %zmm8, %zmm7
	vmulps	%zmm5, %zmm6, %zmm6
	vmovups	%zmm3, (%rdi,%rbp,4)
	vmulps	(%r10,%rbp,4), %zmm5, %zmm5
	vsubps	%zmm6, %zmm5, %zmm8
	vmovups	%zmm8, (%r8,%rbp,4)
	vsubps	%zmm5, %zmm6, %zmm5
	vfnmadd213ps	(%r11,%rbp,4), %zmm2, %zmm3
	vfmadd231ps	%zmm5, %zmm4, %zmm3
	vmulps	%zmm7, %zmm3, %zmm2
	vmovups	%zmm2, (%r9,%rbp,4)
	addq	$16, %rbp
	cmpq	%rbp, %rax
	jne	.LBB0_7

The differences in asm:
Clang and Flang are essentially the same, except that Flang has debug info. Rust has one less vmovups (with 4 vmuls having memory as an argument, instead of 3 as in the other three languages).
Julia's is fairly different, seemingly having extra instructions. Probably my fault with the macro that inserts intrinsics, probably preventing some compiler optimizations. Once Julia upgrades to LLVM 7, it'll probably be more or less the same as the other three.
[EDIT: Just confirmed. The function is written to have (adding in the NR steps from rsqrt) 19 vmuls and 11 f(n)m(add/sub) instructions, which is exactly how much the Julia asm has. That function was created by a macro that transformed a loop to use LLVM intrinsics. I used the macro because LLVM 6 doesn't use vrsqrt instructions on my platform.
Those optimized by LLVM7 instead had 18 vmuls, 8 f(n)m(add/sub) instructions, and 3 vsubps.
Seems the function can be algebraically simplified.

I wouldn't worry about any of these differences (other than pondering improvements to my Julia macro).
Performance differences are minimal.
Missed optimizations make a substantially bigger difference than what we see here now.

Topic		Replies	Views
Rust autovectorization issues help	8	445	March 2, 2025
Vectorize This Loop help	5	1352	January 9, 2020
Optimizing linear algebra code help	47	3152	March 20, 2020
Rust performance help (convolution) help	36	4186	August 4, 2020
Experience porting a simple MC simulation to Rust	29	7022	June 6, 2017

Auto-vectorization in Rust

Related topics