Hi everyone!
I've been playing around with benchmarking a simple problem in a variety of languages/compilers. So far that list is Rust, clang++/g++, ispc, Julia, and gfortran/Flang.
Unfortunately, I cannot get Rust to vectorize the code. This is fairly simple numerical code. A Rust version of the code:
#[inline]
pub fn pdbacksolve(x: (f32,f32,f32), s: (f32,f32,f32,f32,f32,f32)) -> (f32, f32, f32) {
let (x1,x2,x3) = x;
let (s11,s12,s22,s13,s23,s33) = s;
let ui33 = s33.sqrt().recip();
let u13 = s13 * ui33;
let u23 = s23 * ui33;
let ui22 = (s22 - u23*u23).sqrt().recip();
let u12 = (s12 - u13*u23) * ui22;
let ui11 = (s11 - u12*u12 - u13*u13).sqrt().recip();
let ui12 = - u12 * ui11 * ui22;
let ui33x3 = ui33 * x3;
(
ui11*x1 + ui12*x2 - (u13 * ui11 + u23 * ui12) * ui33x3,
ui22*x1 - u23 * ui22 * ui33x3,
ui33x3
)
}
fn processbpp_safe(x: &mut [f32], bpp: &[f32], n: usize){
for i in 0..n {
let (x1,x2,x3) = pdbacksolve(
(bpp[i], bpp[i + n], bpp[i + 2*n]),
(bpp[i + 4*n], bpp[i + 5*n], bpp[i + 6*n], bpp[i + 7*n], bpp[i + 8*n], bpp[i + 9*n])
);
x[i ] = x1;
x[i + n] = x2;
x[i + 2*n] = x3;
}
}
#[no_mangle]
pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
processbpp_safe(unsafe { std::slice::from_raw_parts_mut(x, 3*n) }, unsafe { std::slice::from_raw_parts(bpp, 10*n) }, n);
}
The idea is to create a shared library that could be called from another language.
I am not sure I did that correctly, because I get warnings about the unsafe blocks being unnecessary:
$ cargo rustc --release -- --emit asm
Compiling process_inputs v0.1.0 (/home/chriselrod/Documents/progwork/rust/process_inputs)
warning: unnecessary `unsafe` block
--> src/lib.rs:48:21
|
47 | pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
| ----------------------------------------------------------------------- because it's nested under this `unsafe` fn
48 | processbpp_safe(unsafe { std::slice::from_raw_parts_mut(x, 3*n) }, unsafe { std::slice::from_raw_parts(bpp, 10*n) }, n);
| ^^^^^^ unnecessary `unsafe` block
|
= note: #[warn(unused_unsafe)] on by default
warning: unnecessary `unsafe` block
--> src/lib.rs:48:72
|
47 | pub unsafe extern fn processbpp(x: *mut f32, bpp: *const f32, n: usize){
| ----------------------------------------------------------------------- because it's nested under this `unsafe` fn
48 | processbpp_safe(unsafe { std::slice::from_raw_parts_mut(x, 3*n) }, unsafe { std::slice::from_raw_parts(bpp, 10*n) }, n);
| ^^^^^^ unnecessary `unsafe` block
Finished release [optimized] target(s) in 0.36s
I compiled with these flags on an architecture with avx512f:
$ rustc --version
rustc 1.33.0-nightly (c76f3c374 2019-01-18)
$ cat .cargo/config
[target.'cfg(any(windows, unix))']
rustflags = ["-C", "target-cpu=native", "-C", "llvm-args=-ffast-math", "-C", "opt-level=3", "-C", "llvm-args=-force-vector-width=16"]
$ rustc --print cfg -C target-cpu=native -C opt-level=3
target_arch="x86_64"
target_endian="little"
target_env="gnu"
target_family="unix"
target_feature="adx"
target_feature="aes"
target_feature="avx"
target_feature="avx2"
target_feature="avx512bw"
target_feature="avx512cd"
target_feature="avx512dq"
target_feature="avx512f"
target_feature="avx512vl"
target_feature="bmi1"
target_feature="bmi2"
target_feature="cmpxchg16b"
target_feature="fma"
target_feature="fxsr"
target_feature="lzcnt"
target_feature="mmx"
target_feature="pclmulqdq"
target_feature="popcnt"
target_feature="rdrand"
target_feature="rdseed"
target_feature="sse"
target_feature="sse2"
target_feature="sse3"
target_feature="sse4.1"
target_feature="sse4.2"
target_feature="ssse3"
target_feature="xsave"
target_feature="xsavec"
target_feature="xsaveopt"
target_feature="xsaves"
target_has_atomic="16"
target_has_atomic="32"
target_has_atomic="64"
target_has_atomic="8"
target_has_atomic="cas"
target_has_atomic="ptr"
target_os="linux"
target_pointer_width="64"
target_thread_local
target_vendor="unknown"
unix
However, looking at the asm reveals there was no vectorization.
This is the main for loop when compiler with rust:
.LBB0_2:
cmpq %rdx, %rcx
jae .LBB0_21
leaq (%r8,%rcx), %r11
cmpq %rdx, %r11
jae .LBB0_23
leaq (%r14,%rcx), %r10
cmpq %rdx, %r10
jae .LBB0_24
leaq (%rbx,%rcx), %rax
cmpq %rdx, %rax
jae .LBB0_25
movq 16(%rsp), %rax
leaq (%rax,%rcx), %rax
cmpq %rdx, %rax
jae .LBB0_26
movq 8(%rsp), %rax
leaq (%rax,%rcx), %rax
cmpq %rdx, %rax
jae .LBB0_27
leaq (%rcx,%rbp), %rax
cmpq %rdx, %rax
jae .LBB0_28
leaq (%rcx,%r13), %rax
cmpq %rdx, %rax
jae .LBB0_29
movq (%rsp), %rax
leaq (%rax,%rcx), %rax
cmpq %rdx, %rax
jae .LBB0_30
leaq (%rsi,%rcx,4), %rax
vmovss (%rsi,%rcx,4), %xmm3
vmovss (%rax,%rbx), %xmm11
addq %rbx, %rax
vmovss (%rax,%rbx), %xmm12
addq %rbx, %rax
vmovss (%rax,%r13), %xmm0
addq %r13, %rax
vmovss (%rax,%rbx), %xmm4
addq %rbx, %rax
vmovss (%rax,%rbx), %xmm7
addq %rbx, %rax
vmovss (%rax,%rbx), %xmm2
addq %rbx, %rax
vmovss (%rax,%rbx), %xmm5
addq %rbx, %rax
vmovss (%rbx,%rax), %xmm6
vucomiss %xmm6, %xmm9
vmovaps %xmm10, %xmm1
ja .LBB0_13
vsqrtss %xmm6, %xmm6, %xmm1
vdivss %xmm1, %xmm8, %xmm1
.LBB0_13:
vmulss %xmm1, %xmm2, %xmm2
vmulss %xmm1, %xmm5, %xmm6
vmulss %xmm6, %xmm6, %xmm5
vsubss %xmm5, %xmm7, %xmm5
vucomiss %xmm5, %xmm9
vmovaps %xmm10, %xmm7
ja .LBB0_15
vsqrtss %xmm5, %xmm5, %xmm5
vdivss %xmm5, %xmm8, %xmm7
.LBB0_15:
vmulss %xmm6, %xmm2, %xmm5
vsubss %xmm5, %xmm4, %xmm4
vmulss %xmm7, %xmm4, %xmm5
vmulss %xmm5, %xmm5, %xmm4
vsubss %xmm4, %xmm0, %xmm0
vmulss %xmm2, %xmm2, %xmm4
vsubss %xmm4, %xmm0, %xmm0
vucomiss %xmm0, %xmm9
vmovaps %xmm10, %xmm4
ja .LBB0_17
vsqrtss %xmm0, %xmm0, %xmm0
vdivss %xmm0, %xmm8, %xmm4
.LBB0_17:
cmpq %r9, %rcx
jae .LBB0_31
vmulss %xmm4, %xmm5, %xmm0
vmulss %xmm0, %xmm7, %xmm0
vxorps .LCPI0_2(%rip){1to4}, %xmm0, %xmm5
vmulss %xmm1, %xmm12, %xmm0
vmulss %xmm4, %xmm2, %xmm1
vmulss %xmm5, %xmm6, %xmm2
vaddss %xmm2, %xmm1, %xmm1
vmulss %xmm1, %xmm0, %xmm1
vmulss %xmm4, %xmm3, %xmm2
vmulss %xmm5, %xmm11, %xmm4
vaddss %xmm4, %xmm2, %xmm2
vsubss %xmm1, %xmm2, %xmm1
vmovss %xmm1, (%rdi,%rcx,4)
cmpq %r9, %r11
jae .LBB0_32
vmulss %xmm7, %xmm6, %xmm1
vmulss %xmm1, %xmm0, %xmm1
vmulss %xmm7, %xmm3, %xmm2
vsubss %xmm1, %xmm2, %xmm1
vmovss %xmm1, (%r15,%rcx,4)
cmpq %r9, %r10
jae .LBB0_20
vmovss %xmm0, (%r12,%rcx,4)
addq $1, %rcx
cmpq %r8, %rcx
jb .LBB0_2
versus compiled with
$ /usr/bin/clang++ --version
clang version 7.0.1 (Fedora 7.0.1-1.fc29)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
$ /usr/bin/clang++ -Ofast -march=native -S -fPIC process_inputs.cpp -o process_inputs.s
The C++ code looks more or less the same as the rust code, except that instead of creating and unpacking tuples, I moved that from the for loop to the inside of a function. I tested a Rust version written that way, and there was no change from the version written the way presented here.
The Fortran version was written like the Rust version, and was vectorized well with Flang (although not as well as with Clang). I would share that code if anyone is interested.
.LBB0_8: # =>This Inner Loop Header: Depth=1
leaq (%rsi,%rcx,4), %r14
vmovups (%rax,%r14), %zmm2
addq %rax, %r14
leaq (%r14,%rax), %rbx
vmovups (%r9,%rbx), %zmm3
addq %r9, %rbx
leaq (%rbx,%rax), %r15
leaq (%r15,%rax), %r12
leaq (%r12,%rax), %r13
leaq (%rax,%r13), %rbp
vmovups (%rax,%rbp), %zmm4
vrsqrt14ps %zmm4, %zmm5
vmulps %zmm5, %zmm4, %zmm4
vfmadd213ps %zmm0, %zmm5, %zmm4 # zmm4 = (zmm5 * zmm4) + zmm0
vmulps %zmm1, %zmm5, %zmm5
vmulps %zmm4, %zmm5, %zmm4
vmulps (%rax,%r12), %zmm4, %zmm5
vmulps (%rax,%r13), %zmm4, %zmm6
vmovups (%rax,%r15), %zmm7
vfnmadd231ps %zmm6, %zmm6, %zmm7 # zmm7 = -(zmm6 * zmm6) + zmm7
vrsqrt14ps %zmm7, %zmm8
vmulps %zmm8, %zmm7, %zmm7
vfmadd213ps %zmm0, %zmm8, %zmm7 # zmm7 = (zmm8 * zmm7) + zmm0
vmulps %zmm1, %zmm8, %zmm8
vmulps %zmm7, %zmm8, %zmm7
vmovups (%rax,%rbx), %zmm8
vfnmadd231ps %zmm6, %zmm5, %zmm8 # zmm8 = -(zmm5 * zmm6) + zmm8
vmulps %zmm8, %zmm7, %zmm8
vmulps %zmm5, %zmm5, %zmm9
vfmadd231ps %zmm8, %zmm8, %zmm9 # zmm9 = (zmm8 * zmm8) + zmm9
vsubps %zmm9, %zmm3, %zmm3
vrsqrt14ps %zmm3, %zmm9
vmulps %zmm9, %zmm3, %zmm3
vfmadd213ps %zmm0, %zmm9, %zmm3 # zmm3 = (zmm9 * zmm3) + zmm0
vmulps %zmm1, %zmm9, %zmm9
vmulps %zmm3, %zmm9, %zmm3
vmulps %zmm8, %zmm7, %zmm8
vmulps %zmm3, %zmm8, %zmm8
vmulps (%rax,%r14), %zmm4, %zmm4
vmulps %zmm8, %zmm2, %zmm9
vfmsub231ps (%rsi,%rcx,4), %zmm3, %zmm9 # zmm9 = (zmm3 * mem) - zmm9
vmulps %zmm5, %zmm3, %zmm3
vfmsub231ps %zmm8, %zmm6, %zmm3 # zmm3 = (zmm6 * zmm8) - zmm3
vfmadd213ps %zmm9, %zmm4, %zmm3 # zmm3 = (zmm4 * zmm3) + zmm9
vmovups %zmm3, (%rdi,%rcx,4)
vfnmadd213ps %zmm2, %zmm4, %zmm6 # zmm6 = -(zmm4 * zmm6) + zmm2
vmulps %zmm6, %zmm7, %zmm2
vmovups %zmm2, (%r10,%rcx,4)
vmovups %zmm4, (%r11,%rcx,4)
addq $16, %rcx
cmpq %rcx, %r8
jne .LBB0_8
There is also a second copy of the loop using xmm registers, to catch the n mod vector width remainder.
A few noteworthy differences: the C++ version was vectorized, using zmm registers.
The Rust version was not vectorized.
The C++ version used f(n)m(add/sub) instructions. The Rust version did not.
The C++ version used reciprocal square root instructions.
Other than the fact that calling cargo rustc --release -- --emit asm throws errors when I misspecify the rustflags, these flags do not seem to be doing anything. As far as I can tell, the asm remains the same (and unoptimized).
Am I missing something on how to get Rust to vectorize this numerical code?
If I understand the asm correctly, the bounds checks are a (perhaps the?) problem.