Enable avx2 + parallelisation => reach peak flops

I wrote a simple compute benchmark in rust (translated from C++, in which it delivers 80% of peak flops - 95 / 120GFlops). I'm quite disappointed since I cannot make it deliver more than 3.5Gflop.
Assembly doesn't look to use avx2 or fma (available on my machine).

Any help would be appreciated.

My Makefile :

RUSTFLAGS='-C target-feature=+avx2,+fma' cargo build --release

the code :

use std::time::{Instant};
use rayon::prelude::*; // is provided for parallel iterators

// this function can't be changed (may only add attributes)
#[inline]
fn expm1(x: f64) -> f64 {
	return ((((((((((((((15.0 + x) * x + 210.0) * x + 2730.0) * x + 32760.0) * x + 360360.0) * x + 3603600.0) * x + 32432400.0) * x + 259459200.0) * x + 1816214400.0) * x + 10897286400.0) * x + 54486432000.0) * x + 217945728000.0) * x + 653837184000.0) * x + 1307674368000.0)	* x * 7.6471637318198164759011319857881e-13;
}

// this function can't be changed (may only add attributes)
fn twelve(x: f64) -> f64 {
	return expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1(x))))))))))));
}

// you might want to optimize this one as well
fn populate(data: &mut [f64]) {
    data.par_iter_mut().for_each(|d| *d=0.1);
}

// optimize this one
fn apply(data: &mut [f64]) {
    data.par_iter_mut().for_each(|d| *d=twelve(*d));
}

// you might want to optimize this one as well
fn verify(data: &[f64], n: usize) {
    for i in 0..n {
        let expected = twelve(0.1);
        if expected != data[i] {
            println!("error at {:?} - {:?} != {:?}", i, data[i], expected);
        }
    }
}

fn run(data: &mut [f64], n: usize) -> f64 {
    populate(data);
    let start = Instant::now();
    apply(data);
    let duration = (start.elapsed().as_millis() as f64) * 1.0E-3;
    verify(data, n);
    let gflops = (n as f64) * 12.0 * 15.0E-9;
    println!("{:?}", gflops / duration);
    return gflops / duration;
}

fn main() {
    println!("avx2: {:?}", is_x86_feature_detected!("avx2"));
    let cpu: usize = num_cpus::get();
    println!("num cores {:?}", cpu);
    let n: usize = cpu * 256 * 1024; // take n as large as possible 
    let mut input = vec![0.1;n];
    let mut best : f64 = 0.0;
    for _i in 0..10 {
        let gflop = run(&mut input, n);
        if gflop > best {
            best = gflop;
        }
    }

    println!("Metric : {:?} GFlop/s", best);

}

and cargo.toml :

[package]
name = "expm1"
version = "0.0.1"
edition = "2018"

[dependencies]
rayon = "1.5"
num_cpus = "1.13.0"

What's the C++ code you are comparing to? Do you use -ffast-math or equivalent on the C++ side?

Yes I do. Here is the cpp code and build command line :

g++ -fopenmp -mavx2 -mfma -O3 main.cpp

include <cstdio>
#include <cstdlib>
#include <omp.h>

__attribute__((always_inline))
inline double expm1(double x) {
	return ((((((((((((((15.0 + x) * x + 210.0) * x + 2730.0) * x + 32760.0) * x + 360360.0) * x + 3603600.0) * x + 32432400.0) * x + 259459200.0) * x + 1816214400.0) * x + 10897286400.0) * x + 54486432000.0) * x + 217945728000.0) * x + 653837184000.0) * x + 1307674368000.0)	* x * 7.6471637318198164759011319857881e-13;
}

__attribute__((always_inline))
inline double twelve(double x) {
	return expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1( expm1(x))))))))))));
}

void apply(double * data, int N) {
	data = (double*) __builtin_assume_aligned(data, 1024);
	int CPUCOUNT = omp_get_num_procs();
	int slice = N / CPUCOUNT;
	#pragma omp parallel for
	for(int i = 0; i < CPUCOUNT; ++i) {
		int start = i * slice;
		int stop = start + slice;
		for(int j = start; j < stop; j += 32) {
			data[j+0] = twelve(data[j+0]);
			data[j+1] = twelve(data[j+1]);
			data[j+2] = twelve(data[j+2]);
			data[j+3] = twelve(data[j+3]);
			data[j+4] = twelve(data[j+4]);
			data[j+5] = twelve(data[j+5]);
			data[j+6] = twelve(data[j+6]);
			data[j+7] = twelve(data[j+7]);
			data[j+8] = twelve(data[j+8]);
			data[j+9] = twelve(data[j+9]);
			data[j+10] = twelve(data[j+10]);
			data[j+11] = twelve(data[j+11]);
			data[j+12] = twelve(data[j+12]);
			data[j+13] = twelve(data[j+13]);
			data[j+14] = twelve(data[j+14]);
			data[j+15] = twelve(data[j+15]);
			data[j+16] = twelve(data[j+16]);
			data[j+17] = twelve(data[j+17]);
			data[j+18] = twelve(data[j+18]);
			data[j+19] = twelve(data[j+19]);
			data[j+20] = twelve(data[j+20]);
			data[j+21] = twelve(data[j+21]);
			data[j+22] = twelve(data[j+22]);
			data[j+23] = twelve(data[j+23]);
			data[j+24] = twelve(data[j+24]);
			data[j+25] = twelve(data[j+25]);
			data[j+26] = twelve(data[j+26]);
			data[j+27] = twelve(data[j+27]);
			data[j+28] = twelve(data[j+28]);
			data[j+29] = twelve(data[j+29]);
			data[j+30] = twelve(data[j+30]);
			data[j+31] = twelve(data[j+31]);
		}
	}
}

Since you've manually unrolled the C++ version, you could try the same with Rayon. At a high level, you could add with_min_len(32) to enforce a lower bound on the adaptive parallel splitting, but I'm not sure if that will be enough for LLVM to unroll more aggressively. You could instead use par_chunks_mut(32) or par_chunks_exact_mut(32), and then serially iterate each slice or write the unrolled update yourself.

Also, if your C++ test was compiled with gcc, you could try with clang to see how LLVM fares. If that's missing some optimization, then Rust will be similarly limited.

try

23.6 GFlop/s on i7-4810MQ CPU @ 2.80GHz (4 cores, 8 threads)