Is the usage of simd correct?

#![feature(portable_simd)]
use std::simd::f64x8;

pub fn div_matrix<const N: usize>(a: &[f64; N], b: &[f64; N]) -> [f64; N] {
    // f64 -> init matrix
    let mut res = [0.0; N];
    // N % 8 split to calculate
    let mut start = 0;
    let mut end = 8;
    loop {
        if end >= N {
            end = N;
            let mut p = [1.0; 8];
            let mut q = [1.0; 8];
            for i in start..end {
                p[i % 8] = a[i];
                q[i % 8] = b[i];
            }
            let temp = f64x8::from_array(p) / f64x8::from_array(q);
            for i in start..end {
                res[i] = temp[i % 8];
            }
            break;
        } else {
            // length will always be 8 just copy_from_slice
            let temp = f64x8::from_slice(&a[start..end]) / f64x8::from_slice(&b[start..end]);
            res[start..end].copy_from_slice(temp.as_array());
        }
        start += 8;
        end += 8;
    }
    res
}

fn main() {
    let d = [1.0, 6.0, 3.0, 3.0, 20.0, 9.0, 1.0, 9.0, 4.0, 3.0];
    let f = [10.0, 2.0, 2.0, 6.0, 40.0, 3.0, 6.0, 7.0, 63.0, 95.2];

    let p = div_matrix::<10>(&d, &f);
    dbg!(p);
}

i never use simd before,
Do you have any suggestions for performance improvements?

Look into the asm. In most cases, simd hot loop is expected to be compiled to several instructions without any function calls. It's a general suggestion.

See https://crates.io/crates/cargo-show-asm

got it.

Since you're using nightly anyway for std::simd, use more nightly stuff to simplify your code.

For example, check out .as_chunks::<8>() to get [f64; 8]s to turn into the f64x8s.

2 Likes