SIMD for a FIR filter: unstable result and partitioning in as_simd

Oh, actually, to @H2CO3's previous point, it turns out you can write that without mentioning simd at all and LLVM will notice what you're doing and emit the same thing.

You just need to make it really obvious what order you want it to do things:

pub fn dot<const N: usize>(x: &[f32; N], y: &[f32; N]) -> f32 {
    let (x, x_tail) = x.as_chunks::<4>();
    let (y, y_tail) = y.as_chunks::<4>();

    assert!(x_tail.is_empty() && y_tail.is_empty(), "N must be a multiple of 4");

    let mut sums = [0.0; 4];
    for (x, y) in std::iter::zip(x, y) {
        let [x0, x1, x2, x3] = *x;
        let [y0, y1, y2, y3] = *y;
        let [p0, p1, p2, p3] = [x0 * y0, x1 * y1, x2 * y2, x3 * y3];
        sums[0] += p0;
        sums[1] += p1;
        sums[2] += p2;
        sums[3] += p3;
    }
    
    (sums[0] + sums[1]) + (sums[2] + sums[3])
}

And you get the same vectorized-and-unrolled inner loop: https://rust.godbolt.org/z/7aen7ofce

bb10.i:                                           ; preds = %bb10.i, %start
  %iter.sroa.8.013.i = phi i64 [ 0, %start ], [ %8, %bb10.i ]
  %0 = phi <4 x float> [ zeroinitializer, %start ], [ %14, %bb10.i ]
  %1 = or i64 %iter.sroa.8.013.i, 1, !dbg !36
  %2 = getelementptr inbounds [4 x float], ptr %x, i64 %iter.sroa.8.013.i, !dbg !38
  %3 = getelementptr inbounds [4 x float], ptr %y, i64 %iter.sroa.8.013.i, !dbg !58
  %4 = load <4 x float>, ptr %2, align 4, !dbg !62, !alias.scope !11, !noalias !15
  %5 = load <4 x float>, ptr %3, align 4, !dbg !64, !alias.scope !15, !noalias !11
  %6 = fmul <4 x float> %4, %5, !dbg !66
  %7 = fadd <4 x float> %0, %6, !dbg !68
  %8 = add nuw nsw i64 %iter.sroa.8.013.i, 2, !dbg !36
  %9 = getelementptr inbounds [4 x float], ptr %x, i64 %1, !dbg !38
  %10 = getelementptr inbounds [4 x float], ptr %y, i64 %1, !dbg !58
  %11 = load <4 x float>, ptr %9, align 4, !dbg !62, !alias.scope !11, !noalias !15
  %12 = load <4 x float>, ptr %10, align 4, !dbg !64, !alias.scope !15, !noalias !11
  %13 = fmul <4 x float> %11, %12, !dbg !66
  %14 = fadd <4 x float> %7, %13, !dbg !68
  %exitcond.not.i.1 = icmp eq i64 %8, 1024, !dbg !17
  br i1 %exitcond.not.i.1, label %example::dot.exit, label %bb10.i, !dbg !17

EDIT many years later since something was looking at this

It now also works to just use array::from_fn, like https://rust.godbolt.org/z/qn6We5WeE

let p: [f32; 4] = std::array::from_fn(|i| x[i] * y[i]);
sums = std::array::from_fn(|i| sums[i] + p[i]);

with the same vectorized inner loop:

%7 = load <4 x float>, ptr %_3.i.i, align 4
%8 = load <4 x float>, ptr %_3.i1.i, align 4
%9 = fmul <4 x float> %7, %8
%10 = fadd <4 x float> %9, %5
8 Likes