Oh, actually, to @H2CO3's previous point, it turns out you can write that without mentioning simd at all and LLVM will notice what you're doing and emit the same thing.
You just need to make it really obvious what order you want it to do things:
pub fn dot<const N: usize>(x: &[f32; N], y: &[f32; N]) -> f32 {
let (x, x_tail) = x.as_chunks::<4>();
let (y, y_tail) = y.as_chunks::<4>();
assert!(x_tail.is_empty() && y_tail.is_empty(), "N must be a multiple of 4");
let mut sums = [0.0; 4];
for (x, y) in std::iter::zip(x, y) {
let [x0, x1, x2, x3] = *x;
let [y0, y1, y2, y3] = *y;
let [p0, p1, p2, p3] = [x0 * y0, x1 * y1, x2 * y2, x3 * y3];
sums[0] += p0;
sums[1] += p1;
sums[2] += p2;
sums[3] += p3;
}
(sums[0] + sums[1]) + (sums[2] + sums[3])
}
And you get the same vectorized-and-unrolled inner loop: https://rust.godbolt.org/z/7aen7ofce
bb10.i: ; preds = %bb10.i, %start
%iter.sroa.8.013.i = phi i64 [ 0, %start ], [ %8, %bb10.i ]
%0 = phi <4 x float> [ zeroinitializer, %start ], [ %14, %bb10.i ]
%1 = or i64 %iter.sroa.8.013.i, 1, !dbg !36
%2 = getelementptr inbounds [4 x float], ptr %x, i64 %iter.sroa.8.013.i, !dbg !38
%3 = getelementptr inbounds [4 x float], ptr %y, i64 %iter.sroa.8.013.i, !dbg !58
%4 = load <4 x float>, ptr %2, align 4, !dbg !62, !alias.scope !11, !noalias !15
%5 = load <4 x float>, ptr %3, align 4, !dbg !64, !alias.scope !15, !noalias !11
%6 = fmul <4 x float> %4, %5, !dbg !66
%7 = fadd <4 x float> %0, %6, !dbg !68
%8 = add nuw nsw i64 %iter.sroa.8.013.i, 2, !dbg !36
%9 = getelementptr inbounds [4 x float], ptr %x, i64 %1, !dbg !38
%10 = getelementptr inbounds [4 x float], ptr %y, i64 %1, !dbg !58
%11 = load <4 x float>, ptr %9, align 4, !dbg !62, !alias.scope !11, !noalias !15
%12 = load <4 x float>, ptr %10, align 4, !dbg !64, !alias.scope !15, !noalias !11
%13 = fmul <4 x float> %11, %12, !dbg !66
%14 = fadd <4 x float> %7, %13, !dbg !68
%exitcond.not.i.1 = icmp eq i64 %8, 1024, !dbg !17
br i1 %exitcond.not.i.1, label %example::dot.exit, label %bb10.i, !dbg !17
EDIT many years later since something was looking at this
It now also works to just use array::from_fn, like https://rust.godbolt.org/z/qn6We5WeE
let p: [f32; 4] = std::array::from_fn(|i| x[i] * y[i]);
sums = std::array::from_fn(|i| sums[i] + p[i]);
with the same vectorized inner loop:
%7 = load <4 x float>, ptr %_3.i.i, align 4
%8 = load <4 x float>, ptr %_3.i1.i, align 4
%9 = fmul <4 x float> %7, %8
%10 = fadd <4 x float> %9, %5