Hi,
I am trying to autovectorize the following masked sum, but I am struggling to achieve the performance that I would expect for this operation.
The slow operation is simd_nonnull_sum
, on which the inner loop with the validity is not being very well auto-vectorized: I observe a 4x-5x performance degradation in comparison with the non-masked sum (also below). Such performance difference is not observed when I use packed_simd (which requires nightly).
For completeness, I present the whole benchmark setup.
Any ideas how this can be further improved / help the compiler?
use std::convert::TryInto;
use criterion::{criterion_group, criterion_main, Criterion};
const LANES: usize = 16;
static U16_MASK: [u16; 16] = [
1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
];
fn simd_nonnull_sum(values: &[f32], bitmap: &[u8]) -> f32 {
assert!(values.len() <= bitmap.len() * 8);
let mut chunks = values.chunks_exact(LANES);
let mut validity_chunks = bitmap.chunks_exact(2); // 16 bits = LANES
let sum = chunks.by_ref().zip(validity_chunks.by_ref()).fold(
[0.0f32; LANES],
|mut acc, (chunk, validity_chunk)| {
// select
let chunk: [f32; LANES] = chunk.try_into().unwrap();
let validity_chunk = u16::from_ne_bytes(validity_chunk.try_into().unwrap());
let mut selected_chunk = [0.0f32; LANES];
(0..LANES).for_each(|i| {
selected_chunk[i] = if validity_chunk & U16_MASK[i] != 0 {
chunk[i]
} else {
0.0
}
});
// sum
(0..LANES).for_each(|i| {
acc[i] += selected_chunk[i];
});
acc
},
);
let mut reduced = 0.0f32;
(0..LANES).for_each(|i| {
reduced += sum[i]; // this is NOT right as it needs the mask, but ignore. It is easily fixable.
});
reduced
}
fn simd_sum(values: &[f32]) -> f32 {
let chunks = values.chunks_exact(LANES);
let remainder = chunks.remainder();
let sum = chunks.fold([0.0f32; LANES], |mut acc, chunk| {
let chunk: [f32; LANES] = chunk.try_into().unwrap();
(0..LANES).for_each(|i| {
acc[i] += chunk[i];
});
acc
});
let remainder: f32 = remainder.iter().copied().sum();
let mut reduced = 0.0f32;
(0..LANES).for_each(|i| {
reduced += sum[i];
});
reduced + remainder
}
fn naive_sum(values: &[f32]) -> f32 {
values.iter().sum()
}
fn add_benchmark(c: &mut Criterion) {
let values = (0..513 * 7).map(|x| x as f32).collect::<Vec<_>>();
c.bench_function("simd_sum", |b| b.iter(|| simd_sum(&values)));
c.bench_function("naive_sum", |b| b.iter(|| naive_sum(&values)));
let mask = (0..513 * 7).map(|x| x as u8).collect::<Vec<_>>();
c.bench_function("simd_nonnull_sum", |b| {
b.iter(|| simd_nonnull_sum(&values, &mask))
});
}
criterion_group!(benches, add_benchmark);
criterion_main!(benches);