Hi folks
I managed to parallelize a code that does a simulation, but there's clones and allocations everywhere. Would any of you mind in giving any tips here? I will paste the required pieces of code to understand the basics.
use std::sync::mpsc::channel;
use wide::f64x4;
use threadpool::ThreadPool;
#[derive(Clone)]
pub struct Particle {
position: f64x4,
velocity: f64x4,
acceleration: [f64x4; 2],
mass: f64,
}
pub struct Bodies {
particles: Vec<Particle>,
pool: ThreadPool,
accs: Vec<Vec<f64x4>>,
splits: Vec<(usize, usize)>,
}
//! Make splits to separate slices of the particles
fn make_splits(n: usize, k: usize) -> Vec<(usize, usize)> {
let iter = (0..1).chain(
(0..k)
.map(|i| f64::ceil((n as f64) * (1. - f64::sqrt((i as f64) / (k as f64)))) as usize)
.rev(),
);
iter.clone().zip(iter.skip(1)).collect()
}
impl Bodies {
pub fn new(path_name: &str) -> Bodies {
let file_path = Path::new(path_name);
let particles: Vec<Particle> = ...;
let k = num_cpus::get();
let n = (&particles).len();
let splits = make_splits(n, k);
let accs = vec![vec![f64x4::splat(0.0); n]; k];
Bodies {
particles,
pool: ThreadPool::new(k),
accs,
splits,
}
}
pub fn accelerate(&mut self) {
let (tx, rx) = channel();
// I wonder if I can get rid of these allocations
let positions: Vec<f64x4> = self.particles.iter().map(|p| p.position).collect();
let masses: Vec<f64> = self.particles.iter().map(|p| p.mass).collect();
for (i, split) in self.splits.iter().enumerate() {
let tx = tx.clone();
let split = *split;
// Do these clones hurt performance?
let positions = positions.clone();
let masses = masses.clone();
self.pool.execute(move || {
let acc_i = accelerated_chuncked(split, &positions, &masses);
tx.send((i, acc_i)).unwrap();
});
}
let k = self.splits.len();
// Waiting for the values and replacing them without allocaction of a new Vec
for (i, acc_k) in rx.iter().take(k) {
self.accs[i]
.iter_mut()
.zip(acc_k.iter())
.for_each(|(e, a)| *e = *a);
}
// Transposing Vec<Vec<f64x4>> from [K;N] to [N;K] and summing over K for each N
let accs = &self.accs;
let accs_par = (0..accs[0].len())
.map(|i| accs.iter().map(|row| row[i]).reduce(|a,b| a + b));
// Replacing new value of acc and storing the previous one
// This is the aggregation step.
self.particles
.iter_mut()
.zip(accs_par)
.filter(|(p, a)| a.is_some())
.for_each(|(p, a)| {
p.acceleration[1] = p.acceleration[0];
p.acceleration[0] = a.unwrap();
});
}
}
fn accelerated_chuncked(split: (usize, usize), pos: &[f64x4], mass: &[f64]) -> Vec<f64x4> {
let N = pos.len();
// Can we manage to do it without allocating more?
let mut acc = vec![f64x4::splat(0.0); N];
for i in split.0..split.1 {
let pos_i = pos[i];
let m_i = mass[i];
for j in i + 1..N {
let mut dr = pos_i - pos[j];
let rinv3 = (1. / (dr*dr).reduce_add().sqrt()).powi(3);
dr = dr * rinv3;
acc[i] -= mass[j] * dr;
acc[j] += m_i * dr;
}
}
acc
}
I am thinking on removing the Particle
struct and leaving 3 Vec<f64x4>
and 1 Vec<f64>
so I can avoid some allocations by cloning an iterator over them. But idk if it's a good idea.
Thanks in advance!