Hello,
I have two different implementations of a simple rk4 solver below. The first was my original attempt which did a whole lot of cloning while pushing results to a vec. I was hopeful in my second attempt that by preallocating, and performing all operations in place, that I would see some big performance gains. In the end, the benchmarks were actually pretty consistently 10% slower than the first implementation. I'm guessing that maybe there's some compiler optimizations that make this a wash, or that under the hood I'm basically doing the same thing using clone_from as I would be cloning to a vec::with_capacity. Any insights to help me understand why this was the case? Any recommendations on a better way to perform these operations? I cant make the state copyable since the state is created dynamically at run time through a ui.
pub fn solve_fixed_rk4<F, T>(solver: &Solver<F, T>) -> (Vec<f64>, Vec<T>)
where
F: Fn(T, f64) -> T,
T: Integrable,
{
let Solver {
func,
x0,
tstart,
tstop,
dt,
..
} = solver;
assert!(dt.abs() > f64::EPSILON, "0.0 dt not allowed!");
let half_dt = dt / 2.0;
let mut x = x0.clone();
let mut t = *tstart;
let result_length = ((tstop - tstart) / dt).ceil() as usize;
let mut result = Vec::with_capacity(result_length);
let mut time = Vec::with_capacity(result_length);
result.push(x.clone());
time.push(t);
while t <= *tstop {
let k1 = func(x.clone(), t);
let k2 = func(x.clone() + k1.clone() * half_dt, t + half_dt);
let k3 = func(x.clone() + k2.clone() * half_dt, t + half_dt);
let k4 = func(x.clone() + k3.clone() * *dt, t + dt);
x = x + (k1 + k2 * 2.0 + k3 * 2.0 + k4) * *dt / 6.0;
t += dt;
result.push(x.clone());
time.push(t);
}
(time, result)
}
pub fn solve_fixed_rk4<F, P, T>(solver: &Solver<F, P, T>) -> (Vec<f64>, Vec<T>)
where
F: OdeFunction<P, T>,
T: Integrable,
{
let Solver {
func,
x0,
parameters,
tstart,
tstop,
dt,
..
} = solver;
assert!(dt.abs() > f64::EPSILON, "0.0 dt not allowed!");
let half_dt = dt / 2.0;
let x = &mut x0.clone();
let mut t = *tstart;
let result_length = ((tstop - tstart) / dt).ceil() as usize + 1;
let mut result = Vec::with_capacity(result_length);
result.extend(std::iter::repeat(x0.clone()).take(result_length));
let mut time = Vec::with_capacity(result_length);
time.extend(std::iter::repeat(0.0).take(result_length));
let mut idx = 1 as usize; // 0 will be manually stored
result[0].clone_from(x0);
time[0].clone_from(&t);
t += dt;
//rk4 stages
let k1 = &mut x0.clone();
let k2 = &mut x0.clone();
let k3 = &mut x0.clone();
let k4 = &mut x0.clone();
//tmp variables for in place stage calculation
let k2_tmp = &mut x0.clone();
let k3_tmp = &mut x0.clone();
let k4_tmp = &mut x0.clone();
let x2 = &mut x0.clone();
let x3 = &mut x0.clone();
let x4 = &mut x0.clone();
while t <= *tstop {
x2.clone_from(x);
x3.clone_from(x);
x4.clone_from(x);
//k1
func(k1, &x, parameters, t);
//k2
k2_tmp.clone_from(k1);
*k2_tmp *= half_dt;
*x2 += k2_tmp;
func(k2, x2, parameters, t + half_dt);
//k3
k3_tmp.clone_from(k2);
*k3_tmp *= half_dt;
*x3 += k3_tmp;
func(k3, x3, parameters, t + half_dt);
//k4
k4_tmp.clone_from(k3);
*k4_tmp *= *dt;
*x4 += k4_tmp;
func(k4, x4, parameters, t + dt);
//in place chaining for (k1 + 2k2 + 2k3 + k4) * dt/6
*k2 *= 2.0;
*k3 *= 2.0;
*k1 += k2;
*k1 += k3;
*k1 += k4;
*k1 *= dt / 6.0;
*x += k1;
result[idx].clone_from(x);
time[idx].clone_from(&t);
idx += 1;
t += dt;
}
(time, result)
}