I will start by saying that Idk a whole lot about parallelism/Rust/C++.
I wrote a couple of toy programs to test raw numbers for mass addition in Rust and C++:
C++:
vector<long long> x(960000000);
std::atomic<long long> res = 0ll;
void methodA() {
res += accumulate(x.begin(), x.end(), 0ll);
cout << res << '\n';
}
void add(long long a, long long b, const vector<long long> &x) {
res += accumulate(x.begin() + a, x.begin() + b, 0ll) << '\n';
}
void methodC() {
res = 0ll;
long long d = 960000000 / 16; // (16 == thread count == num_cpus::get())
vector<thread> threads(8);
for (long long i = 0ll; i < 8; i++)
threads[i] = thread(add, i * d, (i + 1) * d, std::ref(x));
for (int i = 0; i < 8; i++)
threads[i].join();
}
int main() {
iota(x.begin(), x.end(), 1ll);
auto start = std::chrono::high_resolution_clock::now();
methodA();
auto end = std::chrono::high_resolution_clock::now();
auto time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
cout << "Method A took " << time << "ms" << endl;
start = std::chrono::high_resolution_clock::now();
methodC();
end = std::chrono::high_resolution_clock::now();
time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
cout << "Method C took " << time << "ms" << endl;
}
Rust:
use std::{sync::Arc, thread, time::Instant};
fn add(x: &[u64]) -> u64 {
x.iter().sum()
}
fn main() {
let ulimit = 960000000;
let mut x = Vec::from_iter(1..=ulimit);
// methodA
println!("sequential addition started");
let mut a = Instant::now();
let b = add(&x);
println!("time for sequential : {:#?}", a.elapsed());
println!("from sequential addition : {}", b);
let cpus = num_cpus::get();
println!("number of cpus : {}", cpus);
// method C
println!("parallel addition started");
a = Instant::now();
let refr = Arc::new(x);
let d = ulimit / (cpus as u64);
let mut thread_handles = Vec::with_capacity(d as usize);
for i in 0..cpus {
let y = refr.clone();
let d2 = d as usize;
let x = thread::spawn(move || add(&y[(i * d2)..((i + 1) * d2)]));
thread_handles.push(x);
}
let mut res = 0;
for handle in thread_handles.into_iter() {
res += handle.join().unwrap();
}
println!("time for parallel : {:#?}", a.elapsed());
println!("from parallel addition : {}", res);
}
C++ was compiled with :
g++ -pthread -O3 -ffast-math x.cpp
Rust was compiled with:
cargo r -- --release
(with no profile in Cargo.toml)
I am aware of the two major differences between them:
- Use of atomic in C++ code.
- returning u64 in Rust.
Upon executing the binaries, consistently, I find that the C++ implementation is significantly faster (~4x in multi-threaded, ~11x in single).
Sample:
C++:
Method A took 264ms
Method C took 97ms
Rust:
time for sequential : 3.2898265s
time for parallel : 415.4457ms
Both are being run on WSL2. (g++and stable rust). Apart from the differences pointed out, is there anything else that could be contributing to the slowness of the rust program ?