Moving an Arc<Vec<_>> across threads is hundreds of times fast. Is the compiler optimizing away the results? If I change the number of vecs moved from 10k to 1k, the behaviour is normal again.
Results for N=10000:
Benchmarking move_vec: Warming up for 3.0000 s
move_vec time: [275.65 ms 282.99 ms 291.80 ms]
Benchmarking move_arc_vec_st: Warming up for 3.0000 s
move_arc_vec time: [277.06 ms 280.58 ms 284.38 ms]
Benchmarking move_arc_vec_spawn: Warming up for 3.0000 s
move_arc_vec #2 time: [1.0580 ms 1.1929 ms 1.2892 ms]
Results for N=1000:
move_vec time: [36.155 ms 36.847 ms 37.509 ms]
move_arc_vec_st time: [35.767 ms 36.193 ms 36.636 ms]
move_arc_vec_spawn time: [36.266 ms 36.568 ms 36.914 ms]
Code:
use std::{sync::Arc, thread};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
const VEC_SIZE: usize = 500;
const ELEMENT_SIZE: usize = 1232;
const NUM_VECS: usize = 10000;
use rand::{thread_rng, RngCore};
fn prepare_vec() -> Vec<[u8; ELEMENT_SIZE]> {
let mut vec = vec![[0u8; ELEMENT_SIZE]; VEC_SIZE];
let mut rng = thread_rng();
for arr in vec.iter_mut() {
rng.fill_bytes(arr);
}
vec
}
fn benchmark_move_vec(c: &mut Criterion) {
c.bench_function("move_vec", |b| {
b.iter_batched(
|| {
// Setup: Create channel and pre-fill with 1000 Vecs
let (sender, receiver) =
crossbeam_channel::bounded::<Vec<[u8; ELEMENT_SIZE]>>(100000);
for _ in 0..NUM_VECS {
sender.send(prepare_vec()).unwrap();
}
receiver
},
|receiver| {
// Benchmark: Receive the Vecs in a separate thread
let handle = thread::spawn(move || {
for _i in 0..NUM_VECS {
let vec = receiver.recv().unwrap();
black_box(vec);
}
});
handle.join().unwrap();
},
criterion::BatchSize::LargeInput,
)
});
}
fn benchmark_move_arc_vec_slow(c: &mut Criterion) {
c.bench_function("move_arc_vec_st", |b| {
b.iter_batched(
|| {
// Setup: Create channel and pre-fill with 1000 Arc<Vec>s
let (sender, receiver) =
crossbeam_channel::bounded::<Arc<Vec<[u8; ELEMENT_SIZE]>>>(100000);
for _ in 0..NUM_VECS {
sender.send(Arc::new(prepare_vec())).unwrap();
}
receiver
},
|receiver| {
for _i in 0..NUM_VECS {
let arc_vec = receiver.recv().unwrap();
black_box(arc_vec);
}
},
criterion::BatchSize::LargeInput,
)
});
}
fn benchmark_move_arc_vec_fast(c: &mut Criterion) {
c.bench_function("move_arc_vec_spawn", |b| {
b.iter_batched(
|| {
// Setup: Create channel and pre-fill with 1000 Arc<Vec>s
let (sender, receiver) =
crossbeam_channel::bounded::<Arc<Vec<[u8; ELEMENT_SIZE]>>>(100000);
for _ in 0..NUM_VECS {
sender.send(Arc::new(prepare_vec())).unwrap();
}
receiver
},
|receiver| {
// Benchmark: Receive the Arc<Vec>s in a separate thread
let handle = thread::spawn(move || {
for _i in 0..NUM_VECS {
let arc_vec = receiver.recv().unwrap();
black_box(arc_vec);
}
});
handle.join().unwrap();
},
criterion::BatchSize::LargeInput,
)
});
}
criterion_group!(
benches,
benchmark_move_vec,
benchmark_move_arc_vec_slow,
benchmark_move_arc_vec_fast,
);
criterion_main!(benches);