I just like to squeeze out the last drip of performance out of my laptop,
use std::time::{SystemTime, UNIX_EPOCH};
use std::ptr;
struct CellIter<const N: usize> {
start: [u8; N],
end: [u8; N],
pending: [u8; N],
done: bool,
}
impl<const N: usize> Iterator for CellIter<N> {
type Item = [u8; N];
fn next(&mut self) -> Option<[u8; N]> {
if self.done {
return None;
}
let result = self.pending;
for i in (0..N).rev() {
let inc:u8 = self.pending[i] as u8 + 1;
if inc <= self.end[i] as u8 {
self.pending[i] = inc as u8;
return Some(result);
}
self.pending[i] = self.start[i];
}
self.done = true;
Some(result)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let final_size: usize = self
.start
.iter()
.copied()
.zip(self.end.iter().copied())
.map(|(s, e)| e as usize - s as usize + 1)
.fold(1, std::ops::Mul::mul);
(final_size, Some(final_size))
}
}
impl<const N: usize> ExactSizeIterator for CellIter<N> {}
fn calc_cells<const N: usize>(mut start: [u8; N], mut end: [u8; N]) -> CellIter<N> {
for i in 0..N {
if start[i] > end[i] {
std::mem::swap(&mut start[i], &mut end[i])
}
}
CellIter {
start,
end,
pending: start,
done: false,
}
}
fn current_milliseconds()->u128{
return SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis();
}
fn benchmark(start:[u8;4], end:[u8;4], len:usize){
// A standard way of using CellIter
let t1=current_milliseconds();
let vector0 = calc_cells(start, end).collect::<Vec<_>>();
let t2=current_milliseconds();
println!("Solution A The standard way of using CellIter takes:{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t2 - t1), vector0.len(), vector0[0], vector0[len - 1]);
// But, I'm curious about what if I use CellIter only as a generator of generating all cells,
// and leave the collecting logics to my own way.
let t3=current_milliseconds();
let mut cell_iter1=calc_cells(start, end);
let mut vector1=Vec::with_capacity(len);
let mut cell_option1=cell_iter1.next();
loop{
match cell_option1{
Some(cell)=>vector1.push(cell),
_=>break
}
cell_option1=cell_iter1.next();
}
let t4=current_milliseconds();
println!("Solution B Trying an implementation of collecting takes:{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t4 - t3), vector1.len(), vector1[0], vector1[len - 1]);
// It turns out that my implementation is no better than the unknown collecting logics of the standard way,
// but, why? so I dig around in VSCode, trying to find out the magics under the hood,
// and come up with following codes copied out of vec.rs with minor tweaks.
let t5=current_milliseconds();
let mut cell_iter2=calc_cells(start, end);
let mut vector2:Vec<[u8;4]> = match cell_iter2.next() {
None => Vec::new(),
Some(element) => {
let (lower, _) = cell_iter2.size_hint();
let mut vector2 = Vec::with_capacity(lower.saturating_add(1));
unsafe {
ptr::write(vector2.as_mut_ptr(), element);
vector2.set_len(1);
}
vector2
}
};
while let Some(element) = cell_iter2.next() {
let len = vector2.len();
if len == vector2.capacity() {
let (lower, _) = cell_iter2.size_hint();
vector2.reserve(lower.saturating_add(1));
}
unsafe {
ptr::write(vector2.as_mut_ptr().add(len), element);
vector2.set_len(len + 1);
}
}
let t6=current_milliseconds();
println!("Solution C Using original codes copied out of vec.rs takes:{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t6 - t5), vector2.len(), vector2[0], vector2[len - 1]);
// It turns out that the copied codes are not performing well as the standard way.
// What the hell it's going on? I cannot explain it, I guess I didn't get the right codes which really
// run in the standard way. Anyway, I noticed that all the magics probably come from using 'unsafe mutable pointer'.
// So, I tried the below, but even performs worse than the copied one, why? I just can not explain it!
let t7=current_milliseconds();
let mut cell_iter3=calc_cells(start, end);
let mut vector3:Vec<[u8;4]> = Vec::with_capacity(len+1);
let ptr_vector=vector3.as_mut_ptr();
let mut ind=0;
while let Some(element) = cell_iter3.next() {
unsafe {
*ptr_vector.add(ind)=element;
ind+=1;
vector3.set_len(ind);
}
}
let t8=current_milliseconds();
println!("Solution D Using 'unsafe mutable pointer' in my own way:{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t8 - t7), vector3.len(), vector3[0], vector3[len - 1]);
let t9=current_milliseconds();
let mut cell_iter4=calc_cells(start, end);
let mut vector4:Vec<[u8;4]> = Vec::with_capacity(len+1);
while let Some(element) = cell_iter4.next() {
let len = vector4.len();
// This is the most weird part, it will take about 90ms longer, always,
// if commenting this block of codes out, even if its body is never
// executed, why???
if len == vector4.capacity() {
let (lower, _) = cell_iter4.size_hint();
vector4.reserve(lower.saturating_add(1));
}
unsafe {
ptr::write(vector4.as_mut_ptr().add(len), element);
vector4.set_len(len + 1);
}
}
let t10=current_milliseconds();
println!("Solution E Using partial codes copied out of vec.rs takes:{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t10 - t9), vector3.len(), vector3[0], vector3[len - 1]);
}
fn main() {
let start = [0, 0, 0, 0];
let end = [127, 127, 127, 29];
let len:usize=128*128*128*30;
for i in 0..5 {
println!("===round {}===", i);
benchmark(start, end, len);
}
}
Result:
===round 0===
Solution A The standard way of using CellIter takes:705ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B Trying an implementation of collecting takes:712ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes:792ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D Using 'unsafe mutable pointer' in my own way:653ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E Using partial codes copied out of vec.rs takes:645ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 1===
Solution A The standard way of using CellIter takes:661ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B Trying an implementation of collecting takes:713ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes:810ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D Using 'unsafe mutable pointer' in my own way:659ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E Using partial codes copied out of vec.rs takes:659ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 2===
Solution A The standard way of using CellIter takes:662ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B Trying an implementation of collecting takes:710ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes:795ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D Using 'unsafe mutable pointer' in my own way:644ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E Using partial codes copied out of vec.rs takes:653ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 3===
Solution A The standard way of using CellIter takes:660ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B Trying an implementation of collecting takes:709ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes:803ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D Using 'unsafe mutable pointer' in my own way:652ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E Using partial codes copied out of vec.rs takes:656ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 4===
Solution A The standard way of using CellIter takes:655ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B Trying an implementation of collecting takes:712ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes:810ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D Using 'unsafe mutable pointer' in my own way:660ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E Using partial codes copied out of vec.rs takes:654ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Questions:
- What is the real implementation of 'calc_cells(start, end).collect'? I guess I failed to find out the real one in VSCode.
- Why commenting out the block of 'if len == vector4.capacity()' leads to constant nontrivial performance change in Solution E? I cannot explain it.