What is under the hood of Iterator::collect? I cannot explain the weirdest part

I just like to squeeze out the last drip of performance out of my laptop,

use std::time::{SystemTime, UNIX_EPOCH};
use std::ptr;

struct CellIter<const N: usize> {
    start: [u8; N],
    end: [u8; N],
    pending: [u8; N],
    done: bool,
}

impl<const N: usize> Iterator for CellIter<N> {
    type Item = [u8; N];
    fn next(&mut self) -> Option<[u8; N]> {
        if self.done {
            return None;
        }
        let result = self.pending;
        for i in (0..N).rev() {
            let inc:u8 = self.pending[i] as u8 + 1;
            if inc <= self.end[i] as u8 {
                self.pending[i] = inc as u8;
                return Some(result);
            }
            self.pending[i] = self.start[i];
        }
        self.done = true;
        Some(result)
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        let final_size: usize = self
            .start
            .iter()
            .copied()
            .zip(self.end.iter().copied())
            .map(|(s, e)| e as usize - s as usize + 1)
            .fold(1, std::ops::Mul::mul);
        (final_size, Some(final_size))
    }
}

impl<const N: usize> ExactSizeIterator for CellIter<N> {}

fn calc_cells<const N: usize>(mut start: [u8; N], mut end: [u8; N]) -> CellIter<N> {
    for i in 0..N {
        if start[i] > end[i] {
            std::mem::swap(&mut start[i], &mut end[i])
        }
    }
    CellIter {
        start,
        end,
        pending: start,
        done: false,
    }
}

fn current_milliseconds()->u128{
    return SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis();
}

fn benchmark(start:[u8;4], end:[u8;4], len:usize){
    
    // A standard way of using CellIter
    let t1=current_milliseconds();
    let vector0 = calc_cells(start, end).collect::<Vec<_>>();
    let t2=current_milliseconds();
    println!("Solution A        The standard way of using CellIter takes´╝Ü{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t2 - t1), vector0.len(), vector0[0], vector0[len - 1]);

    // But, I'm curious about what if I use CellIter only as a generator of generating all cells,
    // and leave the collecting logics to my own way.
    let t3=current_milliseconds();
    let mut cell_iter1=calc_cells(start, end);
    let mut vector1=Vec::with_capacity(len);
    let mut cell_option1=cell_iter1.next();
    loop{
        match cell_option1{
            Some(cell)=>vector1.push(cell),
            _=>break
        }
        cell_option1=cell_iter1.next();
    }
    let t4=current_milliseconds();
    println!("Solution B   Trying an implementation of collecting takes´╝Ü{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t4 - t3), vector1.len(), vector1[0], vector1[len - 1]);
    
    // It turns out that my implementation is no better than the unknown collecting logics of the standard way,
    // but, why? so I dig around in VSCode, trying to find out the magics under the hood,
    // and come up with following codes copied out of vec.rs with minor tweaks.
    let t5=current_milliseconds();
    let mut cell_iter2=calc_cells(start, end);
    let mut vector2:Vec<[u8;4]> = match cell_iter2.next() {
        None => Vec::new(),
        Some(element) => {
            let (lower, _) = cell_iter2.size_hint();
            let mut vector2 = Vec::with_capacity(lower.saturating_add(1));
            unsafe {
                ptr::write(vector2.as_mut_ptr(), element);
                vector2.set_len(1);
            }
            vector2
        }
    };
    while let Some(element) = cell_iter2.next() {
        let len = vector2.len();
        if len == vector2.capacity() {
            let (lower, _) = cell_iter2.size_hint();
            vector2.reserve(lower.saturating_add(1));
        }
        unsafe {
            ptr::write(vector2.as_mut_ptr().add(len), element);
            vector2.set_len(len + 1);
        }
    }
    let t6=current_milliseconds();
    println!("Solution C Using original codes copied out of vec.rs takes´╝Ü{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t6 - t5), vector2.len(), vector2[0], vector2[len - 1]);
    
    // It turns out that the copied codes are not performing well as the standard way.
    // What the hell it's going on? I cannot explain it, I guess I didn't get the right codes which really
    // run in the standard way. Anyway, I noticed that all the magics probably come from using 'unsafe mutable pointer'.
    // So, I tried the below, but even performs worse than the copied one, why? I just can not explain it!
    let t7=current_milliseconds();
    let mut cell_iter3=calc_cells(start, end);
    let mut vector3:Vec<[u8;4]> = Vec::with_capacity(len+1);
    let ptr_vector=vector3.as_mut_ptr();
    let mut ind=0;
    while let Some(element) = cell_iter3.next() {
        unsafe {
            *ptr_vector.add(ind)=element;
            ind+=1;
            vector3.set_len(ind);
        }
    }
    let t8=current_milliseconds();
    println!("Solution D    Using 'unsafe mutable pointer' in my own way´╝Ü{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t8 - t7), vector3.len(), vector3[0], vector3[len - 1]);
    

    let t9=current_milliseconds();
    let mut cell_iter4=calc_cells(start, end);
    let mut vector4:Vec<[u8;4]> = Vec::with_capacity(len+1);
    while let Some(element) = cell_iter4.next() {
        let len = vector4.len();
        // This is the most weird part, it will take about 90ms longer, always, 
        // if commenting this block of codes out, even if its body is never
        // executed, why???
        if len == vector4.capacity() {
            let (lower, _) = cell_iter4.size_hint();
            vector4.reserve(lower.saturating_add(1));
        }
        
        unsafe {
            ptr::write(vector4.as_mut_ptr().add(len), element);
            vector4.set_len(len + 1);
        }
    }
    let t10=current_milliseconds();
    println!("Solution E  Using partial codes copied out of vec.rs takes´╝Ü{}ms, verify: length {}, first cell {:?}, last cell {:?}", (t10 - t9), vector3.len(), vector3[0], vector3[len - 1]);
    
}

fn main() {
    let start = [0, 0, 0, 0];
    let end = [127, 127, 127, 29];
    let len:usize=128*128*128*30;

    for i in 0..5 {
        println!("===round {}===", i);
        benchmark(start, end, len);
    }
}

Result:

===round 0===
Solution A        The standard way of using CellIter takes´╝Ü705ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B   Trying an implementation of collecting takes´╝Ü712ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes´╝Ü792ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D    Using 'unsafe mutable pointer' in my own way´╝Ü653ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E  Using partial codes copied out of vec.rs takes´╝Ü645ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 1===
Solution A        The standard way of using CellIter takes´╝Ü661ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B   Trying an implementation of collecting takes´╝Ü713ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes´╝Ü810ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D    Using 'unsafe mutable pointer' in my own way´╝Ü659ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E  Using partial codes copied out of vec.rs takes´╝Ü659ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 2===
Solution A        The standard way of using CellIter takes´╝Ü662ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B   Trying an implementation of collecting takes´╝Ü710ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes´╝Ü795ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D    Using 'unsafe mutable pointer' in my own way´╝Ü644ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E  Using partial codes copied out of vec.rs takes´╝Ü653ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 3===
Solution A        The standard way of using CellIter takes´╝Ü660ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B   Trying an implementation of collecting takes´╝Ü709ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes´╝Ü803ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D    Using 'unsafe mutable pointer' in my own way´╝Ü652ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E  Using partial codes copied out of vec.rs takes´╝Ü656ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
===round 4===
Solution A        The standard way of using CellIter takes´╝Ü655ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution B   Trying an implementation of collecting takes´╝Ü712ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution C Using original codes copied out of vec.rs takes´╝Ü810ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution D    Using 'unsafe mutable pointer' in my own way´╝Ü660ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]
Solution E  Using partial codes copied out of vec.rs takes´╝Ü654ms, verify: length 62914560, first cell [0, 0, 0, 0], last cell [127, 127, 127, 29]

Questions:

  1. What is the real implementation of 'calc_cells(start, end).collect'? I guess I failed to find out the real one in VSCode.
  2. Why commenting out the block of 'if len == vector4.capacity()' leads to constant nontrivial performance change in Solution E? I cannot explain it.

Please format your post by placing a line with three backticks in a row (```) just before and just after each block of code. Otherwise it's hard to read what you've written.

Thank you, corrected it.

1 Like

Please format your post by using a code block instead of an image.

This benchmark seems very sensitive to optimization settings. For example, here are some typical results I get with different rustc options. I compiled the code with rustc 1.49.0 on x86_64-unknown-linux-gnu. The results were fairly stable from run to run.

                                           A    B    C    D    E
-C opt-level=2                             254  237  377  210  375
-C opt-level=3                             258  377  376  375  396
-C opt-level=3 -C lto                      376  376  375  375  378
-C opt-level=3 -C codegen-units=1          237  237  266  233  236
-C opt-level=3 -C codegen-units=1 -C lto   236  220  236  214  231

Solutions B and D do significantly worse with just opt-level=3 (which is the default for Cargo release builds) than with most other settings.

Solutions C and E do much better with codegen-units=1 than without.

Perhaps most surprisingly, enabling lto (without codegen-units=1) makes some of the solutions much worse. All of the solutions are about equally bad in this configuration!

Perhaps having all of the code in one big function is throwing off some of LLVM's heuristics.

3 Likes

Thank you very much. I tried your rustc options, I agree with you, basically the same settings shows the similar performance pattern on Windows 10 as yours.

Yeah, good idea, replaced.

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.