I read that you like to read code and try to make it more efficient.
So, this is my code, have fun.
extern crate criterion_plot as plot;
extern crate rayon;
extern crate csv;
extern crate chashmap;
extern crate walkdir;
#[macro_use]
extern crate serde_derive;
extern crate statistical;
extern crate itertools_num as itertools;
extern crate palette;
use std::path::PathBuf;
use chashmap::CHashMap;
use rayon::prelude::*;
use walkdir::WalkDir;
use csv::Reader;
use plot::prelude::*;
use itertools::linspace;
use palette::{Hsl, Rgb, FromColor};
type SerialNumber = String;
#[derive(Deserialize)]
struct Record {
#[serde(rename = "serial_number")]
serial: SerialNumber,
model: String,
#[serde(rename = "capacity_bytes")]
capacity: u64,
failure: u8
}
struct Record2 {
serial: Option<SerialNumber>,
model: String,
capacity: u64
}
struct Drive{
model: Option<String>,
capacity:u64, //bytes
life: u64 //days
}
struct Model{
capacity: u64,
lifes: Vec<f64>
}
/// Compares some Hard Disk Drives using the data provided by
/// [BackBlaze] (https://www.backblaze.com/b2/hard-drive-test-data.html)
/// in order to extract a survival analysis. Produces a plot for each
/// capacity and an output with all models ordered by capacity and
/// mean life time.
fn main() {
let root = {
let mut args = std::env::args();
let prog_name = args.next().expect("ERROR. Missing executable name.");
args.next()
.expect(format!("ERROR. Usage: {} <root of folder containing csv>",
prog_name).as_str())
};
let drives: CHashMap<SerialNumber, Drive> = CHashMap::new();
let files = WalkDir::new(root).into_iter()
.filter_map(|de| {
de.ok().and_then(|e| {
if e.file_type().is_file() {
Some(e.path().to_path_buf())
} else {
None
}
})
})
.collect::<Vec<_>>();
let days = files.len();
files.into_par_iter()
.filter_map(|pb| Reader::from_path(pb).ok())
.flat_map(|reader|
reader.into_deserialize::<Record>()
.collect::<Vec<_>>())
.filter_map(|rec| {
if let Ok(rec) = rec {
if rec.failure == 0 { return Some(
Record2 { serial: Some(rec.serial),
capacity: rec.capacity,
model: rec.model} )
}}
None
})
.for_each(|mut r| {
drives.upsert(r.serial.take().unwrap(),
move || Drive{ model: Some(r.model),
capacity: r.capacity,
life: 1 },
|d| d.life += 1);
});
let models = CHashMap::new();
drives.into_iter()
.collect::<Vec<_>>()
.into_par_iter()
.for_each(|(_, mut drive)| {
models.upsert(drive.model.take().unwrap(),
|| Model { capacity: drive.capacity,
lifes: vec![drive.life as f64] },
|m| m.lifes.push(drive.life as f64));
});
let base_color:Hsl = Hsl::from_rgb(Rgb::new_u8(0x8a, 0x56, 0xe2));
let ref xs: Vec<_> = linspace::<f64>(0.0, days as f64, days + 1).collect();
let figures = CHashMap::new();
let count_for_cap:CHashMap<u64, (u32, u32)> = CHashMap::new();
let models:Vec<_> = models.into_iter().collect();
models.par_iter().for_each(|&(_, ref data)| {
count_for_cap.upsert(data.capacity,
|| (1, 0),
|c| c.0+=1);
});
println!("Reliability per model:");
let mut ordered: Vec<_> = models.into_par_iter()
.map(|(model, data)| {
let color :Rgb = {
let mut color = base_color;
let mut guard = count_for_cap.get_mut(&data.capacity).unwrap();
let n = guard.0;
let i = guard.1;
let step = 240f32/n as f32;
color.hue = (((color.hue.to_degrees() + step * i as f32)) % 300f32).into();
guard.1 += 1;
Rgb::from_hsl(color)
};
let mean = statistical::mean(data.lifes.as_slice());
let cap = data.capacity;
let model2 = model.clone();
figures.alter(data.capacity,
move |optf| {
let mut f =
if let Some(f) = optf {
f
} else {
let mut f = Figure::new();
let path = format!("./{}.svg", cap);
let mut pb = PathBuf::new();
pb.push(path);
f.set(Title(format!("Survival analysis. {}bytes drives", cap)))
.set(Output(pb))
.set(Size(1920, 1080))
.set(FontSize(8.0))
.configure(Key, |k| k.set(Boxed::Yes));
f};
let mut ys = vec![0; days+1];
let total = data.lifes.len();
let mut alives = total;
for l in data.lifes.into_iter() {
ys[l as usize] +=1;
}
//at this point ys[i] says how much drives died in the i-th day
for y in ys.iter_mut() {
alives -= *y;
*y = alives*1000 / total;
}
f.plot(
Lines {
x: xs,
y: ys
},
|lp| lp.set(Label(model))
.set(Color::Rgb((color.red*255.) as u8,
(color.green*255.) as u8,
(color.blue*255.) as u8))
);
Some(f)
});
(model2, cap, mean)
}).collect();
ordered.par_sort_unstable_by(|&(ref m1, c1, l1), &(ref m2, c2, l2)| {
if c1 == c2 {
if l1 == l2 {
m1.cmp(&m2)
} else {
l2.partial_cmp(&l1).unwrap()
}
} else {
c1.cmp(&c2)
}
});
for (model, capacity, life) in ordered.into_iter() {
println!("{:30}{:15}{:8.3}", model, capacity, life);
}
for (_, mut f) in figures.into_iter() {
f.draw().ok().and_then(|gnuplot| {
gnuplot.wait_with_output().ok()
.and_then(|p| String::from_utf8(p.stderr).ok())
}).expect("ERROR occurred while plotting");
}
}
This code compile and work.
Using the data of 2013, 2014, 2015 (8.3GiB, 996 CSV files) it takes
real 3m40.479s
user 2m33.840s
sys 0m9.532s
(from the time command, --release version).
So if you think I can do something to improve the performance I will try and I will reply telling you if it did work.
Thank you.