Hi everyone,
I am trying to parse a tab-delimited (with 21,283,105 lines) file line-by-line, apply a filter, and create a HashMap. My Rust solution (release mode) takes about ~9.812s, while in Python it takes ~13.069s, and this is making me doubt my Rust solution (I'm new to the language).
Here's the code
Rust
use csv;
use std::path::Path;
use std::collections::HashMap;
#[derive(Debug, serde::Deserialize)]
struct BimRecord {
chr: i32,
snp: String,
pos: f64,
bp: u32,
a1: String, // alleles are String and not char because of INDELs
a2: String,
}
fn read_bim<P>(chrom: i32, bim_path: P) -> csv::Result<Vec<BimRecord>>
where
P: AsRef<Path>,
{
let records = csv::ReaderBuilder::new()
.has_headers(false)
.delimiter(b'\t')
.from_path(bim_path.as_ref())?
.deserialize()
.filter(|record: &csv::Result<BimRecord>| {
record.as_ref().map_or(true, |r| r.chr == chrom)
})
.collect::<csv::Result<Vec<BimRecord>>>()?;
Ok(records)
}
pub fn bim_hash<P>(chrom: i32, bim_path: P) -> HashMap<String, Vec<String>>
where
P: AsRef<Path>,
{
let bim = read_bim(chrom, bim_path);
let cont = bim.as_ref().unwrap().len();
println!("The bim has {} variants", cont);
let mut snp_vec = vec!["0".to_string(); cont];
let mut a1_vec = vec!["0".to_string(); cont];
let mut a2_vec = vec!["0".to_string(); cont];
let mut num = 0;
for rec in bim.unwrap(){
snp_vec[num] = rec.snp;
a1_vec[num] = rec.a1;
a2_vec[num] = rec.a2;
num += 1;
}
assert_eq!(snp_vec.len(), cont);
assert_eq!(a1_vec.len(), cont);
assert_eq!(a2_vec.len(), cont);
let bim_hash = HashMap::from([
("SNP".to_string(), snp_vec),
("A1".to_string(), a1_vec),
("A2".to_string(), a2_vec),
]);
bim_hash
}
fn main() {
let bim = bim_hash(1, "/path/to//input.bim"); //1,653,512 lines are on chr 1
}
Python
def parse_bim(bim_file, chrom):
bim_dict = {'SNP':[], 'A1':[], 'A2':[]}
with open(bim_file ) as ff:
for line in ff:
ll = (line.strip()).split()
if int(ll[0]) == chrom:
bim_dict['SNP'].append(ll[1])
bim_dict['A1'].append(ll[4])
bim_dict['A2'].append(ll[5])
return bim_dict
bim = parse_bim("/path/to//input.bim", 1)
Here's how the file looks like
22 rs9605903 0 17054720 C T
22 rs5746647 0 17057138 G T
22 rs5747999 0 17075353 C A
22 rs2845380 0 17203103 A G
22 rs2247281 0 17211075 G A
22 rs2845346 0 17214252 C T
22 rs2845347 0 17214669 C T
22 rs1807512 0 17221495 C T
22 rs5748593 0 17227461 T C
22 rs9606468 0 17273728 C T
Any help on how I can optimize my solution and write it better would be appreciated. Thanks!