Hi All, I'm new in rust. so i create litle script as my learning test. The object is process the file as fast as possible. Now i can process the file about 8 minutes. The file size about 11 GBs, My cpu core are 10 and ram about 16 GBs. this is my code :
use std::fs;
use std::sync::Arc;
use regex::Regex;
use std::time::Instant;
use std::thread;
use rayon::prelude::*;
fn main() {
// read the file
let start_reading_time = Instant::now();
let file_as_str = match fs::read_to_string("testing.txt") {
Ok(data) => data,
Err(e) => {
eprintln!("Error reading file: {}", e);
return;
}
};
let end_reading_time = Instant::now();
let elapsed_reading_time = end_reading_time - start_reading_time;
println!("elapsed reading time : {:?}", elapsed_reading_time);
// split string into array lines
let start_split_time = Instant::now();
let lines: Vec<String> = file_as_str.lines().par_bridge().map(|f|{
f.to_string()
}).collect();
let end_split_time = Instant::now();
let elapsed_split_time = end_split_time - start_split_time;
println!("elapsed split time : {:?}", elapsed_split_time);
// main process
let start_time = Instant::now();
let arc_lines = Arc::new(lines);
let lines_handle = {
let arc_lines = Arc::clone(&arc_lines);
thread::spawn(move || {
println!("count_lines : {}", count_lines(arc_lines));
})
};
let spaces_handle = {
let arc_lines = Arc::clone(&arc_lines);
thread::spawn(move || {
println!("count_spaces : {}", count_spaces(arc_lines));
})
};
let words_handle = {
let arc_lines = Arc::clone(&arc_lines);
thread::spawn(move || {
println!("count_words : {}", count_words(arc_lines));
})
};
let paragraphs_handle = {
let arc_lines = Arc::clone(&arc_lines);
thread::spawn(move || {
println!("count_paragraphs : {}",count_paragraphs(arc_lines));
})
};
lines_handle.join().expect("Thread panicked");
spaces_handle.join().expect("Thread panicked");
words_handle.join().expect("Thread panicked");
paragraphs_handle.join().expect("Thread panicked");
let end_time = Instant::now();
let elapsed_time = end_time - start_time;
println!("elapsed main process time : {:?}", elapsed_time);
}
fn count_lines(file_as_str: Arc<Vec<String>>) -> usize {
file_as_str.par_iter().count()
}
fn count_spaces(file_as_str: Arc<Vec<String>>) -> usize {
let re = Regex::new(r"\s+").unwrap();
file_as_str.par_iter().map(|x|{
re.find_iter(x).count()
}).sum()
}
fn count_words(file_as_str:Arc<Vec<String>>) -> usize {
let re = Regex::new(r"\b\w+\b").unwrap();
file_as_str.par_iter().map(|x|{
re.find_iter(x).count()
}).sum()
}
fn count_paragraphs(file_as_str: Arc<Vec<String>>) -> usize {
let re = Regex::new(r"\n\s*\n").unwrap();
file_as_str.par_iter().map(|x|{
re.find_iter(x).count()
}).sum()
}
is it still possible for optimize my code ? if yes. may i know the hint ? Thanks all