May I ask how to optimize rust code to exceed the speed of golang?
main.rs
use std::fmt::Write;
use std::time::{Instant};
fn main() {
let start = Instant::now();
// Faster hasher
// Use Box<str> to avoid storing capacity
// use a size small enough for the values
let mut m = fxhash::FxHashMap::<Box<str>, Box<str>>::with_capacity_and_hasher(10_000_000, Default::default());
// Reuse one string allocation across loop iterations
let mut k = String::with_capacity(16);
for i in 0..10_000_000 {
// clear + write! is like an in-place format!
k.clear();
let _ = write!(k, "{}", i);
let boxed_str = k.clone().into_boxed_str();
m.insert(boxed_str.clone(), boxed_str);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
main.go
package main
import (
"fmt"
"strconv"
"time"
)
func main() {
startT := time.Now()
m := make(map[string]string, 10000000)
for i := 0; i < 10000000; i++ {
key := strconv.Itoa(i)
m[key] = key
}
tc := time.Since(startT)
fmt.Printf("time cost = %v\n", tc)
}
use arrayvec::ArrayString;
let mut m = fxhash::FxHashMap::<ArrayString<9>, ArrayString<9>>::with_capacity_and_hasher(10_000_000, Default::default());
for i in 0..10_000_000 {
let mut k = ArrayString::new();
let _ = write!(k, "{}", i);
m.insert(k.clone(), k);
}
use itoa;
use std::time::{Instant};
fn main() {
let start = Instant::now();
// Faster hasher
// Use Box<str> to avoid storing capacity
// use a size small enough for the values
let mut m = fxhash::FxHashMap::<Box<str>, Box<str>>::with_capacity_and_hasher(10_000_000, Default::default());
let mut buffer = itoa::Buffer::new();
for i in 0..10_000_000 {
let boxed_str = buffer.format(i).to_string().into_boxed_str();
m.insert(boxed_str.clone(), boxed_str);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
Thanks kornel, significant performance gains, but not enough to beat golang. Do we still have room to improve performance?
1.489s - 1.504s
use std::time::{Instant};
use arrayvec::ArrayString;
use std::fmt::Write;
fn main() {
let start = Instant::now();
// Faster hasher
let mut m = fxhash::FxHashMap::<ArrayString<9>, ArrayString<9>>::with_capacity_and_hasher(10_000_000, Default::default());
for i in 0..10_000_000 {
let mut k = ArrayString::new();
let _ = write!(k, "{}", i);
m.insert(k.clone(), k);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
Running this rust version (note the Rc instead of Box):
use fxhash::FxHashMap;
use itoa;
use std::{rc::Rc, time::Instant};
fn main() {
let start = Instant::now();
let n = 10_000_000;
let mut m = FxHashMap::with_capacity_and_hasher(n, Default::default());
let mut buffer = itoa::Buffer::new();
for i in 0..n {
let s = Rc::new(buffer.format(i).to_string());
m.insert(s.clone(), s);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
it's ~0.45s faster than the Go version on my 5900X with jemalloc (and about the same without):
➜ rust_bench git:(main) ✗ cargo run --release
Finished release [optimized] target(s) in 0.00s
Running `target/release/rust_bench`
Time elapsed in expensive_function() is: 1.558743062s
➜ rust_bench git:(main) ✗ LD_PRELOAD=/usr/lib/libjemalloc.so cargo run --release
Finished release [optimized] target(s) in 0.00s
Running `target/release/rust_bench`
Time elapsed in expensive_function() is: 1.095548203s
➜ rust_bench git:(main) ✗ ./main
time cost = 1.542023743s
➜ rust_bench git:(main) ✗ LD_PRELOAD=/usr/lib/libjemalloc.so ./main
time cost = 1.550511912s
(note that Zen 3 has some weird quirks, which is why I used jemalloc to avoid them; see here for more information; this will likely not be necessary on Windows)
And here's a stupid parallel version that takes 600ms
use fxhash::FxHashMap;
use itoa;
use rayon::prelude::*;
use std::{sync::Arc, time::Instant};
fn main() {
let start = Instant::now();
let n = 10_000_000;
let m: FxHashMap<_, _> = (0..n)
.into_par_iter()
.map(|i| {
let mut buffer = itoa::Buffer::new();
let s = Arc::new(buffer.format(i).to_string());
(s.clone(), s)
})
.collect();
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
Output:
➜ rust_bench git:(main) ✗ time LD_PRELOAD=/usr/lib/libjemalloc.so cargo run --release
warning: unused variable: `m`
--> src/main.rs:9:9
|
9 | let m: FxHashMap<_, _> = (0..n)
| ^ help: if this is intentional, prefix it with an underscore: `_m`
|
= note: `#[warn(unused_variables)]` on by default
warning: `rust_bench` (bin "rust_bench") generated 1 warning (run `cargo fix --bin "rust_bench"` to apply 1 suggestion)
Finished release [optimized] target(s) in 0.00s
Running `target/release/rust_bench`
Time elapsed in expensive_function() is: 608.236324ms
LD_PRELOAD=/usr/lib/libjemalloc.so cargo run --release 1.92s user 0.29s system 128% cpu 1.719 total
Note that the actual runtime of the process is longer because it's still busy running destructors at the end of main. You can skip that by calling exit(0).
If you aren't already, this is a friendly reminder to use --release to get optimizations; the performance shouldn't be over a 2x factor different between targets.
And even if you are, you can probably still push a bit more performance by adding -Ctarget-cpu=native to the RUSTFLAGS environment variable to optimize for specifically the CPU you have rather than a generic older CPU.
Wow that's terrible. I just tested it on windows and can reproduce that it's almost twice as slow as on Linux.
Replacing the default allocator with mimalloc gets me back to 1.6 seconds (versus the 1.8s to 2.1s that I get with Go).
Well, that's unfortunate. I guess this is hitting a pathological case in the windows system allocator. Minor notes on alloc behavior that might have an impact:
Reusing k as a buffer for write!ing into probably isn't going to do much, since you're immediately cloning the String afterwards anyway. (It might mean the string that's converted into the box is already tight and doesn't need to reallocate, though; I'm not sure.)
A 2x performance factor to Go on the same target is probably explainable by the Rust code having 2x the allocations. Go's GC will be able to share the strings between key and value in the map, whereas the Rust code gives each a separate allocation.
This (tiny strings as map key/values) is exactly the use case of a smart string which stores small strings (up to ~20 bytes) without a heap allocation. This would allow eliminating most of the allocation pressure in the code.
This seems to be an allocator-benchmark, so one nitpick: Rc<String> has two allocations: one for the Rc and one for the string-data. You should use Rc<str> instead. It is probably not possible to do this without copying (this from impl works, but copies).
Go probably has some advantage here because it does arena allocation by default and has no extra allocation for reference counting. A bumpalo-style arena will most certainly speed things up.
Finally, to mirror the Go code more closely, the map should contain shared references, not the Rc. If you want to be able to mutate the keys afterwards, use Cell<&'alloc str>
/*
[dependencies]
itoa = "*"
fxhash = "*"
bumpalo = {version = "*", features = ["collections"]}
*/
use fxhash::FxHashMap;
use itoa;
use std::time::Instant;
use bumpalo::{Bump, collections::String};
fn main() {
let n = 10_000usize;
let arena = Bump::with_capacity(n*(n.ilog10() as usize));
let start = Instant::now();
let mut m = FxHashMap::with_capacity_and_hasher(n, Default::default());
let mut buffer = itoa::Buffer::new();
for i in 0..n {
let s = buffer.format(i);
let s = String::from_str_in(s, &arena).into_bump_str();
m.insert(s, s);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
This should now be fast. (n reduced to be able to exeucute on RE)
error[E0433]: failed to resolve: could not find `collections` in `bumpalo`
--> src\main.rs:4:21
|
4 | use bumpalo::{Bump, collections::string::String};
| ^^^^^^^^^^^ could not find `collections` in `bumpalo`
You need to enable the collections feature in the Cargo.toml for the bumpalo crate. @samuelpilz also wrote the required line in the comment above the code:
bumpalo = {version = "*", features = ["collections"]}
You can also do it by running cargo add bumpalo --features collections
See here for documentation on that feature in bumpalo.
See here for documentation on features.
use fxhash::FxHashMap;
use itoa;
use std::time::Instant;
use bumpalo::{Bump, collections::string::String};
fn main() {
let start = Instant::now();
let n = 10_000_000usize;
let arena = Bump::with_capacity(n*(n.ilog10() as usize));
let mut m = FxHashMap::with_capacity_and_hasher(n, Default::default());
let mut buffer = itoa::Buffer::new();
for i in 0..n {
let s = buffer.format(i);
let s = String::from_str_in(s, &arena).into_bump_str();
m.insert(s, s);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
println!("Time elapsed in expensive_function() is: {:?}", m.keys().len());
}
Besides allocation it's also the hash function and the hashmap that are being tested. (edit: oh and I guess cache locality matters )
Replacing the FxHashMap with an array of MaybeUninits results in a factor 10 speedup::
use itoa;
use std::{time::Instant, mem::MaybeUninit};
use bumpalo::{Bump, collections::String};
fn main() {
let n = 10_000_000usize;
let arena = Bump::with_capacity(n*(n.ilog10() as usize));
let start = Instant::now();
// let mut m = FxHashMap::with_capacity_and_hasher(n, Default::default());
let mut m = vec![MaybeUninit::uninit(); n];
let mut buffer = itoa::Buffer::new();
for i in 0..n {
let s = buffer.format(i);
let s = String::from_str_in(s, &arena).into_bump_str();
m[i] = MaybeUninit::new(s);
}
let duration = start.elapsed();
println!("Time elapsed in expensive_function() is: {:?}", duration);
}
hyperfine benchmark:
➜ rust_bench git:(main) ✗ RUSTFLAGS="-C target-cpu=native" cargo build --release; LD_PRELOAD=/usr/lib/libjemalloc.so hyperfine ./target/release/rust_bench
Compiling rust_bench v0.1.0 (/home/projects/rust_bench)
Finished release [optimized] target(s) in 0.10s
Benchmark 1: ./target/release/rust_bench
Time (mean ± σ): 136.1 ms ± 5.1 ms [User: 120.0 ms, System: 14.8 ms]
Range (min … max): 131.5 ms … 149.9 ms 21 runs