This issue is about optimizing the performance of counting the occurrences of instances of a collection of u32 values.
Using the Rust standard library, such feature may be implemented as a map to a counter. The Rust standard library has two kinds of maps: std::collections::HashMap
and std::collections::BTreeMap
.
Using the first one, I wrote the following Rust program:
fn main() {
let mut m = std::collections::HashMap::<u32, u32>::new();
for i in 0..200_000_000 {
*m.entry(i % 100_000).or_insert(0) += 1;
}
println!("{}", m[&40000]);
}
It prints β2000
β.
If βHashMap
β is replaced by βBTreeMap
β, an equivalent program is obtained.
Then I wrote the following equivalent C++ program, that uses also a GCC-only library extension, contained in the "__gnu_pbds
" namespace:
#include <iostream>
#include <ctime>
#include <map>
#include <unordered_map>
#include <ext/pb_ds/assoc_container.hpp>
using namespace std;
template<typename T>
void f() {
clock_t start = clock();
T m;
for (int i = 0; i < 200000000; ++i) {
auto j = i % 100000;
auto pos = m.find(j);
if (pos == m.end()) m[j] = 1;
else ++pos->second;
}
cout << m[40000] << " " << float(clock() - start)
/ CLOCKS_PER_SEC << " s" << endl;
}
int main() {
f<map<unsigned int, unsigned int>>();
f<unordered_map<unsigned int, unsigned int>>();
f<__gnu_pbds::cc_hash_table<unsigned int, unsigned int>>();
}
The running times that I got in one computer are the following ones:
-
std::collections::HashMap
: 8.9 seconds; -
std::collections::BTreeMap
: 12.6 seconds; -
std::map
: 20.2 seconds; -
std::unordered_map
: 2.61 seconds; -
__gnu_pbds::cc_hash_table
: 0.83 seconds.
It appears that the Rust program using BTreeMap
outperforms the C++ program using map
(that uses a red-black tree), but it appears also that the fastest Rust program, that is the one that uses HashMap
, takes more than 3 times the time taken by the program using unordered_map
, and more than 10 times the time taken by the program using cc_hash_table
.
To demonstrate that it is not a defect in the test program but an inefficiency of the library, I encapsulated the C++ collections in a C++ library, and then called such library from a Rust program using FFI.
Here is the C++ source code of the library:
#include <unordered_map>
#include <ext/pb_ds/assoc_container.hpp>
using namespace std;
static unordered_map<unsigned int, unsigned int> um;
static __gnu_pbds::cc_hash_table<unsigned int, unsigned int> ccht;
extern "C" void um_create() { }
extern "C" void ccht_create() { }
extern "C" void um_destroy() { um.clear(); }
extern "C" void ccht_destroy() { ccht.clear(); }
extern "C" void um_increment_count(unsigned int i) {
try {
auto pos = um.find(i);
if (pos == um.end()) um[i] = 1;
else ++pos->second;
}
catch (...) { }
}
extern "C" void ccht_increment_count(unsigned int i) {
try {
auto pos = ccht.find(i);
if (pos == ccht.end()) ccht[i] = 1;
else ++pos->second;
}
catch (...) { }
}
extern "C" unsigned int um_get_count(unsigned int i) {
try {
return um[i];
}
catch (...) { }
return 0;
}
extern "C" unsigned int ccht_get_count(unsigned int i) {
try {
return ccht[i];
}
catch (...) { }
return 0;
}
Here is the Rust source code of the program using such library:
use std::time::Instant;
#[link(name = "stdc++", kind = "static")]
#[link(name = "hash", kind = "static")]
extern {
fn um_create();
fn ccht_create();
fn um_destroy();
fn ccht_destroy();
fn um_increment_count(i: i32);
fn ccht_increment_count(i: i32);
fn um_get_count(i: u32) -> i32;
fn ccht_get_count(i: u32) -> i32;
}
fn main() {
let t0 = Instant::now();
unsafe {
um_create();
for i in 0..200_000_000 {
um_increment_count(i % 100_000);
}
println!("{}", um_get_count(40000));
um_destroy();
}
let d1 = t0.elapsed();
println!("{}", d1.as_secs() as f64 + d1.subsec_nanos() as f64 / 1e9);
let t1 = Instant::now();
unsafe {
ccht_create();
for i in 0..200_000_000 {
ccht_increment_count(i % 100_000);
}
println!("{}", ccht_get_count(40000));
ccht_destroy();
}
let d2 = t1.elapsed();
println!("{}", d2.as_secs() as f64 + d2.subsec_nanos() as f64 / 1e9);
}
And here is the Bash script to build the library and the program, and to run the generated program:
g++ -std=c++11 -fPIC -c -O3 hash2.cpp -o hash2.o && \
ar rcs libhash.a hash2.o && \
rustc -O -L. hash2.rs && \
./hash2
The resulting times are:
-
std::unordered_map
: 2.69 seconds; -
__gnu_pbds::cc_hash_table
: 0.83 seconds.
The resulting times are the same of the C++-only program.
I donβt know why the HashMap
collection of the Rust standard library is performing so poorly, compared to map
, and why map
is performing so poorly, compared to cc_hash_table
, but I think these differences such be seriously taken into account to improve the performance of the HashMap
collection, or to provide an alternative Rust collection.