One could also use scoped_tls to inject the pool in the Hash without needing extra fields (nor RefCell), like this:
scoped_tls::scoped_thread_local! {
static STRING_POOL: String
}
use std::borrow::Borrow;
use std::collections::HashSet;
use std::fmt::Display;
use std::hash::Hash;
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
pub struct Substr {
start: usize,
len: usize,
}
impl std::ops::Index<Substr> for str {
type Output = str;
fn index(&self, k: Substr) -> &str {
&self[k.start..][..k.len]
}
}
impl std::ops::Index<Substr> for String {
type Output = str;
fn index(&self, k: Substr) -> &str {
&self[..][k]
}
}
/// newtype around [`Substr`], for new Hash implementation
//
// The simple PartialEq & Eq are still
// logically correct if interning works as intended.
#[derive(PartialEq, Eq)]
struct SubstrInPool(Substr);
impl SubstrInPool {
/// # Panics
/// Will panic if used without initialized `STRING_POOL` context
fn with_str<R>(&self, callback: impl FnOnce(&str) -> R) -> R {
STRING_POOL.with(|pool| callback(&pool[self.0]))
}
}
/// # Panics
/// Will panic if used without initialized `STRING_POOL` context
impl Hash for SubstrInPool {
/// # Panics
/// Will panic if used without initialized `STRING_POOL` context
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.with_str(|s| s.hash(state));
}
}
/// # Panics
/// Will panic if used without initialized `STRING_POOL` context
impl Display for SubstrInPool {
/// # Panics
/// Will panic if used without initialized `STRING_POOL` context
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.with_str(|s| f.write_str(s))
}
}
impl std::fmt::Debug for SubstrInPool {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut d = f.debug_struct("SubstrInPool");
d.field("start", &self.0.start);
d.field("len", &self.0.len);
if STRING_POOL.is_set() {
self.with_str(|s| d.field("value", &s));
} else {
d.field(
"value",
&std::fmt::from_fn(|f| f.write_str("(unknown outside STRING_POOL context)")),
);
}
d.finish()
}
}
// helper trait to deal with lack of `Equivalent` in `HashSet` API
trait WithStrDyn {
fn with_str_dyn(&self, callback: &mut dyn FnMut(&str));
}
impl WithStrDyn for SubstrInPool {
fn with_str_dyn(&self, callback: &mut dyn FnMut(&str)) {
self.with_str(callback);
}
}
impl WithStrDyn for &str {
fn with_str_dyn(&self, callback: &mut dyn FnMut(&str)) {
callback(self)
}
}
impl<'a> Borrow<dyn WithStrDyn + 'a> for SubstrInPool {
fn borrow(&self) -> &(dyn WithStrDyn + 'a) {
self
}
}
impl Hash for dyn WithStrDyn + '_ {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.with_str_dyn(&mut |s| s.hash(state));
}
}
impl PartialEq for dyn WithStrDyn + '_ {
fn eq(&self, other: &Self) -> bool {
let mut r = false;
self.with_str_dyn(&mut |s1| other.with_str_dyn(&mut |s2| r = s1 == s2));
r
}
}
impl Eq for dyn WithStrDyn + '_ {}
// newtype to prevent unwanted (panicky) access outside `intern_string`
// with local pool unset for users of `intern_string`
#[derive(Debug, Default)]
pub struct Index(HashSet<SubstrInPool>);
pub fn intern_string(storage: &mut String, index: &mut Index, s: &str) -> Substr {
if let Some(ss) = STRING_POOL.set(storage, || index.0.get::<dyn WithStrDyn + '_>(&s)) {
return ss.0;
}
let start = storage.find(s).unwrap_or(storage.len());
if start == storage.len() {
storage.push_str(s);
}
let ss = Substr {
start,
len: s.len(),
};
STRING_POOL.set(storage, || {
if !index.0.insert(SubstrInPool(ss)) {
panic!("should never get here due to get() above");
}
});
ss
}
fn main() {
let mut storage = String::new();
let mut index = Index::default();
let mut interned = vec![];
for x in [
"foobar", "bar", "foo", "baz", "bar", "qux", "foo", "q", "azqu", "q", "foo", "wow",
] {
interned.push(intern_string(&mut storage, &mut index, x));
}
STRING_POOL.set(&storage, || {
dbg!(&index);
});
drop(index); // index no longer needed
println!("\nstorage: {storage:?}\n");
for &x in &interned {
println!("{}", &storage[x]);
}
}
(no playground link due to lack of scoped_tls in playground)