A crate to process serialized sequences without allocating a Vec to store them

Hi!

I've just released a crate allowing you to run aggregating functions (think fold, find, for_each) on serialized sequences (which may be located inside a more complex structure) without needing to first deserialize all elements and store them in a Vec.

Examples

Given the following JSON:

[
    {"id": 0, "name": "bob", "subscribed_to": ["rust", "knitting", "cooking"]},
    {"id": 1, "name": "toby 🐶", "subscribed_to": ["sticks", "tennis-balls"]},
    {"id": 2, "name": "alice", "subscribed_to": ["rust", "hiking", "paris"]},
    {"id": 3, "name": "mark", "subscribed_to": ["rust", "rugby", "doctor-who"]},
    {"id": 4, "name": "vera", "subscribed_to": ["rust", "mma", "philosophy"]}
]

we can process it without allocating a 5-sized vector of items as follow:

use serde_deser_iter::top_level::DeserializerExt;

/// The type each item in the sequence will be deserialized to.
#[derive(serde::Deserialize)]
struct DataEntry {
    // Not all fields are needed, but we could add "name"
    // and "id".
    subscribed_to: Vec<String>,
}

fn main() -> anyhow::Result<()> {
    let buffered_file: BufReader<File> = BufReader::new(File::open(example_json_path)?);
    let mut json_deserializer = serde_json::Deserializer::from_reader(buffered_file);
    let mut all_channels = HashSet::new();

    json_deserializer.for_each(|entry: DataEntry| all_channels.extend(entry.subscribed_to))?;
    println!("All existing channels:");
    for channel in all_channels {
        println!("  - {channel}")
    }
    Ok(())
}

Or a more complex case where the sequence is not at the top-level:

{
    "api_version": "x.y.z",
    "result" : [
        {"id": 0, "name": "bob", "subscribed_to": ["rust", "knitting", "cooking"]},
        {"id": 1, "name": "toby 🐶", "subscribed_to": ["good-boy-lifestyle", "sticks", "tennis-balls"]},
        {"id": 2, "name": "alice", "subscribed_to": ["rust", "hiking", "paris"]},
        {"id": 3, "name": "mark", "subscribed_to": ["rust", "rugby", "doctor-who"]},
        {"id": 4, "name": "vera", "subscribed_to": ["rust", "mma", "philosophy"]}
    ]
}

we can use the deep module


#[derive(serde::Deserialize)]
struct DataEntry {
    subscribed_to: Vec<String>,
}

struct Imp;

impl serde_deser_iter::deep::FoldAggregator for Imp {
    type Item = DataEntry;
    type Acc = HashSet<String>;

    fn init() -> Self::Acc {
        HashSet::new()
    }

    fn f(mut acc: HashSet<String>, entry: DataEntry) -> HashSet<String> {
        acc.extend(entry.subscribed_to);
        acc
    }
}

#[derive(serde::Deserialize)]
struct Data {
    result: serde_deser_iter::deep::StreamSeqDeser<serde_deser_iter::deep::Fold<Imp>>,
}

fn main() -> anyhow::Result<()> {
    let buffered_file: BufReader<File> = BufReader::new(File::open(example_json_path)?);

    let data: Data = serde_json::from_reader(buffered_file)?;
    let all_channels = data.result.into_inner();
    println!("All existing channels:");
    for channel in all_channels {
        println!("  - {channel}")
    }
    Ok(())
}

Links

2 Likes

This topic was automatically closed 90 days after the last reply. We invite you to open a new topic if you have further questions or comments.