Json Parser Code Review

Hi! I started learning rust recently and made a small json parser with no string allocations besides the initial reading of the contents of the json to test my understanding. I appreciate a lot if I could get some feedback on my implementation

use core::panic;
use std::{collections::HashMap, env, fs, iter, str};

#[derive(Debug)]
struct Lexer<'a> {
    contents: &'a str,
    current: (usize, char),
}
impl<'a> Lexer<'a> {
    fn new(contents: &'a str) -> Self {
        Self {
            contents: contents,
            current: (0, '\0'),
        }
    }

    fn tokenize(&mut self) -> Vec<Token<'a>> {
        let mut iterator = self.contents.char_indices().peekable();
        let mut tokens = Vec::<Token>::new();
        while self.advance(&mut iterator) {
            match self.current.1 {
                '{' => tokens.push(Token::Bracket(LEFT)),
                '}' => tokens.push(Token::Bracket(RIGHT)),
                '[' => tokens.push(Token::Brace(LEFT)),
                ']' => tokens.push(Token::Brace(RIGHT)),
                ',' => tokens.push(Token::Comma),
                ':' => tokens.push(Token::Colon),
                '"' => {
                    let start = self.current.0 + 1;
                    while self.advance(&mut iterator) {
                        if self.current.1 == '"' {
                            let end = self.current.0;
                            let s = &self.contents[start..end];
                            tokens.push(Token::String(s));
                            break;
                        }
                    }
                    self.expect('"'); // we should be at the ending quote
                }
                c if c.is_whitespace() => self.skip_whitespace(&mut iterator),
                't' => {
                    // expect "true"
                    self.expect('t');
                    self.advance(&mut iterator);
                    self.expect('r');
                    self.advance(&mut iterator);
                    self.expect('u');
                    self.advance(&mut iterator);
                    self.expect('e');
                    tokens.push(Token::Bool(true));
                }
                'f' => {
                    // expect "false"
                    self.advance(&mut iterator);
                    self.expect('a');
                    self.advance(&mut iterator);
                    self.expect('l');
                    self.advance(&mut iterator);
                    self.expect('s');
                    self.advance(&mut iterator);
                    self.expect('e');
                    tokens.push(Token::Bool(false));
                }
                'n' => {
                    // expect "null"
                    self.advance(&mut iterator);
                    self.expect('u');
                    self.advance(&mut iterator);
                    self.expect('l');
                    self.advance(&mut iterator);
                    self.expect('l');
                    tokens.push(Token::Null);
                }
                c if c.is_digit(10) || c == '-' => {
                    let start = self.current.0;
                    let mut float = false;
                    while let Some((_, c)) = iterator.peek() {
                        if !c.is_digit(10) {
                            if *c == '.' {
                                if float {
                                    panic!("Invalid number format, multiple decimal points");
                                }
                                float = true;
                            } else {
                                break;
                            }
                        }
                        self.advance(&mut iterator);
                    }
                    // expect last char to be a digit, handling cases like "123." or "-".
                    self.expect_fn(self.current.1, |c| c.is_digit(10));

                    let end = self.current.0 + 1;
                    let number_str = &self.contents[start..end];
                    if float {
                        tokens.push(Token::Float(
                            number_str.parse().expect("Invalid float format"),
                        ));
                    } else {
                        tokens.push(Token::Int(
                            number_str.parse().expect("Invalid integer format"),
                        ));
                    }
                }
                _ => panic!("unexpected token {}", self.current.1),
            }
        }
        return tokens;
    }

    fn advance(&mut self, iterator: &mut iter::Peekable<str::CharIndices>) -> bool {
        if let Some((i, c)) = iterator.next() {
            self.current = (i, c);
            return true;
        }
        false
    }

    fn expect_fn(&self, expected: char, func: impl FnOnce(char) -> bool) {
        if !func(self.current.1) {
            panic!("Expected {expected} but found {}", self.current.1);
        }
    }

    fn expect(&self, expected: char) {
        if self.current.1 != expected {
            panic!("Expected {expected} but found {}", self.current.1);
        }
    }

    // Skips whitespace characters, leaving the iterator at the last whitespace character
    fn skip_whitespace(&mut self, iterator: &mut iter::Peekable<str::CharIndices>) {
        if !self.current.1.is_whitespace() {
            return;
        }
        while let Some((_, c)) = iterator.peek() {
            if !c.is_whitespace() {
                return;
            }
            self.advance(iterator);
        }
    }
}

#[derive(Debug)]
struct Parser {}

type Direction = bool;
const LEFT: Direction = false;
const RIGHT: Direction = true;

#[derive(Debug)]
enum Token<'a> {
    Brace(Direction),   // []
    Bracket(Direction), // {}
    String(&'a str),
    Comma,
    Int(i64),
    Float(f64),
    Colon,
    Bool(bool),
    Null,
}

#[derive(Debug)]
enum Node<'a> {
    Object(HashMap<&'a str, Node<'a>>),
    Array(Vec<Node<'a>>),
    String(&'a str),
    Int(i64),
    Float(f64),
    Bool(bool),
    Null,
}

impl Parser {
    fn parse_object<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {
        let mut map = HashMap::new();
        let mut key: Option<&'a str> = None;
        let mut value: Option<Node<'a>> = None;
        let mut in_key = true;
        while let Some(token) = tokens.next() {
            if matches!(token, Token::Bracket(dir) if *dir == RIGHT) {
                if key.is_none() || value.is_none() {
                    panic!("Unexpected closing bracket in object");
                }
                map.insert(key.take().unwrap(), value.take().unwrap());
                break;
            }
            if in_key {
                if key.is_none() && !matches!(token, Token::String(_)) {
                    panic!("Expected string key in object, found {:?}", token);
                } else if key.is_some() && !matches!(token, Token::Colon) {
                    panic!("Expected colon after key in object, found {:?}", token);
                }
            }
            match token {
                Token::Bracket(dir) => {
                    if *dir == LEFT {
                        value = Some(Parser::parse_object(tokens));
                    } else {
                        panic!("Unexpected closing bracket in object");
                    }
                }
                Token::Brace(dir) => {
                    if *dir == LEFT {
                        value = Some(Parser::parse_array(tokens));
                    } else {
                        panic!("Unexpected closing brace in object");
                    }
                }
                Token::Colon => in_key = false,
                Token::Comma => {
                    if key.is_none() || value.is_none() {
                        panic!("Unexpected comma in object");
                    }
                    map.insert(key.take().unwrap(), value.take().unwrap());
                    in_key = true;
                }
                Token::String(s) => {
                    if in_key {
                        key = Some(s);
                    } else {
                        value = Some(Node::String(s));
                    }
                }
                Token::Int(i) => value = Some(Node::Int(*i)),
                Token::Float(f) => value = Some(Node::Float(*f)),
                Token::Bool(b) => value = Some(Node::Bool(*b)),
                Token::Null => value = Some(Node::Null),
            }
        }
        return Node::Object(map);
    }

    fn parse_array<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {
        let mut vec = Vec::new();
        while let Some(token) = tokens.next() {
            if matches!(token, Token::Brace(dir) if *dir == RIGHT) {
                break;
            }
            match token {
                Token::Bracket(dir) => {
                    if *dir == LEFT {
                        vec.push(Parser::parse_object(tokens));
                    }
                }
                Token::Brace(dir) => {
                    if *dir == LEFT {
                        vec.push(Parser::parse_array(tokens));
                    } else {
                        panic!("Unexpected closing brace in array");
                    }
                }
                Token::Colon => panic!("Unexpected colon in array"),
                Token::Comma => continue,
                Token::String(s) => vec.push(Node::String(s)),
                Token::Int(i) => vec.push(Node::Int(*i)),
                Token::Float(f) => vec.push(Node::Float(*f)),
                Token::Bool(b) => vec.push(Node::Bool(*b)),
                Token::Null => vec.push(Node::Null),
            }
        }
        return Node::Array(vec);
    }

    fn parse<'a>(tokens: &'a [Token<'a>]) -> Node<'a> {
        let mut iter = tokens.iter().peekable();
        let mut root: Option<Node<'a>> = None;
        while let Some(token) = iter.next() {
            match token {
                Token::Bracket(dir) => {
                    if *dir == LEFT {
                        root = Some(Parser::parse_object(&mut iter));
                    } else {
                        panic!("Unexpected closing bracket");
                    }
                }
                Token::Brace(dir) => {
                    if *dir == LEFT {
                        root = Some(Parser::parse_array(&mut iter));
                    } else {
                        panic!("Unexpected closing brace");
                    }
                }
                _ => {
                    panic!("End of file expected, instead got: {:?}", token);
                }
            }
        }
        root.expect("No root object found")
    }
}

fn main() {
    let args: Vec<String> = env::args().collect();
    let file_path: &String = &args[1];
    println!("Reading {file_path}");
    let contents: String =
        fs::read_to_string(file_path).expect("Should have been able to read the json file");
    let mut lexer = Lexer::new(&contents);
    let tokens = lexer.tokenize();
    let result = Parser::parse(&tokens);
    println!("{:#?}", result);
}

Aditionally while implementing the parser, I came across a compiler error that I didn't quite understand how to handle. At first I wanted the Parser::parse function to receive the contents string slice as an argument and tokenize the contents within the parse function, like this

fn parse<'a>(contents: &'a str) -> Node<'a> {
        let mut lexer = Lexer::new(contents);
        let tokens: Vec<Token<'a>> = lexer.tokenize();
        let mut iter = tokens.iter().peekable(); // ERROR: `tokens` does not live long enough, borrowed value does not live long enough
        let mut root: Option<Node<'a>> = None;
        while let Some(token) = iter.next() {
            match token {
                Token::Bracket(dir) => {
                    if *dir == LEFT {
                        root = Some(Parser::parse_object(&mut iter));
                    } else {
                        panic!("Unexpected closing bracket");
                    }
                }
                Token::Brace(dir) => {
                    if *dir == LEFT {
                        root = Some(Parser::parse_array(&mut iter));
                    } else {
                        panic!("Unexpected closing brace");
                    }
                }
                _ => {
                    panic!("Unexpected token at top level: {:?}", token);
                }
            }
        }
        root.expect("No root object found")
    }

This caused the following compiler error

`tokens` does not live long enough
borrowed value does not live long enough

Which correct me if I'm wrong could be because the compiler thinks that I might hold a reference to tokens in my root Node that will be dropped once parse returns, but I don't understand why. Since the Nodes (of variant String) could only hold references to the original contents slice, same with the Tokens and their lifetime should match the lifetime of the contents slice which should outlive the parse function.

A tip when writing questions about code and error messages: Try to include the full rustc output for the error/diagnostic message, as obtained e.g. from running cargo check on the terminal, not just the overly short snipped your IDE might give you.

2 Likes

So the error looks like this

error[E0597]: `tokens` does not live long enough
   --> src/main.rs:297:24
    |
294 |     fn parse<'a>(contents: &'a str) -> Node<'a> {
    |              -- lifetime `'a` defined here
295 |         let mut lexer = Lexer::new(contents);
296 |         let tokens: Vec<Token<'a>> = lexer.tokenize();
    |             ------ binding `tokens` declared here
297 |         let mut iter = tokens.iter().peekable();
    |                        ^^^^^^ borrowed value does not live long enough
298 |         let mut root: Option<Node<'a>> = None;
    |                       ---------------- type annotation requires that `tokens` is borrowed for `'a`
...
321 |     }
    |     - `tokens` dropped here while still borrowed

It’s a bit surprising that the lifetime of the Vec should be tied to the one of the &'a str as this error message suggests.

Turns out, the problem was merely in your own function signatures.

If you turn

    fn parse_array<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {

into

    fn parse_array<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'_, Token<'a>>>) -> Node<'a> {

and

    fn parse_object<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {

into

    fn parse_object<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'_, Token<'a>>>) -> Node<'a> {

to untie the 'a in Token<'a> (lifetime of the &'a str values inside) from the lifetime of the slice::Iter (which is the lifetime that the Vec<Token> is borrowed for)

then it works :wink:

4 Likes

I haven't looked at your algorithm at all, I'm just addressing the lifetime error you got.


Let's take a closer look at the iter method on Vec<T>:

pub fn iter(&self) -> Iter<'_, T>
// With less elision
pub fn iter<'v>(&'v self) -> Iter<'v, T>

The meaning of this signature is that the Iter<'v, T> keeps the Vec<T> borrowed so long as it exists. You can think of it as the iterator holding on to the &'v Vec<T> (though it doesn't have to be implemented that way). The iterator gives you items of type &'v T.

Then over here:

    // A caller chosen lifetime which denotes the borrow duration
    // of `*contents` -- necessarily longer than the function body
    //       vv             vv              vv
    fn parse<'a>(contents: &'a str) -> Node<'a> {
        let mut lexer = Lexer::new(contents);
        let tokens: Vec<Token<'a>> = lexer.tokenize();
        let mut iter = tokens.iter().peekable();

You're creating an Iter<'v, Token<'a>>, and the local tokens must be borrowed for 'v. Then you try to pass it here...

Parser::parse_object(&mut iter)

...to a function with this signature...

//                                                               vv !
fn parse_object<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {

...which is only possible if 'v = 'a.[1]

The compiler sees this as you trying to borrow tokens for the duration 'a:

fn parse<'a>(contents: &'a str) -> Node<'a> {
    // ...
    let mut iter = tokens.iter().peekable();

But you can never borrow a local variable for a caller-chosen lifetime -- a borrow duration which is necessarily longer than the function body.

And as @steffahn explained, you can change your signatures so that you no longer force the lifetimes to be the same.


So, your iterators return a &'v Token<'a>. Looking at Token we see...

type Direction = bool;
enum Token<'a> {
    Brace(Direction),   // []
    Bracket(Direction), // {}
    String(&'a str),
    Comma,
    Int(i64),
    Float(f64),
    Colon,
    Bool(bool),
    Null,
}

So your iterator items are, at times, nested references akin to a &'v &'a str. It's usually best to not force the lifetimes of nested references to be the same.

And one other thing that stands out to me is that Token<'a> could implement copy, in which case you could avoid the nested references by doing something like...

        //                          vvvvvvvvv
        let mut iter = tokens.iter().copied().peekable();

...and then your functions can take iterators where the items are just Token<'a>.

(You would have to rewrite the signatures. And if you may sometime change Token<'a> to be not copiable -- maybe you need to do so to support unescaping or something -- you'll have to change more things again later. However it would have avoided the nested lifetime problem.)


  1. Or 'v is longer than 'a, but that's not useful here. ↩︎

3 Likes

Will do, thanks for the tip!

Looking at the Lexer implementation, here are a few things that stick out to me:

  • A lot of the token parsing would be easier if you were working with the unparsed tail of the contents as a &str (e.g.: if unparsed.starts_with("true"))
  • You return a Vec<_>, but all Parser wants is an iterator
  • You could take a stab at returning Result<_, _>s intead of panicking

So as a further exercise, I suggest that you make Lexer an iterator yourself instead:

impl<'a> Iterator for Lexer<'a> {
    type Item = Result<Token<'a>, Error>;
    fn next(&mut self) -> Option<Result<Token<'a>, Error>> {
        // ...
    }
}

// Or if you don't want to tackle the errors yet
impl<'a> Iterator for Lexer<'a> {
    type Item = Token<'a>;
    fn next(&mut self) -> Option<Token<'a>> {
        // ...
    }
}
 impl<'a> Lexer<'a> {
+    // Stop-gap until we adjust the rest of our code
     fn tokenize(&mut self) -> Vec<Token<'a>> {
-        ...
+        self.collect::<Result<Vec, _>>().unwrap()
+        // Or without error handling, just
+        // self.collect()
     }

And to work with &strs more in Lexer too:

// You could do this...
struct Lexer<'a> {
    /// Byte offset of our unparsed content
    offset: usize,
    unparsed_content: &'a str,
}

// ...or you could do this and use the `as_str` and `offset` methods
// (you can use `as_str` instead of `Peekable`)
struct Lexer<'a> {
    cidx: CharIndices<'a>,
}

// Suggested helpers
impl<'a> Lexer<'a> {
    /// To be called when the unparsed content starts with `'-'` or a digit
    fn next_number(&mut self) -> Result<Token<'a>, Error> { ... }
    /// To be called when the unparsed content starts with `'"'`
    fn next_str(&mut self) -> Result<Token<'a>, Error> { ... }
    /// Trim leading whitespace and return the unparsed content.
    fn skip_whitespace(&mut self) -> &'a str { ... }
    /// Advance the unparsed content `len` bytes.
    /// (See `char::len_utf8` if you use this to trim leading `char`s.)
    fn advance(&mut self, len: usize) {
        self.offset += len;
        self.unparsed_content = &self.unparsed_content[len..];
        // Or: let _ = (&mut self.cidx).take(len).last();
    }

(I suggest keeping the offset for the sake of better error reports. Line numbers would be even better, but also more of a pain to track.)

After that you could start adjusting Parser to work with iterators that yield Token<'a> or Result<Token<'a>, Error>...

// If you're not ready for `Result` in `Parser` yet, this will work for both...
fn parse_object<'a>(tokens: &mut iter::Peekable<impl Iterator<Item = Token<'a>>>) -> Node<'a> {
// ...the case where you did use `Result` above
// let mut iter = lexer.map(|r| r.unwrap()).peekable();
// ...and that case where you didn't
// let mut iter = lexer.peekable();

// Or if you are ready for `Result` in `Parser`, this will work.
fn parse_object<'a>(
    tokens: &mut iter::Peekable<impl Iterator<Item = Result<Token<'a>, Error>>>
) -> Result<Node<'a>, Error> {

...eventually using Peekable<Lexer<'a>> if you care to be non-generic.

(I still haven't looked at the Parser implementation or looked for logic bugs.)

3 Likes

I see, what I'm doing wrong is basically saying that the lifetimes of the tokens that the iterator holds references to are the same as the lifetime of the contents slice, therefore they should live for the same duration which is impossible since they are dropped at the end of the function by being local variables. So it's that last part what triggers the compiler error, correct? If I was to define the parse_* functions as

fn parse_object<'a,'b>(tokens: &mut std::slice::Iter<'b, Token<'a>>) -> Node<'a>
fn parse_array<'a,'b>(tokens: &mut std::slice::Iter<'b, Token<'a>>) -> Node<'a>

This would work right?
And the copied() version works because it prevents me from writing the function signature with a wrong lifetime right?

You are pretty much correct on all counts, yep!


I will suggest a mental adjustment: Think of Rust lifetimes -- those '_ things -- as the duration of borrows. You can't be borrowing something when it drops, so there is a connection between where values may drop and Rust lifetimes. But the Rust lifetimes don't directly correspond to the drop scope or liveness of values, despite the name. (There are other things you can't do with a borrowed value too -- take a &mut to it, overwrite it, or move it. These will also give you borrow checker errors.)

is basically saying that the lifetimes of the tokens token type that the iterator holds references to are the same as the lifetime of the contents slice type, therefore they should live for the borrow of tokens must be the same duration which is impossible since they are tokens is dropped at the end of the function

If the distinction doesn't make sense now, just keep it in the back of your mind as you continue learning. (Conflating Rust lifetimes and the liveness of values trips up a lot of newcomers, and learning material isn't very good at clearing up the distinction either.)

1 Like

Okay took your advice and reimplemented the Lexer as an Iterator wrapping the Iterator Item with a Result

#[derive(Debug)]
struct Lexer<'a> {
    contents: &'a str,
    offset: usize,
    line: usize,
    line_offset: usize,
}

impl<'a> std::iter::Iterator for Lexer<'a> {
    type Item = Result<Token<'a>, String>;
    fn next(&mut self) -> Option<Self::Item> {
        self.next_token()
    }
}

impl<'a> Lexer<'a> {
    fn new(contents: &'a str) -> Self {
        Self {
            contents: contents,
            offset: 0,
            line: 1,
            line_offset: 0,
        }
    }

    fn fmt_error(&self, message: &str) -> String {
        format!("At line {}, offset {}: {}", self.line, self.line_offset, message)
    }

    fn next_token(&mut self) -> Option<Result<Token<'a>, String>> {
        self.skip_whitespace();
        let next = self.contents.chars().next();
        if next.is_none() {
            return None;
        }
        let next = next.unwrap();
        let result;
        match next {
            '{' => {
                result = Ok(Token::Bracket(LEFT));
                self.advance_char('{');
            }
            '}' => {
                result = Ok(Token::Bracket(RIGHT));
                self.advance_char('}');
            }
            '[' => {
                result = Ok(Token::Brace(LEFT));
                self.advance_char('[');
            }
            ']' => {
                result = Ok(Token::Brace(RIGHT));
                self.advance_char(']');
            }
            ',' => {
                result = Ok(Token::Comma);
                self.advance_char(',');
            }
            ':' => {
                result = Ok(Token::Colon);
                self.advance_char(':');
            }
            '"' => {
                result = self.next_str();
            }
            c if c.is_digit(10) || c == '-' => {
                result = self.next_number();
            }
            c if c == 't' => {
                // expect "true"
                if !self.contents.starts_with("true") {
                    result = Err(self.fmt_error("Invalid token"));
                } else {
                    result = Ok(Token::Bool(true));
                    self.advance_str("true");
                }
            }
            c if c == 'f' => {
                // expect "false"
                if !self.contents.starts_with("false") {
                    result = Err(self.fmt_error("Invalid token"));
                } else {
                result = Ok(Token::Bool(false));
                self.advance_str("false");
                }
            }
            c if c == 'n' => {
                // expect "null"
                if !self.contents.starts_with("null") {
                    result = Err(self.fmt_error("Invalid token"));
                } else {
                    result = Ok(Token::Null);
                    self.advance_str("null");
                }
            }
            _ => result = Err(self.fmt_error("Unexpected token")),
        }
        return Some(result);
    }

    fn next_number(&mut self) -> Result<Token<'a>, String> {
        let mut float = false;
        let mut size = 0;
        let mut chars = self.contents.chars();
        while let Some(c) = chars.next() {
            if !c.is_digit(10) {
                match c {
                    '.' => {
                        if float {
                            return Err(self.fmt_error("Invalid number format, multiple '.' found"));
                        }
                        float = true;
                    }
                    '-' => {
                        if size != 0 {
                            return Err(self.fmt_error("Invalid number format, unexpected '-'"));
                        }
                    }
                    _ => break,
                }
            }
            size += c.len_utf8();
        }

        let result;
        let number_str = &self.contents[..size];
        if float {
            result = number_str.parse().map(|f| {
                Token::Float(f)
            }).map_err(|e| {
                return format!("{}, {}", self.fmt_error("Invalid float"), e);
            });
        } else {
            result = number_str.parse().map(|i| {
                Token::Int(i)
            }).map_err(|e| {
                format!("{}: {}", self.fmt_error("Invalid int"), e)
            });
        }
        self.advance(size);
        return result;
    }

    fn next_str(&mut self) -> Result<Token<'a>, String> {
        self.advance_char('"');
        let mut size = 0;
        let mut chars = self.contents.chars();
        while let Some(c) = chars.next() {
            if c == '"' {
                let s = &self.contents[..size];
                self.advance(size + '"'.len_utf8());
                return Ok(Token::String(s));
            }
            size += c.len_utf8();
        }
        return Err(self.fmt_error("Unterminated string literal"));
    }

    fn advance_char(&mut self, c: char) {
        self.advance(c.len_utf8());
    }
    fn advance_str(&mut self, str: &str) {
        self.advance(str.len());
    }
    fn advance(&mut self, len: usize) {
        self.offset += len;
        self.line_offset += len;
        self.contents = &self.contents[len..];
    }

    fn skip_whitespace(&mut self) {
        while let Some(c) = self.contents.chars().next() {
            if !c.is_whitespace() {
                break;
            }
            self.advance_char(c);
            if c == '\n' {
                self.line += 1;
                self.line_offset = 0;
            }
        }
    }
}

The error messages could use some work but I prioritized the Iterator and Result. Also is &str.chars().next() the "natural" way of traversing a slice? In this case I just wanted to get the first char in the slice, so it didn't feel very natural to use an Iterator