Hi! I started learning rust recently and made a small json parser with no string allocations besides the initial reading of the contents of the json to test my understanding. I appreciate a lot if I could get some feedback on my implementation
use core::panic;
use std::{collections::HashMap, env, fs, iter, str};
#[derive(Debug)]
struct Lexer<'a> {
contents: &'a str,
current: (usize, char),
}
impl<'a> Lexer<'a> {
fn new(contents: &'a str) -> Self {
Self {
contents: contents,
current: (0, '\0'),
}
}
fn tokenize(&mut self) -> Vec<Token<'a>> {
let mut iterator = self.contents.char_indices().peekable();
let mut tokens = Vec::<Token>::new();
while self.advance(&mut iterator) {
match self.current.1 {
'{' => tokens.push(Token::Bracket(LEFT)),
'}' => tokens.push(Token::Bracket(RIGHT)),
'[' => tokens.push(Token::Brace(LEFT)),
']' => tokens.push(Token::Brace(RIGHT)),
',' => tokens.push(Token::Comma),
':' => tokens.push(Token::Colon),
'"' => {
let start = self.current.0 + 1;
while self.advance(&mut iterator) {
if self.current.1 == '"' {
let end = self.current.0;
let s = &self.contents[start..end];
tokens.push(Token::String(s));
break;
}
}
self.expect('"'); // we should be at the ending quote
}
c if c.is_whitespace() => self.skip_whitespace(&mut iterator),
't' => {
// expect "true"
self.expect('t');
self.advance(&mut iterator);
self.expect('r');
self.advance(&mut iterator);
self.expect('u');
self.advance(&mut iterator);
self.expect('e');
tokens.push(Token::Bool(true));
}
'f' => {
// expect "false"
self.advance(&mut iterator);
self.expect('a');
self.advance(&mut iterator);
self.expect('l');
self.advance(&mut iterator);
self.expect('s');
self.advance(&mut iterator);
self.expect('e');
tokens.push(Token::Bool(false));
}
'n' => {
// expect "null"
self.advance(&mut iterator);
self.expect('u');
self.advance(&mut iterator);
self.expect('l');
self.advance(&mut iterator);
self.expect('l');
tokens.push(Token::Null);
}
c if c.is_digit(10) || c == '-' => {
let start = self.current.0;
let mut float = false;
while let Some((_, c)) = iterator.peek() {
if !c.is_digit(10) {
if *c == '.' {
if float {
panic!("Invalid number format, multiple decimal points");
}
float = true;
} else {
break;
}
}
self.advance(&mut iterator);
}
// expect last char to be a digit, handling cases like "123." or "-".
self.expect_fn(self.current.1, |c| c.is_digit(10));
let end = self.current.0 + 1;
let number_str = &self.contents[start..end];
if float {
tokens.push(Token::Float(
number_str.parse().expect("Invalid float format"),
));
} else {
tokens.push(Token::Int(
number_str.parse().expect("Invalid integer format"),
));
}
}
_ => panic!("unexpected token {}", self.current.1),
}
}
return tokens;
}
fn advance(&mut self, iterator: &mut iter::Peekable<str::CharIndices>) -> bool {
if let Some((i, c)) = iterator.next() {
self.current = (i, c);
return true;
}
false
}
fn expect_fn(&self, expected: char, func: impl FnOnce(char) -> bool) {
if !func(self.current.1) {
panic!("Expected {expected} but found {}", self.current.1);
}
}
fn expect(&self, expected: char) {
if self.current.1 != expected {
panic!("Expected {expected} but found {}", self.current.1);
}
}
// Skips whitespace characters, leaving the iterator at the last whitespace character
fn skip_whitespace(&mut self, iterator: &mut iter::Peekable<str::CharIndices>) {
if !self.current.1.is_whitespace() {
return;
}
while let Some((_, c)) = iterator.peek() {
if !c.is_whitespace() {
return;
}
self.advance(iterator);
}
}
}
#[derive(Debug)]
struct Parser {}
type Direction = bool;
const LEFT: Direction = false;
const RIGHT: Direction = true;
#[derive(Debug)]
enum Token<'a> {
Brace(Direction), // []
Bracket(Direction), // {}
String(&'a str),
Comma,
Int(i64),
Float(f64),
Colon,
Bool(bool),
Null,
}
#[derive(Debug)]
enum Node<'a> {
Object(HashMap<&'a str, Node<'a>>),
Array(Vec<Node<'a>>),
String(&'a str),
Int(i64),
Float(f64),
Bool(bool),
Null,
}
impl Parser {
fn parse_object<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {
let mut map = HashMap::new();
let mut key: Option<&'a str> = None;
let mut value: Option<Node<'a>> = None;
let mut in_key = true;
while let Some(token) = tokens.next() {
if matches!(token, Token::Bracket(dir) if *dir == RIGHT) {
if key.is_none() || value.is_none() {
panic!("Unexpected closing bracket in object");
}
map.insert(key.take().unwrap(), value.take().unwrap());
break;
}
if in_key {
if key.is_none() && !matches!(token, Token::String(_)) {
panic!("Expected string key in object, found {:?}", token);
} else if key.is_some() && !matches!(token, Token::Colon) {
panic!("Expected colon after key in object, found {:?}", token);
}
}
match token {
Token::Bracket(dir) => {
if *dir == LEFT {
value = Some(Parser::parse_object(tokens));
} else {
panic!("Unexpected closing bracket in object");
}
}
Token::Brace(dir) => {
if *dir == LEFT {
value = Some(Parser::parse_array(tokens));
} else {
panic!("Unexpected closing brace in object");
}
}
Token::Colon => in_key = false,
Token::Comma => {
if key.is_none() || value.is_none() {
panic!("Unexpected comma in object");
}
map.insert(key.take().unwrap(), value.take().unwrap());
in_key = true;
}
Token::String(s) => {
if in_key {
key = Some(s);
} else {
value = Some(Node::String(s));
}
}
Token::Int(i) => value = Some(Node::Int(*i)),
Token::Float(f) => value = Some(Node::Float(*f)),
Token::Bool(b) => value = Some(Node::Bool(*b)),
Token::Null => value = Some(Node::Null),
}
}
return Node::Object(map);
}
fn parse_array<'a>(tokens: &mut iter::Peekable<std::slice::Iter<'a, Token<'a>>>) -> Node<'a> {
let mut vec = Vec::new();
while let Some(token) = tokens.next() {
if matches!(token, Token::Brace(dir) if *dir == RIGHT) {
break;
}
match token {
Token::Bracket(dir) => {
if *dir == LEFT {
vec.push(Parser::parse_object(tokens));
}
}
Token::Brace(dir) => {
if *dir == LEFT {
vec.push(Parser::parse_array(tokens));
} else {
panic!("Unexpected closing brace in array");
}
}
Token::Colon => panic!("Unexpected colon in array"),
Token::Comma => continue,
Token::String(s) => vec.push(Node::String(s)),
Token::Int(i) => vec.push(Node::Int(*i)),
Token::Float(f) => vec.push(Node::Float(*f)),
Token::Bool(b) => vec.push(Node::Bool(*b)),
Token::Null => vec.push(Node::Null),
}
}
return Node::Array(vec);
}
fn parse<'a>(tokens: &'a [Token<'a>]) -> Node<'a> {
let mut iter = tokens.iter().peekable();
let mut root: Option<Node<'a>> = None;
while let Some(token) = iter.next() {
match token {
Token::Bracket(dir) => {
if *dir == LEFT {
root = Some(Parser::parse_object(&mut iter));
} else {
panic!("Unexpected closing bracket");
}
}
Token::Brace(dir) => {
if *dir == LEFT {
root = Some(Parser::parse_array(&mut iter));
} else {
panic!("Unexpected closing brace");
}
}
_ => {
panic!("End of file expected, instead got: {:?}", token);
}
}
}
root.expect("No root object found")
}
}
fn main() {
let args: Vec<String> = env::args().collect();
let file_path: &String = &args[1];
println!("Reading {file_path}");
let contents: String =
fs::read_to_string(file_path).expect("Should have been able to read the json file");
let mut lexer = Lexer::new(&contents);
let tokens = lexer.tokenize();
let result = Parser::parse(&tokens);
println!("{:#?}", result);
}
Aditionally while implementing the parser, I came across a compiler error that I didn't quite understand how to handle. At first I wanted the Parser::parse function to receive the contents string slice as an argument and tokenize the contents within the parse function, like this
fn parse<'a>(contents: &'a str) -> Node<'a> {
let mut lexer = Lexer::new(contents);
let tokens: Vec<Token<'a>> = lexer.tokenize();
let mut iter = tokens.iter().peekable(); // ERROR: `tokens` does not live long enough, borrowed value does not live long enough
let mut root: Option<Node<'a>> = None;
while let Some(token) = iter.next() {
match token {
Token::Bracket(dir) => {
if *dir == LEFT {
root = Some(Parser::parse_object(&mut iter));
} else {
panic!("Unexpected closing bracket");
}
}
Token::Brace(dir) => {
if *dir == LEFT {
root = Some(Parser::parse_array(&mut iter));
} else {
panic!("Unexpected closing brace");
}
}
_ => {
panic!("Unexpected token at top level: {:?}", token);
}
}
}
root.expect("No root object found")
}
This caused the following compiler error
`tokens` does not live long enough
borrowed value does not live long enough
Which correct me if I'm wrong could be because the compiler thinks that I might hold a reference to tokens in my root Node that will be dropped once parse returns, but I don't understand why. Since the Nodes (of variant String) could only hold references to the original contents slice, same with the Tokens and their lifetime should match the lifetime of the contents slice which should outlive the parse function.