Problems with a recursive parser


#1

I am writing a recursive parser using a Vec as a stack to store the current state but I am running into major problems with the borrow checker. Any help will be appreciated. What I don’t understand is since the lifetime of my Poly exceeds the lifetime of the Parser this should be totally safe but Rust does not think so.

extern crate std;

use std::fs::File;
use std::string::String;
use std::vec::Vec;
use std::result::Result;
use std::boxed::Box;
use std::str::{FromStr, Chars};
use std::io::{Read, Error, ErrorKind};
use std::io::Result as IoResult;
use std::num::ParseIntError;

struct Point
{
    x : i32,
    y : i32,
}

struct Path {
    points : Vec<Point>
}

struct Poly {
    paths : Vec<Path>
}

impl Poly {
    fn new() -> Poly {
        Poly { paths : Vec::new() }
    }
}

impl Path {
    fn new() -> Path {
        Path { points : Vec::new() }
    }
}

impl Point {
    fn new() -> Point {
        Point { x : 0, y : 0 }
    }
}

trait ParserBase<'a> {
    fn parseChar(&'a mut self, ch : char) -> ParseCharResult<'a>;
}

enum ParseCharResult<'a> {
    Push(Box<ParserBase<'a> + 'a>, bool /*next char */), 
    Continue,
    Pop(bool /*next char*/),
    Error(&'static str /*message*/),
}

impl<'a> ParseCharResult<'a> {
    fn new() -> ParseCharResult<'a> {
        ParseCharResult::Error("unused")
    }
}


struct ParsePoly<'a> {

    poly : &'a mut Poly
}

struct ParsePath<'a> {
    path : &'a mut Path
}

enum ParsePointState {
    Start,
    X,
    Y
}

struct ParsePoint<'a> {
    point : &'a mut Point,
    state : ParsePointState
}

struct ParseInt<'a> {
    value : &'a mut i32,
    tmpStr : String
}

impl<'a> ParsePoly<'a> {
    fn new(p : &'a mut Poly) -> ParsePoly<'a> {
        ParsePoly{ poly : p }
    }
}

impl<'a> ParsePath<'a> {
    fn new(p : &'a mut Path) -> ParsePath<'a> {
        ParsePath { path : p }
    }
}

impl<'a>  ParsePoint<'a> {
    fn new(p : &'a mut Point) -> ParsePoint<'a> {
        ParsePoint { point : p , state : ParsePointState::Start }
    }
}

impl<'a> ParseInt<'a>  {
    fn new(v : &'a mut i32) -> ParseInt<'a> {
        ParseInt {value : v, tmpStr : String::new() }
    }
}


impl<'a> ParserBase<'a> for ParsePoly<'a>
{
    fn parseChar(&'a mut self, ch : char) -> ParseCharResult<'a> {
        if ch.is_whitespace() {
            ParseCharResult::Continue
        } else if ch == '{' {
            self.poly.paths.push(Path::new());
            let path : &'a mut Path  = self.poly.paths.last_mut().unwrap();
            let res : ParseCharResult<'a> = ParseCharResult::Push(Box::new(ParsePath::new(path)), true);
            res

        } else {
            ParseCharResult::Error("unexpected character")
        }
    }
}

impl<'a> ParserBase<'a> for ParsePath<'a>
{
    fn parseChar(&'a mut self, ch : char) -> ParseCharResult {
        if ch.is_whitespace() {
            ParseCharResult::Continue
        } else if ch == '}' {
            ParseCharResult::Pop(true)
        } else {
            self.path.points.push(Point::new());
            let point = self.path.points.last_mut().unwrap();
            ParseCharResult::Push(Box::new(ParsePoint::new(point)), false)
        }
    }
}

impl<'a> ParserBase<'a> for ParsePoint<'a>
{
    fn parseChar(&'a mut self, ch : char) -> ParseCharResult {
        match self.state {
            ParsePointState::Start => {
                if ch.is_whitespace() {
                    ParseCharResult::Continue
                } else {
                    self.state = ParsePointState::X;
                    ParseCharResult::Push(Box::new(ParseInt::new(&mut self.point.x)), false)
                }
            }
            ParsePointState::X => {
                if ch == ' ' || ch == ',' {
                    ParseCharResult::Continue
                } else {
                    self.state = ParsePointState::Y;
                    ParseCharResult::Push(Box::new(ParseInt::new(&mut self.point.y)), false)
                }
            }
            ParsePointState::Y => {
                if ch == '\n' {
                    ParseCharResult::Pop(true)
                }
                else if ch.is_whitespace() {
                    ParseCharResult::Continue
                } else {
                    ParseCharResult::Error("ParsePoint unexpected character")
                }
            }
        }
    }
}

impl<'a> ParserBase<'a> for ParseInt<'a>
{
    fn parseChar(&'a mut self, ch : char) -> ParseCharResult {
        if ch.is_whitespace() || ch == ',' {
            let res : Result<i32, ParseIntError>  = self.tmpStr.parse();
            match res {
                Result::Err(_) => ParseCharResult::Error("An error occurred parsing the int"),
                Result::Ok(val) => {
                    *self.value = val;
                    ParseCharResult::Pop(true)
                }
            }
        }
        else {
            self.tmpStr.push(ch);
            ParseCharResult::Continue
        }
    }
}

struct Parser<'a> {
    stack : Vec<Box<ParserBase<'a> + 'a>>
}

impl<'a> Parser<'a> {
    fn new(poly : &'a mut Poly) -> Parser<'a> {
        let mut ret = Parser{ stack : Vec::new() };
        ret.stack.push( Box::new(ParsePoly::new(poly)));
        ret
    }

    fn parse(&'a mut self, data : &str) -> IoResult<()> {
        let mut it : Chars = data.chars();
        let mut val : Option<char> = it.next();
        
        while let Option::Some(ch) = val {
            let mut res = ParseCharResult::Error("unused");
            {
                if let Some(last) = self.stack.last_mut() {
                   res = (*last).parseChar(ch);
                } else {
                    return Result::Ok(())
                }
            }
            match res {
                ParseCharResult::Push(item, inc) => {
                    self.stack.push(item);
                    if inc {
                        val = it.next();
                    }
                }
                ParseCharResult::Continue => { 
                    val = it.next(); 
                }
                ParseCharResult::Pop(inc) => {
                    {
                        self.stack.pop();
                    }
                    if inc {
                        val = it.next();
                    }
                }
                ParseCharResult::Error(message) =>  return Err(std::io::Error::new(ErrorKind::Other, message))
            }
        }
        Result::Err(std::io::Error::new(ErrorKind::UnexpectedEof, "Needs more data"))
    }
}

impl Poly {
    fn load(filename : &str) -> IoResult<Poly> {
        let mut file = try!(File::open(filename));
        let mut s = String::new();
        try!(file.read_to_string(&mut s));
        let mut poly= Poly::new();
        {
          let mut parser = Parser::new(&mut poly);
          try!(parser.parse(s.as_str()));
        }
        Ok(poly)
    }
}

This give the following errors.

error[E0499]: cannot borrow `self.stack` as mutable more than once at a time
   --> src/poly.rs:235:25
    |
217 |                 if let Some(last) = self.stack.last_mut() {
    |                                     ---------- first mutable borrow occurs here
...
235 |                         self.stack.pop();
    |                         ^^^^^^^^^^ second mutable borrow occurs here
...
245 |     }
    |     - first borrow ends here

error: `parser` does not live long enough
   --> src/poly.rs:256:16
    |
256 |           try!(parser.parse(s.as_str()));
    |                ^^^^^^
src/poly.rs:256:11: 256:42 note: in this expansion of try! (defined in )
    |
note: reference must be valid for the block at 254:8...
   --> src/poly.rs:254:9
    |
254 |         {
    |         ^
note: ...but borrowed value is only valid for the block suffix following statement 0 at 255:50
   --> src/poly.rs:255:51
    |
255 |           let mut parser = Parser::new(&mut poly);
    |                                                   ^

error: aborting due to 5 previous errors

error: Could not compile `test`.

#2

So I tried to minimize the code example using my phone, preserving only Poly::load and signatures immediately visible to it.

After I was done, however, the error had vanished! Playpen: https://is.gd/cYutgp

In response I’ve decided I’m not touching this again until I am on an actual computer.

(note: Unfortunately the original code is far too big to fit into a sharable playpen link)

EDIT: A successful minimization: https://is.gd/rTncmV


#3

Thanks ExpHP, I am from a C++ background and I see lots to like in Rust, unfortunately every time I try to do something non-trivial that would be easy in C++, I get unstuck like in this case.


#4

I’m far from a lifetime expert, but I think the signature of parseChar is strange:

trait ParserBase<'a> {
    fn parseChar(&'a mut self, ch : char) -> ParseCharResult<'a>;
}

As far as I understand it, this keeps self borrowed basically for ever.

From my own experience, using some other lifetime than &mut self is almost never the right thing. I know, the compiler keeps suggesting it, but it’s the wrong direction.

EDIT:
Actually, you need a named lifetime for the return value, but I think it should rather be:

trait ParserBase {
    fn parseChar<'a>(&'a mut self, ch : char) -> ParseCharResult<'a>;
}

#5

On a second thought, that whole “store mutable references everywhere” business feels very unidiomatic and will result in very complex lifetimes.

I’d rather go a different route instead and return the already parsed subexpressions by value everywhere.


#6

My idea is to build up the Poly in place hence the references. Storing the components of the poly means we will have to move those components up the stack which if one was doing in C++ would involve some sort of down-casting. Safe Rust does not support casting.


#7

Just use an enum, e.g:

enum Component {
    Point(Point),
    Path(Path),
    Poly(Poly),
}

I’m don’t exactly understand how the algorithm works, but I imagine you can use a variant of the ParseCharResult enum to return the actual component instead of just signaling successful parsing of a subcomponent.
In other words, if a sub-parser has done its job, it is “converted” into the actual subcomponent and the data structure is built bottom-up.
Because of move semantics, this is just as “in-place”.


#8

Okay, so after minimizing things a bit more I think I see what is causing the error. The error is legitimate, but surprising! (I would have expected it to fail compilation at an earlier step, as I will explain)

The following code exemplifies the error: (link)

use std::fmt::Debug;

struct Parser<'a> {
    stack: Vec<Box<Debug + 'a>>
}

impl<'a> Parser<'a> {
    fn new(poly: &'a mut ()) -> Parser<'a> {
        Parser{ stack: Vec::new() }
    }

    fn parse(&'a mut self) { }
}

fn load() {
    let mut poly = ();
    let mut parser = Parser::new(&mut poly);
    parser.parse();
}

Changing a single line eliminates the error: (link)
The only difference is the type of Parser’s member.

struct Parser<'a> {
//  stack: Vec<Box<Debug + 'a>> // old
    stack: Vec<&'a mut ()>      // new
}

What’s happening here is that Boxes own their data, and because of this, the type contained inside a box must have a 'static lifetime. As a result, it appears that the compiler has implicitly added the requirement that 'a: 'static.

That said, I feel the compiler is at fault here. To be honest, I am surprised that the definition of struct Parser even compiles! And in any case, the compiler has completely failed to communicate the 'static requirement that it has inferred.

(Edit: Actually, take the above with a grain of salt; I’m no expert on this! I myself seldom use Boxes, and looking around the web I see posts from only a year ago suggesting that Box<Trait + 'a> does not imply 'a: 'static.)


#9

In terms of how to fix the code, I agree with the general sentiment that mutable references are not to be thrown around lightly. Ownership is generally a much more powerful tool for getting things done.

The type of stack appears to be doomed to fail from the start; it sounds like you have a vector of mutable references that should have increasingly narrow lifetimes like 'a > 'b > 'c > ..., but because they are stored in a vector their lifetimes are inevitably forced to be equal; 'a = 'a = 'a = ....

If you really want to model some sort of narrowing chain of lifetimes, you would at the very least require some kind of recursive type, so that each element gets its own lifetime parameter. But even then, I’m not even sure that it is possible, as I have never tried (or needed) to create such a type!


#10

So it seems like there are two ways to implementing my parser.

Instead of building Poly using externally references components we can do so with owned components. In such a scenario we will need implement the following changes:

enum ParserPopReturn {
    Poly(Poly),
    Path(Path),
    Point(Point),
    Int(Int)
}

trait ParserBase {
    fn parseChar(&mut self, ch : char) -> ParseCharResult;
    fn doPop(&mut self, child : ParserPopReturn);
}

The structs implementing the ParseBase traits will then store the component they are parsing.

The problem with this approach is that we ParserPopReturn will need an exhaustive list of all possible return types. This is not terrible but does not lend itself to totally clean abstraction.

The second approach is to replace the references with pointers. This will bypass the borrow checker but since Poly outlives the Parser, should be totally safe.


#11

I’m not so sure about that.
If you modify the structure (which you do), references/pointers to the subcomponents can be invalidated.


#12

Forgot to add that I am only adding, not removing subcomponents


#13

Still, if Vec reallocates, references and poiners are invalidated.
The lifetime system exists exactly to prevent such errors.


#14

So this is my implementation of the first idea, the code works but as mentioned, we need an exhaustive list of all the parser components. Please note that most of the lifetimes and methods have been removed

extern crate std;

use std::fs::File;
use std::string::String;
use std::vec::Vec;
use std::result::Result;
use std::boxed::Box;
use std::str::Chars;
use std::io::{Read, ErrorKind};
use std::io::Result as IoResult;
use std::num::ParseIntError;

#[derive(Debug)]
struct Point
{
    x : i32,
    y : i32,
}

#[derive(Debug)]
struct Path {
    points : Vec<Point>
}

#[derive(Debug)]
struct Poly {
    paths : Vec<Path>
}

impl Poly {
    fn new() -> Poly {
        Poly { paths : Vec::new() }
    }
}

impl Path {
    fn new() -> Path {
        Path { points : Vec::new() }
    }
}

impl Point {
    fn new() -> Point {
        Point { x : 0, y : 0 }
    }
}

enum Component{
    Poly(Poly),
    Path(Path),
    Point(Point),
    Int(i32)
}

trait ParserBase {
    fn parse_char(&mut self, ch : char) -> ParseCharResult;
    fn do_pop(&mut self, child : Component);
    fn get_value(self : Box<Self>) -> Component { panic!("Cannot get value"); }

}

enum ParseCharResult {
    Push(Box<ParserBase>, bool /*next char */), 
    Continue,
    Pop(bool /*next char*/),
    Error(&'static str /*message*/),
}


struct ParsePoly {

    poly : Poly
}

struct ParsePath {
    path : Path
}

enum ParsePointState {
    Start,
    X,
    Y
}

struct ParsePoint {
    point : Point,
    state : ParsePointState
}

struct ParseInt {
    value : i32,
    tmp_str : String
}

impl ParsePoly {
    fn new() -> ParsePoly {
        ParsePoly{ poly : Poly::new() }
    }
}

impl ParsePath {
    fn new() -> ParsePath {
        ParsePath { path : Path::new() }
    }
}

impl ParsePoint {
    fn new() -> ParsePoint {
        ParsePoint { point : Point::new() , state : ParsePointState::Start }
    }
}

impl ParseInt  {
    fn new() -> ParseInt {
        ParseInt {value : i32::max_value(), tmp_str : String::new() }
    }
}


impl ParserBase for ParsePoly
{
    fn parse_char(& mut self, ch : char) -> ParseCharResult {
        if ch.is_whitespace() {
            ParseCharResult::Continue
        } else if ch == '{' {
            let res : ParseCharResult = ParseCharResult::Push(Box::new(ParsePath::new()), true);
            return res
        } else {
            ParseCharResult::Error("unexpected character")
        }
    }
    fn do_pop(&mut self, child : Component)
    {
        if let Component::Path(path) = child {
            self.poly.paths.push(path);
        } else {
            panic!("Unexpected return type");
        }
    }
    fn get_value(self : Box<Self>) -> Component {
        Component::Poly(self.poly)
    }

}

impl ParserBase for ParsePath
{
    fn parse_char(&mut self, ch : char) -> ParseCharResult {
        if ch.is_whitespace() {
            ParseCharResult::Continue
        } else if ch == '}' {
            ParseCharResult::Pop(true)
        } else {
            ParseCharResult::Push(Box::new(ParsePoint::new()), false)
        }
    }
    fn do_pop(&mut self, child : Component)
    {
        if let Component::Point(point) = child {
            self.path.points.push(point);
        } else {
            panic!("Unexpected return type");
        }
    }
    fn get_value(self : Box<Self>) -> Component {
        Component::Path(self.path)
    }
}

impl ParserBase for ParsePoint
{
    fn parse_char(&mut self, ch : char) -> ParseCharResult {
        match self.state {
            ParsePointState::Start => {
                if ch.is_whitespace() {
                    ParseCharResult::Continue
                } else {
                    self.state = ParsePointState::X;
                    ParseCharResult::Push(Box::new(ParseInt::new()), false)
                }
            }
            ParsePointState::X => {
                if ch == ' ' || ch == ',' {
                    ParseCharResult::Continue
                } else {
                    self.state = ParsePointState::Y;
                    ParseCharResult::Push(Box::new(ParseInt::new()), false)
                }
            }
            ParsePointState::Y => {
                if ch == '\n' {
                    ParseCharResult::Pop(true)
                }
                else if ch.is_whitespace() {
                    ParseCharResult::Continue
                } else {
                    ParseCharResult::Error("ParsePoint unexpected character")
                }
            }
        }
    }
    fn do_pop(&mut self, child : Component)
    {
        let mut val : i32 = i32::max_value();
        if let Component::Int(i)  = child {
            val  = i;
        } else {
            panic!("Unexpected return type");
        }
        match self.state {
            ParsePointState::X => {
                self.point.x = val;
            }
            ParsePointState::Y => {
                self.point.y = val;
            }
            _ => {
                panic!("Unexpected int state");
            }
        }
    }
    fn get_value(self : Box<Self>) -> Component {
        Component::Point(self.point)
    }
}

impl ParserBase for ParseInt
{
    fn parse_char(& mut self, ch : char) -> ParseCharResult {
        if ch.is_whitespace() || ch == ',' {
            let res : Result<i32, ParseIntError>  = self.tmp_str.parse();
            match res {
                Result::Err(_) => ParseCharResult::Error("An error occurred parsing the int"),
                Result::Ok(val) => {
                    self.value = val;
                    ParseCharResult::Pop(false)
                }
            }
        }
        else {
            self.tmp_str.push(ch);
            ParseCharResult::Continue
        }
    }
    fn do_pop(&mut self, child : Component) {
        panic!("We cannot pop the bottom  of the stack");
    }
    fn get_value(self : Box<Self>) -> Component {
        Component::Int(self.value)
    }
}

struct Parser {
    stack : Vec<Box<ParserBase>>
}

impl Parser {
    fn new() -> Parser {
        let mut ret = Parser{ stack : Vec::new() };
        ret.stack.push( Box::new(ParsePoly::new()));
        ret
    }

    fn parse(&mut self, data : &str) -> IoResult<()> {
        let mut it : Chars = data.chars();
        let mut val : Option<char> = it.next();
        
        while let Option::Some(ch) = val {
            let res = self.stack.last_mut().unwrap().parse_char(ch);
            match res {
                ParseCharResult::Push(item, inc) => {
                    self.stack.push(item);
                    if inc {
                        val = it.next();
                    }
                }
                ParseCharResult::Continue => { 
                    val = it.next(); 
                }
                ParseCharResult::Pop(inc) => {
                    {
                        let popped = self.stack.pop().unwrap().get_value();
                        if let Some(prev) = self.stack.last_mut() {
                            prev.do_pop(popped);
                        } else {
                            panic!("Nothing left on the parser stack");
                        }

                    }
                    if inc {
                        val = it.next();
                    }
                }
                ParseCharResult::Error(message) =>  return Err(std::io::Error::new(ErrorKind::Other, message))
            }
        }
        Result::Ok(())
    }
    fn get_poly(&mut self) -> Poly {
        if let Component::Poly(poly) =  self.stack.pop().unwrap().get_value() {
            return poly
        } else {
            panic!("Unexpected return value");
        }
    }
}

impl Poly {
    fn load(filename : &str) -> IoResult<Poly> {
        let mut file = try!(File::open(filename));
        let mut s = String::new();
        try!(file.read_to_string(&mut s));
        let mut parser = Parser::new();
        try!(parser.parse(s.as_str()));
        Result::Ok(parser.get_poly())
    }
}

#[test]
fn test_parser() {
    let test_str = "
    {
        21 43
        54 12
        76 26
    }
    {
        43 12
        32 45
        12 12
    }
    ";
    let mut parser = Parser::new();
    parser.parse(test_str).unwrap();
    let poly = parser.get_poly();
    println!("The parsed value is : {:?}", poly);
}


#15

That’s true, but OTOH the data structure isn’t extensible.
If it were, you would need boxing anyway in which case you can get rid of the enum end return an Box<Trait>.

I think it could be possible to get your original code working, by using a recursive data structure instead of a stack and by carefully chosing lifetimes. But this is currently over my head.
I believe that in your original implementation you ended up with one single giant lifetime for everything, the input, the stack and the data structure.

PS: You get Rust synax highlighting here in the forum if you use:

```rust
fn main () { }
```

#16

BTW I have managed to improve on this a bit by Converting my Component enum into a Box<Any>

I am now using:

trait ParserBase {
    fn parse_char(&mut self, ch : char) -> ParseCharResult;
    fn do_pop(&mut self, child : Box<Any>);
    fn get_value(self : Box<Self>) -> Box<Any>;
}

this yields the nice general solution I was looking for.

So thanks a lot for the help.It is this type of help that will help me get a handle on a new language without throwing my hands up in despair.