Help with building parser using nom for custom data format with nested list

I have this format, the result should be of Item data type,

test 1: "<A \"Test\">"

test 2: r#"<A "Test">"#

Result: Item { item_type: TEXT, ascii_data: Some("Test") }

test 3: <A>

Result: Item { item_type: TEXT, ascii_data: None }

For test 1 and test 2, following code parse, but for type 3, it is failing,

It also contains nested types.

<L 
    <A "Test1">
    <L
        <A>
        <A "Test2">
    >
    <A "Test3">
>

Result:

Item { 
    item_type: LIST,
    sub_items: [
        Item {
            item_type: ASCII,
            ascii_data: "Test1",
        },
        Item {
            item_type: LIST,
            sub_items: [
                Item {
                    item_type: ASCII,
                    ascii_data: None,
                    }
                Item {
                    item_type: ASCII,
                    ascii_data: "Test2",
                    }
                ], 
        },
        Item {
            item_type: ASCII,
            ascii_data: "Test3",
        },
    ],
}

Rust Playgroud link

use nom::{
    branch::alt,
    bytes::complete::{tag, take_until},
    character::complete::multispace0,
    combinator::map,
    IResult,
};

#[derive(Clone, Debug, PartialEq)]
enum ItemType {
    LIST,
    TEXT,
    NONE,
}

#[derive(Clone, Debug, PartialEq)]
struct Item {
    item_type: ItemType,
    sub_items: Option<Vec<Item>>,
    ascii_data: Option<String>,
}

impl Default for Item {
    fn default() -> Self {
        Item {
            item_type: ItemType::NONE,
            sub_items: None,
            ascii_data: None,
        }
    }
}

// Parse string data, it may empty then return none,
fn parse_ascii_data(input: &str) -> IResult<&str, String> {
    let (input, _) = tag("\"")(input)?;
    let (input, ascii_data) = take_until("\"")(input)?;
    let (input, _) = tag("\"")(input)?;

    Ok((input, ascii_data.to_string()))
}

// Parse <A> or <A "string">, if no string then return empty string then return none
fn parse_ascii_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = tag("<A")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, ascii_data) = alt((parse_ascii_data, map(tag("\"\""), |_| "".to_string())))(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::TEXT,
            ascii_data: Some(ascii_data),
            ..Default::default()
        },
    ))
}

// Parse <L> or <L <A "string">>, if no string then return empty string then return none
fn parse_list_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = tag("<L")(input)?;
    let (input, _) = multispace0(input)?;
    let (input, sub_items) = alt((parse_ascii_item, map(tag("<>"), |_| Item::default())))(input)?;
    let (input, _) = tag(">")(input)?;

    Ok((
        input,
        Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![sub_items]),
            ..Default::default()
        },
    ))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_ascii_item() {
        let input = "<A \"Test\">";

        let expected_item = Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test".to_string()),
            ..Default::default()
        };

        assert_eq!(parse_ascii_item(input), Ok(("", expected_item)));

        let input = r#"<A "Test">"#;

        let expected_item = Item {
            item_type: ItemType::TEXT,
            ascii_data: Some("Test".to_string()),
            ..Default::default()
        };

        assert_eq!(parse_ascii_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_ascii_item("<A>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::TEXT,
                    ascii_data: None,
                    ..Default::default()
                }
            ))
        );
    }

    #[test]
    fn test_parse_list_item() {
        let input = "<L <A \"Test\">>";

        let expected_item = Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        };

        assert_eq!(parse_list_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_list_item("<L>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::LIST,
                    sub_items: Some(vec![Item::default()]),
                    ..Default::default()
                }
            ))
        );
    }

    #[test]
    fn test_parse_nested_list_item() {
        let input = "<L \n    <A \"Test1\">\n    <L\n        <A \"Test2\">\n    >\n>";

        let expected_item = Item {
            item_type: ItemType::LIST,
            sub_items: Some(vec![Item {
                item_type: ItemType::TEXT,
                ascii_data: Some("Test".to_string()),
                ..Default::default()
            }]),
            ..Default::default()
        };

        assert_eq!(parse_list_item(input), Ok(("", expected_item)));

        assert_eq!(
            parse_list_item("<L>"),
            Ok((
                "",
                Item {
                    item_type: ItemType::LIST,
                    sub_items: Some(vec![Item::default()]),
                    ..Default::default()
                }
            ))
        );
    }
}

To parse the optional ascii_data field, you can use the opt combinator:

    let (input, ascii_data) = delimited(
        tag("<A"),
        opt(preceded(multispace0, parse_ascii_data)),
        tag(">")
    )(input)?;

For a list of nested items, you can use the many0 combinator:

    let (input, sub_items) = delimited(
        tag("<L"),
        many0(preceded(multispace0, parse_item)),
        preceded(multispace0, tag(">"))
    )(input)?;

To allow those nested items to be either list items or ascii items, you can use the alt combinator:

fn parse_item(input: &str) -> IResult<&str, Item> {
    alt((parse_ascii_item, parse_list_item))(input)
}

Playground

2 Likes

Just to show off an alternate way of using Nom, you can also shorten entire parser to this (playground):

fn parse_item(input: &str) -> IResult<&str, Item> {
    let ascii_data = map(
        delimited(tag("\""), take_until("\""), tag("\"")),
        str::to_string,
    );
    let ascii_item = map(
        delimited(
            tag("<A"),
            opt(preceded(multispace0, ascii_data)),
            tag(">"),
        ),
        Item::text,
    );
    let list_item = map(
        delimited(
            tag("<L"),
            many0(preceded(multispace0, parse_item)),
            preceded(multispace0, tag(">")),
        ),
        Item::list,
    );
    let mut item = alt((ascii_item, list_item));
    item(input)
}

impl Item {
    fn text(ascii_data: Option<String>) -> Self {
        Item { item_type: ItemType::TEXT, ascii_data, ..Item::default() }
    }

    fn list(sub_items: Vec<Item>) -> Self {
        Item { item_type: ItemType::LIST, sub_items, ..Item::default() }
    }
}
2 Likes

Lastly, consider combining Item and ItemType into a single enum:

enum Item {
    List(Vec<Item>),
    Text(Option<String>),
}

This can simplify the code considerably: Playground

1 Like

This is method for parsing list, but what changes needs to done to parse length also in the bracket, [3] for first L and [2] second,

<L [3]
    <A "Test1">
    <L [2]
        <A>
        <A "Test2">
    >
    <A "Test3">
>

Edit: Got it working now,

pub fn parse_list_item(input: &str) -> IResult<&str, Item> {
    let (input, _) = multispace0(input)?;
    let (input, _) = tag("<L")(input)?;

    let (input, _) = multispace0(input)?;
    let (input, _) = opt(tag("["))(input)?;
    let (input, count) = opt(digit1)(input)?;
    let (input, _) = opt(tag("]"))(input)?;

    let (input, mut sub_items) = many0(|input| {
        let (input, _) = multispace0(input)?;
        alt((parse_ascii_item, parse_list_item))(input)
    })(input)?;
    let (input, _) = multispace0(input)?;
    let (input, _) = tag(">")(input)?;

    if sub_items.is_empty() {
        sub_items.push(Item::default());
    }

    Ok((
        input,
        Item {
            item_type: ItemType::LIST,
            sub_items: Some(sub_items),
            ..Default::default()
        },
    ))
}

You probably want to use many_m_n in nom::multi - Rust when number of items is known to ensure data is valid.

2 Likes

Also, this code:

    let (input, _) = opt(tag("["))(input)?;
    let (input, count) = opt(digit1)(input)?;
    let (input, _) = opt(tag("]"))(input)?;

Will allow malformed input like "<L []>" or "<L [0 >" or "<L ] >".

You should instead use:

    let (input, count) = opt(delimited(tag("["), digit1, tag("]")))(input)?;
1 Like