zig-yaml
zig-yaml copied to clipboard
Missing support for block notation for scalars
The following example from YAML 1.2.x docs does not parse:
# ASCII Art
--- |
\//||\/||
// || ||__
Debug info
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.comment, .start = 0, .end = 11 }
debug(tokenizer): | # ASCII Art
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 11, .end = 12 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.doc_start, .start = 12, .end = 15 }
debug(tokenizer): | ---
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 15, .end = 16 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 16, .end = 17 }
debug(tokenizer): | |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 17, .end = 18 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 18, .end = 20 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 20, .end = 29 }
debug(tokenizer): | \//||\/||
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 29, .end = 30 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 30, .end = 32 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 32, .end = 34 }
debug(tokenizer): | //
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 34, .end = 35 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 35, .end = 37 }
debug(tokenizer): | ||
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 37, .end = 39 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 39, .end = 43 }
debug(tokenizer): | ||__
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 43, .end = 44 }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.eof, .start = 44, .end = 44 }
debug(tokenizer): |
debug(parse): eatCommentsAndSpace
debug(parse): (token 'comment')
debug(parse): (token 'new_line')
debug(parse): (token 'doc_start')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'doc_start')
debug(parse): (main) next doc_start@2
debug(parse): (doc) begin doc_start@2
debug(parse): eatToken('doc_start')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'doc_start')
debug(parse): (found at 2)
debug(parse): eatToken('tag')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'space')
debug(parse): (token 'literal')
debug(parse): (not found)
debug(parse): eatCommentsAndSpace
debug(parse): (token 'literal')
debug(parse): next literal@4
debug(parse): eatToken('map_value_ind')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'new_line')
debug(parse): (not found)
debug(parse): (leaf) |
debug(parse): eatToken('doc_end')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'new_line')
debug(parse): (token 'space')
debug(parse): (token 'literal')
debug(parse): (not found)
debug(parse): eatToken('doc_start')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'literal')
debug(parse): (not found)
debug(parse): eatToken('eof')
debug(parse): eatCommentsAndSpace
debug(parse): (token 'literal')
debug(parse): (not found)
error: UnexpectedToken
I can help fixing this, though any pointers to the exact places that need fixing are appreciated.
Hi @e-ivkov, thanks for submitting the issue! The first thing I would try and figure out is what this YAML should tokenize as and would then compare it with zig-yaml parser. When that matches your expectation, I would then focus on the (probably) missing parser rule(s).
Here is an alternative test, taking advantage of improved error reports.
note: |
This is a note
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 0, .end = 4 } }
debug(tokenizer): | note
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.map_value_ind, .loc = Tokenizer.Token.Loc{ .start = 4, .end = 5 } }
debug(tokenizer): | :
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 5, .end = 6 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 6, .end = 7 } }
debug(tokenizer): | |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .loc = Tokenizer.Token.Loc{ .start = 7, .end = 8 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 8, .end = 12 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 12, .end = 16 } }
debug(tokenizer): | This
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 16, .end = 17 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 17, .end = 19 } }
debug(tokenizer): | is
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 19, .end = 20 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 20, .end = 21 } }
debug(tokenizer): | a
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 21, .end = 22 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 22, .end = 26 } }
debug(tokenizer): | note
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .loc = Tokenizer.Token.Loc{ .start = 26, .end = 27 } }
debug(tokenizer): |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.eof, .loc = Tokenizer.Token.Loc{ .start = 27, .end = 27 } }
debug(tokenizer): |
debug(parser): eatCommentsAndSpace
debug(parser): (token 'literal')
debug(parser): eatCommentsAndSpace
debug(parser): (token 'literal')
debug(parser): (main) next literal@0
debug(parser): (doc) begin [email protected](0)
debug(parser): eatToken('doc_start')
debug(parser): eatCommentsAndSpace
debug(parser): (token 'literal')
debug(parser): (not found)
debug(parser): eatCommentsAndSpace
debug(parser): (token 'literal')
debug(parser): next [email protected](0)
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser): (token 'map_value_ind')
debug(parser): (found at Tokenizer.Token.Index(1))
debug(parser): (map) begin [email protected](0)
debug(parser): eatCommentsAndSpace
debug(parser): (token 'literal')
debug(parser): (map) key [email protected](0)
debug(parser): expectToken('map_value_ind')
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser): (token 'map_value_ind')
debug(parser): (found at Tokenizer.Token.Index(1))
debug(parser): eatCommentsAndSpace
debug(parser): (token 'space')
debug(parser): (token 'literal')
debug(parser): next [email protected](3)
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser): (token 'new_line')
debug(parser): (not found)
debug(parser): (leaf) |
debug(parser): eatCommentsAndSpace
debug(parser): (token 'new_line')
debug(parser): (token 'space')
debug(parser): (token 'literal')
debug(parser): (map) key [email protected](6)
debug(parser): expectToken('map_value_ind')
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser): (token 'space')
debug(parser): (token 'literal')
debug(parser): (not found)
(memory):2:10: error: expected map separator ':'
This is a note
~~~~~^~~~~~~~~
error: ParseFailure
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:605:5: 0x11ae408 in expectToken__anon_27513 (lib.zig)
return self.eatToken(id, exclusions) orelse error.UnexpectedToken;
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:740:5: 0x11e24a7 in fail__anon_29162 (lib.zig)
return error.ParseFailure;
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:287:13: 0x11c7e8e in map (lib.zig)
return self.fail(gpa, self.token_it.pos, "expected map separator ':'", .{});
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:150:13: 0x11ae6ef in value (lib.zig)
return self.map(gpa);
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:203:25: 0x119b03d in doc (lib.zig)
const value_index = try self.value(gpa);
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:87:36: 0x11842b2 in parse (lib.zig)
const node_index = try self.doc(gpa);
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Yaml.zig:42:13: 0x117be62 in load (lib.zig)
return error.ParseFailure;
^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/examples/yaml.zig:103:13: 0x117f537 in main (yaml.zig)
return error.ParseFailure;
^
The problem is that the tokens |, > and % are not supported (% is not a priority).
Additionally, I think that the comma token should also include the lexeme ' for consistency.
See https://yaml.org/spec/1.2.2/#53-indicator-characters.
Is there any progress on adding support for | or > for block notation?