zig-yaml icon indicating copy to clipboard operation
zig-yaml copied to clipboard

Missing support for block notation for scalars

Open e-ivkov opened this issue 1 year ago • 5 comments

The following example from YAML 1.2.x docs does not parse:

# ASCII Art
--- |
  \//||\/||
  // ||  ||__

Debug info

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.comment, .start = 0, .end = 11 }
debug(tokenizer):     | # ASCII Art
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 11, .end = 12 }
debug(tokenizer):     |

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.doc_start, .start = 12, .end = 15 }
debug(tokenizer):     | ---
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 15, .end = 16 }
debug(tokenizer):     |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 16, .end = 17 }
debug(tokenizer):     | |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 17, .end = 18 }
debug(tokenizer):     |

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 18, .end = 20 }
debug(tokenizer):     |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 20, .end = 29 }
debug(tokenizer):     | \//||\/||
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 29, .end = 30 }
debug(tokenizer):     |

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 30, .end = 32 }
debug(tokenizer):     |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 32, .end = 34 }
debug(tokenizer):     | //
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 34, .end = 35 }
debug(tokenizer):     |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 35, .end = 37 }
debug(tokenizer):     | ||
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .start = 37, .end = 39 }
debug(tokenizer):     |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .start = 39, .end = 43 }
debug(tokenizer):     | ||__
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .start = 43, .end = 44 }
debug(tokenizer):     |

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.eof, .start = 44, .end = 44 }
debug(tokenizer):     |
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'comment')
debug(parse):   (token 'new_line')
debug(parse):   (token 'doc_start')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'doc_start')
debug(parse): (main) next doc_start@2
debug(parse): (doc) begin doc_start@2
debug(parse): eatToken('doc_start')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'doc_start')
debug(parse):   (found at 2)
debug(parse): eatToken('tag')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'space')
debug(parse):   (token 'literal')
debug(parse):   (not found)
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'literal')
debug(parse):   next literal@4
debug(parse): eatToken('map_value_ind')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'new_line')
debug(parse):   (not found)
debug(parse): (leaf) |
debug(parse): eatToken('doc_end')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'new_line')
debug(parse):   (token 'space')
debug(parse):   (token 'literal')
debug(parse):   (not found)
debug(parse): eatToken('doc_start')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'literal')
debug(parse):   (not found)
debug(parse): eatToken('eof')
debug(parse): eatCommentsAndSpace
debug(parse):   (token 'literal')
debug(parse):   (not found)
error: UnexpectedToken

e-ivkov avatar Oct 13 '24 22:10 e-ivkov

I can help fixing this, though any pointers to the exact places that need fixing are appreciated.

e-ivkov avatar Oct 13 '24 22:10 e-ivkov

Hi @e-ivkov, thanks for submitting the issue! The first thing I would try and figure out is what this YAML should tokenize as and would then compare it with zig-yaml parser. When that matches your expectation, I would then focus on the (probably) missing parser rule(s).

kubkon avatar Feb 09 '25 12:02 kubkon

Here is an alternative test, taking advantage of improved error reports.

note: |
    This is a note
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 0, .end = 4 } }
debug(tokenizer):     | note
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.map_value_ind, .loc = Tokenizer.Token.Loc{ .start = 4, .end = 5 } }
debug(tokenizer):     | :
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 5, .end = 6 } }
debug(tokenizer):     |  
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 6, .end = 7 } }
debug(tokenizer):     | |
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .loc = Tokenizer.Token.Loc{ .start = 7, .end = 8 } }
debug(tokenizer):     | 

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 8, .end = 12 } }
debug(tokenizer):     |     
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 12, .end = 16 } }
debug(tokenizer):     | This
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 16, .end = 17 } }
debug(tokenizer):     |  
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 17, .end = 19 } }
debug(tokenizer):     | is
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 19, .end = 20 } }
debug(tokenizer):     |  
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 20, .end = 21 } }
debug(tokenizer):     | a
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.space, .loc = Tokenizer.Token.Loc{ .start = 21, .end = 22 } }
debug(tokenizer):     |  
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.literal, .loc = Tokenizer.Token.Loc{ .start = 22, .end = 26 } }
debug(tokenizer):     | note
debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.new_line, .loc = Tokenizer.Token.Loc{ .start = 26, .end = 27 } }
debug(tokenizer):     | 

debug(tokenizer): Tokenizer.Token{ .id = Tokenizer.Token.Id.eof, .loc = Tokenizer.Token.Loc{ .start = 27, .end = 27 } }
debug(tokenizer):     | 
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'literal')
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'literal')
debug(parser): (main) next literal@0
debug(parser): (doc) begin [email protected](0)
debug(parser): eatToken('doc_start')
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'literal')
debug(parser):   (not found)
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'literal')
debug(parser):   next [email protected](0)
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'map_value_ind')
debug(parser):   (found at Tokenizer.Token.Index(1))
debug(parser): (map) begin [email protected](0)
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'literal')
debug(parser): (map) key [email protected](0)
debug(parser): expectToken('map_value_ind')
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'map_value_ind')
debug(parser):   (found at Tokenizer.Token.Index(1))
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'space')
debug(parser):   (token 'literal')
debug(parser):   next [email protected](3)
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'new_line')
debug(parser):   (not found)
debug(parser): (leaf) |
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'new_line')
debug(parser):   (token 'space')
debug(parser):   (token 'literal')
debug(parser): (map) key [email protected](6)
debug(parser): expectToken('map_value_ind')
debug(parser): eatToken('map_value_ind')
debug(parser): eatCommentsAndSpace
debug(parser):   (token 'space')
debug(parser):   (token 'literal')
debug(parser):   (not found)
(memory):2:10: error: expected map separator ':'
    This is a note
    ~~~~~^~~~~~~~~
error: ParseFailure
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:605:5: 0x11ae408 in expectToken__anon_27513 (lib.zig)
    return self.eatToken(id, exclusions) orelse error.UnexpectedToken;
    ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:740:5: 0x11e24a7 in fail__anon_29162 (lib.zig)
    return error.ParseFailure;
    ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:287:13: 0x11c7e8e in map (lib.zig)
            return self.fail(gpa, self.token_it.pos, "expected map separator ':'", .{});
            ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:150:13: 0x11ae6ef in value (lib.zig)
            return self.map(gpa);
            ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:203:25: 0x119b03d in doc (lib.zig)
    const value_index = try self.value(gpa);
                        ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Parser.zig:87:36: 0x11842b2 in parse (lib.zig)
                const node_index = try self.doc(gpa);
                                   ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/src/Yaml.zig:42:13: 0x117be62 in load (lib.zig)
            return error.ParseFailure;
            ^
/home/manlio/src/contrib/zig/github.com/kubkon/zig-yaml/examples/yaml.zig:103:13: 0x117f537 in main (yaml.zig)
            return error.ParseFailure;
            ^

perillo avatar Apr 14 '25 15:04 perillo

The problem is that the tokens |, > and % are not supported (% is not a priority). Additionally, I think that the comma token should also include the lexeme ' for consistency.

See https://yaml.org/spec/1.2.2/#53-indicator-characters.

perillo avatar Apr 14 '25 15:04 perillo

Is there any progress on adding support for | or > for block notation?

BearzRobotics avatar Jun 23 '25 01:06 BearzRobotics