sauron icon indicating copy to clipboard operation
sauron copied to clipboard

parse_html ignoring white-spaces and newlines for <pre><code> ... </pre></code> html

Open qknight opened this issue 9 months ago • 3 comments

When using parse_html() it seems that <pre><code> sections are only parsed correctly when no nested tag(s) are used but instead only text nodes. as soon as a nested html element as a <span>...</span> is used, it looses the formatting as spaces and newlines (probably tabs, too).

After checking that the parser works according to html specification at https://github.com/fefit/rphtml/issues/4 I think that the error I'm seeing comes from process_node(...) now.

I had added these to html_parser_tests.rs

After a bunch of tests I discoverd:

  • when the first element in the <pre><code> is a string, formatting works correctly.
  • If the next element is a

    2

    (so any tag) it still works.
  • However, if the next one afterwards is a tag like

    3

    again, it fails to indent
  • but works with a text node. So the tags parser introduces a state somehow.

test 1

#[test]
fn test_pre_code() {
    let html = r#"<div><p> test </p>
<pre><code>
0
  1
  <p>foo</p>
  2
3</code></pre>
</div>"#;
let expected = r#"<div><p> test </p><pre><code>
0
  1
  <p>foo</p>
  2
3</code></pre></div>"#;
    let node: Node<()> = parse_html(html).ok().flatten().expect("must parse");
    //println!("node: {:#?}", node);
    println!("html: {}", html);
    println!("render: {}", node.render_to_string());
    assert_eq!(expected, node.render_to_string());
}

result:

cargo test -p sauron --test html_parser_test
test test_pre_code ... ok

test 2

#[test]
fn test_pre_code_2() {
    let html = r#"<pre><code>
0
<span>asdf</span>
  <span>asdf</span>
  <span>asdf</span>
</code></pre>"#;
let expected = r#"<pre><code>
<span>asdf</span>
  <span>asdf</span>
  <span>asdf</span>
</code></pre>"#;

    let node: Node<()> = parse_html(html).ok().flatten().expect("must parse");
    //println!("node: {:#?}", node);
    println!("html: {}", html);
    println!("render: {}", node.render_to_string());
    assert_eq!(expected, node.render_to_string());
}

result

cargo test -p sauron --test html_parser_test
test test_pre_code2 ... FAILED

failures:

---- test_pre_code_2 stdout ----
html: <pre><code>
0
<span>asdf</span>
  <span>asdf</span>
  <span>asdf</span>
</code></pre>
render: <pre><code>
0
<span>asdf</span><span>asdf</span><span>asdf</span></code></pre>
thread 'test_pre_code_2' panicked at tests/html_parser_test.rs:97:5:
assertion `left == right` failed
  left: "<pre><code>\n<span>asdf</span>\n  <span>asdf</span>\n  <span>asdf</span>\n</code></pre>"
 right: "<pre><code>\n0\n<span>asdf</span><span>asdf</span><span>asdf</span></code></pre>"
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

test 3

#[test]
fn test_pre_code3() {
    let html = r#"<div><p> test </p><pre><code>
0
  1
  2
3
</code></pre>
</div>"#;
let expected = r#"<div><p> test </p><pre><code>
0
  1
  2
3
</code></pre></div>"#;

    let node: Node<()> = parse_html(html).ok().flatten().expect("must parse");
    //println!("node: {:#?}", node);
    println!("html: {}", html);
    println!("render: {}", node.render_to_string());
    assert_eq!(expected, node.render_to_string());
}

result

cargo test -p sauron --test html_parser_test
test test_pre_code3 ... ok

test 4

#[test]
fn test_pre_code3_paragraphs_mix() {
    let html = r#"<div><p> test </p><pre><code>
  0
  <p>1</p>
  2
<p>3</p>
  4
</code></pre>
</div>"#;
let expected = r#"<div><p> test </p><pre><code>
  0
  <p>1</p>
  2
<p>3</p>
  4
</code></pre></div>"#;

    let node: Node<()> = parse_html(html).ok().flatten().expect("must parse");
    //println!("node: {:#?}", node);
    println!("html: {}", html);
    println!("render: {}", node.render_to_string());
    assert_eq!(expected, node.render_to_string());
    // right "<div><p> test </p><pre><code>\n  0\n  <p>1</p><p>2</p><p>3</p></code></pre></div>"
}

result

test test_pre_code3_paragraphs_mix ... ok

qknight avatar Mar 04 '25 18:03 qknight

Sauron uses parse_html and with this input:

let html = r#"<div><p> test </p>
<pre><code>
0
  1
  <p>foo</p>
  2
3</code></pre>
</div>"#;

In this code passage:

pub fn parse_html<MSG>(html: &str) -> Result<Option<Node<MSG>>, ParseError> {
    let doc = Doc::parse(
        html,
        ParseOptions {
            case_sensitive_tagname: false,
            allow_self_closing: true,
            auto_fix_unclosed_tag: true,
            auto_fix_unexpected_endtag: true,
            auto_fix_unescaped_lt: true,
        },
    )?;
    println!("xxxx: {}", doc.render(&Default::default()));
    process_node(doc.get_root_node().borrow().deref())

xxx doc.render(&Default::default()));

<div><p> test </p>
<pre><code>
0
  1
  <p>foo</p>
  2
3</code></pre>
</div>

this looks good

node AST: parse_html output

node: Element(
    Element {
        namespace: None,
        tag: "div",
        attrs: [],
        children: [
            Element(
                Element {
                    namespace: None,
                    tag: "p",
                    attrs: [],
                    children: [
                        Leaf(
                            Text(
                                " test ",
                            ),
                        ),
                    ],
                    self_closing: false,
                },
            ),
            Element(
                Element {
                    namespace: None,
                    tag: "pre",
                    attrs: [],
                    children: [
                        Element(
                            Element {
                                namespace: None,
                                tag: "code",
                                attrs: [],
                                children: [
                                    Leaf(
                                        Text(
                                            "\n0\n  1\n  ",
                                        ),
                                    ),
                                    Element(
                                        Element {
                                            namespace: None,
                                            tag: "p",
                                            attrs: [],
                                            children: [
                                                Leaf(
                                                    Text(
                                                        "foo",
                                                    ),
                                                ),
                                            ],
                                            self_closing: false,
                                        },
                                    ),
                                    Leaf(
                                        Text(
                                            "\n  2\n3",
                                        ),
                                    ),
                                ],
                                self_closing: false,
                            },
                        ),
                    ],
                    self_closing: false,
                },
            ),
        ],
        self_closing: false,
    },
)

Took me quite some time to understand this issue, but here is what I know now:

  • the Element { namespace: None, tag: "code", is already wrong, it must be a text node and contain everything up to (and including the </code>
  • rphtml gets the output right but i'm not yet sure if it also parses the tags inside <pre>...</pre>

qknight avatar Mar 06 '25 10:03 qknight

rphtml is at fault! It parses <p> as node_type: Tag and it should be of type node_type: Text because of the <pre>

#[test]
fn test_childs() -> HResult {
	let code = r##"<pre><p>aaa</p></pre>"##;
	let doc = parse(code)?;
	let root = doc.get_root_node();
	let childs = &root.borrow().childs;
	let childs = childs.as_ref().unwrap();

  for child in childs {
    println!(" - child: {:#?}\n", child);
  }

  assert_eq!(1,2);
	Ok(())
}
---- test_childs stdout ----
 - child: RefCell {
    value: Node {
        index: 0,
        node_type: Tag,
        begin_at: 0,
        end_at: 0,
        content: None,
        childs: Some(
            [
                RefCell {
                    value: Node {
                        index: 0,
                        node_type: Tag,
                        begin_at: 5,
                        end_at: 5,
                        content: None,
                        childs: Some(
                            [
                                RefCell {
                                    value: Node {
                                        index: 0,
                                        node_type: Text,
                                        begin_at: 8,
                                        end_at: 11,
                                        content: Some(
                                            [
                                                'a',
                                                'a',
                                                'a',
                                            ],
                                        ),
                                        childs: None,
                                        meta: None,
                                        end_tag: None,
                                        parent: true,
                                        root: true,
                                        document: false,
                                    },
                                },
                            ],
                        ),
                        meta: Some(
                            RefCell {
                                value: TagMeta {
                                    code_in: Wait,
                                    is_void: false,
                                    self_closed: false,
                                    auto_fix: false,
                                    name: [
                                        'p',
                                    ],
                                    attrs: [],
                                    lc_name_map: {},
                                },
                            },
                        ),
                        end_tag: Some(
                            RefCell {
                                value: Node {
                                    index: 0,
                                    node_type: TagEnd,
                                    begin_at: 11,
                                    end_at: 15,
                                    content: Some(
                                        [
                                            'p',
                                        ],
                                    ),
                                    childs: None,
                                    meta: None,
                                    end_tag: None,
                                    parent: true,
                                    root: false,
                                    document: false,
                                },
                            },
                        ),
                        parent: true,
                        root: true,
                        document: false,
                    },
                },
            ],
        ),
        meta: Some(
            RefCell {
                value: TagMeta {
                    code_in: Wait,
                    is_void: false,
                    self_closed: false,
                    auto_fix: false,
                    name: [
                        'p',
                        'r',
                        'e',
                    ],
                    attrs: [],
                    lc_name_map: {},
                },
            },
        ),
        end_tag: Some(
            RefCell {
                value: Node {
                    index: 0,
                    node_type: TagEnd,
                    begin_at: 15,
                    end_at: 21,
                    content: Some(
                        [
                            'p',
                            'r',
                            'e',
                        ],
                    ),
                    childs: None,
                    meta: None,
                    end_tag: None,
                    parent: true,
                    root: false,
                    document: false,
                },
            },
        ),
        parent: true,
        root: true,
        document: false,
    },
}

qknight avatar Mar 07 '25 13:03 qknight

I've tried to replace rphtml with html5ever parser in this branch: https://github.com/qknight/sauron/commits/public-dom-patch/

Turns out this works great, except the parser only supports complete html documents and can't work on fractions of the DOM tree. See https://github.com/servo/html5ever/issues/583 for details.

I spent quite some time in figuring out why it does not work but both parsers are very complex and I don't have a solution other to document the issues. Until that is fixed I will probably wrap https://www.npmjs.com/package/diff-dom/v/1.0.0 in rust.

qknight avatar Mar 14 '25 17:03 qknight