node-htmlparser
node-htmlparser copied to clipboard
1.x: Less thans and greater thans in attributes break the parser
Based on the example in http://www.whatwg.org/specs/web-apps/current-work/#attr-iframe-srcdoc :
var rawHtml = '<iframe srcdoc="<p>Yeah, you can see it <a href="/gallery?mode=cover&amp;page=1">in my gallery</a>."></iframe>',
htmlparser = require('./lib/htmlparser'),
handler = new htmlparser.DefaultHandler(),
parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
console.warn(require('util').inspect(handler.dom, false, null));
Output:
[ { raw: 'iframe srcdoc="',
data: 'iframe srcdoc="',
type: 'tag',
name: 'iframe',
attribs: { srcdoc: 'srcdoc' },
children:
[ { raw: 'p',
data: 'p',
type: 'tag',
name: 'p',
children:
[ { raw: 'Yeah, you can see it ',
data: 'Yeah, you can see it ',
type: 'text' },
{ raw: 'a href="/gallery?mode=cover&amp;page=1"',
data: 'a href="/gallery?mode=cover&amp;page=1"',
type: 'tag',
name: 'a',
attribs: { href: '"/gallery?mode=cover&amp;page=1"' },
children: [ { raw: 'in my gallery', data: 'in my gallery', type: 'text' } ] },
{ raw: '."', data: '."', type: 'text' } ] } ] } ]
Expected output:
[ { raw: 'iframe srcdoc="<p>Yeah, you can see it <a href="/gallery?mode=cover&amp;page=1">in my gallery</a>."',
data: 'iframe srcdoc="<p>Yeah, you can see it <a href="/gallery?mode=cover&amp;page=1">in my gallery</a>."',
type: 'tag',
name: 'iframe',
attribs: { srcdoc: '<p>Yeah, you can see it <a href="/gallery?mode=cover&amp;page=1">in my gallery</a>.' } } ]
It works if I entitify the less thans and greater thans.
Oh, I didn't realize that 2.0 was out. I'm using node-htmlparser
with jsdom, which still has it pegged at 1.x.
Yeah, I just pushed 2.0 to github but have not released a new npm package until I do some more verification and add more tests. You are more than welcome to submit a pull request if you have a fix for the issue in 1.x.
I will be accepting and applying fixes for 1.x as time permits.