readabilitySAX
readabilitySAX copied to clipboard
Take into account text in divs
It's very useful for some sites to handle this case:
<div class="content">
Some long text...
<a href="...">Link</a>
...
</div>
Competitors do: https://github.com/mozilla/readability/blob/master/Readability.js#L677-L688 https://github.com/luin/readability/blob/master/src/helpers.js#L108-L119
I think it must be optional. Something like
if(tagName === "p" || tagName === "pre" || tagName === "td");
else if(tagName === "div"){
var done = false;
//check if div should be converted to a p
for(i = 0, j = divToPElements.length; i < j; i++){
if(divToPElements[i] in elem.info.tagCount){
done = true;
break;
}
}
if(done && this._settings.strayText){
for(i = 0, j = elem.children.length; i < j; i++) {
var child = elem.children[i];
if(typeof child !== 'string')
continue;
var textLength = child.trim().length;
if(textLength > 24 && elem.parent){
elem.isCandidate = elem.parent.isCandidate = true;
if(re_commas.test(child)) var commas = child.split(re_commas).length - 1;
var addScore = 1 + commas + Math.min( Math.floor( textLength / 100 ), 3);
elem.tagScore += addScore;
elem.parent.tagScore += addScore / 2;
}
}
}
if(done)
return;
elem.name = "p";
}
else return;
What do you think about it?