node-summary
node-summary copied to clipboard
Summary output short / wrong html text
Hi, I am trying to get a summary text for a given webpage but the returned summary is short / bad and I cant figure out why....
Here are my steps:
- Use needle to get the url html data
- Run the data through readability so only the core of the html is extracted.
- Sanitize the html to only return the text.
- Run summarize...
Why am I getting one or two lines of summary ? Can I control how long the summary output should be? I want the output to be something like http://smmry.com
Here is the code:
var needle = require('needle');
var read = require('node-readability');
var sanitizeHtml = require('sanitize-html');
var SummaryTool = require('node-summary');
var url = " http://www.inc.com/gene-marks/the-one-way-to-tell-if-you-re-a-successful-entrepreneur.html?cid=sf01001";
needle.get(url, function(error, response) {
if (!error && response.statusCode == 200){
read(response.body, function(err, article, meta) {
var cleanCont = sanitizeHtml(article.content, {
allowedTags: [ ],
allowedAttributes: {}
});
SummaryTool.summarize(article.title, cleanCont, function(err, summary) {
if(err) console.log("Something went wrong man!");
console.log(summary);
console.log("Original Length " + (article.title.length + cleanCont.length));
console.log("Summary Length " + summary.length);
console.log("Summary Ratio: " + (100 - (100 * (summary.length / (article.title.length + cleanCont.length)))));
});
});
}
});
Thanks Chris
Well some progress added carriage returns on html </p>
and <br>
so the algorithm knows the text separations... Not perfect but better. Any ideas?
var needle = require('needle');
var read = require('node-readability');
var sanitizeHtml = require('sanitize-html');
var SummaryTool = require('node-summary');
var url = " http://www.inc.com/gene-marks/the-one-way-to-tell-if-you-re-a-successful-entrepreneur.html?cid=sf01001";
needle.get(url, function(error, response) {
if (!error && response.statusCode == 200){
read(response.body, function(err, article, meta) {
var str = article.content.replace(/<\/p>/g, '<\/p>\n\n');
str = str.replace(/<br>/g, '<br>\n');
str = str.replace(/<br\/>/g, '<br\/>\n');
str = str.replace(/<br \/>/g, '<br \/>\n');
var cleanCont = sanitizeHtml(str, {
allowedTags: [ ],
allowedAttributes: {}
});
SummaryTool.summarize(article.title, cleanCont, function(err, summary) {
if(err) console.log("Something went wrong man!");
console.log(summary);
console.log("Original Length " + (article.title.length + cleanCont.length));
console.log("Summary Length " + summary.length);
console.log("Summary Ratio: " + (100 - (100 * (summary.length / (article.title.length + cleanCont.length)))));
});
});
}
});
@chrisribe I'm getting the same. I will write a good couple of lines for content and try to summarize it but I only get like a line back? The example returned shows loads of lines but summarizes fine then? How can we get it to summarize better?
Never got it to work exactly as I wanted it. Moved to another project and put this aside so...
Sorry. Chris