metascraper
metascraper copied to clipboard
Add `body` property
Returning the cleaned body of the article.
I'd be super interested in this!
Yes please!
+1
I modified the code a bit and saved it into a separate function and it works like a charm:
const Readability = require('readability')
const jsdom = require('jsdom')
const { JSDOM, VirtualConsole } = jsdom
export const readabilityScraper = () => {
const composeRule = fn => ({ from, to = from, ...opts }) => async ({
htmlDom,
url
}) => {
const data = await fn(htmlDom, url)
return data[from]
}
const readability = memoizeOne(($, url) => {
const dom = new JSDOM($.html(), { url, virtualConsole: new VirtualConsole() })
const reader = new Readability(dom.window.document)
const article = reader.parse()
/*
This article object will contain the following properties:
title: article title
content: HTML string of processed article content
length: length of an article, in characters
excerpt: article description, or short excerpt from the content
byline: author metadata
dir: content direction
*/
return article
})
const getReadbility = composeRule(readability)
const rules = {
description: getReadbility({ from: 'excerpt', to: 'description' }),
publisher: getReadbility({ from: 'siteName', to: 'publisher' }),
author: getReadbility({ from: 'byline', to: 'author' }),
title: getReadbility({ from: 'title' }),
dir: getReadbility({ from: 'dir' }),
length: getReadbility({ from: 'length' }),
body: getReadbility({ from: 'content' }),
}
return rules
}
and then when I run metascraper:
const metascraper = require('metascraper')([
readabilityScraper(),
])
The reason I had to modify composeRule
is because the regular helper has a validator
function that filters out "foreign" keys like body
Made a slight change to @janzheng 's code. Return data[from]
only if data exists. Without this, the function crashes on non-article urls.
const composeRule = fn => ({ from, to = from, ...opts }) => async ({
htmlDom,
url
}) => {
const data = await fn(htmlDom, url)
if (data) {
return data[from]
}
}