liferea
liferea copied to clipboard
[Feature Request] Scraper
This is the script I use for scraping contents
#!/usr/bin/env python2
# coding: utf-8
from lxml import etree
import argparse
import datetime
import feedgenerator
#import requests
import os
from sys import stdin, stdout
import sys
# command line arguments
parser = argparse.ArgumentParser(
description='Scrap HTML pages into Atom Syndication Format '
'feeds (RFC 4287)')
parser.add_argument('-v, --version', action='version', version='%(prog)s 0.1')
# debugging
parser.add_argument('--debug', metavar='TEXT',
dest='debug', help='Print raw data to <!-- DATA -->)')
# content metadata
parser.add_argument('-c', '--charset', metavar='TEXT',
dest='charset', help='Specify data charset (Use this '
'option only when a given page has an exotic charset. '
'Default is UTF-8)')
# feed properties
parser.add_argument('-D', '--description', metavar='TEXT',
dest='feed_description', help='Set feed description')
parser.add_argument('-L', '--language', metavar='xx-XX',
dest='feed_language_code', help='Set feed language code. '
'Default language code is "en"')
parser.add_argument('-R', '--rights', metavar='TEXT',
dest='feed_rights', help='Set feed rights')
parser.add_argument('-S', '--subtitle', metavar='TEXT',
dest='feed_subtitle', help='Set feed subtitle. '
'If not specified, %(prog)s would attempt to fetch '
'subtitle from given URL with an XPath expression '
'//html/head//meta[@name="description"]/@content')
parser.add_argument('-T', '--title', metavar='TEXT',
dest='feed_title', help='Set feed title. '
'If not specified, %(prog)s would attempt to fetch '
'title from given URL with an XPath expression '
'//html/head//title/text()')
parser.add_argument('-U', '--url', metavar='URL',
dest='feed_url', help='Set feed link. '
'If not specified, %(prog)s would use the given '
'URL as feed link')
# start points
parser.add_argument('-r', '--root', metavar='XPATH',
dest='base_level', help='Path to node base level from '
'each of which entry properties will be retrieved.',
required=True)
# entry characterization
# TODO all entry arguments should be array/collection/list
# for joining several strings to one
parser.add_argument('-a', '--entry-author', metavar='XPATH',
dest='entry_author', help='Path to entry author')
parser.add_argument('-e', '--entry-email', metavar='XPATH',
dest='entry_email', help='Path to entry author Email')
# TODO use array/list instead of 2 date arguments [XPATH, DATE]
parser.add_argument('-d', '--entry-date', metavar='XPATH',
dest='entry_date', help='Path to entry title',
required=False)
parser.add_argument('-f', '--date-format', metavar='DATE',
dest='date_format', help='Date format of given URL '
'(e.g. if date is "YYYY-MM-DD", input would '
'correspondingly be "Y-m-d". See "man date" for more '
'information). Must be utilized along with --entry-date',
required=False)
# TODO use array/list instead of 3 enclosure arguments [XPATH, TEXT, XPATH]
parser.add_argument('-E', '--entry-enclosure', metavar='XPATH',
dest='entry_enclosure', help='Path to entry enclosure',
required=False)
parser.add_argument('-m', '--enclosure-mimetype', metavar='MIME-Type',
dest='enclosure_type', help='Set enclosure media type. '
'(e.g. video/x-matroska, audio/x-flac, audio/ogg, '
'audio/opus, audio/speex etc.) '
'Must be utilized along with --entry-enclosure',
required=False)
parser.add_argument('-w', '--enclosure-size', metavar='XPATH',
dest='enclosure_size', help='Path to enclosure size. '
'Must be utilized along with --entry-enclosure',
required=False)
parser.add_argument('-s', '--entry-description', metavar='XPATH',
dest='entry_description', help='Path to entry description',
required=False)
parser.add_argument('-i', '--entry-id', metavar='XPATH',
dest='entry_id', help='Path to entry id',
required=False)
parser.add_argument('-l', '--entry-link', metavar='XPATH',
dest='entry_link', help='Path to entry link',
required=True)
parser.add_argument('-t', '--entry-title', metavar='XPATH',
dest='entry_title', help='Path to entry title',
required=True)
args = parser.parse_args()
#if args.verbose:
# print "verbosity turned on"
# fetch html page
#response = requests.get(args.feed_url)
#data = response.text
# read html page
#reload(sys).setdefaultencoding('utf-8')
#data = unicode(stdin.read())
# you may require an encoding change
# here latin1 -> unicode
#data = unicode(data, 'utf-8')
# get nodes
#tree = etree.HTML(data)
tree = etree.HTML(stdin.read())
nodes = tree.xpath(args.base_level)
# feed description
if args.feed_description:
description = args.feed_description
else:
# retrieve page description
descriptions = tree.xpath('//html/head//meta[@name="description"]/@content')
description = '' if len(descriptions) == 0 else descriptions[0]
# feed title
if args.feed_title:
title = args.feed_title
else:
# retrieve page title
titles = tree.xpath('//html/head//title/text()')
title = '' if len(titles) == 0 else titles[0]
if len(nodes) > 0:
subtitle = 'Syndication feed created by html2atom \n\n html2atom has scraped {} headlines \n\n Last update: {}'.format(len(nodes), datetime.datetime.now())
else:
subtitle = 'FEED APPEARS TO BE EMPTY \n\n Date: {}'.format(datetime.datetime.now())
# create a feed
feed = feedgenerator.Atom1Feed(
description = description,
feed_copyright = args.feed_rights,
language = args.feed_language_code,
link = args.feed_url,
#subtitle = '{} has scanned {} headlines'.format(sys.argv[0], len(nodes)),
#subtitle = '{} headlines were scrapped by {}'.format(len(nodes), sys.argv[0]),
#subtitle = 'Syndication feed created by html2atom\n\n'
#'html2atom has scraped {} headlines\n\n'
#'Last update: {}'.format(len(nodes), datetime.datetime.now()),
subtitle = subtitle,
title = title
)
# for each instance of given node
for node in nodes:
# get author
if args.entry_author:
authors = node.xpath(args.entry_author)
author = '' if len(authors) == 0 else authors[0]
# get date
if args.entry_date:
dates = node.xpath(args.entry_date)
#date = '' if len(dates) == 0 else dates[0]
# obtain list at its entire, so XPath String Functions such as
# normalize-space, substring-before, substring-after, substring etc.
# would be of use.
#
# on the opposite side of this, method strptime returns
# TypeError: must be string, not list
#
# NOTE1 we might want to utilize function string-length
# NOTE2 we might want to hardcode normalize-space, so we would always
# be able to use date
#
# TODO test with CRYPTOME
date = '' if len(dates) == 0 else ''.join(dates)
# adopt given date format
if args.date_format:
date_format = args.date_format
# TODO try & except TypeError
# http://stackoverflow.com/questions/6615533/datetime-tryparse-in-python
try:
date = datetime.datetime.strptime(date, date_format)
#TODO print error message to console
except ValueError:
#date = None
date = datetime.datetime.now()
#date = datetime.datetime.fromtimestamp(0) # Jan 1st 1970
# get description
if args.entry_description:
descriptions = node.xpath(args.entry_description)
#description = '' if len(descriptions) == 0 else descriptions[0]
# python .local/share/liferea/scripts/html2atom.py --url 'https://jolla.com/jobs/' --title 'Jolla' --description 'Jobs' --subtitle '' --language 'en' --root '//div[(@class="et_pb_section et_section_regular") and not(descendant::a[@class="job-post"]) and (position()<last())]' --entry-title './/h2/text()' --entry-link './/@href' --entry-description './/strong/following-sibling::text()' --entry-id '@id'
# TODO use \n with plaintext summary
description = '' if len(descriptions) == 0 else '<br>'.join(descriptions)
# TODO Error when utilizing node()
# TypeError: sequence item 1: expected string or Unicode, lxml.etree._Element found
# http://stackoverflow.com/questions/10880813/typeerror-sequence-item-0-expected-string-int-found
#description = '' if len(descriptions) == 0 else '<br>'.join([str(i) for i in descriptions])
enclosure_type = ''
enclosure_size = ''
# get enclosure
if args.entry_enclosure:
enclosures = node.xpath(args.entry_enclosure)
enclosure = '' if len(enclosures) == 0 else enclosures[0]
if args.enclosure_type:
enclosure_type = args.enclosure_type
else:
enclosure_type = ''
if args.enclosure_size:
sizes = node.xpath(args.enclosure_size)
size = '' if len(sizes) == 0 else sizes[0]
else:
enclosure_size = ''
else:
enclosure = ''
# get identifier
if args.entry_id:
ids = node.xpath(args.entry_id)
idposte = '' if len(ids) == 0 else ''.join(ids)
# get link
links = node.xpath(args.entry_link)
link = '' if len(links) == 0 else ''.join(links)
# NOTE using list[0] may be better
#link = '' if len(links) == 0 else links[0]
# get title
titles = node.xpath(args.entry_title)
title = '' if len(titles) == 0 else ''.join(titles)
# create entry
feed.add_item(
#author_email = email,
author_name = author if args.entry_author else None,
#categories = (print [categories]),
description = description if args.entry_description else title,
enclosure = feedgenerator.Enclosure(enclosure, enclosure_size, enclosure_type) if args.entry_enclosure else None,
link = link,
pubdate = date if args.entry_date else None,
title = title,
unique_id = idposte if args.entry_id else link
)
# output feed
#print feed.writeString('utf-8')
# python 3 print function
#print(feed.writeString('utf-8'))
# output feed
sys.stdout.write(feed.writeString('utf-8'))
sys.stdout.flush()
Test with:
python html2atom.py --url 'http://www.slackware.com/' --title 'The Slackware Linux Project' --description 'News' --subtitle '' --language 'en' --root '//center[not(parent::body)]/table' --entry-title './tr[1]//b/text()' --entry-link '@href' --entry-description './tr[2]/td[1]//text()' --entry-date 'normalize-space(./tr[2]/td[2]//b/text())' --date-format '%Y-%m-%d'
I rather have a built-in dialog which provides fields for title, link, description, date, in both XPath and CSS Selectors.
FYI I've linked html2atom here https://github.com/lwindolf/rss-scraping
Oh, no. That version is not up to date. I'm using an improved version.
The author gave me two answers for licensing and I was too shy to ask him about it again.
Below is the current version of html2atom.
I think this should work with Python 3. I'll fix it, if it doesn't. Thank you, Lars.
#!/usr/bin/env python2
# coding: utf-8
from lxml import etree
import argparse
import datetime
import feedgenerator
#import requests
import os
from sys import stdin, stdout
import sys
# command line arguments
parser = argparse.ArgumentParser(
description='Scrap HTML pages into Atom Syndication Format '
'feeds (RFC 4287)')
parser.add_argument('-v, --version', action='version', version='%(prog)s 0.1')
# debugging
parser.add_argument('--debug', metavar='TEXT',
dest='debug', help='Print raw data to <!-- DATA -->)')
# content metadata
parser.add_argument('-c', '--charset', metavar='TEXT',
dest='charset', help='Specify data charset (Use this '
'option only when a given page has an exotic charset. '
'Default is UTF-8)')
# feed properties
parser.add_argument('-D', '--description', metavar='TEXT',
dest='feed_description', help='Set feed description')
parser.add_argument('-L', '--language', metavar='xx-XX',
dest='feed_language_code', help='Set feed language code. '
'Default language code is "en"')
parser.add_argument('-R', '--rights', metavar='TEXT',
dest='feed_rights', help='Set feed rights')
parser.add_argument('-S', '--subtitle', metavar='TEXT',
dest='feed_subtitle', help='Set feed subtitle. '
'If not specified, %(prog)s would attempt to fetch '
'subtitle from given URL with an XPath expression '
'//html/head//meta[@name="description"]/@content')
parser.add_argument('-T', '--title', metavar='TEXT',
dest='feed_title', help='Set feed title. '
'If not specified, %(prog)s would attempt to fetch '
'title from given URL with an XPath expression '
'//html/head//title/text()')
parser.add_argument('-U', '--url', metavar='URL',
dest='feed_url', help='Set feed link. '
'If not specified, %(prog)s would use the given '
'URL as feed link')
# start points
parser.add_argument('-r', '--root', metavar='XPATH',
dest='base_level', help='Path to node base level from '
'each of which entry properties will be retrieved.',
required=True)
# entry characterization
# TODO all entry arguments should be array/collection/list
# for joining several strings to one
parser.add_argument('-a', '--entry-author', metavar='XPATH',
dest='entry_author', help='Path to entry author')
parser.add_argument('-e', '--entry-email', metavar='XPATH',
dest='entry_email', help='Path to entry author Email')
# TODO use array/list instead of 2 date arguments [XPATH, DATE]
parser.add_argument('-d', '--entry-date', metavar='XPATH',
dest='entry_date', help='Path to entry title',
required=False)
parser.add_argument('-f', '--date-format', metavar='DATE',
dest='date_format', help='Date format of given URL '
'(e.g. if date is "YYYY-MM-DD", input would '
'correspondingly be "Y-m-d". See "man date" for more '
'information). Must be utilized along with --entry-date',
required=False)
# TODO use array/list instead of 3 enclosure arguments [XPATH, TEXT, XPATH]
parser.add_argument('-E', '--entry-enclosure', metavar='XPATH',
dest='entry_enclosure', help='Path to entry enclosure',
required=False)
parser.add_argument('-m', '--enclosure-mimetype', metavar='MIME-Type',
dest='enclosure_type', help='Set enclosure media type. '
'(e.g. video/x-matroska, audio/x-flac, audio/ogg, '
'audio/opus, audio/speex etc.) '
'Must be utilized along with --entry-enclosure',
required=False)
parser.add_argument('-w', '--enclosure-size', metavar='XPATH',
dest='enclosure_size', help='Path to enclosure size. '
'Must be utilized along with --entry-enclosure',
required=False)
parser.add_argument('-s', '--entry-description', metavar='XPATH',
dest='entry_description', help='Path to entry description',
required=False)
parser.add_argument('-i', '--entry-id', metavar='XPATH',
dest='entry_id', help='Path to entry id',
required=False)
parser.add_argument('-l', '--entry-link', metavar='XPATH',
dest='entry_link', help='Path to entry link',
required=True)
parser.add_argument('-t', '--entry-title', metavar='XPATH',
dest='entry_title', help='Path to entry title',
required=True)
args = parser.parse_args()
#if args.verbose:
# print "verbosity turned on"
# fetch html page
#response = requests.get(args.feed_url)
#data = response.text
# read html page
#reload(sys).setdefaultencoding('utf-8')
#data = unicode(stdin.read())
# you may require an encoding change
# here latin1 -> unicode
#data = unicode(data, 'utf-8')
# get nodes
#tree = etree.HTML(data)
tree = etree.HTML(stdin.read())
nodes = tree.xpath(args.base_level)
# feed description
if args.feed_description:
description = args.feed_description
else:
# retrieve page description
descriptions = tree.xpath('//html/head//meta[@name="description"]/@content')
description = '' if len(descriptions) == 0 else descriptions[0]
# feed title
if args.feed_title:
title = args.feed_title
else:
# retrieve page title
titles = tree.xpath('//html/head//title/text()')
title = '' if len(titles) == 0 else titles[0]
if len(nodes) > 0:
subtitle = 'Syndication feed created by html2atom \n\n html2atom has scraped {} headlines \n\n Last update: {}'.format(len(nodes), datetime.datetime.now())
else:
subtitle = 'FEED APPEARS TO BE EMPTY \n\n Date: {}'.format(datetime.datetime.now())
# create a feed
feed = feedgenerator.Atom1Feed(
description = description,
feed_copyright = args.feed_rights,
language = args.feed_language_code,
link = args.feed_url,
#subtitle = '{} has scanned {} headlines'.format(sys.argv[0], len(nodes)),
#subtitle = '{} headlines were scrapped by {}'.format(len(nodes), sys.argv[0]),
#subtitle = 'Syndication feed created by html2atom\n\n'
#'html2atom has scraped {} headlines\n\n'
#'Last update: {}'.format(len(nodes), datetime.datetime.now()),
subtitle = subtitle,
title = title
)
# for each instance of given node
for node in nodes:
# get author
if args.entry_author:
authors = node.xpath(args.entry_author)
author = '' if len(authors) == 0 else authors[0]
# get date
if args.entry_date:
dates = node.xpath(args.entry_date)
#date = '' if len(dates) == 0 else dates[0]
# obtain list at its entire, so XPath String Functions such as
# normalize-space, substring-before, substring-after, substring etc.
# would be of use.
#
# on the opposite side of this, method strptime returns
# TypeError: must be string, not list
#
# NOTE1 we might want to utilize function string-length
# NOTE2 we might want to hardcode normalize-space, so we would always
# be able to use date
#
# TODO test with CRYPTOME
date = '' if len(dates) == 0 else ''.join(dates)
# adopt given date format
if args.date_format:
date_format = args.date_format
# TODO try & except TypeError
# http://stackoverflow.com/questions/6615533/datetime-tryparse-in-python
try:
date = datetime.datetime.strptime(date, date_format)
#TODO print error message to console
except ValueError:
#date = None
date = datetime.datetime.now()
#date = datetime.datetime.fromtimestamp(0) # Jan 1st 1970
# get description
if args.entry_description:
descriptions = node.xpath(args.entry_description)
#description = '' if len(descriptions) == 0 else descriptions[0]
# python .local/share/liferea/scripts/html2atom.py --url 'https://jolla.com/jobs/' --title 'Jolla' --description 'Jobs' --subtitle '' --language 'en' --root '//div[(@class="et_pb_section et_section_regular") and not(descendant::a[@class="job-post"]) and (position()<last())]' --entry-title './/h2/text()' --entry-link './/@href' --entry-description './/strong/following-sibling::text()' --entry-id '@id'
# TODO use \n with plaintext summary
description = '' if len(descriptions) == 0 else '<br>'.join(descriptions)
# TODO Error when utilizing node()
# TypeError: sequence item 1: expected string or Unicode, lxml.etree._Element found
# http://stackoverflow.com/questions/10880813/typeerror-sequence-item-0-expected-string-int-found
#description = '' if len(descriptions) == 0 else '<br>'.join([str(i) for i in descriptions])
enclosure_type = ''
enclosure_size = ''
# get enclosure
if args.entry_enclosure:
enclosures = node.xpath(args.entry_enclosure)
enclosure = '' if len(enclosures) == 0 else enclosures[0]
if args.enclosure_type:
enclosure_type = args.enclosure_type
else:
enclosure_type = ''
if args.enclosure_size:
sizes = node.xpath(args.enclosure_size)
size = '' if len(sizes) == 0 else sizes[0]
else:
enclosure_size = ''
else:
enclosure = ''
# get identifier
if args.entry_id:
ids = node.xpath(args.entry_id)
idposte = '' if len(ids) == 0 else ''.join(ids)
# get link
links = node.xpath(args.entry_link)
link = '' if len(links) == 0 else ''.join(links)
# NOTE using list[0] may be better
#link = '' if len(links) == 0 else links[0]
# get title
titles = node.xpath(args.entry_title)
title = '' if len(titles) == 0 else ''.join(titles)
# create entry
feed.add_item(
#author_email = email,
author_name = author if args.entry_author else None,
#categories = (print [categories]),
description = description if args.entry_description else title,
enclosure = feedgenerator.Enclosure(enclosure, enclosure_size, enclosure_type) if args.entry_enclosure else None,
link = link,
pubdate = date if args.entry_date else None,
title = title,
unique_id = idposte if args.entry_id else link
)
# output feed
#print feed.writeString('utf-8')
# python 3 print function
#print(feed.writeString('utf-8'))
# output feed
sys.stdout.write(feed.writeString('utf-8'))
sys.stdout.flush()