liferea icon indicating copy to clipboard operation
liferea copied to clipboard

[Feature Request] Scraper

Open sjehuda opened this issue 3 years ago • 3 comments

This is the script I use for scraping contents

#!/usr/bin/env python2
# coding: utf-8

from lxml import etree
import argparse
import datetime
import feedgenerator
#import requests
import os
from sys import stdin, stdout
import sys

# command line arguments
parser = argparse.ArgumentParser(
    description='Scrap HTML pages into Atom Syndication Format '
                'feeds (RFC 4287)')

parser.add_argument('-v, --version', action='version', version='%(prog)s 0.1')

# debugging
parser.add_argument('--debug', metavar='TEXT',
                    dest='debug', help='Print raw data to <!-- DATA -->)')

# content metadata
parser.add_argument('-c', '--charset', metavar='TEXT',
                    dest='charset', help='Specify data charset (Use this '
                    'option only when a given page has an exotic charset. '
                    'Default is UTF-8)')


# feed properties
parser.add_argument('-D', '--description', metavar='TEXT',
                    dest='feed_description', help='Set feed description')
parser.add_argument('-L', '--language', metavar='xx-XX',
                    dest='feed_language_code', help='Set feed language code. '
                    'Default language code is "en"')
parser.add_argument('-R', '--rights', metavar='TEXT',
                    dest='feed_rights', help='Set feed rights')
parser.add_argument('-S', '--subtitle', metavar='TEXT',
                    dest='feed_subtitle', help='Set feed subtitle. '
                    'If not specified, %(prog)s would attempt to fetch '
                    'subtitle from given URL with an XPath expression '
                    '//html/head//meta[@name="description"]/@content')
parser.add_argument('-T', '--title', metavar='TEXT',
                    dest='feed_title', help='Set feed title. '
                    'If not specified, %(prog)s would attempt to fetch '
                    'title from given URL with an XPath expression  '
                    '//html/head//title/text()')
parser.add_argument('-U', '--url', metavar='URL',
                    dest='feed_url', help='Set feed link. '
                    'If not specified, %(prog)s would use the given '
                    'URL as feed link')

# start points
parser.add_argument('-r', '--root', metavar='XPATH',
                    dest='base_level', help='Path to node base level from '
                    'each of which entry properties will be retrieved.',
                    required=True)

# entry characterization
# TODO all entry arguments should be array/collection/list
# for joining  several strings to one
parser.add_argument('-a', '--entry-author', metavar='XPATH',
                    dest='entry_author', help='Path to entry author')
parser.add_argument('-e', '--entry-email', metavar='XPATH',
                    dest='entry_email', help='Path to entry author Email')
# TODO use array/list instead of 2 date arguments [XPATH, DATE]
parser.add_argument('-d', '--entry-date', metavar='XPATH',
                    dest='entry_date', help='Path to entry title',
                    required=False)
parser.add_argument('-f', '--date-format', metavar='DATE',
                    dest='date_format', help='Date format of given URL '
                    '(e.g. if date is "YYYY-MM-DD", input would '
                    'correspondingly be "Y-m-d".  See "man date" for more '
                    'information). Must be utilized along with --entry-date',
                    required=False)
# TODO use array/list instead of 3 enclosure arguments [XPATH, TEXT, XPATH]
parser.add_argument('-E', '--entry-enclosure', metavar='XPATH',
                    dest='entry_enclosure', help='Path to entry enclosure',
                    required=False)
parser.add_argument('-m', '--enclosure-mimetype', metavar='MIME-Type',
                    dest='enclosure_type', help='Set enclosure media type. '
                    '(e.g. video/x-matroska, audio/x-flac, audio/ogg, '
                    'audio/opus, audio/speex etc.) '
                    'Must be utilized along with --entry-enclosure',
                    required=False)
parser.add_argument('-w', '--enclosure-size', metavar='XPATH',
                    dest='enclosure_size', help='Path to enclosure size. '
                    'Must be utilized along with --entry-enclosure',
                    required=False)
parser.add_argument('-s', '--entry-description', metavar='XPATH',
                    dest='entry_description', help='Path to entry description',
                    required=False)
parser.add_argument('-i', '--entry-id', metavar='XPATH',
                    dest='entry_id', help='Path to entry id',
                    required=False)
parser.add_argument('-l', '--entry-link', metavar='XPATH',
                    dest='entry_link', help='Path to entry link',
                    required=True)
parser.add_argument('-t', '--entry-title', metavar='XPATH',
                    dest='entry_title', help='Path to entry title',
                    required=True)

args = parser.parse_args()
#if args.verbose:
#    print "verbosity turned on"


# fetch html page
#response = requests.get(args.feed_url)
#data = response.text

# read html page
#reload(sys).setdefaultencoding('utf-8')
#data = unicode(stdin.read())

# you may require an encoding change
# here latin1 -> unicode
#data = unicode(data, 'utf-8')

# get nodes
#tree = etree.HTML(data)
tree = etree.HTML(stdin.read())
nodes = tree.xpath(args.base_level)

# feed description
if args.feed_description:
    description = args.feed_description
else:
    # retrieve page description
    descriptions = tree.xpath('//html/head//meta[@name="description"]/@content')
    description = '' if len(descriptions) == 0 else descriptions[0]

# feed title
if args.feed_title:
    title = args.feed_title
else:
    # retrieve page title
    titles = tree.xpath('//html/head//title/text()')
    title = '' if len(titles) == 0 else titles[0]

if len(nodes) > 0:
    subtitle = 'Syndication feed created by html2atom \n\n html2atom has scraped {} headlines \n\n Last update: {}'.format(len(nodes), datetime.datetime.now())
else:
    subtitle = 'FEED APPEARS TO BE EMPTY \n\n Date: {}'.format(datetime.datetime.now())

# create a feed
feed = feedgenerator.Atom1Feed(
       description = description,
       feed_copyright = args.feed_rights,
       language = args.feed_language_code,
       link = args.feed_url,
       #subtitle = '{} has scanned {} headlines'.format(sys.argv[0], len(nodes)),
       #subtitle = '{} headlines were scrapped by {}'.format(len(nodes), sys.argv[0]),
       #subtitle = 'Syndication feed created by html2atom\n\n'
       #'html2atom has scraped {} headlines\n\n'
       #'Last update: {}'.format(len(nodes), datetime.datetime.now()),
       subtitle = subtitle,
       title = title
       )

# for each instance of given node
for node in nodes:
    # get author
    if args.entry_author:
        authors = node.xpath(args.entry_author)
        author = '' if len(authors) == 0 else authors[0]

    # get date
    if args.entry_date:
        dates = node.xpath(args.entry_date)
        #date = '' if len(dates) == 0 else dates[0]
        # obtain list at its entire, so XPath String Functions such as
        # normalize-space, substring-before, substring-after, substring etc.
        # would be of use.
        #
        # on the opposite side of this, method strptime returns
        # TypeError: must be string, not list
        #
        # NOTE1 we might want to utilize function string-length
        # NOTE2 we might want to hardcode normalize-space, so we would always
        # be able to use date
        #
        # TODO test with CRYPTOME
        date = '' if len(dates) == 0 else ''.join(dates)
        # adopt given date format
        if args.date_format:
            date_format = args.date_format
            # TODO try & except TypeError
            # http://stackoverflow.com/questions/6615533/datetime-tryparse-in-python
            try:
                date = datetime.datetime.strptime(date, date_format)
            #TODO print error message to console
            except ValueError:
                #date = None
                date = datetime.datetime.now()
                #date = datetime.datetime.fromtimestamp(0) # Jan 1st 1970

    # get description
    if args.entry_description:
        descriptions = node.xpath(args.entry_description)
        #description = '' if len(descriptions) == 0 else descriptions[0]
        # python .local/share/liferea/scripts/html2atom.py --url 'https://jolla.com/jobs/' --title 'Jolla' --description 'Jobs' --subtitle '' --language 'en' --root '//div[(@class="et_pb_section et_section_regular") and not(descendant::a[@class="job-post"]) and (position()<last())]' --entry-title './/h2/text()' --entry-link './/@href' --entry-description './/strong/following-sibling::text()' --entry-id '@id'
        # TODO use \n with plaintext summary
        description = '' if len(descriptions) == 0 else '<br>'.join(descriptions)
        
        # TODO Error when utilizing node()
        # TypeError: sequence item 1: expected string or Unicode, lxml.etree._Element found
        # http://stackoverflow.com/questions/10880813/typeerror-sequence-item-0-expected-string-int-found
        #description = '' if len(descriptions) == 0 else '<br>'.join([str(i) for i in descriptions])
    
    enclosure_type = ''
    enclosure_size = ''
    # get enclosure
    if args.entry_enclosure:
        enclosures = node.xpath(args.entry_enclosure)
        enclosure = '' if len(enclosures) == 0 else enclosures[0]
        if args.enclosure_type:
            enclosure_type = args.enclosure_type
        else:
            enclosure_type = ''
        if args.enclosure_size:
            sizes = node.xpath(args.enclosure_size)
            size = '' if len(sizes) == 0 else sizes[0]
        else:
            enclosure_size = ''
    else:
        enclosure = ''

    # get identifier
    if args.entry_id:
        ids = node.xpath(args.entry_id)
        idposte = '' if len(ids) == 0 else ''.join(ids)

    # get link
    links = node.xpath(args.entry_link)
    link = '' if len(links) == 0 else ''.join(links)
    # NOTE using list[0] may be better
    #link = '' if len(links) == 0 else links[0]

    # get title
    titles = node.xpath(args.entry_title)
    title = '' if len(titles) == 0 else ''.join(titles)

    # create entry
    feed.add_item(
             #author_email = email,
             author_name = author if args.entry_author else None,
             #categories = (print [categories]),
             description = description if args.entry_description else title,
             enclosure = feedgenerator.Enclosure(enclosure, enclosure_size, enclosure_type) if args.entry_enclosure else None,
             link = link,
             pubdate = date if args.entry_date else None,
             title = title,
             unique_id = idposte if args.entry_id else link
             )

# output feed
#print feed.writeString('utf-8')

# python 3 print function
#print(feed.writeString('utf-8'))

# output feed
sys.stdout.write(feed.writeString('utf-8'))
sys.stdout.flush()

Test with:

python html2atom.py --url 'http://www.slackware.com/' --title 'The Slackware Linux Project' --description 'News' --subtitle '' --language 'en' --root '//center[not(parent::body)]/table' --entry-title './tr[1]//b/text()' --entry-link '@href' --entry-description './tr[2]/td[1]//text()' --entry-date 'normalize-space(./tr[2]/td[2]//b/text())' --date-format '%Y-%m-%d'

I rather have a built-in dialog which provides fields for title, link, description, date, in both XPath and CSS Selectors.

sjehuda avatar Jul 31 '22 16:07 sjehuda

FYI I've linked html2atom here https://github.com/lwindolf/rss-scraping

lwindolf avatar Sep 15 '22 23:09 lwindolf

Oh, no. That version is not up to date. I'm using an improved version.

The author gave me two answers for licensing and I was too shy to ask him about it again.

Below is the current version of html2atom.

sjehuda avatar Nov 25 '22 12:11 sjehuda

I think this should work with Python 3. I'll fix it, if it doesn't. Thank you, Lars.

#!/usr/bin/env python2
# coding: utf-8

from lxml import etree
import argparse
import datetime
import feedgenerator
#import requests
import os
from sys import stdin, stdout
import sys

# command line arguments
parser = argparse.ArgumentParser(
    description='Scrap HTML pages into Atom Syndication Format '
                'feeds (RFC 4287)')

parser.add_argument('-v, --version', action='version', version='%(prog)s 0.1')

# debugging
parser.add_argument('--debug', metavar='TEXT',
                    dest='debug', help='Print raw data to <!-- DATA -->)')

# content metadata
parser.add_argument('-c', '--charset', metavar='TEXT',
                    dest='charset', help='Specify data charset (Use this '
                    'option only when a given page has an exotic charset. '
                    'Default is UTF-8)')


# feed properties
parser.add_argument('-D', '--description', metavar='TEXT',
                    dest='feed_description', help='Set feed description')
parser.add_argument('-L', '--language', metavar='xx-XX',
                    dest='feed_language_code', help='Set feed language code. '
                    'Default language code is "en"')
parser.add_argument('-R', '--rights', metavar='TEXT',
                    dest='feed_rights', help='Set feed rights')
parser.add_argument('-S', '--subtitle', metavar='TEXT',
                    dest='feed_subtitle', help='Set feed subtitle. '
                    'If not specified, %(prog)s would attempt to fetch '
                    'subtitle from given URL with an XPath expression '
                    '//html/head//meta[@name="description"]/@content')
parser.add_argument('-T', '--title', metavar='TEXT',
                    dest='feed_title', help='Set feed title. '
                    'If not specified, %(prog)s would attempt to fetch '
                    'title from given URL with an XPath expression  '
                    '//html/head//title/text()')
parser.add_argument('-U', '--url', metavar='URL',
                    dest='feed_url', help='Set feed link. '
                    'If not specified, %(prog)s would use the given '
                    'URL as feed link')

# start points
parser.add_argument('-r', '--root', metavar='XPATH',
                    dest='base_level', help='Path to node base level from '
                    'each of which entry properties will be retrieved.',
                    required=True)

# entry characterization
# TODO all entry arguments should be array/collection/list
# for joining  several strings to one
parser.add_argument('-a', '--entry-author', metavar='XPATH',
                    dest='entry_author', help='Path to entry author')
parser.add_argument('-e', '--entry-email', metavar='XPATH',
                    dest='entry_email', help='Path to entry author Email')
# TODO use array/list instead of 2 date arguments [XPATH, DATE]
parser.add_argument('-d', '--entry-date', metavar='XPATH',
                    dest='entry_date', help='Path to entry title',
                    required=False)
parser.add_argument('-f', '--date-format', metavar='DATE',
                    dest='date_format', help='Date format of given URL '
                    '(e.g. if date is "YYYY-MM-DD", input would '
                    'correspondingly be "Y-m-d".  See "man date" for more '
                    'information). Must be utilized along with --entry-date',
                    required=False)
# TODO use array/list instead of 3 enclosure arguments [XPATH, TEXT, XPATH]
parser.add_argument('-E', '--entry-enclosure', metavar='XPATH',
                    dest='entry_enclosure', help='Path to entry enclosure',
                    required=False)
parser.add_argument('-m', '--enclosure-mimetype', metavar='MIME-Type',
                    dest='enclosure_type', help='Set enclosure media type. '
                    '(e.g. video/x-matroska, audio/x-flac, audio/ogg, '
                    'audio/opus, audio/speex etc.) '
                    'Must be utilized along with --entry-enclosure',
                    required=False)
parser.add_argument('-w', '--enclosure-size', metavar='XPATH',
                    dest='enclosure_size', help='Path to enclosure size. '
                    'Must be utilized along with --entry-enclosure',
                    required=False)
parser.add_argument('-s', '--entry-description', metavar='XPATH',
                    dest='entry_description', help='Path to entry description',
                    required=False)
parser.add_argument('-i', '--entry-id', metavar='XPATH',
                    dest='entry_id', help='Path to entry id',
                    required=False)
parser.add_argument('-l', '--entry-link', metavar='XPATH',
                    dest='entry_link', help='Path to entry link',
                    required=True)
parser.add_argument('-t', '--entry-title', metavar='XPATH',
                    dest='entry_title', help='Path to entry title',
                    required=True)

args = parser.parse_args()
#if args.verbose:
#    print "verbosity turned on"


# fetch html page
#response = requests.get(args.feed_url)
#data = response.text

# read html page
#reload(sys).setdefaultencoding('utf-8')
#data = unicode(stdin.read())

# you may require an encoding change
# here latin1 -> unicode
#data = unicode(data, 'utf-8')

# get nodes
#tree = etree.HTML(data)
tree = etree.HTML(stdin.read())
nodes = tree.xpath(args.base_level)

# feed description
if args.feed_description:
    description = args.feed_description
else:
    # retrieve page description
    descriptions = tree.xpath('//html/head//meta[@name="description"]/@content')
    description = '' if len(descriptions) == 0 else descriptions[0]

# feed title
if args.feed_title:
    title = args.feed_title
else:
    # retrieve page title
    titles = tree.xpath('//html/head//title/text()')
    title = '' if len(titles) == 0 else titles[0]

if len(nodes) > 0:
    subtitle = 'Syndication feed created by html2atom \n\n html2atom has scraped {} headlines \n\n Last update: {}'.format(len(nodes), datetime.datetime.now())
else:
    subtitle = 'FEED APPEARS TO BE EMPTY \n\n Date: {}'.format(datetime.datetime.now())

# create a feed
feed = feedgenerator.Atom1Feed(
       description = description,
       feed_copyright = args.feed_rights,
       language = args.feed_language_code,
       link = args.feed_url,
       #subtitle = '{} has scanned {} headlines'.format(sys.argv[0], len(nodes)),
       #subtitle = '{} headlines were scrapped by {}'.format(len(nodes), sys.argv[0]),
       #subtitle = 'Syndication feed created by html2atom\n\n'
       #'html2atom has scraped {} headlines\n\n'
       #'Last update: {}'.format(len(nodes), datetime.datetime.now()),
       subtitle = subtitle,
       title = title
       )

# for each instance of given node
for node in nodes:
    # get author
    if args.entry_author:
        authors = node.xpath(args.entry_author)
        author = '' if len(authors) == 0 else authors[0]

    # get date
    if args.entry_date:
        dates = node.xpath(args.entry_date)
        #date = '' if len(dates) == 0 else dates[0]
        # obtain list at its entire, so XPath String Functions such as
        # normalize-space, substring-before, substring-after, substring etc.
        # would be of use.
        #
        # on the opposite side of this, method strptime returns
        # TypeError: must be string, not list
        #
        # NOTE1 we might want to utilize function string-length
        # NOTE2 we might want to hardcode normalize-space, so we would always
        # be able to use date
        #
        # TODO test with CRYPTOME
        date = '' if len(dates) == 0 else ''.join(dates)
        # adopt given date format
        if args.date_format:
            date_format = args.date_format
            # TODO try & except TypeError
            # http://stackoverflow.com/questions/6615533/datetime-tryparse-in-python
            try:
                date = datetime.datetime.strptime(date, date_format)
            #TODO print error message to console
            except ValueError:
                #date = None
                date = datetime.datetime.now()
                #date = datetime.datetime.fromtimestamp(0) # Jan 1st 1970

    # get description
    if args.entry_description:
        descriptions = node.xpath(args.entry_description)
        #description = '' if len(descriptions) == 0 else descriptions[0]
        # python .local/share/liferea/scripts/html2atom.py --url 'https://jolla.com/jobs/' --title 'Jolla' --description 'Jobs' --subtitle '' --language 'en' --root '//div[(@class="et_pb_section et_section_regular") and not(descendant::a[@class="job-post"]) and (position()<last())]' --entry-title './/h2/text()' --entry-link './/@href' --entry-description './/strong/following-sibling::text()' --entry-id '@id'
        # TODO use \n with plaintext summary
        description = '' if len(descriptions) == 0 else '<br>'.join(descriptions)
        
        # TODO Error when utilizing node()
        # TypeError: sequence item 1: expected string or Unicode, lxml.etree._Element found
        # http://stackoverflow.com/questions/10880813/typeerror-sequence-item-0-expected-string-int-found
        #description = '' if len(descriptions) == 0 else '<br>'.join([str(i) for i in descriptions])
    
    enclosure_type = ''
    enclosure_size = ''
    # get enclosure
    if args.entry_enclosure:
        enclosures = node.xpath(args.entry_enclosure)
        enclosure = '' if len(enclosures) == 0 else enclosures[0]
        if args.enclosure_type:
            enclosure_type = args.enclosure_type
        else:
            enclosure_type = ''
        if args.enclosure_size:
            sizes = node.xpath(args.enclosure_size)
            size = '' if len(sizes) == 0 else sizes[0]
        else:
            enclosure_size = ''
    else:
        enclosure = ''

    # get identifier
    if args.entry_id:
        ids = node.xpath(args.entry_id)
        idposte = '' if len(ids) == 0 else ''.join(ids)

    # get link
    links = node.xpath(args.entry_link)
    link = '' if len(links) == 0 else ''.join(links)
    # NOTE using list[0] may be better
    #link = '' if len(links) == 0 else links[0]

    # get title
    titles = node.xpath(args.entry_title)
    title = '' if len(titles) == 0 else ''.join(titles)

    # create entry
    feed.add_item(
             #author_email = email,
             author_name = author if args.entry_author else None,
             #categories = (print [categories]),
             description = description if args.entry_description else title,
             enclosure = feedgenerator.Enclosure(enclosure, enclosure_size, enclosure_type) if args.entry_enclosure else None,
             link = link,
             pubdate = date if args.entry_date else None,
             title = title,
             unique_id = idposte if args.entry_id else link
             )

# output feed
#print feed.writeString('utf-8')

# python 3 print function
#print(feed.writeString('utf-8'))

# output feed
sys.stdout.write(feed.writeString('utf-8'))
sys.stdout.flush()

sjehuda avatar Nov 25 '22 12:11 sjehuda