newspaper
newspaper copied to clipboard
Skip unparsable urls
I get problems with some image urls when using news-please:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/newsplease/crawler/commoncrawl_extractor.py", line 259, in _
_process_warc_gz_file
filter_pass, article = self.filter_record(record)
File "/opt/coviddash/ingress/covidmarch.py", line 24, in filter_record
passed_filters, article = super().filter_record(warc_record, article)
File "/usr/local/lib/python3.7/dist-packages/newsplease/crawler/commoncrawl_extractor.py", line 123, in f
ilter_record
article = self._from_warc(warc_record)
File "/usr/local/lib/python3.7/dist-packages/newsplease/crawler/commoncrawl_extractor.py", line 235, in _
from_warc
return NewsPlease.from_warc(record, decode_errors="replace" if self.__ignore_unicode_errors else "stric
t", fetch_images=self.__fetch_images)
File "/usr/local/lib/python3.7/dist-packages/newsplease/__init__.py", line 55, in from_warc
article = NewsPlease.from_html(html, url=url, download_date=download_date, fetch_images=fetch_images)
File "/usr/local/lib/python3.7/dist-packages/newsplease/__init__.py", line 95, in from_html
item = extractor.extract(item)
File "/usr/local/lib/python3.7/dist-packages/newsplease/pipeline/extractor/article_extractor.py", line 63
, in extract
article_candidate = extractor.extract(item)
File "/usr/local/lib/python3.7/dist-packages/newsplease/pipeline/extractor/extractors/newspaper_extractor
.py", line 33, in extract
article.parse()
File "/usr/local/lib/python3.7/dist-packages/newspaper/article.py", line 261, in parse
self.fetch_images()
File "/usr/local/lib/python3.7/dist-packages/newspaper/article.py", line 272, in fetch_images
imgs = self.extractor.get_img_urls(self.url, self.clean_doc)
File "/usr/local/lib/python3.7/dist-packages/newspaper/extractors.py", line 570, in get_img_urls
for url in urls])
File "/usr/local/lib/python3.7/dist-packages/newspaper/extractors.py", line 570, in <listcomp>
for url in urls])
File "/usr/lib/python3.7/urllib/parse.py", line 511, in urljoin
urlparse(url, bscheme, allow_fragments)
File "/usr/lib/python3.7/urllib/parse.py", line 368, in urlparse
splitresult = urlsplit(url, scheme, allow_fragments)
File "/usr/lib/python3.7/urllib/parse.py", line 435, in urlsplit
raise ValueError("Invalid IPv6 URL")
ValueError: Invalid IPv6 URL
ERROR:newsplease.crawler.commoncrawl_extractor:Unexpected error: <class 'ValueError'> (Invalid IPv6 URL)