internal-displacement
internal-displacement copied to clipboard
Deal with scraping error
Trace
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _get_chunk_left(self)
545 try:
--> 546 chunk_left = self._read_next_chunk_size()
547 except ValueError:
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_next_chunk_size(self)
512 try:
--> 513 return int(line, 16)
514 except ValueError:
ValueError: invalid literal for int() with base 16: b''
During handling of the above exception, another exception occurred:
IncompleteRead Traceback (most recent call last)
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _readall_chunked(self)
562 while True:
--> 563 chunk_left = self._get_chunk_left()
564 if chunk_left is None:
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _get_chunk_left(self)
547 except ValueError:
--> 548 raise IncompleteRead(b'')
549 if chunk_left == 0:
IncompleteRead: IncompleteRead(0 bytes read)
During handling of the above exception, another exception occurred:
IncompleteRead Traceback (most recent call last)
<ipython-input-62-4f8063e7f514> in <module>()
4 try:
5 article = pipeline.create_article(url)
----> 6 pipeline.fetch_article(article)
7 except exc.IntegrityError:
8 session.rollback()
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/pipeline.py in fetch_article(self, article)
196 '''
197 content, publish_date, title, content_type, authors, domain = self.scraper.scrape(
--> 198 article.url)
199 if content == 'retrieval_failed':
200 article.update_status(Status.FETCHING_FAILED)
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in scrape(self, url, scrape_pdfs)
189
190 """
--> 191 pdf_check = is_pdf_consolidated_test(url)
192 if pdf_check and scrape_pdfs:
193 article = self.pdf_article(pdf_check)
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_consolidated_test(url)
63
64 # Carry out additional test based by looking for iframe
---> 65 pdf_attempt_2 = is_pdf_iframe_test(url)
66 if pdf_attempt_2:
67 return pdf_attempt_2
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_iframe_test(url)
38 try:
39 page = request.urlopen(url)
---> 40 soup = BeautifulSoup(page, "html.parser")
41 iframes = soup.find_all('iframe')
42 if len(iframes) > 0:
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
189
190 if hasattr(markup, 'read'): # It's a file-type object.
--> 191 markup = markup.read()
192 elif len(markup) <= 256 and (
193 (isinstance(markup, bytes) and not b'<' in markup)
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in read(self, amt)
454
455 if self.chunked:
--> 456 return self._readall_chunked()
457
458 if self.length is None:
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _readall_chunked(self)
568 return b''.join(value)
569 except IncompleteRead:
--> 570 raise IncompleteRead(b''.join(value))
571
572 def _readinto_chunked(self, b):
IncompleteRead: IncompleteRead(25405 bytes read)
What was the url that you were trying to scrape?
That's a good question...
I came across similar issue when i tried to scrap from https://blogs.savethechildren.org.uk/category/theme/education I was following the tutorial available here
Help please
Trace error
TypeErrorTraceback (most recent call last) <ipython-input-319-7bc9785125d5> in <module>() 4 5 # pre-process text and store the same ----> 6 news_df[''clean_text''] = normalize_corpus(news_df[''full_text'']) 7 norm_corpus = list(news_df[''clean_text'']) 8 <ipython-input-318-996e5fdc3099> in normalize_corpus(corpus) 5 for doc in corpus: 6 # strip HTML ----> 7 doc = strip_html_tags(doc) 8 9 <ipython-input-272-891cde963c70> in strip_html_tags(text) 1 def strip_html_tags(text): ----> 2 soup = BeautifulSoup(text, "html.parser") 3 stripped_text = soup.get_text() 4 return stripped_text 5 /home/baiju/DSS_DATA/pyenv/local/lib/python2.7/site-packages/bs4/__init__.pyc in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs) 244 if hasattr(markup, 'read'): # It's a file-type object. 245 markup = markup.read() --> 246 elif len(markup) <= 256 and ( 247 (isinstance(markup, bytes) and not b'<' in markup) 248 or (isinstance(markup, unicode) and not u'<' in markup) TypeError: object of type 'float' has no len()