internal-displacement
internal-displacement copied to clipboard
Rare case of site not returning true 404
Trace
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_status(self)
282 try:
--> 283 status = int(status)
284 if status < 100 or status > 999:
ValueError: invalid literal for int() with base 10: '404:'
During handling of the above exception, another exception occurred:
BadStatusLine Traceback (most recent call last)
<ipython-input-69-4f8063e7f514> in <module>()
4 try:
5 article = pipeline.create_article(url)
----> 6 pipeline.fetch_article(article)
7 except exc.IntegrityError:
8 session.rollback()
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/pipeline.py in fetch_article(self, article)
196 '''
197 content, publish_date, title, content_type, authors, domain = self.scraper.scrape(
--> 198 article.url)
199 if content == 'retrieval_failed':
200 article.update_status(Status.FETCHING_FAILED)
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in scrape(self, url, scrape_pdfs)
189
190 """
--> 191 pdf_check = is_pdf_consolidated_test(url)
192 if pdf_check and scrape_pdfs:
193 article = self.pdf_article(pdf_check)
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_consolidated_test(url)
58
59 # Carry out simple tests based upon url and content type
---> 60 pdf_attempt_1 = is_pdf_simple_tests(url)
61 if pdf_attempt_1:
62 return pdf_attempt_1
/Users/George/projects/d4d/internal-displacement-gr/internal-displacement/internal_displacement/scraper.py in is_pdf_simple_tests(url)
24 # Test based on headers
25 try:
---> 26 page = request.urlopen(url)
27 content_type = page.getheader('Content-Type')
28 if content_type == 'application/pdf':
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
524 req = meth(req)
525
--> 526 response = self._open(req, data)
527
528 # post-process response
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in _open(self, req, data)
542 protocol = req.type
543 result = self._call_chain(self.handle_open, protocol, protocol +
--> 544 '_open', req)
545 if result:
546 return result
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in http_open(self, req)
1344
1345 def http_open(self, req):
-> 1346 return self.do_open(http.client.HTTPConnection, req)
1347
1348 http_request = AbstractHTTPHandler.do_request_
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1319 except OSError as err: # timeout error
1320 raise URLError(err)
-> 1321 r = h.getresponse()
1322 except:
1323 h.close()
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in getresponse(self)
1329 try:
1330 try:
-> 1331 response.begin()
1332 except ConnectionError:
1333 self.close()
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in begin(self)
295 # read until we get a non-100 response
296 while True:
--> 297 version, status, reason = self._read_status()
298 if status != CONTINUE:
299 break
/Users/George/miniconda3/envs/d4d-internal-displacement/lib/python3.6/http/client.py in _read_status(self)
285 raise BadStatusLine(line)
286 except ValueError:
--> 287 raise BadStatusLine(line)
288 return version, status, reason
289
BadStatusLine: HTTP/1.1 404: Not Found