frontera
frontera copied to clipboard
How to make scrapy-splash work with frontera?
I have added the following lines to my spider.
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,
args={
# optional; parameters passed to Splash HTTP API
'wait': 0.5,
# 'url' is prefilled from request url
# 'http_method' is set to 'POST' for POST requests
# 'body' is set to request body for POST requests
},
endpoint='render.json', # optional; default is render.html
# splash_url='<url>', # optional; overrides SPLASH_URL
slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN, # optional
)
def make_requests_from_url(self, url):
""" This method is deprecated. """
return SplashRequest(url, self.parse,
args={
# optional; parameters passed to Splash HTTP API
'wait': 0.5,
# 'url' is prefilled from request url
# 'http_method' is set to 'POST' for POST requests
# 'body' is set to request body for POST requests
},
endpoint='render.json', # optional; default is render.html
# splash_url='<url>', # optional; overrides SPLASH_URL
)
From my debugging these lines are only executed when the seed is run. Frontera is creating the request and responses outside of the spider. Specifically in frontera/contrib/requests/converters.py.. I found these lines. mentioned in #300 .
def from_frontier(self, frontier_request):
"""request: Frontier > Scrapy"""
cb = frontier_request.meta.get(b'scrapy_callback', None)
if cb and self.spider:
cb = _get_method(self.spider, cb)
eb = frontier_request.meta.get(b'scrapy_errback', None)
if eb and self.spider:
eb = _get_method(self.spider, eb)
body = frontier_request.body
meta = frontier_request.meta.get(b'scrapy_meta', {})
meta[b'frontier_request'] = frontier_request
return ScrapyRequest(url=frontier_request.url,
callback=cb,
errback=eb,
body=body,
method=to_native_str(frontier_request.method),
headers=frontier_request.headers,
cookies=frontier_request.cookies,
meta=meta,
dont_filter=True)
def from_frontier(self, response):
"""response: Frontier > Scrapy"""
return ScrapyResponse(url=response.url,
status=response.status_code,
headers=response.headers,
body=response.body,
request=self._request_converter.from_frontier(response.request))
I've edited those two function in virtualenv/lib to see if I could change the functionality. Just changing to SplashRequest and SplashResponse, but the problem is that the parse method in the spider is no longer called. Am I doing this wrong? I'm wanting to make sure my crawler is efficient so I'm wanting to avoid downloading pages more than once if I can.
Ok I've noticed the log on my computer looks like a timeout error now.
2018-07-01 11:46:28 [manager] DEBUG: PAGE_REQUEST_ERROR url=http://127.0.0.1:8050/render.html error=TimeoutError
2018-07-01 11:46:28 [manager.components] DEBUG: processing 'request_error' 'Middleware.UrlFingerprintMiddleware' <Request at 0x110da5710 http://127.0.0.1:8050/render.html meta={b'scrapy_callback': None, b'scrapy_errback': None, b'scrapy_meta': {'depth': 6, 'splash': {'endpoint': 'render.html', 'slot_policy': 'per_domain', 'magic_response': True, 'http_status_from_error_code': True, 'args': {'url': 'https://www.amazon.com/b/ref=s9_acss_bw_cg_LH18AES_1a1_w?node=17728828011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-12&pf_rd_r=H90K0CDC94PWSA576F2W&pf_rd_t=101&pf_rd_p=39785ce6-6f12-4c9e-8259-0566bbb7447e&pf_rd_i=17728536011', 'wait': 0.5, 'headers': {'Referer': 'https://www.amazon.com/b/ref=nav_shopall_SWMTVT18?_encoding=UTF8&node=17728536011&pf_rd_p=bd34c6e7-aef4-4799-853a-d3b52e17360a&pf_rd_s=nav-sa-toys-kids-baby&pf_rd_t=4201&pf_rd_i=navbar-4201&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=CSYHPA14129HPJS57ZBA', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1'}}}, 'ajax_crawlable': True, 'proxy': 'https://83.149.70.159:13012', 'download_timeout': 180.0, '_splash_processed': True, 'download_slot': 'www.amazon.com'}, b'origin_is_frontier': True, b'fingerprint': b'de4c43938b44c012e828a5bd3d0f0d856a24c9b7', b'domain': {b'netloc': b'www.amazon.com', b'name': b'www.amazon.com', b'scheme': b'https', b'sld': b'', b'tld': b'', b'subdomain': b'', b'fingerprint': b'47cf2c8c9947e99f6ecfee54a51bd0225f376ebc'}, b'state': 1, b'score': 0.15625, b'jid': 0} body=b'{\n "headers": {\n '... cookies={}, headers={b'Content-Type': [b'application/json'], b'Accept': [b'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'], b'Accept-Language': [b'en'], b'User-Agent': [b'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36'], b'Accept-Encoding': [b'gzip,deflate']}>
2018-07-01 11:46:28 [manager.components] DEBUG: processing 'request_error' 'Middleware.DomainMiddleware' <Request at 0x110da5710 http://127.0.0.1:8050/render.html meta={b'scrapy_callback': None, b'scrapy_errback': None, b'scrapy_meta': {'depth': 6, 'splash': {'endpoint': 'render.html', 'slot_policy': 'per_domain', 'magic_response': True, 'http_status_from_error_code': True, 'args': {'url': 'https://www.amazon.com/b/ref=s9_acss_bw_cg_LH18AES_1a1_w?node=17728828011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-12&pf_rd_r=H90K0CDC94PWSA576F2W&pf_rd_t=101&pf_rd_p=39785ce6-6f12-4c9e-8259-0566bbb7447e&pf_rd_i=17728536011', 'wait': 0.5, 'headers': {'Referer': 'https://www.amazon.com/b/ref=nav_shopall_SWMTVT18?_encoding=UTF8&node=17728536011&pf_rd_p=bd34c6e7-aef4-4799-853a-d3b52e17360a&pf_rd_s=nav-sa-toys-kids-baby&pf_rd_t=4201&pf_rd_i=navbar-4201&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=CSYHPA14129HPJS57ZBA', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1'}}}, 'ajax_crawlable': True, 'proxy': 'https://83.149.70.159:13012', 'download_timeout': 180.0, '_splash_processed': True, 'download_slot': 'www.amazon.com'}, b'origin_is_frontier': True, b'fingerprint': b'd7b7e7a88bba0c4966f742b12dfb5dede8353f80', b'domain': {b'netloc': b'www.amazon.com', b'name': b'www.amazon.com', b'scheme': b'https', b'sld': b'', b'tld': b'', b'subdomain': b'', b'fingerprint': b'47cf2c8c9947e99f6ecfee54a51bd0225f376ebc'}, b'state': 1, b'score': 0.15625, b'jid': 0} body=b'{\n "headers": {\n '... cookies={}, headers={b'Content-Type': [b'application/json'], b'Accept': [b'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'], b'Accept-Language': [b'en'], b'User-Agent': [b'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36'], b'Accept-Encoding': [b'gzip,deflate']}>
2018-07-01 11:46:28 [manager.components] DEBUG: processing 'request_error' 'Middleware.DomainFingerprintMiddleware' <Request at 0x110da5710 http://127.0.0.1:8050/render.html meta={b'scrapy_callback': None, b'scrapy_errback': None, b'scrapy_meta': {'depth': 6, 'splash': {'endpoint': 'render.html', 'slot_policy': 'per_domain', 'magic_response': True, 'http_status_from_error_code': True, 'args': {'url': 'https://www.amazon.com/b/ref=s9_acss_bw_cg_LH18AES_1a1_w?node=17728828011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-12&pf_rd_r=H90K0CDC94PWSA576F2W&pf_rd_t=101&pf_rd_p=39785ce6-6f12-4c9e-8259-0566bbb7447e&pf_rd_i=17728536011', 'wait': 0.5, 'headers': {'Referer': 'https://www.amazon.com/b/ref=nav_shopall_SWMTVT18?_encoding=UTF8&node=17728536011&pf_rd_p=bd34c6e7-aef4-4799-853a-d3b52e17360a&pf_rd_s=nav-sa-toys-kids-baby&pf_rd_t=4201&pf_rd_i=navbar-4201&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=CSYHPA14129HPJS57ZBA', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1'}}}, 'ajax_crawlable': True, 'proxy': 'https://83.149.70.159:13012', 'download_timeout': 180.0, '_splash_processed': True, 'download_slot': 'www.amazon.com'}, b'origin_is_frontier': True, b'fingerprint': b'd7b7e7a88bba0c4966f742b12dfb5dede8353f80', b'domain': {b'netloc': b'127.0.0.1:8050', b'name': b'127.0.0.1', b'scheme': b'http', b'sld': b'', b'tld': b'', b'subdomain': b''}, b'state': 1, b'score': 0.15625, b'jid': 0} body=b'{\n "headers": {\n '... cookies={}, headers={b'Content-Type': [b'application/json'], b'Accept': [b'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'], b'Accept-Language': [b'en'], b'User-Agent': [b'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36'], b'Accept-Encoding': [b'gzip,deflate']}>
2018-07-01 11:46:28 [manager.components] DEBUG: processing 'request_error' 'CanonicalSolver.BasicCanonicalSolver' <Request at 0x110da5710 http://127.0.0.1:8050/render.html meta={b'scrapy_callback': None, b'scrapy_errback': None, b'scrapy_meta': {'depth': 6, 'splash': {'endpoint': 'render.html', 'slot_policy': 'per_domain', 'magic_response': True, 'http_status_from_error_code': True, 'args': {'url': 'https://www.amazon.com/b/ref=s9_acss_bw_cg_LH18AES_1a1_w?node=17728828011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-12&pf_rd_r=H90K0CDC94PWSA576F2W&pf_rd_t=101&pf_rd_p=39785ce6-6f12-4c9e-8259-0566bbb7447e&pf_rd_i=17728536011', 'wait': 0.5, 'headers': {'Referer': 'https://www.amazon.com/b/ref=nav_shopall_SWMTVT18?_encoding=UTF8&node=17728536011&pf_rd_p=bd34c6e7-aef4-4799-853a-d3b52e17360a&pf_rd_s=nav-sa-toys-kids-baby&pf_rd_t=4201&pf_rd_i=navbar-4201&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=CSYHPA14129HPJS57ZBA', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1'}}}, 'ajax_crawlable': True, 'proxy': 'https://83.149.70.159:13012', 'download_timeout': 180.0, '_splash_processed': True, 'download_slot': 'www.amazon.com'}, b'origin_is_frontier': True, b'fingerprint': b'd7b7e7a88bba0c4966f742b12dfb5dede8353f80', b'domain': {b'netloc': b'127.0.0.1:8050', b'name': b'127.0.0.1', b'scheme': b'http', b'sld': b'', b'tld': b'', b'subdomain': b'', b'fingerprint': b'4b84b15bff6ee5796152495a230e45e3d7e947d9'}, b'state': 1, b'score': 0.15625, b'jid': 0} body=b'{\n "headers": {\n '... cookies={}, headers={b'Content-Type': [b'application/json'], b'Accept': [b'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'], b'Accept-Language': [b'en'], b'User-Agent': [b'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36'], b'Accept-Encoding': [b'gzip,deflate']}>
2018-07-01 11:46:28 [manager.components] DEBUG: processing 'request_error' 'Backend.MessageBusBackend' <Request at 0x110da5710 http://127.0.0.1:8050/render.html meta={b'scrapy_callback': None, b'scrapy_errback': None, b'scrapy_meta': {'depth': 6, 'splash': {'endpoint': 'render.html', 'slot_policy': 'per_domain', 'magic_response': True, 'http_status_from_error_code': True, 'args': {'url': 'https://www.amazon.com/b/ref=s9_acss_bw_cg_LH18AES_1a1_w?node=17728828011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-12&pf_rd_r=H90K0CDC94PWSA576F2W&pf_rd_t=101&pf_rd_p=39785ce6-6f12-4c9e-8259-0566bbb7447e&pf_rd_i=17728536011', 'wait': 0.5, 'headers': {'Referer': 'https://www.amazon.com/b/ref=nav_shopall_SWMTVT18?_encoding=UTF8&node=17728536011&pf_rd_p=bd34c6e7-aef4-4799-853a-d3b52e17360a&pf_rd_s=nav-sa-toys-kids-baby&pf_rd_t=4201&pf_rd_i=navbar-4201&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=CSYHPA14129HPJS57ZBA', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1'}}}, 'ajax_crawlable': True, 'proxy': 'https://83.149.70.159:13012', 'download_timeout': 180.0, '_splash_processed': True, 'download_slot': 'www.amazon.com'}, b'origin_is_frontier': True, b'fingerprint': b'd7b7e7a88bba0c4966f742b12dfb5dede8353f80', b'domain': {b'netloc': b'127.0.0.1:8050', b'name': b'127.0.0.1', b'scheme': b'http', b'sld': b'', b'tld': b'', b'subdomain': b'', b'fingerprint': b'4b84b15bff6ee5796152495a230e45e3d7e947d9'}, b'state': 1, b'score': 0.15625, b'jid': 0} body=b'{\n "headers": {\n '... cookies={}, headers={b'Content-Type': [b'application/json'], b'Accept': [b'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'], b'Accept-Language': [b'en'], b'User-Agent': [b'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36'], b'Accept-Encoding': [b'gzip,deflate']}>
Hi @DiscipleOfOne the right approach will be to use this guide https://github.com/scrapy-plugins/scrapy-splash#configuration , and Scrapy.Request with splash
meta key.