scrapy-splash
scrapy-splash copied to clipboard
scrapy splash not rendering continous requests.
I created a spider with scrapy_splash, I hardcoded 3 urls in start_requests. When I run with any one url it is working fine for all the urls. when I put all the urls in a list and run one by one, it is not working, and splash not returning complete rendered html in response.body.
kindly help.
code:
import time
import json
import scrapy
import w3lib
from scrapy_splash import SplashRequest
class SpeSpider(scrapy.Spider):
name = 'spe'
# allowed_domains = ['s']
# start_urls = ['http://s/']
without_wait_script = """
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
"""
wait_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(10))
return {
html = splash:html(),
}
end
"""
splash_headers = {
'authority': 'www.avivainvestors.com',
'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.avivainvestors.com/fr-fr/nos-expertises/nos-fonds/',
'accept-language': 'en-US,en;q=0.9,lb;q=0.8',
}
def start_requests(self):
url1="https://www.avivainvestors.com/fr-fr/nos-expertises/equities/uk-listed-equity-high-alpha-fund/lu0160960752-gbp/"
url2 = "https://www.avivainvestors.com/fr-fr/nos-expertises/equities/japon-isr/fr0013340841-eur/"
url3 = "https://www.avivainvestors.com/fr-fr/nos-expertises/fixed-income/emerging-markets-corporate-bond-fund/lu1550133976-usd/"
urls = [url1, url2, url3]
for url in urls:
time.sleep(10)
yield SplashRequest(
url=url,
endpoint="execute",
callback=self.scrape_document_id,
args={"lua_source":self.wait_script},
splash_headers= self.splash_headers
)
def scrape_document_id(self, response):
value = response.xpath('//div[@class="ec-table__cell-content ng-binding ng-scope" and text() = "Rapport annuel"]/../..//td/ec-button/@mstar-component-id').get()
print("VALUE", value)
v = re.search(r"\[([^]]+)\]", value).group().strip("[]")
yield {
"url": response.url,
"id" : v
}`