scrapy-splash icon indicating copy to clipboard operation
scrapy-splash copied to clipboard

scrapy splash not rendering continous requests.

Open Pechi77 opened this issue 3 years ago • 0 comments

I created a spider with scrapy_splash, I hardcoded 3 urls in start_requests. When I run with any one url it is working fine for all the urls. when I put all the urls in a list and run one by one, it is not working, and splash not returning complete rendered html in response.body.

kindly help.

code:

import time
import json

import scrapy
import w3lib
from scrapy_splash import SplashRequest

class SpeSpider(scrapy.Spider):
    name = 'spe'
    # allowed_domains = ['s']
    # start_urls = ['http://s/']

    without_wait_script = """
            function main(splash, args)
            splash.private_mode_enabled = false
            
            assert(splash:go(args.url))
            assert(splash:wait(2))
            
            return {
                html = splash:html(),
                
            }
            end
    
    """

    wait_script = """
            function main(splash, args)
            
            assert(splash:go(args.url))
            
            assert(splash:wait(10))
            
            return {
                html = splash:html(),
                
            }
            end
    
    """
    splash_headers = {
        'authority': 'www.avivainvestors.com',
        'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.avivainvestors.com/fr-fr/nos-expertises/nos-fonds/',
        'accept-language': 'en-US,en;q=0.9,lb;q=0.8',
    }



    def start_requests(self):
        url1="https://www.avivainvestors.com/fr-fr/nos-expertises/equities/uk-listed-equity-high-alpha-fund/lu0160960752-gbp/"
        url2 = "https://www.avivainvestors.com/fr-fr/nos-expertises/equities/japon-isr/fr0013340841-eur/"
        url3 = "https://www.avivainvestors.com/fr-fr/nos-expertises/fixed-income/emerging-markets-corporate-bond-fund/lu1550133976-usd/"
        urls = [url1, url2, url3]
        for url in urls:
            time.sleep(10)
            yield SplashRequest(
                url=url,
                endpoint="execute",
                callback=self.scrape_document_id,
                args={"lua_source":self.wait_script},
                splash_headers= self.splash_headers

            )

    def scrape_document_id(self, response):
        value = response.xpath('//div[@class="ec-table__cell-content ng-binding ng-scope" and text() = "Rapport annuel"]/../..//td/ec-button/@mstar-component-id').get()
        print("VALUE", value)
        v = re.search(r"\[([^]]+)\]", value).group().strip("[]")

        yield {
            "url": response.url,
            "id" : v

        }`

Pechi77 avatar Nov 18 '21 16:11 Pechi77