scrapy-splash
scrapy-splash copied to clipboard
Scrapy-splash - setting DEPTH_LIMIT not working?
Hello , I'm facing a problem in my spider for 'DEPTH_LIMIT' not work.
# setting.py
# depth to crawler
#--------------------------------------------------
DEPTH_LIMIT = 5
DEPTH_STATS_VERBOSE = True
DEPTH_PRIORITY = 1
#--------------------------------------------------
# /spider/dynamicspider.py
class DynamiccrawlerSpider(scrapy.Spider):
name = 'dynamicCrawler'
link_extractor = MYLxmlLinkExtractor()
start_urls = []
def get_seed_url(self,file_path):
url_list=set()
with open(file_path) as f:
for i in f.readlines():
seed=i.strip('\n')
seedURL = json.loads(seed)['seedURL']
url_list.add(seedURL)
print('GET SEED Length: ',len(url_list),'--',url_list)
return url_list
def start_requests(self):
self.start_urls = self.get_seed_url(file_path=self.settings.get('SEED_FILE_PATH'))
for url in self.start_urls:
yield SplashRequest(url,
callback=self.parse_result,
args={
'wait': 30,
'timeout': 90 ,
'images': 0,
'resource_timeout': 30 ,
},
dont_filter = True,
dont_process_response = True,
endpoint='render.html'
)
def parse_result(self, response):
print("DEPTH+++++++++++++++++++++++",response.request.meta['depth'])
table_list = response.xpath("//table").extract()
if len(table_list) > 0 :
item = DynamicWebMeta()
item['pageurl'] = response.request._original_url
item['title'] = response.xpath("//title/text()").get()
item['body'] = response.text
yield item
links = self.link_extractor.extract_links(response)
links_len = len(links)
if links_len>0:
i = 0
for link in links:
i = i+1
print('{0}/{1}--Son link *******{2}'.format(i,links_len,link.url))
yield SplashRequest(link.url,
callback=self.parse_result,
args={
'wait': 30,
'timeout': 90 ,
'images': 0,
'resource_timeout': 30,
},
dont_process_response = True,
endpoint='render.html')
logfile
{'BOT_NAME': 'dynamicTableScrapy',
'DEPTH_LIMIT': 5,
'DEPTH_PRIORITY': 1,
'DEPTH_STATS_VERBOSE': True,
'DOWNLOAD_DELAY': 10,
'DUPEFILTER_CLASS': 'dynamicTableScrapy.mydupefilter.MyDupeFilter',
'HTTPCACHE_ENABLED': True,
'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage',
'NEWSPIDER_MODULE': 'dynamicTableScrapy.spiders',
'SCHEDULER': 'scrapy_redis_bloomfilter.scheduler.Scheduler',
'SPIDER_MODULES': ['dynamicTableScrapy.spiders'],
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'}
[scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
[scrapy.extensions.telnet] INFO: Telnet Password: 732cc755693aaef0
[scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2022-05-26 21:07:40 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'dynamicTableScrapy.middlewares.DynamictablescrapyDownloaderMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy_splash.SplashCookiesMiddleware',
'scrapy_splash.SplashMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats',
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware']
[scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy_splash.SplashDeduplicateArgsMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'dynamicTableScrapy.middlewares.DynamictablescrapySpiderMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
[scrapy.middleware] INFO: Enabled item pipelines:
['dynamicTableScrapy.pipelines.MongoDBPipLine']
[scrapy.core.engine] INFO: Spider opened
..........
{'bloomfilter/filtered': 21704,
'downloader/exception_count': 4,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 4,
'downloader/request_bytes': 1143999,
'downloader/request_count': 1482,
'downloader/request_method_count/POST': 1482,
'downloader/response_bytes': 27034526,
'downloader/response_count': 1478,
'downloader/response_status_count/200': 420,
'downloader/response_status_count/502': 129,
'downloader/response_status_count/503': 864,
'downloader/response_status_count/504': 65,
'elapsed_time_seconds': 343.332559,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 5, 26, 13, 13, 24, 317268),
'httpcache/firsthand': 37,
'httpcache/hit': 1441,
'httpcache/miss': 41,
'httpcache/store': 37,
'httperror/response_ignored_count': 353,
'httperror/response_ignored_status_count/502': 43,
'httperror/response_ignored_status_count/503': 288,
'httperror/response_ignored_status_count/504': 22,
'item_scraped_count': 188,
'log_count/DEBUG': 1673,
'log_count/ERROR': 353,
'log_count/INFO': 372,
'log_count/WARNING': 2,
'memusage/max': 123432960,
'memusage/startup': 71020544,
'request_depth_count/0': 420,
'request_depth_count/1': 21706,
'request_depth_max': 1,
'response_received_count': 773,
'retry/count': 709,
'retry/max_reached': 353,
'retry/reason_count/502 Bad Gateway': 86,
'retry/reason_count/503 Service Unavailable': 576,
'retry/reason_count/504 Gateway Time-out': 43,
'retry/reason_count/twisted.internet.error.TimeoutError': 4,
'scheduler/dequeued/redis': 2255,
'scheduler/enqueued/redis': 2255,
'splash/render.html/request_count': 773,
'splash/render.html/response_count/200': 420,
'splash/render.html/response_count/502': 129,
'splash/render.html/response_count/503': 864,
'splash/render.html/response_count/504': 65,
'start_time': datetime.datetime(2022, 5, 26, 13, 7, 40, 984709),
'urllength/request_ignored_count': 2}
[scrapy.core.engine] INFO: Spider closed (finished)
What should I do to make DEPTH_LIMIT work well?