scrapy-redis
scrapy-redis copied to clipboard
When does the crawler stop
When does the crawler stop
this is my code. it is extensions.
from scrapy.exceptions import NotConfigured
from twisted.internet import task
from scrapy import signals
class AutoCloseSpider(object):
"""
scrapy_redis扩展插件
Parameters
----------
CLOSE_SPIDER_INTERVAL : float
ZERO_THRESHOLD : int
"""
def __init__(self, crawler, stats, interval=60.0, threshold=3):
self.crawler = crawler
self.stats = stats
self.interval = interval
self.threshold = threshold
self.task = None
@classmethod
def from_crawler(cls, crawler):
interval = crawler.settings.getfloat('CLOSE_SPIDER_INTERVAL')
threshold = crawler.settings.getint('ZERO_THRESHOLD')
if not interval and not threshold:
raise NotConfigured
stats = crawler.stats
o = cls(crawler, stats, interval, threshold)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider):
# 记录上一次的数据量
self.request_count_prev = 0
# 获取response的次数
self.zero_count = -1
self.task = task.LoopingCall(self.increment, spider)
self.task.start(self.interval)
def increment(self, spider):
# 判断爬虫的request是否为空
request_count = self.stats.get_value('downloader/request_count', 0)
# 这一次的数据量 - 上一次的数据量
inc = (request_count - self.request_count_prev)
if inc == 0:
self.zero_count += 1
elif inc != 0 and self.zero_count != 0:
self.zero_count = 0
# 如果为增量为0的次数超过阈值,则主动关闭爬虫
if self.zero_count >= self.threshold:
self.crawler.engine.close_spider(spider, 'closespider_zerocount')
def spider_closed(self, spider, reason):
if self.task and self.task.running:
self.task.stop()
this is my code. it is extensions.
from scrapy.exceptions import NotConfigured from twisted.internet import task from scrapy import signals class AutoCloseSpider(object): """ scrapy_redis扩展插件 Parameters ---------- CLOSE_SPIDER_INTERVAL : float ZERO_THRESHOLD : int """ def __init__(self, crawler, stats, interval=60.0, threshold=3): self.crawler = crawler self.stats = stats self.interval = interval self.threshold = threshold self.task = None @classmethod def from_crawler(cls, crawler): interval = crawler.settings.getfloat('CLOSE_SPIDER_INTERVAL') threshold = crawler.settings.getint('ZERO_THRESHOLD') if not interval and not threshold: raise NotConfigured stats = crawler.stats o = cls(crawler, stats, interval, threshold) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o def spider_opened(self, spider): # 记录上一次的数据量 self.request_count_prev = 0 # 获取response的次数 self.zero_count = -1 self.task = task.LoopingCall(self.increment, spider) self.task.start(self.interval) def increment(self, spider): # 判断爬虫的request是否为空 request_count = self.stats.get_value('downloader/request_count', 0) # 这一次的数据量 - 上一次的数据量 inc = (request_count - self.request_count_prev) if inc == 0: self.zero_count += 1 elif inc != 0 and self.zero_count != 0: self.zero_count = 0 # 如果为增量为0的次数超过阈值,则主动关闭爬虫 if self.zero_count >= self.threshold: self.crawler.engine.close_spider(spider, 'closespider_zerocount') def spider_closed(self, spider, reason): if self.task and self.task.running: self.task.stop()
How to restart it after it stopped? If I want to work on a new target.
@liuyuer could you expand on your use case?
I like to recycle processes so memory doesn't pile up over time. You could have make you crawler to close after being idle for some time or reaching certain threshold (i.e.: domains scraped, mem usage, etc) and have an external process that monitors at least you have X crawlers running.
@liuyuer could you expand on your use case?
I like to recycle processes so memory doesn't pile up over time. You could have make you crawler to close after being idle for some time or reaching certain threshold (i.e.: domains scraped, mem usage, etc) and have an external process that monitors at least you have X crawlers running.
My use case is:
- Crawler runs as a service, when the crawler reached a threshold, it could stop with self.crawler.engine.close_spider.
- The crawler could restart when it received a new target to work on.
My problem was:
- The crawler could not restart after if was stopped by self.crawler.engine.close_spider.
- I need to clean up the redis keys. So that the result will not mix up.
What I did:
- I am using Scrapydo to take care of the new process so I can restart Scrapy(not scrapy-redis) in a new process.
@rmax I am not sure if that is the correct way to handle that. I also worry about the memory issue. If you can share more how you recycle processes/clean up, that will very helpful.