crawlab
crawlab copied to clipboard
关于同一爬虫保存多种item的建议
请描述该需求尝试解决的问题
当尝试用scrapy爬取同一网站的不同网页并保存不同数据时,发现只有第一个被yield的类型能够被保存下来。希望能做些改进能够保存多种Item
import scrapy
from itemloaders.processors import TakeFirst
class HospitalItem(scrapy.Item):
page_url = scrapy.Field(output_processor=TakeFirst())
http_status = scrapy.Field(output_processor=TakeFirst())
hospital_name = scrapy.Field(output_processor=TakeFirst())
hospital_alias = scrapy.Field(output_processor=TakeFirst())
hospital_type = scrapy.Field(output_processor=TakeFirst())
hospital_phone = scrapy.Field(output_processor=TakeFirst())
hospital_address = scrapy.Field(output_processor=TakeFirst())
class DepartmentItem(scrapy.Item):
page_url = scrapy.Field(output_processor=TakeFirst())
http_status = scrapy.Field(output_processor=TakeFirst())
dept_name = scrapy.Field(output_processor=TakeFirst())
class A99hospitalSpider(scrapy.Spider):
name = '99hospital'
allowed_domains = ['yyk.99.com.cn', 'ksk.99.com.cn']
start_urls = ['http://yyk.99.com.cn/beijing']
def parse(self, response):
links = response.css('.m-table-2 a')
for link in links.css('a::attr(href)').extract()[:10]:
url = response.urljoin(link)
yield scrapy.Request(url, callback=self.parse_hospital_info)
def parse_hospital_info(self, response):
loader = ItemLoader(item=HospitalItem(), response=response)
loader.add_value('page_url', response.url)
loader.add_value('http_status', response.status)
loader.add_css('hospital_name', '.wrap-mn h1::text')
loader.add_css('hospital_alias', '.wrap-info dd p:first-child::text')
loader.add_css('hospital_type', '.wrap-info dd p:nth-child(2)::text')
loader.add_css('hospital_phone',
'.wrap-info dd p:nth-child(3) em::text')
loader.add_css('hospital_address',
'.wrap-info dd p:nth-child(4) em::text')
yield loader.load_item()
for link in response.css('.hospital-dpt a::attr(href)').extract():
url = response.urljoin(link)
yield scrapy.Request(url=url, callback=self.parse_dept_info)
def parse_dept_info(self, response):
loader = ItemLoader(item=DepartmentItem(), response=response)
loader.add_value('page_url', response.url)
loader.add_value('http_status', response.status)
loader.add_css('dept_name', '.ksinfo-left dd:nth-child(1) span::text')
yield loader.load_item()
结果展示
