crawlab icon indicating copy to clipboard operation
crawlab copied to clipboard

关于同一爬虫保存多种item的建议

Open zemelLeong opened this issue 2 years ago • 0 comments

请描述该需求尝试解决的问题 当尝试用scrapy爬取同一网站的不同网页并保存不同数据时,发现只有第一个被yield的类型能够被保存下来。希望能做些改进能够保存多种Item

import scrapy
from itemloaders.processors import TakeFirst


class HospitalItem(scrapy.Item):
    page_url = scrapy.Field(output_processor=TakeFirst())
    http_status = scrapy.Field(output_processor=TakeFirst())
    hospital_name = scrapy.Field(output_processor=TakeFirst())
    hospital_alias = scrapy.Field(output_processor=TakeFirst())
    hospital_type = scrapy.Field(output_processor=TakeFirst())
    hospital_phone = scrapy.Field(output_processor=TakeFirst())
    hospital_address = scrapy.Field(output_processor=TakeFirst())


class DepartmentItem(scrapy.Item):
    page_url = scrapy.Field(output_processor=TakeFirst())
    http_status = scrapy.Field(output_processor=TakeFirst())
    dept_name = scrapy.Field(output_processor=TakeFirst())
class A99hospitalSpider(scrapy.Spider):
    name = '99hospital'
    allowed_domains = ['yyk.99.com.cn', 'ksk.99.com.cn']
    start_urls = ['http://yyk.99.com.cn/beijing']

    def parse(self, response):
        links = response.css('.m-table-2 a')
        for link in links.css('a::attr(href)').extract()[:10]:
            url = response.urljoin(link)
            yield scrapy.Request(url, callback=self.parse_hospital_info)

    def parse_hospital_info(self, response):
        loader = ItemLoader(item=HospitalItem(), response=response)
        loader.add_value('page_url', response.url)
        loader.add_value('http_status', response.status)
        loader.add_css('hospital_name', '.wrap-mn h1::text')
        loader.add_css('hospital_alias', '.wrap-info dd p:first-child::text')
        loader.add_css('hospital_type', '.wrap-info dd p:nth-child(2)::text')
        loader.add_css('hospital_phone',
                       '.wrap-info dd p:nth-child(3) em::text')
        loader.add_css('hospital_address',
                       '.wrap-info dd p:nth-child(4) em::text')

        yield loader.load_item()

        for link in response.css('.hospital-dpt a::attr(href)').extract():
            url = response.urljoin(link)
            yield scrapy.Request(url=url, callback=self.parse_dept_info)

    def parse_dept_info(self, response):
        loader = ItemLoader(item=DepartmentItem(), response=response)

        loader.add_value('page_url', response.url)
        loader.add_value('http_status', response.status)
        loader.add_css('dept_name', '.ksinfo-left dd:nth-child(1) span::text')

        yield loader.load_item()

结果展示

image

zemelLeong avatar Apr 23 '23 05:04 zemelLeong