feapder icon indicating copy to clipboard operation
feapder copied to clipboard

如果因为代理失效导致任务重试,我这边把代理标记为失效代理,但是重试的request代理仍为失效的代理

Open lscool66 opened this issue 2 years ago • 1 comments

`# -- coding: utf-8 -- """ Created on 2022-04-13 14:17:32

@summary:

@author: lscoo """

import feapder from feapder.utils.log import log from ratelimit import sleep_and_retry, limits from items.permit_mee_gov_cn_item import Permit_mee_gov_cnItem

class PermitMeeGovCn(feapder.AirSpider): def start_requests(self): url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg\u0021getRegisterInfo.action"

    for i in range(1, 1000):
        data = {
            "page.pageNo": str(i),
            "page.orderBy": "",
            "page.order": "",
            "province": "500000000000",
            "city": "",
            "registerentername": "",
            "xkznum": "",
            "treadname": "",
            "treadcode": "",
            "publishtime": ""
        }
        yield feapder.Request(url, data=data, verify=False, method="POST")

# @sleep_and_retry
# @limits(calls=2, period=1)
# def download_midware(self, request):
#     request.headers = {
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
#         "Cache-Control": "no-cache",
#         "Connection": "keep-alive",
#         "Content-Type": "application/x-www-form-urlencoded",
#         "DNT": "1",
#         "Origin": "http://permit.mee.gov.cn",
#         "Pragma": "no-cache",
#         "Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg\\u0021getRegisterInfo.action",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36"
#     }
#     # request.cookies = {
#     #     "JSESSIONID": "FD08D886501E9D055EC2BBE28CB35CD7",
#     #     "paiwu80_cookie": "37836164"
#     # }
#     return request

def parse(self, request, response):
    tr_list = response.xpath('//table[@class="tabtd"]/tr[position()> 1]')
    for tr in tr_list:
        url = tr.xpath('./td[8]/a/@href').get()
        item = {
            "省直辖市": tr.xpath('./td[1]/text()').get(),
            "地市": tr.xpath('./td[2]/text()').get(),
            "登记编号": tr.xpath('./td[3]/text()').get(),
            "单位名称": tr.xpath('./td[4]/text()').get(),
            "行业类别": tr.xpath('./td[5]/text()').get(),
            "有效期限": tr.xpath('./td[6]/text()').get(),
            "登记时间": tr.xpath('./td[7]/text()').get(),
            "生产经营场所地址": '',
            "所在地区": '',
            "登记回执": ''
        }
        yield feapder.Request(url, callback=self.parser_detail, item=item)
        # log.info(url)
        # log.info(item)

# def validate(self, request, response):
#     if not response:
#         raise Exception("代理失效了")

def exception_request(self, request, response):
    # request.proxies_pool.reset_proxy_pool(force=True)
    request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)  # 废弃本次代理
    # yield request

def parser_detail(self, request, response):
    item = request.item
    '''
    
                                                      生产经营场所地址:重庆市璧山区璧泉街道金剑路279号2号厂房底楼东面部份    
                                                      行业类别:模具制造    
                                                      所在地区:重庆市-市辖区-璧山区   
                
    '''
    text = response.xpath('//table/tr[2]/td[1]/p/text()').get()
    text = text.strip().split()
    item['生产经营场所地址'] = text[0].split(':', 1)[-1].strip()
    item['所在地区'] = text[1].split(':', 1)[-1].strip()
    item['登记回执'] = response.xpath('//table/tr[2]/td[2]/a/@href').get()
    log.info(item)
    yield Permit_mee_gov_cnItem(**item)

if name == "main": PermitMeeGovCn().start() `

lscool66 avatar Apr 13 '22 08:04 lscool66

同样的问题

ddzyx avatar Sep 10 '22 13:09 ddzyx