feapder
feapder copied to clipboard
如果因为代理失效导致任务重试,我这边把代理标记为失效代理,但是重试的request代理仍为失效的代理
`# -- coding: utf-8 -- """ Created on 2022-04-13 14:17:32
@summary:
@author: lscoo """
import feapder from feapder.utils.log import log from ratelimit import sleep_and_retry, limits from items.permit_mee_gov_cn_item import Permit_mee_gov_cnItem
class PermitMeeGovCn(feapder.AirSpider): def start_requests(self): url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg\u0021getRegisterInfo.action"
for i in range(1, 1000):
data = {
"page.pageNo": str(i),
"page.orderBy": "",
"page.order": "",
"province": "500000000000",
"city": "",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""
}
yield feapder.Request(url, data=data, verify=False, method="POST")
# @sleep_and_retry
# @limits(calls=2, period=1)
# def download_midware(self, request):
# request.headers = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
# "Cache-Control": "no-cache",
# "Connection": "keep-alive",
# "Content-Type": "application/x-www-form-urlencoded",
# "DNT": "1",
# "Origin": "http://permit.mee.gov.cn",
# "Pragma": "no-cache",
# "Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg\\u0021getRegisterInfo.action",
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.36"
# }
# # request.cookies = {
# # "JSESSIONID": "FD08D886501E9D055EC2BBE28CB35CD7",
# # "paiwu80_cookie": "37836164"
# # }
# return request
def parse(self, request, response):
tr_list = response.xpath('//table[@class="tabtd"]/tr[position()> 1]')
for tr in tr_list:
url = tr.xpath('./td[8]/a/@href').get()
item = {
"省直辖市": tr.xpath('./td[1]/text()').get(),
"地市": tr.xpath('./td[2]/text()').get(),
"登记编号": tr.xpath('./td[3]/text()').get(),
"单位名称": tr.xpath('./td[4]/text()').get(),
"行业类别": tr.xpath('./td[5]/text()').get(),
"有效期限": tr.xpath('./td[6]/text()').get(),
"登记时间": tr.xpath('./td[7]/text()').get(),
"生产经营场所地址": '',
"所在地区": '',
"登记回执": ''
}
yield feapder.Request(url, callback=self.parser_detail, item=item)
# log.info(url)
# log.info(item)
# def validate(self, request, response):
# if not response:
# raise Exception("代理失效了")
def exception_request(self, request, response):
# request.proxies_pool.reset_proxy_pool(force=True)
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1) # 废弃本次代理
# yield request
def parser_detail(self, request, response):
item = request.item
'''
生产经营场所地址:重庆市璧山区璧泉街道金剑路279号2号厂房底楼东面部份
行业类别:模具制造
所在地区:重庆市-市辖区-璧山区
'''
text = response.xpath('//table/tr[2]/td[1]/p/text()').get()
text = text.strip().split()
item['生产经营场所地址'] = text[0].split(':', 1)[-1].strip()
item['所在地区'] = text[1].split(':', 1)[-1].strip()
item['登记回执'] = response.xpath('//table/tr[2]/td[2]/a/@href').get()
log.info(item)
yield Permit_mee_gov_cnItem(**item)
if name == "main": PermitMeeGovCn().start() `
同样的问题