caipanwenshu
caipanwenshu copied to clipboard
vl5x.py中算法可能有问题,有的cookie值算不对。
有时执行正常,有时执行错误。报错如下
2018-12-18 17:46:21 [scrapy.core.scraper] ERROR: Spider error processing <GET http://wenshu.court.gov.cn/list/list/?sorttype=1> (referer: None)
Traceback (most recent call last):
File "c:\users\qiqing\appdata\local\programs\python\python37\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "c:\users\qiqing\appdata\local\programs\python\python37\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
for x in result:
File "c:\users\qiqing\appdata\local\programs\python\python37\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in
测试代码如下: `# -- coding: utf-8 -- import scrapy import datetime
from wenshu.utils.vl5x import getvjkl5
class wenshu(scrapy.Spider): name = "wenshu" start_urls = ['http://wenshu.court.gov.cn/list/list/?sorttype=1']
# def start_requests(self):
# url = 'http://wenshu.court.gov.cn/list/list/?sorttype=1'
# yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
cookie = response.headers['Set-Cookie'].decode().split(';')[0][6:]
print(cookie)
# print(response.body)
vjkl5 = getvjkl5(cookie)
print(vjkl5)
# # 案例筛选参数
data = {'Param': u'裁判日期:2017-12-06 TO 2017-12-06', 'Index': '1', 'Page': '20', 'Order': u'法院层级', 'Direction': 'asc', 'vl5x': vjkl5}
yield scrapy.FormRequest('http://wenshu.court.gov.cn/List/ListContent', headers={'Cookie': cookie},callback=self.testlist,
formdata=data,
meta={'cookie': cookie, 'vjkl5': vjkl5})
def testlist(self,response):
# print(response.body)
filename = 'mingyan.html'
with open(filename, 'wb') as f: #python文件操作,不多说了;
f.write(response.body) #刚才下载的页面去哪里了?response.body就代表了刚才下载的页面!
self.log('保存文件: %s' % filename)`
测试代码如下: `# - - coding:utf-8 - - import scrapy import datetime
来自wenshu.utils.vl5x导入getvjkl5
class wenshu(scrapy.Spider): name =“ wenshu ” start_urls = ['http://wenshu.court.gov.cn/list/list/?sorttype=1']
# def start_requests(self): # url = 'http://wenshu.court.gov.cn/list/list/?sorttype=1' # yield scrapy.Request(url, callback=self.parse) def parse(self, response): cookie = response.headers['Set-Cookie'].decode().split(';')[0][6:] print(cookie) # print(response.body) vjkl5 = getvjkl5(cookie) print(vjkl5) # # 案例筛选参数 data = {'Param': u'裁判日期:2017-12-06 TO 2017-12-06', 'Index': '1', 'Page': '20', 'Order': u'法院层级', 'Direction': 'asc', 'vl5x': vjkl5} yield scrapy.FormRequest('http://wenshu.court.gov.cn/List/ListContent', headers={'Cookie': cookie},callback=self.testlist, formdata=data, meta={'cookie': cookie, 'vjkl5': vjkl5}) def testlist(self,response): # print(response.body) filename = 'mingyan.html' with open(filename, 'wb') as f: #python文件操作,不多说了; f.write(response.body) #刚才下载的页面去哪里了?response.body就代表了刚才下载的页面! self.log('保存文件: %s' % filename)`
你发的代码我测试没有问题。如果有时执行错误,可以降低频率
还可以正常运行么
# 计算页码 page_count = int(result[0]['Count']) // 10 if int(result[0]['Count']) // 10 == 0 else int(result[0]['Count']) // 10 + 1 if int(Index) < page_count: data = {'Param': Param, 'Index': str(Index+1), 'Page': '10', 'Order': u'法院层级', 'Direction': 'asc', 'vl5x': vjkl5, 'number': number, 'guid': guid} yield scrapy.FormRequest('http://wenshu.court.gov.cn/List/ListContent', headers=headers, callback=self.get_doc_list, formdata=data, meta={'cookie': cookie, 'vjkl5': vjkl5, 'Param': Param, 'number': number, 'Index': str(Index+1), 'guid': guid})
你好,测试了一下你的代码,有些确实写的很好,不过翻页有问题,我建议用该过的这个,还有递归调用已经是循环了,再加上for翻页的话,会把网站搞死的