weibo-search
weibo-search copied to clipboard
运行后报错AttributeError: 'NoneType' object has no attribute 'split'
环境都配置好了,之前运行都有用,今天再使用的时候出现了这个错误
Traceback (most recent call last): File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\defer.py", line 132, in iter_errback yield next(it) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\python.py", line 354, in __next__ return next(self.data) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\python.py", line 354, in __next__ return next(self.data) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output for x in result: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr> return (_set_referer(r) for r in result or ()) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\python_code\weibo\weibo-search\weibo\spiders\search.py", line 107, in parse for weibo in self.parse_weibo(response): File "F:\python_code\weibo\weibo-search\weibo\spiders\search.py", line 356, in parse_weibo weibo['bid'] = sel.xpath( AttributeError: 'NoneType' object has no attribute 'split'
我也是之前可以使用,昨天开始出现这样错误
我也一样
我也同样出现该问题,我在其他网站上搜索出现 AttributeError: 'NoneType' object has no attribute 'split'的情况,发现其他人说只是有报错提示但是不影响数据的爬取。 但我感觉还是少爬了一部分数据…不知道如何解决,同等回复
我最近无法调试,不清楚是网站改版了,还是被限制了,大家再搜索看。
刚刚看了其他人的反馈 发现把search.py里面的p[@Class="from"全部更换为div[@Class="from",可以跑起来
确实可以,但请问原理是什么?
另外全部改成div会在遇见转发微博的时候报错,转发微博的元素不是div而是p,在retweet相关的一些p不能改成div
是的我试过了,没改retweet那部分可以运行,但是搞不清楚理由。
同问
File "/opt/conda/envs/python35-paddle120-env/bin/scrapy", line 8, in <module>
sys.exit(execute())
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/cmdline.py", line 123, in execute
settings = get_project_settings()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/utils/project.py", line 68, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 290, in setmodule
self.set(key, getattr(module, key), priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 265, in set
self.attributes[name].set(value, priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 50, in set
value = BaseSettings(value, priority=priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 86, in __init__
self.update(values, priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 322, in update
for name, value in values.items():
AttributeError: 'set' object has no attribute 'items'
改了之后新问题
改了上面两位老哥说的,成功跑起来了,谢谢几位老哥!
-- coding: utf-8 --
import os import re import sys from datetime import datetime, timedelta from urllib.parse import unquote
import scrapy import weibo.utils.util as util from scrapy.exceptions import CloseSpider from scrapy.utils.project import get_project_settings from weibo.items import WeiboItem
class SearchSpider(scrapy.Spider): name = 'search' allowed_domains = ['weibo.com'] settings = get_project_settings() keyword_list = settings.get('KEYWORD_LIST') if not isinstance(keyword_list, list): if not os.path.isabs(keyword_list): keyword_list = os.getcwd() + os.sep + keyword_list if not os.path.isfile(keyword_list): sys.exit('不存在%s文件' % keyword_list) keyword_list = util.get_keyword_list(keyword_list)
for i, keyword in enumerate(keyword_list):
if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
keyword_list[i] = '%23' + keyword[1:-1] + '%23'
weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE'))
contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE'))
regions = util.get_regions(settings.get('REGION'))
base_url = 'https://s.weibo.com'
start_date = settings.get('START_DATE',
datetime.now().strftime('%Y-%m-%d'))
end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d'))
if util.str_to_time(start_date) > util.str_to_time(end_date):
sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py')
further_threshold = settings.get('FURTHER_THRESHOLD', 46)
mongo_error = False
pymongo_error = False
mysql_error = False
pymysql_error = False
def start_requests(self):
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date,
'%Y-%m-%d') + timedelta(days=1)
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
for keyword in self.keyword_list:
if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION'):
base_url = 'https://s.weibo.com/weibo?q=%s' % keyword
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword
})
else:
for region in self.regions.values():
base_url = (
'https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
# 获取一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'province': region
})
def check_environment(self):
"""判断配置要求的软件是否已安装"""
if self.pymongo_error:
print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
raise CloseSpider()
if self.mongo_error:
print('系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
raise CloseSpider()
if self.pymysql_error:
print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
raise CloseSpider()
if self.mysql_error:
print('系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
raise CloseSpider()
def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date, '%Y-%m-%d')
while start_date <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
})
def parse_by_day(self, response):
"""以天为单位筛选"""
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
date = response.meta.get('date')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
start_date_str = date + '-0'
start_date = datetime.strptime(start_date_str, '%Y-%m-%d-%H')
for i in range(1, 25):
start_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
start_date = start_date + timedelta(hours=1)
end_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一小时的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province
if province else self.parse_by_hour,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'start_time': start_str,
'end_time': end_str
})
def parse_by_hour(self, response):
"""以小时为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
for region in self.regions.values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': region
})
def parse_by_hour_province(self, response):
"""以小时和直辖市/省为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
province = response.meta.get('province')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
for city in province['city'].values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:{}'
).format(keyword, province['code'], city)
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个城市的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_page,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': province,
'city': city
})
def parse_page(self, response):
"""解析一页搜索结果的信息"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
if is_empty:
print('当前页面搜索结果为空')
else:
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
def get_article_url(self, selector):
"""获取微博头条文章url"""
article_url = ''
text = selector.xpath('string(.)').extract_first().replace(
'\u200b', '').replace('\ue627', '').replace('\n',
'').replace(' ', '')
if text.startswith('发布了头条文章'):
urls = selector.xpath('.//a')
for url in urls:
if url.xpath(
'i[@class="wbicon"]/text()').extract_first() == 'O':
if url.xpath('@href').extract_first() and url.xpath(
'@href').extract_first().startswith('http://t.cn'):
article_url = url.xpath('@href').extract_first()
break
return article_url
def get_location(self, selector):
"""获取微博发布位置"""
a_list = selector.xpath('.//a')
location = ''
for a in a_list:
if a.xpath('./i[@class="wbicon"]') and a.xpath(
'./i[@class="wbicon"]/text()').extract_first() == '2':
location = a.xpath('string(.)').extract_first()[1:]
break
return location
def get_at_users(self, selector):
"""获取微博中@的用户昵称"""
a_list = selector.xpath('.//a')
at_users = ''
at_list = []
for a in a_list:
if len(unquote(a.xpath('@href').extract_first())) > 14 and len(
a.xpath('string(.)').extract_first()) > 1:
if unquote(a.xpath('@href').extract_first())[14:] == a.xpath(
'string(.)').extract_first()[1:]:
at_user = a.xpath('string(.)').extract_first()[1:]
if at_user not in at_list:
at_list.append(at_user)
if at_list:
at_users = ','.join(at_list)
return at_users
def get_topics(self, selector):
"""获取参与的微博话题"""
a_list = selector.xpath('.//a')
topics = ''
topic_list = []
for a in a_list:
text = a.xpath('string(.)').extract_first()
if len(text) > 2 and text[0] == '#' and text[-1] == '#':
if text[1:-1] not in topic_list:
topic_list.append(text[1:-1])
if topic_list:
topics = ','.join(topic_list)
return topics
def parse_weibo(self, response):
"""解析网页中的微博信息"""
keyword = response.meta.get('keyword')
for sel in response.xpath("//div[@class='card-wrap']"):
info = sel.xpath(
"div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
)
if info:
weibo = WeiboItem()
weibo['id'] = sel.xpath('@mid').extract_first()
weibo['bid'] = sel.xpath(
'.//div[@class="from"]/a[1]/@href').extract_first(
).split('/')[-1].split('?')[0]
weibo['user_id'] = info[0].xpath(
'div[2]/a/@href').extract_first().split('?')[0].split(
'/')[-1]
weibo['screen_name'] = info[0].xpath(
'div[2]/a/@nick-name').extract_first()
txt_sel = sel.xpath('.//p[@class="txt"]')[0]
retweet_sel = sel.xpath('.//div[@class="card-comment"]')
retweet_txt_sel = ''
if retweet_sel and retweet_sel[0].xpath('.//div[@class="txt"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//div[@class="txt"]')[0]
content_full = sel.xpath(
'.//p[@node-type="feed_list_content_full"]')
is_long_weibo = False
is_long_retweet = False
if content_full:
if not retweet_sel:
txt_sel = content_full[0]
is_long_weibo = True
elif len(content_full) == 2:
txt_sel = content_full[0]
retweet_txt_sel = content_full[1]
is_long_weibo = True
is_long_retweet = True
elif retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]')[0]
is_long_retweet = True
else:
txt_sel = content_full[0]
is_long_weibo = True
weibo['text'] = txt_sel.xpath(
'string(.)').extract_first().replace('\u200b', '').replace(
'\ue627', '')
weibo['article_url'] = self.get_article_url(txt_sel)
weibo['location'] = self.get_location(txt_sel)
if weibo['location']:
weibo['text'] = weibo['text'].replace(
'2' + weibo['location'], '')
weibo['text'] = weibo['text'][2:].replace(' ', '')
if is_long_weibo:
weibo['text'] = weibo['text'][:-4]
weibo['at_users'] = self.get_at_users(txt_sel)
weibo['topics'] = self.get_topics(txt_sel)
reposts_count = sel.xpath(
'.//a[@action-type="feed_list_forward"]/text()').extract()
reposts_count = "".join(reposts_count)
try:
reposts_count = re.findall(r'\d+.*', reposts_count)
except TypeError:
print(
"无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n"
"请在 https://github.com/dataabc/weibo-search 查看文档,以解决问题,"
)
raise CloseSpider()
weibo['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = sel.xpath(
'.//a[@action-type="feed_list_comment"]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
weibo['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = sel.xpath(
'(.//span[@class="woo-like-count"])[last()]/text()').extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
weibo['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = sel.xpath(
'.//div[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
weibo['created_at'] = util.standardize_date(created_at)
source = sel.xpath('.//div[@class="from"]/a[2]/text()'
).extract_first()
weibo['source'] = source if source else ''
pics = ''
is_exist_pic = sel.xpath(
'.//div[@class="media media-piclist"]')
if is_exist_pic:
pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
pics = [pic[8:] for pic in pics]
pics = [
re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
]
pics = ['https://' + pic for pic in pics]
video_url = ''
is_exist_video = sel.xpath(
'.//div[@class="thumbnail"]//video-player').extract_first()
if is_exist_video:
video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0]
video_url = video_url.replace('&', '&')
video_url = 'http:' + video_url
if not retweet_sel:
weibo['pics'] = pics
weibo['video_url'] = video_url
else:
weibo['pics'] = ''
weibo['video_url'] = ''
weibo['retweet_id'] = ''
if retweet_sel and retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'):
retweet = WeiboItem()
retweet['id'] = retweet_sel[0].xpath(
'.//a[@action-type="feed_list_like"]/@action-data'
).extract_first()[4:]
retweet['bid'] = retweet_sel[0].xpath(
'.//p[@class="from"]/a/@href').extract_first().split(
'/')[-1].split('?')[0]
info = retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'
)[0]
retweet['user_id'] = info.xpath(
'@href').extract_first().split('/')[-1]
retweet['screen_name'] = info.xpath(
'@nick-name').extract_first()
retweet['text'] = retweet_txt_sel.xpath(
'string(.)').extract_first().replace('\u200b',
'').replace(
'\ue627', '')
retweet['article_url'] = self.get_article_url(
retweet_txt_sel)
retweet['location'] = self.get_location(retweet_txt_sel)
if retweet['location']:
retweet['text'] = retweet['text'].replace(
'2' + retweet['location'], '')
retweet['text'] = retweet['text'][2:].replace(' ', '')
if is_long_retweet:
retweet['text'] = retweet['text'][:-4]
retweet['at_users'] = self.get_at_users(retweet_txt_sel)
retweet['topics'] = self.get_topics(retweet_txt_sel)
reposts_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[1]/a[1]/text()'
).extract_first()
reposts_count = re.findall(r'\d+.*', reposts_count)
retweet['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[2]/a[1]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
retweet['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = retweet_sel[0].xpath(
'.//a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()'
).extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
retweet['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = retweet_sel[0].xpath(
'.//p[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
retweet['created_at'] = util.standardize_date(created_at)
source = retweet_sel[0].xpath(
'.//p[@class="from"]/a[2]/text()').extract_first()
retweet['source'] = source if source else ''
retweet['pics'] = pics
retweet['video_url'] = video_url
retweet['retweet_id'] = ''
yield {'weibo': retweet, 'keyword': keyword}
weibo['retweet_id'] = retweet['id']
print(weibo)
yield {'weibo': weibo, 'keyword': keyword}
我也遇到相同问题。改了之后可以正常运行,但是信息不全。同样的设置,之前能存下来的大概是现在的2-3倍。不知道大家有没有遇到类似的问题。
我就一直循环楼主的这个界面,然后最后停止后什么也没有,结果文件夹里也没有。不知道为啥
切换成旧版微博的cookie就可以解决这个问题,不需要修改代码。 我也遇到同样的问题,如果使用新版web微博的cookie就会出现这个问题😂