搜索结果为空判断错误
搜素微博时会出现这种情况:一页的内容非常少,远低于我这里的7条,推测可能微博屏蔽了某些内容,在最极端的情况下,一页可能什么都没有,这时就会出现搜索结果为空 因此parse_page似乎会提前终止 特别的,如果第一页什么都没有,就会导致诸如parse, parse_by_day之类的立即停止
这是我目前的改动 ``
(刚才误触enter了
def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
print(f"{datetime.now()}: 搜索{response.meta.get('start_date')}至{response.meta.get('end_date')}")
# if is_empty:
# print(f"{datetime.now()}: 当前页面搜索结果为空")
# 如果第一页全被夹了,也会被认为是空的
if page_count < self.further_threshold and not is_empty:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={
'keyword': keyword,
'page_count': page_count,
'page_index': 1
})
else:
start_date = datetime.strptime(response.meta.get('start_date'), '%Y-%m-%d')
end_date = datetime.strptime(response.meta.get('end_date'), '%Y-%m-%d')
if start_date < end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
meta = {
'base_url': base_url,
'keyword': keyword
} if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION') else {
'base_url': base_url,
'keyword': keyword,
'province': province
}
mid_date = start_date + timedelta(days=(end_date - start_date).days // 2)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, mid_str)
meta['start_date'] = start_str[:-2]
meta['end_date'] = mid_str[:-2]
# 获取前半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
mid_date = mid_date + timedelta(days=1)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
mid_str, end_str)
meta['start_date'] = mid_str[:-2]
meta['end_date'] = end_str[:-2]
# 获取后半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
elif start_date == end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
})
希望作者能针对这种情况补充,并增加调整粒度的选项
感谢建议,我现在没办法调试,我可以把您的方法置顶,供大家参考,再次感谢。
# -*- coding: utf-8 -*-
import os
import re
import sys
from datetime import datetime, timedelta
from urllib.parse import unquote
import requests
import scrapy
import weibo.utils.util as util
from scrapy.exceptions import CloseSpider
from scrapy.utils.project import get_project_settings
from weibo.items import WeiboItem
class SearchSpider(scrapy.Spider):
name = 'search'
allowed_domains = ['weibo.com']
settings = get_project_settings()
keyword_list = settings.get('KEYWORD_LIST')
if not isinstance(keyword_list, list):
if not os.path.isabs(keyword_list):
keyword_list = os.getcwd() + os.sep + keyword_list
if not os.path.isfile(keyword_list):
sys.exit('不存在%s文件' % keyword_list)
keyword_list = util.get_keyword_list(keyword_list)
for i, keyword in enumerate(keyword_list):
if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
keyword_list[i] = '%23' + keyword[1:-1] + '%23'
weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE'))
contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE'))
regions = util.get_regions(settings.get('REGION'))
base_url = 'https://s.weibo.com'
start_date = settings.get('START_DATE',
datetime.now().strftime('%Y-%m-%d'))
end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d'))
if util.str_to_time(start_date) > util.str_to_time(end_date):
sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py')
further_threshold = settings.get('FURTHER_THRESHOLD', 46)
mongo_error = False
pymongo_error = False
mysql_error = False
pymysql_error = False
with open('log.txt', 'w') as f:
f.write('')
def start_requests(self):
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date,
'%Y-%m-%d') + timedelta(days=1)
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
for keyword in self.keyword_list:
if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION'):
base_url = 'https://s.weibo.com/weibo?q=%s' % keyword
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'start_date': self.start_date,
'end_date': self.end_date
})
else:
for region in self.regions.values():
base_url = (
'https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
# 获取一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'province': region,
'start_date': self.start_date,
'end_date': self.end_date
})
def check_environment(self):
"""判断配置要求的软件是否已安装"""
if self.pymongo_error:
print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
raise CloseSpider()
if self.mongo_error:
print('系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
raise CloseSpider()
if self.pymysql_error:
print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
raise CloseSpider()
if self.mysql_error:
print('系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
raise CloseSpider()
def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
print(f"{datetime.now()}: 搜索{response.meta.get('start_date')}至{response.meta.get('end_date')}")
# if is_empty:
# print(f"{datetime.now()}: 当前页面搜索结果为空")
# 如果第一页全被夹了,也会被认为是空的
if page_count < self.further_threshold and not is_empty:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={
'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
start_date = datetime.strptime(response.meta.get('start_date'), '%Y-%m-%d')
end_date = datetime.strptime(response.meta.get('end_date'), '%Y-%m-%d')
if start_date + timedelta(days=3) <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = (end_date + timedelta(days=1)).strftime('%Y-%m-%d') + '-0'
meta = {
'base_url': base_url,
'keyword': keyword
} if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION') else {
'base_url': base_url,
'keyword': keyword,
'province': province
}
mid_date = start_date + timedelta(days=(end_date - start_date).days // 2)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, mid_str)
meta['start_date'] = start_date.strftime('%Y-%m-%d')
meta['end_date'] = mid_date.strftime('%Y-%m-%d')
# 获取前半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
mid_date = mid_date + timedelta(days=1)
mid_str = mid_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
mid_str, end_str)
meta['start_date'] = mid_date.strftime('%Y-%m-%d')
meta['end_date'] = end_date.strftime('%Y-%m-%d')
# 获取后半段时间的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta=meta)
else:
while start_date <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
})
def parse_by_day(self, response):
"""以天为单位筛选"""
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
date = response.meta.get('date')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print(f"{datetime.now()}: {date} 日搜索结果为空")
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
start_date_str = date + '-0'
start_date = datetime.strptime(start_date_str, '%Y-%m-%d-%H')
for i in range(1, 25):
start_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
start_date = start_date + timedelta(hours=1)
end_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一小时的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province
if province else self.parse_by_hour,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'start_time': start_str,
'end_time': end_str
})
def parse_by_hour(self, response):
"""以小时为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print(f"{datetime.now()}: {start_time} 搜索结果为空")
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
for region in self.regions.values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': region
})
def parse_by_hour_province(self, response):
"""以小时和直辖市/省为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
province = response.meta.get('province')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print(f"{datetime.now()}: {start_time} {province} 搜索结果为空")
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
if page_count > 1:
next_url = response.url.split('&page=')[0] + f'&page=2'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': 2
})
else:
for city in province['city'].values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:{}'
).format(keyword, province['code'], city)
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个城市的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_page,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': province,
'city': city,
'page_count': -1,
'page_index': 1
})
def parse_page(self, response):
"""解析一页搜索结果的信息"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = response.meta.get('page_count')
page_index = response.meta.get('page_index')
if is_empty and page_count < 0:
print(f"{datetime.now()}: {page_index}/{page_count} 页搜索结果为空")
with open('log.txt', 'a') as f:
f.write(f"{response.url} 0\n")
else:
if not is_empty:
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
else:
with open('log.txt', 'a') as f:
f.write(f"{response.url} 0\n")
if page_index < page_count:
next_url = response.url.split('&page=')[0] + f'&page={page_index + 1}'
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword,
'page_count': page_count,
'page_index': page_index + 1})
def get_ip(self, bid):
url = f"https://weibo.com/ajax/statuses/show?id={bid}&locale=zh-CN"
response = requests.get(url, headers=self.settings.get('DEFAULT_REQUEST_HEADERS'))
if response.status_code != 200:
return ""
try:
data = response.json()
except requests.exceptions.JSONDecodeError:
return ""
ip_str = data.get("region_name", "")
if ip_str:
ip_str = ip_str.split()[-1]
return ip_str
def get_article_url(self, selector):
"""获取微博头条文章url"""
article_url = ''
text = selector.xpath('string(.)').extract_first().replace(
'\u200b', '').replace('\ue627', '').replace('\n',
'').replace(' ', '')
if text.startswith('发布了头条文章'):
urls = selector.xpath('.//a')
for url in urls:
if url.xpath(
'i[@class="wbicon"]/text()').extract_first() == 'O':
if url.xpath('@href').extract_first() and url.xpath(
'@href').extract_first().startswith('https://t.cn'):
article_url = url.xpath('@href').extract_first()
break
return article_url
def n_get_article_url(self, selector, text_selector):
"""获取微博头条文章url"""
article_url = self.get_article_url(text_selector)
paths = ['div[@class="from"]/a[1]/@href', 'p[@class="from"]/a[1]/@href']
if not article_url or article_url == '':
for path in paths:
try:
article_url = 'https:' + selector.xpath('.//'+path).extract_first().split('?')[0]
break
except:
pass
if not article_url or article_url == '':
print('未找到文章链接')
print(selector.extract())
return article_url
def get_location(self, selector):
"""获取微博发布位置"""
a_list = selector.xpath('.//a')
location = ''
for a in a_list:
if a.xpath('./i[@class="wbicon"]') and a.xpath(
'./i[@class="wbicon"]/text()').extract_first() == '2':
location = a.xpath('string(.)').extract_first()[1:]
break
return location
def get_at_users(self, selector):
"""获取微博中@的用户昵称"""
a_list = selector.xpath('.//a')
at_users = ''
at_list = []
for a in a_list:
if len(unquote(a.xpath('@href').extract_first())) > 14 and len(
a.xpath('string(.)').extract_first()) > 1:
if unquote(a.xpath('@href').extract_first())[14:] == a.xpath(
'string(.)').extract_first()[1:]:
at_user = a.xpath('string(.)').extract_first()[1:]
if at_user not in at_list:
at_list.append(at_user)
if at_list:
at_users = ','.join(at_list)
return at_users
def get_topics(self, selector):
"""获取参与的微博话题"""
a_list = selector.xpath('.//a')
topics = ''
topic_list = []
for a in a_list:
text = a.xpath('string(.)').extract_first()
if len(text) > 2 and text[0] == '#' and text[-1] == '#':
if text[1:-1] not in topic_list:
topic_list.append(text[1:-1])
if topic_list:
topics = ','.join(topic_list)
return topics
def parse_weibo(self, response):
"""解析网页中的微博信息"""
keyword = response.meta.get('keyword')
card_warp_len = len(response.xpath('//div[@class="card-wrap"]'))
with open('log.txt', 'a') as f:
f.write(f"{response.url} {card_warp_len}\n")
for sel in response.xpath("//div[@class='card-wrap']"):
info = sel.xpath(
"div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
)
if info:
weibo = WeiboItem()
weibo['id'] = sel.xpath('@mid').extract_first()
bid = sel.xpath(
'.//div[@class="from"]/a[1]/@href').extract_first(
).split('/')[-1].split('?')[0]
weibo['bid'] = bid
weibo['user_id'] = info[0].xpath(
'div[2]/a/@href').extract_first().split('?')[0].split(
'/')[-1]
weibo['screen_name'] = info[0].xpath(
'div[2]/a/@nick-name').extract_first()
txt_sel = sel.xpath('.//p[@class="txt"]')[0]
retweet_sel = sel.xpath('.//div[@class="card-comment"]')
retweet_txt_sel = ''
if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//p[@class="txt"]')[0]
content_full = sel.xpath(
'.//p[@node-type="feed_list_content_full"]')
is_long_weibo = False
is_long_retweet = False
if content_full:
if not retweet_sel:
txt_sel = content_full[0]
is_long_weibo = True
elif len(content_full) == 2:
txt_sel = content_full[0]
retweet_txt_sel = content_full[1]
is_long_weibo = True
is_long_retweet = True
elif retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]')[0]
is_long_retweet = True
else:
txt_sel = content_full[0]
is_long_weibo = True
weibo['text'] = txt_sel.xpath(
'string(.)').extract_first().replace('\u200b', '').replace(
'\ue627', '')
weibo['article_url'] = self.n_get_article_url(sel, txt_sel)
weibo['location'] = self.get_location(txt_sel)
if weibo['location']:
weibo['text'] = weibo['text'].replace(
'2' + weibo['location'], '')
weibo['text'] = weibo['text'][2:].replace(' ', '')
if is_long_weibo:
weibo['text'] = weibo['text'][:-4]
weibo['at_users'] = self.get_at_users(txt_sel)
weibo['topics'] = self.get_topics(txt_sel)
reposts_count = sel.xpath(
'.//a[@action-type="feed_list_forward"]/text()').extract()
reposts_count = "".join(reposts_count)
try:
reposts_count = re.findall(r'\d+.*', reposts_count)
except TypeError:
print(
"无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n"
"请在 https://github.com/dataabc/weibo-search 查看文档,以解决问题,"
)
raise CloseSpider()
weibo['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = sel.xpath(
'.//a[@action-type="feed_list_comment"]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
weibo['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = sel.xpath(
'.//a[@action-type="feed_list_like"]/button/span[2]/text()').extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
weibo['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = sel.xpath(
'.//div[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
weibo['created_at'] = util.standardize_date(created_at)
source = sel.xpath('.//div[@class="from"]/a[2]/text()'
).extract_first()
weibo['source'] = source if source else ''
pics = ''
is_exist_pic = sel.xpath(
'.//div[@class="media media-piclist"]')
if is_exist_pic:
pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
pics = [pic[8:] for pic in pics]
pics = [
re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
]
pics = ['https://' + pic for pic in pics]
video_url = ''
is_exist_video = sel.xpath(
'.//div[@class="thumbnail"]//video-player').extract_first()
if is_exist_video:
video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0]
video_url = video_url.replace('&', '&')
video_url = 'http:' + video_url
if not retweet_sel:
weibo['pics'] = pics
weibo['video_url'] = video_url
else:
weibo['pics'] = ''
weibo['video_url'] = ''
weibo['retweet_id'] = ''
if retweet_sel and retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'):
retweet = WeiboItem()
retweet['id'] = retweet_sel[0].xpath(
'.//a[@action-type="feed_list_like"]/@action-data'
).extract_first()[4:]
retweet['bid'] = retweet_sel[0].xpath(
'.//p[@class="from"]/a/@href').extract_first().split(
'/')[-1].split('?')[0]
info = retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'
)[0]
retweet['user_id'] = info.xpath(
'@href').extract_first().split('/')[-1]
retweet['screen_name'] = info.xpath(
'@nick-name').extract_first()
retweet['text'] = retweet_txt_sel.xpath(
'string(.)').extract_first().replace('\u200b',
'').replace(
'\ue627', '')
retweet['article_url'] = self.n_get_article_url(
retweet_sel[0], retweet_txt_sel)
retweet['location'] = self.get_location(retweet_txt_sel)
if retweet['location']:
retweet['text'] = retweet['text'].replace(
'2' + retweet['location'], '')
retweet['text'] = retweet['text'][2:].replace(' ', '')
if is_long_retweet:
retweet['text'] = retweet['text'][:-4]
retweet['at_users'] = self.get_at_users(retweet_txt_sel)
retweet['topics'] = self.get_topics(retweet_txt_sel)
reposts_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[1]/a[1]/text()'
).extract_first()
reposts_count = re.findall(r'\d+.*', reposts_count)
retweet['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[2]/a[1]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
retweet['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = retweet_sel[0].xpath(
'.//a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()'
).extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
retweet['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = retweet_sel[0].xpath(
'.//p[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
retweet['created_at'] = util.standardize_date(created_at)
source = retweet_sel[0].xpath(
'.//p[@class="from"]/a[2]/text()').extract_first()
retweet['source'] = source if source else ''
retweet['pics'] = pics
retweet['video_url'] = video_url
retweet['retweet_id'] = ''
yield {'weibo': retweet, 'keyword': keyword}
weibo['retweet_id'] = retweet['id']
weibo["ip"] = self.get_ip(bid)
avator = sel.xpath(
"div[@class='card']/div[@class='card-feed']/div[@class='avator']"
)
if avator:
user_auth = avator.xpath('.//svg/@id').extract_first()
# print(user_auth)
if user_auth == 'woo_svg_vblue':
weibo['user_authentication'] = '蓝V'
elif user_auth == 'woo_svg_vyellow':
weibo['user_authentication'] = '黄V'
elif user_auth == 'woo_svg_vorange':
weibo['user_authentication'] = '红V'
elif user_auth == 'woo_svg_vgold':
weibo['user_authentication'] = '金V'
else:
weibo['user_authentication'] = '普通用户'
# print(weibo)
yield {'weibo': weibo, 'keyword': keyword}
调整了搜索方式,使得搜索日期区间不相交(屏蔽情况的输出很随意,希望作者能调整一下) parse_page能够处理屏蔽情况 get_article_url似乎不能正常工作,做了相应调整
另外很早以前的微博视频链接不能正常爬取(我这里是2014年)
# -*- coding: utf-8 -*- import os import re import sys from datetime import datetime, timedelta from urllib.parse import unquote import requests import scrapy import weibo.utils.util as util from scrapy.exceptions import CloseSpider from scrapy.utils.project import get_project_settings from weibo.items import WeiboItem class SearchSpider(scrapy.Spider): name = 'search' allowed_domains = ['weibo.com'] settings = get_project_settings() keyword_list = settings.get('KEYWORD_LIST') if not isinstance(keyword_list, list): if not os.path.isabs(keyword_list): keyword_list = os.getcwd() + os.sep + keyword_list if not os.path.isfile(keyword_list): sys.exit('不存在%s文件' % keyword_list) keyword_list = util.get_keyword_list(keyword_list) for i, keyword in enumerate(keyword_list): if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#': keyword_list[i] = '%23' + keyword[1:-1] + '%23' weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE')) contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE')) regions = util.get_regions(settings.get('REGION')) base_url = 'https://s.weibo.com' start_date = settings.get('START_DATE', datetime.now().strftime('%Y-%m-%d')) end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d')) if util.str_to_time(start_date) > util.str_to_time(end_date): sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py') further_threshold = settings.get('FURTHER_THRESHOLD', 46) mongo_error = False pymongo_error = False mysql_error = False pymysql_error = False with open('log.txt', 'w') as f: f.write('') def start_requests(self): start_date = datetime.strptime(self.start_date, '%Y-%m-%d') end_date = datetime.strptime(self.end_date, '%Y-%m-%d') + timedelta(days=1) start_str = start_date.strftime('%Y-%m-%d') + '-0' end_str = end_date.strftime('%Y-%m-%d') + '-0' for keyword in self.keyword_list: if not self.settings.get('REGION') or '全部' in self.settings.get( 'REGION'): base_url = 'https://s.weibo.com/weibo?q=%s' % keyword url = base_url + self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}'.format(start_str, end_str) yield scrapy.Request(url=url, callback=self.parse, meta={ 'base_url': base_url, 'keyword': keyword, 'start_date': self.start_date, 'end_date': self.end_date }) else: for region in self.regions.values(): base_url = ( 'https://s.weibo.com/weibo?q={}®ion=custom:{}:1000' ).format(keyword, region['code']) url = base_url + self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}'.format(start_str, end_str) # 获取一个省的搜索结果 yield scrapy.Request(url=url, callback=self.parse, meta={ 'base_url': base_url, 'keyword': keyword, 'province': region, 'start_date': self.start_date, 'end_date': self.end_date }) def check_environment(self): """判断配置要求的软件是否已安装""" if self.pymongo_error: print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序') raise CloseSpider() if self.mongo_error: print('系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序') raise CloseSpider() if self.pymysql_error: print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序') raise CloseSpider() if self.mysql_error: print('系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序') raise CloseSpider() def parse(self, response): base_url = response.meta.get('base_url') keyword = response.meta.get('keyword') province = response.meta.get('province') is_empty = response.xpath( '//div[@class="card card-no-result s-pt20b40"]') page_count = len(response.xpath('//ul[@class="s-scroll"]/li')) print(f"{datetime.now()}: 搜索{response.meta.get('start_date')}至{response.meta.get('end_date')}") # if is_empty: # print(f"{datetime.now()}: 当前页面搜索结果为空") # 如果第一页全被夹了,也会被认为是空的 if page_count < self.further_threshold and not is_empty: # 解析当前页面 for weibo in self.parse_weibo(response): self.check_environment() yield weibo if page_count > 1: next_url = response.url.split('&page=')[0] + f'&page=2' yield scrapy.Request(url=next_url, callback=self.parse_page, meta={ 'keyword': keyword, 'page_count': page_count, 'page_index': 2 }) else: start_date = datetime.strptime(response.meta.get('start_date'), '%Y-%m-%d') end_date = datetime.strptime(response.meta.get('end_date'), '%Y-%m-%d') if start_date + timedelta(days=3) <= end_date: start_str = start_date.strftime('%Y-%m-%d') + '-0' end_str = (end_date + timedelta(days=1)).strftime('%Y-%m-%d') + '-0' meta = { 'base_url': base_url, 'keyword': keyword } if not self.settings.get('REGION') or '全部' in self.settings.get( 'REGION') else { 'base_url': base_url, 'keyword': keyword, 'province': province } mid_date = start_date + timedelta(days=(end_date - start_date).days // 2) mid_str = mid_date.strftime('%Y-%m-%d') + '-0' url = base_url + self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}&page=1'.format( start_str, mid_str) meta['start_date'] = start_date.strftime('%Y-%m-%d') meta['end_date'] = mid_date.strftime('%Y-%m-%d') # 获取前半段时间的搜索结果 yield scrapy.Request(url=url, callback=self.parse, meta=meta) mid_date = mid_date + timedelta(days=1) mid_str = mid_date.strftime('%Y-%m-%d') + '-0' url = base_url + self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}&page=1'.format( mid_str, end_str) meta['start_date'] = mid_date.strftime('%Y-%m-%d') meta['end_date'] = end_date.strftime('%Y-%m-%d') # 获取后半段时间的搜索结果 yield scrapy.Request(url=url, callback=self.parse, meta=meta) else: while start_date <= end_date: start_str = start_date.strftime('%Y-%m-%d') + '-0' start_date = start_date + timedelta(days=1) end_str = start_date.strftime('%Y-%m-%d') + '-0' url = base_url + self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}&page=1'.format( start_str, end_str) # 获取一天的搜索结果 yield scrapy.Request(url=url, callback=self.parse_by_day, meta={ 'base_url': base_url, 'keyword': keyword, 'province': province, 'date': start_str[:-2] }) def parse_by_day(self, response): """以天为单位筛选""" base_url = response.meta.get('base_url') keyword = response.meta.get('keyword') province = response.meta.get('province') is_empty = response.xpath( '//div[@class="card card-no-result s-pt20b40"]') date = response.meta.get('date') page_count = len(response.xpath('//ul[@class="s-scroll"]/li')) if is_empty: print(f"{datetime.now()}: {date} 日搜索结果为空") elif page_count < self.further_threshold: # 解析当前页面 for weibo in self.parse_weibo(response): self.check_environment() yield weibo if page_count > 1: next_url = response.url.split('&page=')[0] + f'&page=2' yield scrapy.Request(url=next_url, callback=self.parse_page, meta={'keyword': keyword, 'page_count': page_count, 'page_index': 2 }) else: start_date_str = date + '-0' start_date = datetime.strptime(start_date_str, '%Y-%m-%d-%H') for i in range(1, 25): start_str = start_date.strftime('%Y-%m-%d-X%H').replace( 'X0', 'X').replace('X', '') start_date = start_date + timedelta(hours=1) end_str = start_date.strftime('%Y-%m-%d-X%H').replace( 'X0', 'X').replace('X', '') url = base_url + self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}&page=1'.format( start_str, end_str) # 获取一小时的搜索结果 yield scrapy.Request(url=url, callback=self.parse_by_hour_province if province else self.parse_by_hour, meta={ 'base_url': base_url, 'keyword': keyword, 'province': province, 'start_time': start_str, 'end_time': end_str }) def parse_by_hour(self, response): """以小时为单位筛选""" keyword = response.meta.get('keyword') is_empty = response.xpath( '//div[@class="card card-no-result s-pt20b40"]') start_time = response.meta.get('start_time') end_time = response.meta.get('end_time') page_count = len(response.xpath('//ul[@class="s-scroll"]/li')) if is_empty: print(f"{datetime.now()}: {start_time} 搜索结果为空") elif page_count < self.further_threshold: # 解析当前页面 for weibo in self.parse_weibo(response): self.check_environment() yield weibo if page_count > 1: next_url = response.url.split('&page=')[0] + f'&page=2' yield scrapy.Request(url=next_url, callback=self.parse_page, meta={'keyword': keyword, 'page_count': page_count, 'page_index': 2 }) else: for region in self.regions.values(): url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:1000' ).format(keyword, region['code']) url += self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}&page=1'.format( start_time, end_time) # 获取一小时一个省的搜索结果 yield scrapy.Request(url=url, callback=self.parse_by_hour_province, meta={ 'keyword': keyword, 'start_time': start_time, 'end_time': end_time, 'province': region }) def parse_by_hour_province(self, response): """以小时和直辖市/省为单位筛选""" keyword = response.meta.get('keyword') is_empty = response.xpath( '//div[@class="card card-no-result s-pt20b40"]') start_time = response.meta.get('start_time') end_time = response.meta.get('end_time') province = response.meta.get('province') page_count = len(response.xpath('//ul[@class="s-scroll"]/li')) if is_empty: print(f"{datetime.now()}: {start_time} {province} 搜索结果为空") elif page_count < self.further_threshold: # 解析当前页面 for weibo in self.parse_weibo(response): self.check_environment() yield weibo if page_count > 1: next_url = response.url.split('&page=')[0] + f'&page=2' yield scrapy.Request(url=next_url, callback=self.parse_page, meta={'keyword': keyword, 'page_count': page_count, 'page_index': 2 }) else: for city in province['city'].values(): url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:{}' ).format(keyword, province['code'], city) url += self.weibo_type url += self.contain_type url += '×cope=custom:{}:{}&page=1'.format( start_time, end_time) # 获取一小时一个城市的搜索结果 yield scrapy.Request(url=url, callback=self.parse_page, meta={ 'keyword': keyword, 'start_time': start_time, 'end_time': end_time, 'province': province, 'city': city, 'page_count': -1, 'page_index': 1 }) def parse_page(self, response): """解析一页搜索结果的信息""" keyword = response.meta.get('keyword') is_empty = response.xpath( '//div[@class="card card-no-result s-pt20b40"]') page_count = response.meta.get('page_count') page_index = response.meta.get('page_index') if is_empty and page_count < 0: print(f"{datetime.now()}: {page_index}/{page_count} 页搜索结果为空") with open('log.txt', 'a') as f: f.write(f"{response.url} 0\n") else: if not is_empty: for weibo in self.parse_weibo(response): self.check_environment() yield weibo else: with open('log.txt', 'a') as f: f.write(f"{response.url} 0\n") if page_index < page_count: next_url = response.url.split('&page=')[0] + f'&page={page_index + 1}' yield scrapy.Request(url=next_url, callback=self.parse_page, meta={'keyword': keyword, 'page_count': page_count, 'page_index': page_index + 1}) def get_ip(self, bid): url = f"https://weibo.com/ajax/statuses/show?id={bid}&locale=zh-CN" response = requests.get(url, headers=self.settings.get('DEFAULT_REQUEST_HEADERS')) if response.status_code != 200: return "" try: data = response.json() except requests.exceptions.JSONDecodeError: return "" ip_str = data.get("region_name", "") if ip_str: ip_str = ip_str.split()[-1] return ip_str def get_article_url(self, selector): """获取微博头条文章url""" article_url = '' text = selector.xpath('string(.)').extract_first().replace( '\u200b', '').replace('\ue627', '').replace('\n', '').replace(' ', '') if text.startswith('发布了头条文章'): urls = selector.xpath('.//a') for url in urls: if url.xpath( 'i[@class="wbicon"]/text()').extract_first() == 'O': if url.xpath('@href').extract_first() and url.xpath( '@href').extract_first().startswith('https://t.cn'): article_url = url.xpath('@href').extract_first() break return article_url def n_get_article_url(self, selector, text_selector): """获取微博头条文章url""" article_url = self.get_article_url(text_selector) paths = ['div[@class="from"]/a[1]/@href', 'p[@class="from"]/a[1]/@href'] if not article_url or article_url == '': for path in paths: try: article_url = 'https:' + selector.xpath('.//'+path).extract_first().split('?')[0] break except: pass if not article_url or article_url == '': print('未找到文章链接') print(selector.extract()) return article_url def get_location(self, selector): """获取微博发布位置""" a_list = selector.xpath('.//a') location = '' for a in a_list: if a.xpath('./i[@class="wbicon"]') and a.xpath( './i[@class="wbicon"]/text()').extract_first() == '2': location = a.xpath('string(.)').extract_first()[1:] break return location def get_at_users(self, selector): """获取微博中@的用户昵称""" a_list = selector.xpath('.//a') at_users = '' at_list = [] for a in a_list: if len(unquote(a.xpath('@href').extract_first())) > 14 and len( a.xpath('string(.)').extract_first()) > 1: if unquote(a.xpath('@href').extract_first())[14:] == a.xpath( 'string(.)').extract_first()[1:]: at_user = a.xpath('string(.)').extract_first()[1:] if at_user not in at_list: at_list.append(at_user) if at_list: at_users = ','.join(at_list) return at_users def get_topics(self, selector): """获取参与的微博话题""" a_list = selector.xpath('.//a') topics = '' topic_list = [] for a in a_list: text = a.xpath('string(.)').extract_first() if len(text) > 2 and text[0] == '#' and text[-1] == '#': if text[1:-1] not in topic_list: topic_list.append(text[1:-1]) if topic_list: topics = ','.join(topic_list) return topics def parse_weibo(self, response): """解析网页中的微博信息""" keyword = response.meta.get('keyword') card_warp_len = len(response.xpath('//div[@class="card-wrap"]')) with open('log.txt', 'a') as f: f.write(f"{response.url} {card_warp_len}\n") for sel in response.xpath("//div[@class='card-wrap']"): info = sel.xpath( "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']" ) if info: weibo = WeiboItem() weibo['id'] = sel.xpath('@mid').extract_first() bid = sel.xpath( './/div[@class="from"]/a[1]/@href').extract_first( ).split('/')[-1].split('?')[0] weibo['bid'] = bid weibo['user_id'] = info[0].xpath( 'div[2]/a/@href').extract_first().split('?')[0].split( '/')[-1] weibo['screen_name'] = info[0].xpath( 'div[2]/a/@nick-name').extract_first() txt_sel = sel.xpath('.//p[@class="txt"]')[0] retweet_sel = sel.xpath('.//div[@class="card-comment"]') retweet_txt_sel = '' if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'): retweet_txt_sel = retweet_sel[0].xpath( './/p[@class="txt"]')[0] content_full = sel.xpath( './/p[@node-type="feed_list_content_full"]') is_long_weibo = False is_long_retweet = False if content_full: if not retweet_sel: txt_sel = content_full[0] is_long_weibo = True elif len(content_full) == 2: txt_sel = content_full[0] retweet_txt_sel = content_full[1] is_long_weibo = True is_long_retweet = True elif retweet_sel[0].xpath( './/p[@node-type="feed_list_content_full"]'): retweet_txt_sel = retweet_sel[0].xpath( './/p[@node-type="feed_list_content_full"]')[0] is_long_retweet = True else: txt_sel = content_full[0] is_long_weibo = True weibo['text'] = txt_sel.xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') weibo['article_url'] = self.n_get_article_url(sel, txt_sel) weibo['location'] = self.get_location(txt_sel) if weibo['location']: weibo['text'] = weibo['text'].replace( '2' + weibo['location'], '') weibo['text'] = weibo['text'][2:].replace(' ', '') if is_long_weibo: weibo['text'] = weibo['text'][:-4] weibo['at_users'] = self.get_at_users(txt_sel) weibo['topics'] = self.get_topics(txt_sel) reposts_count = sel.xpath( './/a[@action-type="feed_list_forward"]/text()').extract() reposts_count = "".join(reposts_count) try: reposts_count = re.findall(r'\d+.*', reposts_count) except TypeError: print( "无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n" "请在 https://github.com/dataabc/weibo-search 查看文档,以解决问题," ) raise CloseSpider() weibo['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = sel.xpath( './/a[@action-type="feed_list_comment"]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) weibo['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = sel.xpath( './/a[@action-type="feed_list_like"]/button/span[2]/text()').extract_first() attitudes_count = re.findall(r'\d+.*', attitudes_count) weibo['attitudes_count'] = attitudes_count[ 0] if attitudes_count else '0' created_at = sel.xpath( './/div[@class="from"]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] weibo['created_at'] = util.standardize_date(created_at) source = sel.xpath('.//div[@class="from"]/a[2]/text()' ).extract_first() weibo['source'] = source if source else '' pics = '' is_exist_pic = sel.xpath( './/div[@class="media media-piclist"]') if is_exist_pic: pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract() pics = [pic[8:] for pic in pics] pics = [ re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics ] pics = ['https://' + pic for pic in pics] video_url = '' is_exist_video = sel.xpath( './/div[@class="thumbnail"]//video-player').extract_first() if is_exist_video: video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0] video_url = video_url.replace('&', '&') video_url = 'http:' + video_url if not retweet_sel: weibo['pics'] = pics weibo['video_url'] = video_url else: weibo['pics'] = '' weibo['video_url'] = '' weibo['retweet_id'] = '' if retweet_sel and retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]'): retweet = WeiboItem() retweet['id'] = retweet_sel[0].xpath( './/a[@action-type="feed_list_like"]/@action-data' ).extract_first()[4:] retweet['bid'] = retweet_sel[0].xpath( './/p[@class="from"]/a/@href').extract_first().split( '/')[-1].split('?')[0] info = retweet_sel[0].xpath( './/div[@node-type="feed_list_forwardContent"]/a[1]' )[0] retweet['user_id'] = info.xpath( '@href').extract_first().split('/')[-1] retweet['screen_name'] = info.xpath( '@nick-name').extract_first() retweet['text'] = retweet_txt_sel.xpath( 'string(.)').extract_first().replace('\u200b', '').replace( '\ue627', '') retweet['article_url'] = self.n_get_article_url( retweet_sel[0], retweet_txt_sel) retweet['location'] = self.get_location(retweet_txt_sel) if retweet['location']: retweet['text'] = retweet['text'].replace( '2' + retweet['location'], '') retweet['text'] = retweet['text'][2:].replace(' ', '') if is_long_retweet: retweet['text'] = retweet['text'][:-4] retweet['at_users'] = self.get_at_users(retweet_txt_sel) retweet['topics'] = self.get_topics(retweet_txt_sel) reposts_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li[1]/a[1]/text()' ).extract_first() reposts_count = re.findall(r'\d+.*', reposts_count) retweet['reposts_count'] = reposts_count[ 0] if reposts_count else '0' comments_count = retweet_sel[0].xpath( './/ul[@class="act s-fr"]/li[2]/a[1]/text()' ).extract_first() comments_count = re.findall(r'\d+.*', comments_count) retweet['comments_count'] = comments_count[ 0] if comments_count else '0' attitudes_count = retweet_sel[0].xpath( './/a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()' ).extract_first() attitudes_count = re.findall(r'\d+.*', attitudes_count) retweet['attitudes_count'] = attitudes_count[ 0] if attitudes_count else '0' created_at = retweet_sel[0].xpath( './/p[@class="from"]/a[1]/text()').extract_first( ).replace(' ', '').replace('\n', '').split('前')[0] retweet['created_at'] = util.standardize_date(created_at) source = retweet_sel[0].xpath( './/p[@class="from"]/a[2]/text()').extract_first() retweet['source'] = source if source else '' retweet['pics'] = pics retweet['video_url'] = video_url retweet['retweet_id'] = '' yield {'weibo': retweet, 'keyword': keyword} weibo['retweet_id'] = retweet['id'] weibo["ip"] = self.get_ip(bid) avator = sel.xpath( "div[@class='card']/div[@class='card-feed']/div[@class='avator']" ) if avator: user_auth = avator.xpath('.//svg/@id').extract_first() # print(user_auth) if user_auth == 'woo_svg_vblue': weibo['user_authentication'] = '蓝V' elif user_auth == 'woo_svg_vyellow': weibo['user_authentication'] = '黄V' elif user_auth == 'woo_svg_vorange': weibo['user_authentication'] = '红V' elif user_auth == 'woo_svg_vgold': weibo['user_authentication'] = '金V' else: weibo['user_authentication'] = '普通用户' # print(weibo) yield {'weibo': weibo, 'keyword': keyword}调整了搜索方式,使得搜索日期区间不相交(屏蔽情况的输出很随意,希望作者能调整一下) parse_page能够处理屏蔽情况 get_article_url似乎不能正常工作,做了相应调整
另外很早以前的微博视频链接不能正常爬取(我这里是2014年)
您好~请问按这样重新调整就可以避免首页空白的情况吗? 另外,作者在很多地方提醒大家def pase_by_hour部分需要修改,我看您这里还是原始版本的code,这里我按照作者的意思做了如下修改,您觉得是否可以呢?
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
if is_empty:
print(f"{datetime.now()}: {start_time} 搜索结果为空")
else:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})