weibo-search icon indicating copy to clipboard operation
weibo-search copied to clipboard

运行后报错AttributeError: 'NoneType' object has no attribute 'split'

Open xiaoyinguo22 opened this issue 2 years ago • 17 comments

环境都配置好了,之前运行都有用,今天再使用的时候出现了这个错误 Traceback (most recent call last): File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\", line 132, in iter_errback yield next(it) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\", line 354, in __next__ return next( File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\", line 354, in __next__ return next( File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\", line 29, in process_spider_output for x in result: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\", line 342, in <genexpr> return (_set_referer(r) for r in result or ()) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\", line 40, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\", line 66, in _evaluate_iterable for r in iterable: File "F:\python_code\weibo\weibo-search\weibo\spiders\", line 107, in parse for weibo in self.parse_weibo(response): File "F:\python_code\weibo\weibo-search\weibo\spiders\", line 356, in parse_weibo weibo['bid'] = sel.xpath( AttributeError: 'NoneType' object has no attribute 'split'

xiaoyinguo22 avatar Aug 13 '22 08:08 xiaoyinguo22


ErikChen0001 avatar Aug 13 '22 08:08 ErikChen0001


compaction avatar Aug 13 '22 12:08 compaction

我也同样出现该问题,我在其他网站上搜索出现 AttributeError: 'NoneType' object has no attribute 'split'的情况,发现其他人说只是有报错提示但是不影响数据的爬取。 但我感觉还是少爬了一部分数据…不知道如何解决,同等回复

ShenglinYU avatar Aug 13 '22 15:08 ShenglinYU


dataabc avatar Aug 14 '22 14:08 dataabc

刚刚看了其他人的反馈 发现把search.py里面的p[@Class="from"全部更换为div[@Class="from",可以跑起来

ShenglinYU avatar Aug 14 '22 14:08 ShenglinYU


DeNancy avatar Aug 15 '22 02:08 DeNancy


DeNancy avatar Aug 15 '22 03:08 DeNancy


compaction avatar Aug 15 '22 06:08 compaction


Stubborn-one avatar Aug 16 '22 14:08 Stubborn-one

  File "/opt/conda/envs/python35-paddle120-env/bin/scrapy", line 8, in <module>
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/", line 123, in execute
    settings = get_project_settings()
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/utils/", line 68, in get_project_settings
    settings.setmodule(settings_module_path, priority='project')
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/", line 290, in setmodule
    self.set(key, getattr(module, key), priority)
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/", line 265, in set
    self.attributes[name].set(value, priority)
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/", line 50, in set
    value = BaseSettings(value, priority=priority)
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/", line 86, in __init__
    self.update(values, priority)
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/", line 322, in update
    for name, value in values.items():
AttributeError: 'set' object has no attribute 'items'


Ss-shuang123 avatar Aug 18 '22 07:08 Ss-shuang123


xiaoyinguo22 avatar Aug 19 '22 08:08 xiaoyinguo22

-- coding: utf-8 --

import os import re import sys from datetime import datetime, timedelta from urllib.parse import unquote

import scrapy import weibo.utils.util as util from scrapy.exceptions import CloseSpider from scrapy.utils.project import get_project_settings from weibo.items import WeiboItem

class SearchSpider(scrapy.Spider): name = 'search' allowed_domains = [''] settings = get_project_settings() keyword_list = settings.get('KEYWORD_LIST') if not isinstance(keyword_list, list): if not os.path.isabs(keyword_list): keyword_list = os.getcwd() + os.sep + keyword_list if not os.path.isfile(keyword_list): sys.exit('不存在%s文件' % keyword_list) keyword_list = util.get_keyword_list(keyword_list)

for i, keyword in enumerate(keyword_list):
    if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
        keyword_list[i] = '%23' + keyword[1:-1] + '%23'
weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE'))
contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE'))
regions = util.get_regions(settings.get('REGION'))
base_url = ''
start_date = settings.get('START_DATE',
end_date = settings.get('END_DATE','%Y-%m-%d'))
if util.str_to_time(start_date) > util.str_to_time(end_date):
further_threshold = settings.get('FURTHER_THRESHOLD', 46)
mongo_error = False
pymongo_error = False
mysql_error = False
pymysql_error = False

def start_requests(self):
    start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
    end_date = datetime.strptime(self.end_date,
                                 '%Y-%m-%d') + timedelta(days=1)
    start_str = start_date.strftime('%Y-%m-%d') + '-0'
    end_str = end_date.strftime('%Y-%m-%d') + '-0'
    for keyword in self.keyword_list:
        if not self.settings.get('REGION') or '全部' in self.settings.get(
            base_url = '' % keyword
            url = base_url + self.weibo_type
            url += self.contain_type
            url += '&timescope=custom:{}:{}'.format(start_str, end_str)
            yield scrapy.Request(url=url,
                                     'base_url': base_url,
                                     'keyword': keyword
            for region in self.regions.values():
                base_url = (
                ).format(keyword, region['code'])
                url = base_url + self.weibo_type
                url += self.contain_type
                url += '&timescope=custom:{}:{}'.format(start_str, end_str)
                # 获取一个省的搜索结果
                yield scrapy.Request(url=url,
                                         'base_url': base_url,
                                         'keyword': keyword,
                                         'province': region

def check_environment(self):
    if self.pymongo_error:
        print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
        raise CloseSpider()
    if self.mongo_error:
        raise CloseSpider()
    if self.pymysql_error:
        print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
        raise CloseSpider()
    if self.mysql_error:
        raise CloseSpider()

def parse(self, response):
    base_url = response.meta.get('base_url')
    keyword = response.meta.get('keyword')
    province = response.meta.get('province')
    is_empty = response.xpath(
        '//div[@class="card card-no-result s-pt20b40"]')
    page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
    if is_empty:
    elif page_count < self.further_threshold:
        # 解析当前页面
        for weibo in self.parse_weibo(response):
            yield weibo
        next_url = response.xpath(
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(url=next_url,
                                 meta={'keyword': keyword})
        start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
        end_date = datetime.strptime(self.end_date, '%Y-%m-%d')
        while start_date <= end_date:
            start_str = start_date.strftime('%Y-%m-%d') + '-0'
            start_date = start_date + timedelta(days=1)
            end_str = start_date.strftime('%Y-%m-%d') + '-0'
            url = base_url + self.weibo_type
            url += self.contain_type
            url += '&timescope=custom:{}:{}&page=1'.format(
                start_str, end_str)
            # 获取一天的搜索结果
            yield scrapy.Request(url=url,
                                     'base_url': base_url,
                                     'keyword': keyword,
                                     'province': province,
                                     'date': start_str[:-2]

def parse_by_day(self, response):
    base_url = response.meta.get('base_url')
    keyword = response.meta.get('keyword')
    province = response.meta.get('province')
    is_empty = response.xpath(
        '//div[@class="card card-no-result s-pt20b40"]')
    date = response.meta.get('date')
    page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
    if is_empty:
    elif page_count < self.further_threshold:
        # 解析当前页面
        for weibo in self.parse_weibo(response):
            yield weibo
        next_url = response.xpath(
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(url=next_url,
                                 meta={'keyword': keyword})
        start_date_str = date + '-0'
        start_date = datetime.strptime(start_date_str, '%Y-%m-%d-%H')
        for i in range(1, 25):
            start_str = start_date.strftime('%Y-%m-%d-X%H').replace(
                'X0', 'X').replace('X', '')
            start_date = start_date + timedelta(hours=1)
            end_str = start_date.strftime('%Y-%m-%d-X%H').replace(
                'X0', 'X').replace('X', '')
            url = base_url + self.weibo_type
            url += self.contain_type
            url += '&timescope=custom:{}:{}&page=1'.format(
                start_str, end_str)
            # 获取一小时的搜索结果
            yield scrapy.Request(url=url,
                                 if province else self.parse_by_hour,
                                     'base_url': base_url,
                                     'keyword': keyword,
                                     'province': province,
                                     'start_time': start_str,
                                     'end_time': end_str

def parse_by_hour(self, response):
    keyword = response.meta.get('keyword')
    is_empty = response.xpath(
        '//div[@class="card card-no-result s-pt20b40"]')
    start_time = response.meta.get('start_time')
    end_time = response.meta.get('end_time')
    page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
    if is_empty:
    elif page_count < self.further_threshold:
        # 解析当前页面
        for weibo in self.parse_weibo(response):
            yield weibo
        next_url = response.xpath(
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(url=next_url,
                                 meta={'keyword': keyword})
        for region in self.regions.values():
            url = ('{}&region=custom:{}:1000'
                   ).format(keyword, region['code'])
            url += self.weibo_type
            url += self.contain_type
            url += '&timescope=custom:{}:{}&page=1'.format(
                start_time, end_time)
            # 获取一小时一个省的搜索结果
            yield scrapy.Request(url=url,
                                     'keyword': keyword,
                                     'start_time': start_time,
                                     'end_time': end_time,
                                     'province': region

def parse_by_hour_province(self, response):
    keyword = response.meta.get('keyword')
    is_empty = response.xpath(
        '//div[@class="card card-no-result s-pt20b40"]')
    start_time = response.meta.get('start_time')
    end_time = response.meta.get('end_time')
    province = response.meta.get('province')
    page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
    if is_empty:
    elif page_count < self.further_threshold:
        # 解析当前页面
        for weibo in self.parse_weibo(response):
            yield weibo
        next_url = response.xpath(
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(url=next_url,
                                 meta={'keyword': keyword})
        for city in province['city'].values():
            url = ('{}&region=custom:{}:{}'
                   ).format(keyword, province['code'], city)
            url += self.weibo_type
            url += self.contain_type
            url += '&timescope=custom:{}:{}&page=1'.format(
                start_time, end_time)
            # 获取一小时一个城市的搜索结果
            yield scrapy.Request(url=url,
                                     'keyword': keyword,
                                     'start_time': start_time,
                                     'end_time': end_time,
                                     'province': province,
                                     'city': city

def parse_page(self, response):
    keyword = response.meta.get('keyword')
    is_empty = response.xpath(
        '//div[@class="card card-no-result s-pt20b40"]')
    if is_empty:
        for weibo in self.parse_weibo(response):
            yield weibo
        next_url = response.xpath(
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(url=next_url,
                                 meta={'keyword': keyword})

def get_article_url(self, selector):
    article_url = ''
    text = selector.xpath('string(.)').extract_first().replace(
        '\u200b', '').replace('\ue627', '').replace('\n',
                                                    '').replace(' ', '')
    if text.startswith('发布了头条文章'):
        urls = selector.xpath('.//a')
        for url in urls:
            if url.xpath(
                    'i[@class="wbicon"]/text()').extract_first() == 'O':
                if url.xpath('@href').extract_first() and url.xpath(
                    article_url = url.xpath('@href').extract_first()
    return article_url

def get_location(self, selector):
    a_list = selector.xpath('.//a')
    location = ''
    for a in a_list:
        if a.xpath('./i[@class="wbicon"]') and a.xpath(
                './i[@class="wbicon"]/text()').extract_first() == '2':
            location = a.xpath('string(.)').extract_first()[1:]
    return location

def get_at_users(self, selector):
    a_list = selector.xpath('.//a')
    at_users = ''
    at_list = []
    for a in a_list:
        if len(unquote(a.xpath('@href').extract_first())) > 14 and len(
                a.xpath('string(.)').extract_first()) > 1:
            if unquote(a.xpath('@href').extract_first())[14:] == a.xpath(
                at_user = a.xpath('string(.)').extract_first()[1:]
                if at_user not in at_list:
    if at_list:
        at_users = ','.join(at_list)
    return at_users

def get_topics(self, selector):
    a_list = selector.xpath('.//a')
    topics = ''
    topic_list = []
    for a in a_list:
        text = a.xpath('string(.)').extract_first()
        if len(text) > 2 and text[0] == '#' and text[-1] == '#':
            if text[1:-1] not in topic_list:
    if topic_list:
        topics = ','.join(topic_list)
    return topics

def parse_weibo(self, response):
    keyword = response.meta.get('keyword')
    for sel in response.xpath("//div[@class='card-wrap']"):
        info = sel.xpath(
        if info:
            weibo = WeiboItem()
            weibo['id'] = sel.xpath('@mid').extract_first()
            weibo['bid'] = sel.xpath(
            weibo['user_id'] = info[0].xpath(
            weibo['screen_name'] = info[0].xpath(
            txt_sel = sel.xpath('.//p[@class="txt"]')[0]
            retweet_sel = sel.xpath('.//div[@class="card-comment"]')
            retweet_txt_sel = ''
            if retweet_sel and retweet_sel[0].xpath('.//div[@class="txt"]'):
                retweet_txt_sel = retweet_sel[0].xpath(
            content_full = sel.xpath(
            is_long_weibo = False
            is_long_retweet = False
            if content_full:
                if not retweet_sel:
                    txt_sel = content_full[0]
                    is_long_weibo = True
                elif len(content_full) == 2:
                    txt_sel = content_full[0]
                    retweet_txt_sel = content_full[1]
                    is_long_weibo = True
                    is_long_retweet = True
                elif retweet_sel[0].xpath(
                    retweet_txt_sel = retweet_sel[0].xpath(
                    is_long_retweet = True
                    txt_sel = content_full[0]
                    is_long_weibo = True
            weibo['text'] = txt_sel.xpath(
                'string(.)').extract_first().replace('\u200b', '').replace(
                    '\ue627', '')
            weibo['article_url'] = self.get_article_url(txt_sel)
            weibo['location'] = self.get_location(txt_sel)
            if weibo['location']:
                weibo['text'] = weibo['text'].replace(
                    '2' + weibo['location'], '')
            weibo['text'] = weibo['text'][2:].replace(' ', '')
            if is_long_weibo:
                weibo['text'] = weibo['text'][:-4]
            weibo['at_users'] = self.get_at_users(txt_sel)
            weibo['topics'] = self.get_topics(txt_sel)
            reposts_count = sel.xpath(
            reposts_count = "".join(reposts_count)
                reposts_count = re.findall(r'\d+.*', reposts_count)
            except TypeError:
                    "无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n"
                    "请在 查看文档,以解决问题,"
                raise CloseSpider()
            weibo['reposts_count'] = reposts_count[
                0] if reposts_count else '0'
            comments_count = sel.xpath(
            comments_count = re.findall(r'\d+.*', comments_count)
            weibo['comments_count'] = comments_count[
                0] if comments_count else '0'
            attitudes_count = sel.xpath(
            attitudes_count = re.findall(r'\d+.*', attitudes_count)
            weibo['attitudes_count'] = attitudes_count[
                0] if attitudes_count else '0'
            created_at = sel.xpath(
                ).replace(' ', '').replace('\n', '').split('前')[0]
            weibo['created_at'] = util.standardize_date(created_at)
            source = sel.xpath('.//div[@class="from"]/a[2]/text()'
            weibo['source'] = source if source else ''
            pics = ''
            is_exist_pic = sel.xpath(
                './/div[@class="media media-piclist"]')
            if is_exist_pic:
                pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
                pics = [pic[8:] for pic in pics]
                pics = [
                    re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
                pics = ['https://' + pic for pic in pics]
            video_url = ''
            is_exist_video = sel.xpath(
            if is_exist_video:
                video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0]
                video_url = video_url.replace('&amp;', '&')
                video_url = 'http:' + video_url
            if not retweet_sel:
                weibo['pics'] = pics
                weibo['video_url'] = video_url
                weibo['pics'] = ''
                weibo['video_url'] = ''
            weibo['retweet_id'] = ''
            if retweet_sel and retweet_sel[0].xpath(
                retweet = WeiboItem()
                retweet['id'] = retweet_sel[0].xpath(
                retweet['bid'] = retweet_sel[0].xpath(
                info = retweet_sel[0].xpath(
                retweet['user_id'] = info.xpath(
                retweet['screen_name'] = info.xpath(
                retweet['text'] = retweet_txt_sel.xpath(
                                                             '\ue627', '')
                retweet['article_url'] = self.get_article_url(
                retweet['location'] = self.get_location(retweet_txt_sel)
                if retweet['location']:
                    retweet['text'] = retweet['text'].replace(
                        '2' + retweet['location'], '')
                retweet['text'] = retweet['text'][2:].replace(' ', '')
                if is_long_retweet:
                    retweet['text'] = retweet['text'][:-4]
                retweet['at_users'] = self.get_at_users(retweet_txt_sel)
                retweet['topics'] = self.get_topics(retweet_txt_sel)
                reposts_count = retweet_sel[0].xpath(
                    './/ul[@class="act s-fr"]/li[1]/a[1]/text()'
                reposts_count = re.findall(r'\d+.*', reposts_count)
                retweet['reposts_count'] = reposts_count[
                    0] if reposts_count else '0'
                comments_count = retweet_sel[0].xpath(
                    './/ul[@class="act s-fr"]/li[2]/a[1]/text()'
                comments_count = re.findall(r'\d+.*', comments_count)
                retweet['comments_count'] = comments_count[
                    0] if comments_count else '0'
                attitudes_count = retweet_sel[0].xpath(
                    './/a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()'
                attitudes_count = re.findall(r'\d+.*', attitudes_count)
                retweet['attitudes_count'] = attitudes_count[
                    0] if attitudes_count else '0'
                created_at = retweet_sel[0].xpath(
                    ).replace(' ', '').replace('\n', '').split('前')[0]
                retweet['created_at'] = util.standardize_date(created_at)
                source = retweet_sel[0].xpath(
                retweet['source'] = source if source else ''
                retweet['pics'] = pics
                retweet['video_url'] = video_url
                retweet['retweet_id'] = ''
                yield {'weibo': retweet, 'keyword': keyword}
                weibo['retweet_id'] = retweet['id']
            yield {'weibo': weibo, 'keyword': keyword}

Stubborn-one avatar Aug 19 '22 08:08 Stubborn-one


emilygong-zhuying avatar Aug 20 '22 20:08 emilygong-zhuying


yiweiyi121 avatar Aug 23 '22 03:08 yiweiyi121

切换成旧版微博的cookie就可以解决这个问题,不需要修改代码。 我也遇到同样的问题,如果使用新版web微博的cookie就会出现这个问题😂

JasonZ023 avatar Sep 13 '22 16:09 JasonZ023