newspaper
newspaper copied to clipboard
can't start new thread
Sorry, my English is not good, I will try to be as clear as possible
I used 3 servers to run my program, but there are still errors like error: can't start new thread
Here is my code
import hashlib
import datetime
import time, re
import newspaper
from newspaper import Article, Config, ArticleException
from lxml import etree
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 25
config.number_threads = 1
config.thread_timeout_seconds = 5
config.keep_article_html = True
def spider_newspaper_information(build_url, website_name):
start_time = time.time()
web_paper = newspaper.build(url=build_url)
print(web_paper.size())
data_list = []
for url in web_paper.article_urls():
article = Article(url, config=config)
try:
article.download()
time.sleep(1)
article.parse()
except ArticleException:
continue
if article.title == '':
continue
web_html = etree.HTML(article.html)
web_time = ''
if article.publish_date is None:
try:
web_time = web_html.xpath('//time/text()')[0]
except:
web_time = ''
article_meta_data = article.meta_data
video_url = []
img_list = []
for key, value in article_meta_data.items():
if key == 'og':
if 'video' in value.keys():
video_url.append(value['video'])
if 'site_name' in value.keys():
if website_name == '':
website_name = value['site_name']
if 'image' in value.keys():
img_list.append(value['image'])
for img in article.imgs:
if img in article.article_html:
img_list.append({
'identifier': img,
'width': 960,
'height': 540})
data = {
'url': url,
'source_url': article.source_url,
'title': article.title,
'top_img': article.top_img,
'imgs': str(img_list),
'movies': str(article.movies) if len(article.movies) != 0 else str(video_url),
'text': article.text,
'keywords': article.meta_keywords,
'tags': list(article.tags),
'authors': article.authors,
'lang': article.meta_lang,
'description': article.meta_description,
'create_time': web_time if article.publish_date is None else article.publish_date.strftime("%Y-%m-%d %H:%M:%S"),
'collection_time': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'website_name': website_name
}
print(data)
data_list.append(data)
return {
'web_size': web_paper.size(),
'data_list': data_list,
'use_time': round(time.time() - start_time, 3)
}
Is there any room for improvement? help...
Why are you setting config.number_threads
to 1? The default is 10.
Also take at the threading section in my Newspaper3k
Overview Document.
你为什么设置
config.number_threads
为1?默认值为 10。另请参阅我的概述文档中的线程部分。
Newspaper3k
Thank you for your reply!
Because I thought this configuration was the cause of the error, but in fact the same error will occur if I adjust it smaller. T^T