crawl4ai
crawl4ai copied to clipboard
[Bug]: arun_many Unable to get results at runtime
crawl4ai version
0.6.3
Expected Behavior
Fixed the issue of arun_many() Method
Current Behavior
The same configuration can get the correct results when using arun(), and the result is empty when arun_many(). results[0].error_message="'list' object has no attribute 'status_code'"
Is this reproducible?
Yes
Inputs Causing the Bug
async with AsyncWebCrawler(config=browser_config) as crawler:
#crawler.crawler_strategy.set_hook("before_goto", before_goto)
results = await crawler.arun_many(
urls=["https://docs.crawl4ai.com/"],
config=run_config,
#dispatcher=dispatcher
)
Steps to Reproduce
Code snippets
import asyncio
import os
import logging
from datetime import datetime
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
from crawl4ai import RateLimiter
from crawl4ai.async_dispatcher import SemaphoreDispatcher
from playwright.async_api import Page, BrowserContext
import aiofiles
# 全局计数器和锁
page_counter = 0
write_lock = asyncio.Lock()
async def write_page_to_file(result, page_num):
"""实时写入单个页面数据到文件"""
async with write_lock:
output_file = "/home/duduzhang/webcraw/streamlit_realtime.md"
# 如果是第一个页面,创建文件头
if page_num == 1:
async with aiofiles.open(output_file, 'w', encoding='utf-8') as f:
await f.write("# Streamlit Documentation\n")
await f.write(f"\nCrawled on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
await f.write("---\n\n")
# 追加页面内容
async with aiofiles.open(output_file, 'a', encoding='utf-8') as f:
await f.write(f"## Page {page_num}: {result.url}\n")
await f.write(f"**Depth:** {result.metadata.get('depth', 0)}\n")
await f.write(f"**Status:** {result.status_code}\n")
if hasattr(result, 'title') and result.title:
await f.write(f"**Title:** {result.title}\n")
await f.write("\n### Content\n\n")
# 添加内容
if result.markdown:
content = result.markdown[:10000]
if len(result.markdown) > 10000:
content += "\n\n*[Content truncated due to length]*"
await f.write(content)
elif result.cleaned_html:
content = result.cleaned_html[:5000]
if len(result.cleaned_html) > 5000:
content += "\n\n*[Content truncated due to length]*"
await f.write(f"\n{content}\n")
else:
await f.write("*No content available*")
await f.write("\n\n---\n\n")
print(f"✅ Page {page_num} written to file: {result.url}")
dispatcher = SemaphoreDispatcher(
max_session_permit=5,
rate_limiter=RateLimiter(
base_delay=(1.5, 3.0),
max_delay=30.0
)
)
async def main():
# 配置浏览器设置
browser_config = BrowserConfig(
headless=False,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
},
viewport_width=1920,
viewport_height=1080,
ignore_https_errors=True,
java_script_enabled=True,
verbose=True,
proxy="http://localhost:7890" # 临时注释代理
)
# 创建过滤器链
filter_chain = FilterChain([
URLPatternFilter(
patterns=["*discuss*", "*privacy-policy*"],
reverse=True
)
])
# 配置爬取运行参数
run_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=0,
include_external=False,
filter_chain=filter_chain
),
scraping_strategy=LXMLWebScrapingStrategy(),
mean_delay=1.5,
max_range=2,
exclude_external_images=True,
override_navigator=True,
page_timeout=60000,
wait_until="networkidle",
)
async def before_goto(
page: Page, context: BrowserContext, url: str, **kwargs
):
if not url.startswith("https://docs.streamlit.io/"):
print(f"[SKIP] 跳过非目标子域名链接: {url}")
page.close()
return page
async with AsyncWebCrawler(config=browser_config) as crawler:
#crawler.crawler_strategy.set_hook("before_goto", before_goto)
results = await crawler.arun_many(
urls=["https://docs.crawl4ai.com/"],
config=run_config,
#dispatcher=dispatcher
)
print(f"Crawled {len(results)} pages in total")
print(f"Results saved to /home/duduzhang/webcraw/streamlit_realtime.md")
if __name__ == "__main__":
asyncio.run(main())
OS
WSL2->Ubuntu-22.04
Python version
cpython-3.10.12-linux-x86_64-gnu
Browser
chromium
Browser version
No response
Error logs & Screenshots (if applicable)
CrawlResult(url='https://docs.crawl4ai.com/', html='', fit_html=<property object at 0x7f392ab87920>, success=False, cleaned_html=None, media={}, links={}, downloaded_files=None, js_execution_result=None, screenshot=None, pdf=None, mhtml=None, extracted_content=None, metadata={}, error_message="'list' object has no attribute 'status_code'", session_id=None, response_headers=None, status_code=None, ssl_certificate=None, dispatch_result=DispatchResult(task_id='1b57d969-2b11-4dbb-85e5-a8c694be798e', memory_usage=2.25, peak_memory=2.25, start_time=1751879416.1899254, end_time=1751879422.5776045, error_message="'list' object has no attribute 'status_code'"), redirected_url=None, network_requests=None, console_messages=None, tables=[])