crawl4ai icon indicating copy to clipboard operation
crawl4ai copied to clipboard

[Bug]: arun_many Unable to get results at runtime

Open ztp559 opened this issue 5 months ago • 2 comments

crawl4ai version

0.6.3

Expected Behavior

Fixed the issue of arun_many() Method

Current Behavior

The same configuration can get the correct results when using arun(), and the result is empty when arun_many(). results[0].error_message="'list' object has no attribute 'status_code'"

Is this reproducible?

Yes

Inputs Causing the Bug

async with AsyncWebCrawler(config=browser_config) as crawler:
        #crawler.crawler_strategy.set_hook("before_goto", before_goto)
        results = await crawler.arun_many(
            urls=["https://docs.crawl4ai.com/"], 
            config=run_config, 
            #dispatcher=dispatcher
        )

Steps to Reproduce


Code snippets

import asyncio
import os
import logging
from datetime import datetime
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
from crawl4ai import RateLimiter
from crawl4ai.async_dispatcher import SemaphoreDispatcher
from playwright.async_api import Page, BrowserContext
import aiofiles

# 全局计数器和锁
page_counter = 0
write_lock = asyncio.Lock()

async def write_page_to_file(result, page_num):
    """实时写入单个页面数据到文件"""
    async with write_lock:
        output_file = "/home/duduzhang/webcraw/streamlit_realtime.md"
        
        # 如果是第一个页面,创建文件头
        if page_num == 1:
            async with aiofiles.open(output_file, 'w', encoding='utf-8') as f:
                await f.write("# Streamlit Documentation\n")
                await f.write(f"\nCrawled on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                await f.write("---\n\n")
        
        # 追加页面内容
        async with aiofiles.open(output_file, 'a', encoding='utf-8') as f:
            await f.write(f"## Page {page_num}: {result.url}\n")
            await f.write(f"**Depth:** {result.metadata.get('depth', 0)}\n")
            await f.write(f"**Status:** {result.status_code}\n")
            
            if hasattr(result, 'title') and result.title:
                await f.write(f"**Title:** {result.title}\n")
            
            await f.write("\n### Content\n\n")
            
            # 添加内容
            if result.markdown:
                content = result.markdown[:10000]
                if len(result.markdown) > 10000:
                    content += "\n\n*[Content truncated due to length]*"
                await f.write(content)
            elif result.cleaned_html:
                content = result.cleaned_html[:5000]
                if len(result.cleaned_html) > 5000:
                    content += "\n\n*[Content truncated due to length]*"
                await f.write(f"\n{content}\n")
            else:
                await f.write("*No content available*")
            
            await f.write("\n\n---\n\n")
        
        print(f"✅ Page {page_num} written to file: {result.url}")

dispatcher = SemaphoreDispatcher(
    max_session_permit=5,
    rate_limiter=RateLimiter(
        base_delay=(1.5, 3.0),
        max_delay=30.0
    )
)

async def main():
    # 配置浏览器设置
    browser_config = BrowserConfig(
        headless=False,
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        },
        viewport_width=1920,
        viewport_height=1080,
        ignore_https_errors=True,
        java_script_enabled=True,
        verbose=True,
        proxy="http://localhost:7890"  # 临时注释代理
    )
    
    # 创建过滤器链
    filter_chain = FilterChain([
        URLPatternFilter(
            patterns=["*discuss*", "*privacy-policy*"],
            reverse=True
        )
    ])
    
    # 配置爬取运行参数
    run_config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=0, 
            include_external=False,
            filter_chain=filter_chain
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        mean_delay=1.5,
        max_range=2,
        exclude_external_images=True,
        override_navigator=True,
        page_timeout=60000,
        wait_until="networkidle",
    )

    async def before_goto(
        page: Page, context: BrowserContext, url: str, **kwargs
    ):
        if not url.startswith("https://docs.streamlit.io/"):
            print(f"[SKIP] 跳过非目标子域名链接: {url}")
            page.close()
        return page

    async with AsyncWebCrawler(config=browser_config) as crawler:
        #crawler.crawler_strategy.set_hook("before_goto", before_goto)
        results = await crawler.arun_many(
            urls=["https://docs.crawl4ai.com/"], 
            config=run_config, 
            #dispatcher=dispatcher
        )
        
        print(f"Crawled {len(results)} pages in total")
        print(f"Results saved to /home/duduzhang/webcraw/streamlit_realtime.md")

if __name__ == "__main__":
    asyncio.run(main())

OS

WSL2->Ubuntu-22.04

Python version

cpython-3.10.12-linux-x86_64-gnu

Browser

chromium

Browser version

No response

Error logs & Screenshots (if applicable)

CrawlResult(url='https://docs.crawl4ai.com/', html='', fit_html=<property object at 0x7f392ab87920>, success=False, cleaned_html=None, media={}, links={}, downloaded_files=None, js_execution_result=None, screenshot=None, pdf=None, mhtml=None, extracted_content=None, metadata={}, error_message="'list' object has no attribute 'status_code'", session_id=None, response_headers=None, status_code=None, ssl_certificate=None, dispatch_result=DispatchResult(task_id='1b57d969-2b11-4dbb-85e5-a8c694be798e', memory_usage=2.25, peak_memory=2.25, start_time=1751879416.1899254, end_time=1751879422.5776045, error_message="'list' object has no attribute 'status_code'"), redirected_url=None, network_requests=None, console_messages=None, tables=[])

ztp559 avatar Jul 07 '25 09:07 ztp559