crawl4ai [Bug]: arun_many Unable to get results at runtime

[Bug]: arun_many Unable to get results at runtime

Open ztp559 opened this issue 5 months ago • 2 comments

crawl4ai version

0.6.3

Expected Behavior

Fixed the issue of arun_many() Method

Current Behavior

The same configuration can get the correct results when using arun(), and the result is empty when arun_many(). results[0].error_message="'list' object has no attribute 'status_code'"

Is this reproducible?

Yes

Inputs Causing the Bug

async with AsyncWebCrawler(config=browser_config) as crawler:
        #crawler.crawler_strategy.set_hook("before_goto", before_goto)
        results = await crawler.arun_many(
            urls=["https://docs.crawl4ai.com/"], 
            config=run_config, 
            #dispatcher=dispatcher
        )

Steps to Reproduce

Code snippets

import asyncio
import os
import logging
from datetime import datetime
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
from crawl4ai import RateLimiter
from crawl4ai.async_dispatcher import SemaphoreDispatcher
from playwright.async_api import Page, BrowserContext
import aiofiles

# 全局计数器和锁
page_counter = 0
write_lock = asyncio.Lock()

async def write_page_to_file(result, page_num):
    """实时写入单个页面数据到文件"""
    async with write_lock:
        output_file = "/home/duduzhang/webcraw/streamlit_realtime.md"
        
        # 如果是第一个页面，创建文件头
        if page_num == 1:
            async with aiofiles.open(output_file, 'w', encoding='utf-8') as f:
                await f.write("# Streamlit Documentation\n")
                await f.write(f"\nCrawled on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                await f.write("---\n\n")
        
        # 追加页面内容
        async with aiofiles.open(output_file, 'a', encoding='utf-8') as f:
            await f.write(f"## Page {page_num}: {result.url}\n")
            await f.write(f"**Depth:** {result.metadata.get('depth', 0)}\n")
            await f.write(f"**Status:** {result.status_code}\n")
            
            if hasattr(result, 'title') and result.title:
                await f.write(f"**Title:** {result.title}\n")
            
            await f.write("\n### Content\n\n")
            
            # 添加内容
            if result.markdown:
                content = result.markdown[:10000]
                if len(result.markdown) > 10000:
                    content += "\n\n*[Content truncated due to length]*"
                await f.write(content)
            elif result.cleaned_html:
                content = result.cleaned_html[:5000]
                if len(result.cleaned_html) > 5000:
                    content += "\n\n*[Content truncated due to length]*"
                await f.write(f"\n{content}\n")
            else:
                await f.write("*No content available*")
            
            await f.write("\n\n---\n\n")
        
        print(f"✅ Page {page_num} written to file: {result.url}")

dispatcher = SemaphoreDispatcher(
    max_session_permit=5,
    rate_limiter=RateLimiter(
        base_delay=(1.5, 3.0),
        max_delay=30.0
    )
)

async def main():
    # 配置浏览器设置
    browser_config = BrowserConfig(
        headless=False,
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        },
        viewport_width=1920,
        viewport_height=1080,
        ignore_https_errors=True,
        java_script_enabled=True,
        verbose=True,
        proxy="http://localhost:7890"  # 临时注释代理
    )
    
    # 创建过滤器链
    filter_chain = FilterChain([
        URLPatternFilter(
            patterns=["*discuss*", "*privacy-policy*"],
            reverse=True
        )
    ])
    
    # 配置爬取运行参数
    run_config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=0, 
            include_external=False,
            filter_chain=filter_chain
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        mean_delay=1.5,
        max_range=2,
        exclude_external_images=True,
        override_navigator=True,
        page_timeout=60000,
        wait_until="networkidle",
    )

    async def before_goto(
        page: Page, context: BrowserContext, url: str, **kwargs
    ):
        if not url.startswith("https://docs.streamlit.io/"):
            print(f"[SKIP] 跳过非目标子域名链接: {url}")
            page.close()
        return page

    async with AsyncWebCrawler(config=browser_config) as crawler:
        #crawler.crawler_strategy.set_hook("before_goto", before_goto)
        results = await crawler.arun_many(
            urls=["https://docs.crawl4ai.com/"], 
            config=run_config, 
            #dispatcher=dispatcher
        )
        
        print(f"Crawled {len(results)} pages in total")
        print(f"Results saved to /home/duduzhang/webcraw/streamlit_realtime.md")

if __name__ == "__main__":
    asyncio.run(main())

OS

WSL2->Ubuntu-22.04

Python version

cpython-3.10.12-linux-x86_64-gnu

Browser

chromium

Browser version

No response

Error logs & Screenshots (if applicable)

CrawlResult(url='https://docs.crawl4ai.com/', html='', fit_html=<property object at 0x7f392ab87920>, success=False, cleaned_html=None, media={}, links={}, downloaded_files=None, js_execution_result=None, screenshot=None, pdf=None, mhtml=None, extracted_content=None, metadata={}, error_message="'list' object has no attribute 'status_code'", session_id=None, response_headers=None, status_code=None, ssl_certificate=None, dispatch_result=DispatchResult(task_id='1b57d969-2b11-4dbb-85e5-a8c694be798e', memory_usage=2.25, peak_memory=2.25, start_time=1751879416.1899254, end_time=1751879422.5776045, error_message="'list' object has no attribute 'status_code'"), redirected_url=None, network_requests=None, console_messages=None, tables=[])

Jul 07 '25 09:07 ztp559

crawl4ai crawl4ai copied to clipboard

[Bug]: arun_many Unable to get results at runtime

crawl4ai version

Expected Behavior

Current Behavior

Is this reproducible?

Inputs Causing the Bug

Steps to Reproduce

Code snippets

OS

Python version

Browser

Browser version

Error logs & Screenshots (if applicable)

crawl4ai
crawl4ai copied to clipboard