crawl4ai icon indicating copy to clipboard operation
crawl4ai copied to clipboard

[Bug]: The website cannot be crawled

Open LinkerM opened this issue 7 months ago • 3 comments

crawl4ai version

0.5.0

Expected Behavior

This website does not have an href address and needs to be crawled in the next step

Current Behavior

Currently unable to crawl data

Is this reproducible?

Yes

Inputs Causing the Bug

import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig # type: ignore
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy # type: ignore
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy # type: ignore
from crawl4ai.deep_crawling.filters import FilterChain,URLPatternFilter,DomainFilter,ContentTypeFilter# type: ignore

from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # type: ignore
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import datetime 
year = datetime.datetime.now().year

async def main():
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=2,  
            include_external = False,
            filter_chain=FilterChain([
                URLPatternFilter(patterns=["*/detail"]),
                DomainFilter(
                    allowed_domains=["www2.ahtvu.ah.cn"]
                ),
                ContentTypeFilter(allowed_types=["text/html"])
            ])
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True,
        exclude_social_media_links=True,
        exclude_external_images=True,
        # word_count_threshold=10,
        # excluded_tags=['form', 'header'],
        # exclude_external_links=True,
        # # Content processing
        # process_iframes=True,
        # remove_overlay_elements=True,
        # wait_for="class:.list",
        # delay_before_return_html=2.0, 
        extraction_strategy=JsonCssExtractionStrategy(schema = {
        "name": "News Item",
        "baseSelector": "#container > div > div.content",
        "fields": [
            {
                "name": "title",
                "selector": "div.title",
                "type": "text"
            },
            {
                "name": "time",
                "selector": "#content > div.infoWrap > h1",
                "type": "text"
            },
            {
                "name": "content",
                "selector": "#gallery",
                "type": "text"
            }
        ]
    }),
    )
    
    # 浏览器配置
    browser_config = BrowserConfig(headless=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36", extra_args=['--disable-web-security'], )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun("https://www2.ahtvu.ah.cn/news/list?id=16&title=%E5%AD%A6%E6%A0%A1%E6%96%B0%E9%97%BB", config=config)

        print(f"Crawled {len(results)} pages in total")
        print(results)
        for i, result in enumerate(results, start=0):  # Show first 3 results
            if result.success:
                print("=" * 30)
                print(result.url)
                # print(result)
                print(json.loads(result.extracted_content))
            else:
                print("Crawl failed:", result.error_message)

                

if __name__ == "__main__":
    asyncio.run(main())

Steps to Reproduce

import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig # type: ignore
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy # type: ignore
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy # type: ignore
from crawl4ai.deep_crawling.filters import FilterChain,URLPatternFilter,DomainFilter,ContentTypeFilter# type: ignore

from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # type: ignore
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import datetime 
year = datetime.datetime.now().year

async def main():
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=2,  
            include_external = False,
            filter_chain=FilterChain([
                URLPatternFilter(patterns=["*/detail"]),
                DomainFilter(
                    allowed_domains=["www2.ahtvu.ah.cn"]
                ),
                ContentTypeFilter(allowed_types=["text/html"])
            ])
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True,
        exclude_social_media_links=True,
        exclude_external_images=True,
        # word_count_threshold=10,
        # excluded_tags=['form', 'header'],
        # exclude_external_links=True,
        # # Content processing
        # process_iframes=True,
        # remove_overlay_elements=True,
        # wait_for="class:.list",
        # delay_before_return_html=2.0, 
        extraction_strategy=JsonCssExtractionStrategy(schema = {
        "name": "News Item",
        "baseSelector": "#container > div > div.content",
        "fields": [
            {
                "name": "title",
                "selector": "div.title",
                "type": "text"
            },
            {
                "name": "time",
                "selector": "#content > div.infoWrap > h1",
                "type": "text"
            },
            {
                "name": "content",
                "selector": "#gallery",
                "type": "text"
            }
        ]
    }),
    )
    
    # 浏览器配置
    browser_config = BrowserConfig(headless=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36", extra_args=['--disable-web-security'], )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun("https://www2.ahtvu.ah.cn/news/list?id=16&title=%E5%AD%A6%E6%A0%A1%E6%96%B0%E9%97%BB", config=config)

        print(f"Crawled {len(results)} pages in total")
        print(results)
        for i, result in enumerate(results, start=0):  # Show first 3 results
            if result.success:
                print("=" * 30)
                print(result.url)
                # print(result)
                print(json.loads(result.extracted_content))
            else:
                print("Crawl failed:", result.error_message)

                

if __name__ == "__main__":
    asyncio.run(main())

Code snippets


OS

macOS

Python version

python3.13.2

Browser

Chrome

Browser version

No response

Error logs & Screenshots (if applicable)

No response

LinkerM avatar Apr 24 '25 02:04 LinkerM