crawl4ai
crawl4ai copied to clipboard
[Bug]: The website cannot be crawled
crawl4ai version
0.5.0
Expected Behavior
This website does not have an href address and needs to be crawled in the next step
Current Behavior
Currently unable to crawl data
Is this reproducible?
Yes
Inputs Causing the Bug
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig # type: ignore
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy # type: ignore
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy # type: ignore
from crawl4ai.deep_crawling.filters import FilterChain,URLPatternFilter,DomainFilter,ContentTypeFilter# type: ignore
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # type: ignore
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import datetime
year = datetime.datetime.now().year
async def main():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
include_external = False,
filter_chain=FilterChain([
URLPatternFilter(patterns=["*/detail"]),
DomainFilter(
allowed_domains=["www2.ahtvu.ah.cn"]
),
ContentTypeFilter(allowed_types=["text/html"])
])
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
exclude_social_media_links=True,
exclude_external_images=True,
# word_count_threshold=10,
# excluded_tags=['form', 'header'],
# exclude_external_links=True,
# # Content processing
# process_iframes=True,
# remove_overlay_elements=True,
# wait_for="class:.list",
# delay_before_return_html=2.0,
extraction_strategy=JsonCssExtractionStrategy(schema = {
"name": "News Item",
"baseSelector": "#container > div > div.content",
"fields": [
{
"name": "title",
"selector": "div.title",
"type": "text"
},
{
"name": "time",
"selector": "#content > div.infoWrap > h1",
"type": "text"
},
{
"name": "content",
"selector": "#gallery",
"type": "text"
}
]
}),
)
# 浏览器配置
browser_config = BrowserConfig(headless=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36", extra_args=['--disable-web-security'], )
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun("https://www2.ahtvu.ah.cn/news/list?id=16&title=%E5%AD%A6%E6%A0%A1%E6%96%B0%E9%97%BB", config=config)
print(f"Crawled {len(results)} pages in total")
print(results)
for i, result in enumerate(results, start=0): # Show first 3 results
if result.success:
print("=" * 30)
print(result.url)
# print(result)
print(json.loads(result.extracted_content))
else:
print("Crawl failed:", result.error_message)
if __name__ == "__main__":
asyncio.run(main())
Steps to Reproduce
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig # type: ignore
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy # type: ignore
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy # type: ignore
from crawl4ai.deep_crawling.filters import FilterChain,URLPatternFilter,DomainFilter,ContentTypeFilter# type: ignore
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # type: ignore
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import datetime
year = datetime.datetime.now().year
async def main():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
include_external = False,
filter_chain=FilterChain([
URLPatternFilter(patterns=["*/detail"]),
DomainFilter(
allowed_domains=["www2.ahtvu.ah.cn"]
),
ContentTypeFilter(allowed_types=["text/html"])
])
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
exclude_social_media_links=True,
exclude_external_images=True,
# word_count_threshold=10,
# excluded_tags=['form', 'header'],
# exclude_external_links=True,
# # Content processing
# process_iframes=True,
# remove_overlay_elements=True,
# wait_for="class:.list",
# delay_before_return_html=2.0,
extraction_strategy=JsonCssExtractionStrategy(schema = {
"name": "News Item",
"baseSelector": "#container > div > div.content",
"fields": [
{
"name": "title",
"selector": "div.title",
"type": "text"
},
{
"name": "time",
"selector": "#content > div.infoWrap > h1",
"type": "text"
},
{
"name": "content",
"selector": "#gallery",
"type": "text"
}
]
}),
)
# 浏览器配置
browser_config = BrowserConfig(headless=True, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36", extra_args=['--disable-web-security'], )
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun("https://www2.ahtvu.ah.cn/news/list?id=16&title=%E5%AD%A6%E6%A0%A1%E6%96%B0%E9%97%BB", config=config)
print(f"Crawled {len(results)} pages in total")
print(results)
for i, result in enumerate(results, start=0): # Show first 3 results
if result.success:
print("=" * 30)
print(result.url)
# print(result)
print(json.loads(result.extracted_content))
else:
print("Crawl failed:", result.error_message)
if __name__ == "__main__":
asyncio.run(main())
Code snippets
OS
macOS
Python version
python3.13.2
Browser
Chrome
Browser version
No response
Error logs & Screenshots (if applicable)
No response