crawl4ai
crawl4ai copied to clipboard
[Bug]: After using the hook on_page_context_created method to execute, the process of calling arun() causes the page to refresh
crawl4ai version
0.5
Expected Behavior
采用 on_page_context_created 钩子事件执行页面动态行为,然后替换布局表格为table标签后,采集页面保存为markdown文件
Current Behavior
采用 on_page_context_created 钩子事件执行页面动态行为,然后替换布局表格为table标签后,页面自动刷新
Is this reproducible?
Yes
Inputs Causing the Bug
https://platform.worldquantbrain.com/learn/operators
Steps to Reproduce
Code snippets
import asyncio
import json
import time
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy
from playwright.async_api import Page, BrowserContext
from os.path import expanduser
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, ContentTypeFilter
from bs4 import BeautifulSoup
import re
from unidecode import unidecode
import os
directory = './md/' # 保存目录
def generate_markdown_filename(title: str) -> str:
# 移除特殊字符并替换为下划线
sanitized = unidecode(title)
# 替换所有特殊字符为下划线,包括*号
sanitized = re.sub(r'[<>:"/\\|?*]', '_', sanitized)
# 移除多余的空格并替换为下划线
sanitized = '_'.join(sanitized.split())
# 移除连续的下划线
sanitized = re.sub(r'_+', '_', sanitized)
return sanitized[:100] + ".md"
def write_markdown(content, directory, filename):
os.makedirs(directory, exist_ok=True)
file_path = os.path.join(directory, filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"文件已保存到: {file_path}")
async def main():
print("🔗 开始爬取WorldQuant BRAIN文档")
# 浏览器配置
browser_config = BrowserConfig(
headless=False,
use_persistent_context=True,
use_managed_browser=True,
browser_type="chromium",
user_data_dir='D:/pythonProject/Crawler/crawl4ai/data',
verbose=False
)
# URL过滤器
filter_chain = FilterChain([
URLPatternFilter(patterns=["https://platform.worldquantbrain.com/learn/operators*"]),
ContentTypeFilter(allowed_types=["text/html", "application/json"])
])
url = "https://platform.worldquantbrain.com/learn/operators"
# 自定义等待条件
WAIT_FOR_READY = """js:() => {
// 检查文档是否完全加载
if (document.readyState !== 'complete') {
console.log("文档加载未完成");
return false;
}
// 检查目标元素是否存在
const targetElement = document.querySelector('div.operator-tables');
if (!targetElement) {
console.log("目标元素未找到");
return false;
}
// 检查表格是否存在
# const tables = document.querySelectorAll('.rt-table.block-table');
# if (tables.length > 0) {
# console.log(`找到 ${tables.length} 个表格`);
# }
# // 检查Accept按钮是否存在
# const acceptButton = document.querySelector('button.button.button--md.button--primary');
# if (acceptButton && acceptButton.textContent.trim() === 'Accept') {
# console.log("找到Accept按钮");
# acceptButton.click();
# }
// 确保内容已加载
const content = targetElement.textContent.trim();
if (!content) {
console.log("内容为空");
return false;
}
return new Promise(resolve => {
setTimeout(() => {
console.log("页面加载完成,所有条件满足,额外等待2秒");
resolve(true);
},10000);
});
}"""
# 在爬虫配置中使用自定义等待条件
crawler_run_config = CrawlerRunConfig(
url=url,
wait_for=WAIT_FOR_READY, # 使用自定义等待条件
session_id="wq_session",
cache_mode=CacheMode.READ_ONLY,
page_timeout=40000, # 增加超时时间
scroll_delay=0.5,
delay_before_return_html=5,
css_selector=".panel--learn",
stream=False,
)
crawler = AsyncWebCrawler(config=browser_config)
async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
print("[HOOK] 页面创建 - 设置页面和上下文")
if page.url == 'about:blank':
# 设置登录态cookie
await context.add_cookies([{
'name': 't',
'value': '*****************************************',
'domain': '.api.worldquantbrain.com',
'path': '/',
'secure': True,
'httpOnly': True
}])
# 直接进入文档页面
await page.goto(url, wait_until='networkidle', timeout=30000)
# 检查并点击Accept按钮
accept_button = await page.query_selector('button.button.button--md.button--primary:has-text("Accept")')
if accept_button:
await accept_button.click()
await page.wait_for_timeout(1000)
# 查找并点击所有operator-explorer__expanded-toggle按钮
expand_buttons = await page.query_selector_all('.operator-explorer__expanded-toggle')
if expand_buttons:
print(f"找到 {len(expand_buttons)} 个展开按钮")
for button in expand_buttons:
try:
await button.click()
await page.wait_for_timeout(500)
except Exception as e:
print(f"点击展开按钮时出错: {str(e)}")
continue
# 等待所有展开内容加载完成
await page.wait_for_timeout(2000)
# 禁用页面自动刷新
await page.evaluate('''() => {
// 清除所有定时器
const highestId = window.setTimeout(() => {}, 0);
for (let i = 0; i <= highestId; i++) {
clearTimeout(i);
clearInterval(i);
}
// 阻止页面unload
window.onbeforeunload = function() {
return false;
};
}''')
# 转换表格并立即获取内容
content = await page.evaluate('''() => {
const operatorTables = document.querySelector('div.operator-tables');
if (!operatorTables) {
console.log('未找到operator-tables元素');
return '';
}
const tables = operatorTables.querySelectorAll('.rt-table');
let allContent = operatorTables.innerHTML; // 保存原始内容
tables.forEach(table => {
try {
const headers = Array.from(table.querySelectorAll('.rt-th')).map(th => th.textContent.trim());
const rows = Array.from(table.querySelectorAll('.rt-tr-group')).map(group => {
return Array.from(group.querySelectorAll('.rt-td')).map(td => td.textContent.trim());
});
let htmlTable = '<table>';
htmlTable += '<thead><tr>';
headers.forEach(header => {
htmlTable += `<th>${header}</th>`;
});
htmlTable += '</tr></thead><tbody>';
rows.forEach(row => {
htmlTable += '<tr>';
row.forEach(cell => {
htmlTable += `<td>${cell}</td>`;
});
htmlTable += '</tr>';
});
htmlTable += '</tbody></table>';
// 替换原始表格
table.outerHTML = htmlTable;
} catch (error) {
console.error('处理表格时出错:', error);
}
});
return operatorTables.innerHTML; // 返回转换后的内容
}''')
# 将转换后的内容注入回页面
await page.evaluate(f'''(content) => {{
const operatorTables = document.querySelector('div.operator-tables');
if (operatorTables) {{
operatorTables.innerHTML = content;
}}
}}''', content)
print("表格转换完成并保存内容")
return page
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
await crawler.start()
results = await crawler.arun(url, config=crawler_run_config)
for result in results:
if result.success:
print("\n爬取地址:", result.url)
soup = BeautifulSoup(result.html, 'html.parser')
# 提取h2标题作为文件名
h2_tags = soup.find_all('h2')
if h2_tags:
filename = generate_markdown_filename(h2_tags[0].get_text(strip=True))
else:
# 如果没有h2标签,使用URL的最后一部分作为文件名
filename = generate_markdown_filename(result.url.split('/')[-1])
# 保存内容前验证文件名
if not os.path.basename(filename):
filename = 'untitled.md'
# 保存内容
write_markdown(result.markdown.raw_markdown, directory, filename)
else:
print("错误:", result.error_message)
await crawler.close()
if __name__ == "__main__":
asyncio.run(main())
OS
Windows
Python version
3.11
Browser
chrome
Browser version
No response
Error logs & Screenshots (if applicable)
No response