crawl4ai icon indicating copy to clipboard operation
crawl4ai copied to clipboard

[Bug]: After using the hook on_page_context_created method to execute, the process of calling arun() causes the page to refresh

Open bthuntergg opened this issue 7 months ago • 0 comments

crawl4ai version

0.5

Expected Behavior

采用 on_page_context_created 钩子事件执行页面动态行为,然后替换布局表格为table标签后,采集页面保存为markdown文件

Current Behavior

采用 on_page_context_created 钩子事件执行页面动态行为,然后替换布局表格为table标签后,页面自动刷新

Is this reproducible?

Yes

Inputs Causing the Bug

https://platform.worldquantbrain.com/learn/operators

Steps to Reproduce


Code snippets

import asyncio
import json
import time
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy
from playwright.async_api import Page, BrowserContext
from os.path import expanduser
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, ContentTypeFilter
from bs4 import BeautifulSoup
import re
from unidecode import unidecode
import os

directory = './md/'  # 保存目录

def generate_markdown_filename(title: str) -> str:
    # 移除特殊字符并替换为下划线
    sanitized = unidecode(title)
    # 替换所有特殊字符为下划线,包括*号
    sanitized = re.sub(r'[<>:"/\\|?*]', '_', sanitized)
    # 移除多余的空格并替换为下划线
    sanitized = '_'.join(sanitized.split())
    # 移除连续的下划线
    sanitized = re.sub(r'_+', '_', sanitized)
    return sanitized[:100] + ".md"

def write_markdown(content, directory, filename):
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, filename)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"文件已保存到: {file_path}")

async def main():
    print("🔗 开始爬取WorldQuant BRAIN文档")
    
    # 浏览器配置
    browser_config = BrowserConfig(
        headless=False,
        use_persistent_context=True,
        use_managed_browser=True,
        browser_type="chromium",
        user_data_dir='D:/pythonProject/Crawler/crawl4ai/data',
        verbose=False
    )

    # URL过滤器
    filter_chain = FilterChain([
        URLPatternFilter(patterns=["https://platform.worldquantbrain.com/learn/operators*"]),
        ContentTypeFilter(allowed_types=["text/html", "application/json"])
    ])

    url = "https://platform.worldquantbrain.com/learn/operators"

    # 自定义等待条件
    WAIT_FOR_READY = """js:() => {
        // 检查文档是否完全加载
        if (document.readyState !== 'complete') {
            console.log("文档加载未完成");
            return false;
        }

        // 检查目标元素是否存在
        const targetElement = document.querySelector('div.operator-tables');
        if (!targetElement) {
            console.log("目标元素未找到");
            return false;
        }

        // 检查表格是否存在
        # const tables = document.querySelectorAll('.rt-table.block-table');
        # if (tables.length > 0) {
        #     console.log(`找到 ${tables.length} 个表格`);
        # }

        # // 检查Accept按钮是否存在
        # const acceptButton = document.querySelector('button.button.button--md.button--primary');
        # if (acceptButton && acceptButton.textContent.trim() === 'Accept') {
        #     console.log("找到Accept按钮");
        #     acceptButton.click();
        # }

        // 确保内容已加载
        const content = targetElement.textContent.trim();
        if (!content) {
            console.log("内容为空");
            return false;
        }
        return new Promise(resolve => {
            setTimeout(() => {
                console.log("页面加载完成,所有条件满足,额外等待2秒");
                resolve(true);
            },10000);
        });
    }"""

    # 在爬虫配置中使用自定义等待条件
    crawler_run_config = CrawlerRunConfig(
        url=url,
        wait_for=WAIT_FOR_READY,  # 使用自定义等待条件
        session_id="wq_session",
        cache_mode=CacheMode.READ_ONLY,
        page_timeout=40000,  # 增加超时时间
        scroll_delay=0.5,
        delay_before_return_html=5,
        css_selector=".panel--learn",
        stream=False,

    )
    crawler = AsyncWebCrawler(config=browser_config)

    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
        print("[HOOK] 页面创建 - 设置页面和上下文")
        
        if page.url == 'about:blank':
            # 设置登录态cookie
            await context.add_cookies([{
                'name': 't',
                'value': '*****************************************',
                'domain': '.api.worldquantbrain.com',
                'path': '/',
                'secure': True,
                'httpOnly': True
            }])
            
            # 直接进入文档页面
            await page.goto(url, wait_until='networkidle', timeout=30000)
        
        # 检查并点击Accept按钮
        accept_button = await page.query_selector('button.button.button--md.button--primary:has-text("Accept")')
        if accept_button:
            await accept_button.click()
            await page.wait_for_timeout(1000)
    
        # 查找并点击所有operator-explorer__expanded-toggle按钮
        expand_buttons = await page.query_selector_all('.operator-explorer__expanded-toggle')
        if expand_buttons:
            print(f"找到 {len(expand_buttons)} 个展开按钮")
            for button in expand_buttons:
                try:
                    await button.click()
                    await page.wait_for_timeout(500)
                except Exception as e:
                    print(f"点击展开按钮时出错: {str(e)}")
                    continue
        
            # 等待所有展开内容加载完成
            await page.wait_for_timeout(2000)
        
            # 禁用页面自动刷新
            await page.evaluate('''() => {
                // 清除所有定时器
                const highestId = window.setTimeout(() => {}, 0);
                for (let i = 0; i <= highestId; i++) {
                    clearTimeout(i);
                    clearInterval(i);
                }
                // 阻止页面unload
                window.onbeforeunload = function() {
                    return false;
                };
            }''')

            # 转换表格并立即获取内容
            content = await page.evaluate('''() => {
                const operatorTables = document.querySelector('div.operator-tables');
                if (!operatorTables) {
                    console.log('未找到operator-tables元素');
                    return '';
                }
                
                const tables = operatorTables.querySelectorAll('.rt-table');
                let allContent = operatorTables.innerHTML;  // 保存原始内容
                
                tables.forEach(table => {
                    try {
                        const headers = Array.from(table.querySelectorAll('.rt-th')).map(th => th.textContent.trim());
                        const rows = Array.from(table.querySelectorAll('.rt-tr-group')).map(group => {
                            return Array.from(group.querySelectorAll('.rt-td')).map(td => td.textContent.trim());
                        });
                        
                        let htmlTable = '<table>';
                        htmlTable += '<thead><tr>';
                        headers.forEach(header => {
                            htmlTable += `<th>${header}</th>`;
                        });
                        htmlTable += '</tr></thead><tbody>';
                        
                        rows.forEach(row => {
                            htmlTable += '<tr>';
                            row.forEach(cell => {
                                htmlTable += `<td>${cell}</td>`;
                            });
                            htmlTable += '</tr>';
                        });
                        htmlTable += '</tbody></table>';
                        
                        // 替换原始表格
                        table.outerHTML = htmlTable;
                    } catch (error) {
                        console.error('处理表格时出错:', error);
                    }
                });
                
                return operatorTables.innerHTML;  // 返回转换后的内容
            }''')
            
            # 将转换后的内容注入回页面
            await page.evaluate(f'''(content) => {{
                const operatorTables = document.querySelector('div.operator-tables');
                if (operatorTables) {{
                    operatorTables.innerHTML = content;
                }}
            }}''', content)
            
            print("表格转换完成并保存内容")
        
        return page

    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
    await crawler.start()

    results = await crawler.arun(url, config=crawler_run_config)
    
    for result in results:
        if result.success:
            print("\n爬取地址:", result.url)
            soup = BeautifulSoup(result.html, 'html.parser')
            
            # 提取h2标题作为文件名
            h2_tags = soup.find_all('h2')
            if h2_tags:
                filename = generate_markdown_filename(h2_tags[0].get_text(strip=True))
            else:
                # 如果没有h2标签,使用URL的最后一部分作为文件名
                filename = generate_markdown_filename(result.url.split('/')[-1])
            
            # 保存内容前验证文件名
            if not os.path.basename(filename):
                filename = 'untitled.md'
            
            # 保存内容
            write_markdown(result.markdown.raw_markdown, directory, filename)
        else:
            print("错误:", result.error_message)

    await crawler.close()

if __name__ == "__main__":
    asyncio.run(main())

OS

Windows

Python version

3.11

Browser

chrome

Browser version

No response

Error logs & Screenshots (if applicable)

No response

bthuntergg avatar May 10 '25 10:05 bthuntergg