SeleniumBase icon indicating copy to clipboard operation
SeleniumBase copied to clipboard

"Hacking websites with CDP" is now on YouTube

Open mdmintz opened this issue 1 year ago • 10 comments

"Hacking websites with CDP" is now on YouTube:

https://www.youtube.com/watch?v=vt2zsdiNh3U

mdmintz avatar Jan 01 '25 01:01 mdmintz

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

https://github.com/seleniumbase/SeleniumBase/issues/3354#issuecomment-2557709036

mdmintz avatar Jan 06 '25 22:01 mdmintz

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

#3354 (comment)

So we don't need chromedriver anymore? Like nodriver or Selenium driverless?

boludoz avatar Feb 06 '25 04:02 boludoz

With pure CDP Mode, chromedriver isn’t necessary.

mdmintz avatar Feb 06 '25 05:02 mdmintz

If stealth is important, you may have to use one of the CDP formats here: https://github.com/seleniumbase/SeleniumBase/issues/3354#issuecomment-2557709036

Otherwise, you can try passing in the remote-debugging-port via chromium_arg, but that might not get you the results you’re looking for.

mdmintz avatar Feb 08 '25 23:02 mdmintz

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

#3354 (comment)

Is there a way to take control of cdp browsers from outside of seleniumbase? For example I want to use phpwebdriver to control the browser. I was thinking to use debugger_address in the cdp browser and connect through a remote web driver but I'm thinking it won't carry over the stealth functionality...

dongdestroyer avatar Feb 26 '25 06:02 dongdestroyer

BaseCase class with SB() is synchronous is there anyway that i can use asynchronous driver from cdp_util instead of basecase driver? from seleniumbase.undetected.cdp_driver import cdp_util driver = await cdp_util.start_async()

guocity avatar Feb 27 '25 01:02 guocity

how do i call sb.uc_gui_click_captcha() from async code?

guocity avatar Feb 27 '25 04:02 guocity

The async code can only call direct CDP methods. (See https://chromedevtools.github.io/devtools-protocol/ for those methods)

mdmintz avatar Feb 27 '25 05:02 mdmintz

The async code can only call direct CDP methods. (See https://chromedevtools.github.io/devtools-protocol/ for those methods)

Hi! Please tell me, I've already racked my brains trying to find a solution. Why cdp doesn't get the request body, if it's 100% there and if I uncomment the line xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index) in the while loop, then everything will be fine, but only if I screw up everything from the while loop further.

`
CDP.network.ResponseReceived with CDP.network.ResourceType.XHR.
import asyncio
import json
import random

import mycdp
import time
from seleniumbase import cdp_driver

import config
from utils import distribute_scrolls, is_have_products, get_xhr_products, scroll_down, need_click_load_more

products = []

last_xhr_request = {}

xhr_requests = {}


def listenXHR(page, index_):
    async def handler(evt: mycdp.network.ResponseReceived):
        # Get AJAX requests
        if evt.type_ is mycdp.network.ResourceType.XHR and 'api/catalog/products' in evt.response.url:
            if xhr_requests.get(index_):
                xhr_requests[index_].append([evt.response.url, evt.request_id])
                last_xhr_request[index_] = time.time()
            else:
                xhr_requests[index_] = [[evt.response.url, evt.request_id], ]
                last_xhr_request[index_] = time.time()

    page.add_handler(mycdp.network.ResponseReceived, handler)


async def receiveXHR(page, requests, index_):
    responses = []
    retries = 0
    max_retries = 5
    # Wait at least 2 seconds after last XHR request for more
    while True:
        if last_xhr_request[index_] is None or retries > max_retries:
            break
        if time.time() - last_xhr_request[index_] <= 3:
            retries = retries + 1
            await asyncio.sleep(2)
            continue
        else:
            break
    await page
    # Loop through gathered requests and get response body
    for request in requests:
        try:
            res = await page.send(mycdp.network.get_response_body(request[1]))
            if res is None:
                continue
            responses.append({
                "url": request[0],
                "body": json.loads(res[0]),
                "is_base64": res[1],
            })
        except Exception as e:
            print("Error getting response:", e)
    if responses:
        xhr_requests[index_] = []
    return responses


async def request_paused_handler(event, tab):
    r = event.request
    is_image = ".png" in r.url or ".jpg" in r.url or ".gif" in r.url or ".webp" in r.url or "pcdn.goldapple.ru" in r.url or "/front/api/apm/events" in r.url or '.mp4' in r.url
    if not is_image:  # Let the data through
        tab.feed_cdp(
            mycdp.fetch.continue_request(request_id=event.request_id)
        )
    else:  # Block the data (images)
        TIMED_OUT = mycdp.network.ErrorReason.TIMED_OUT
        tab.feed_cdp(
            mycdp.fetch.fail_request(event.request_id, TIMED_OUT)
        )


async def check_hxr(index_, tab):
    print("Starting check_hxr for index:", index_)  # Debug print
    products_ = []
    while True:
        print("Waiting for XHR responses...")  # Debug print
        xhr_responses = await receiveXHR(tab, xhr_requests.get(index_, []), index_)
        print("Received XHR responses:", xhr_responses)  # Debug print
        data = await get_xhr_products(xhr_responses)
        if data:
            products_.extend(data)
        print("request: ", xhr_requests.get(index_, []))  # Original print
        await asyncio.sleep(1)


async def crawl(browser_index, link, scrols, max_pages=None, start_pages=0):
    products_parsed = []
    retries = 5
    page_count = 0
    driver = await cdp_driver.start_async()
    tab = await driver.get("about:blank")
    listenXHR(tab, browser_index)
    tab.add_handler(mycdp.fetch.RequestPaused, request_paused_handler)
    url = f"{link}?p={start_pages}&storestocks=1"
    await tab.get(url)
    await asyncio.sleep(5)
    # check_hxr_task = asyncio.create_task(check_hxr(browser_index, tab))
    # print("hello")
    while True:
        # xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
        # data = await get_xhr_products(xhr_responses)
        # if not data:
        #     retries -= 1
        # else:
        #     products_parsed.extend(data)
        if not await is_have_products(tab):
            break
        await scroll_down(tab)
        await need_click_load_more(tab)
        print(f"Браузер {browser_index} - Страница {page_count}; Всего продуктов: {len(products_parsed)}")
        # if retries == 0:
        #     break
        page_count += 1
        if page_count == 5:
            break
    xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
    data = await get_xhr_products(xhr_responses)
    print(data)
    print(f"Браузер {browser_index} собрал {len(products_parsed)} продуктов на странице {url}")
    products.extend(products_parsed)


async def main():
    driver = await cdp_driver.start_async()
    for n, link in enumerate(config.links):
        tasks = []
        tab = await driver.get("https://goldapple.ru/" + link)
        await tab.wait_for("span[data-category-products-count]")
        product_count = await tab.select("span[data-category-products-count]")
        product_count = product_count.attrs.get("data-category-products-count")
        product_pages = int(int(product_count) / 24)
        scrolls_per_browser = distribute_scrolls(product_pages, 1)
        start_pages = []
        start = 0
        for s in scrolls_per_browser:
            start_pages.append(start)
            start += s
        for i in range(1):
            tasks.append(crawl(i, "https://goldapple.ru" + link, scrolls_per_browser[i], 100, start_pages[i]))
        await asyncio.gather(*tasks)


if __name__ == "__main__":
    asyncio.run(main())
`

dima23113 avatar Mar 01 '25 20:03 dima23113