"Hacking websites with CDP" is now on YouTube
The code for taking control of existing Chrome browsers via the remote-debugging-port is here:
https://github.com/seleniumbase/SeleniumBase/issues/3354#issuecomment-2557709036
The code for taking control of existing Chrome browsers via the
remote-debugging-portis here:
So we don't need chromedriver anymore? Like nodriver or Selenium driverless?
With pure CDP Mode, chromedriver isn’t necessary.
If stealth is important, you may have to use one of the CDP formats here: https://github.com/seleniumbase/SeleniumBase/issues/3354#issuecomment-2557709036
Otherwise, you can try passing in the remote-debugging-port via chromium_arg, but that might not get you the results you’re looking for.
The code for taking control of existing Chrome browsers via the
remote-debugging-portis here:
Is there a way to take control of cdp browsers from outside of seleniumbase? For example I want to use phpwebdriver to control the browser. I was thinking to use debugger_address in the cdp browser and connect through a remote web driver but I'm thinking it won't carry over the stealth functionality...
BaseCase class with SB() is synchronous is there anyway that i can use asynchronous driver from cdp_util instead of basecase driver? from seleniumbase.undetected.cdp_driver import cdp_util driver = await cdp_util.start_async()
There are several async examples such as:
how do i call sb.uc_gui_click_captcha() from async code?
The async code can only call direct CDP methods. (See https://chromedevtools.github.io/devtools-protocol/ for those methods)
The async code can only call direct CDP methods. (See https://chromedevtools.github.io/devtools-protocol/ for those methods)
Hi! Please tell me, I've already racked my brains trying to find a solution. Why cdp doesn't get the request body, if it's 100% there and if I uncomment the line xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index) in the while loop, then everything will be fine, but only if I screw up everything from the while loop further.
`
CDP.network.ResponseReceived with CDP.network.ResourceType.XHR.
import asyncio
import json
import random
import mycdp
import time
from seleniumbase import cdp_driver
import config
from utils import distribute_scrolls, is_have_products, get_xhr_products, scroll_down, need_click_load_more
products = []
last_xhr_request = {}
xhr_requests = {}
def listenXHR(page, index_):
async def handler(evt: mycdp.network.ResponseReceived):
# Get AJAX requests
if evt.type_ is mycdp.network.ResourceType.XHR and 'api/catalog/products' in evt.response.url:
if xhr_requests.get(index_):
xhr_requests[index_].append([evt.response.url, evt.request_id])
last_xhr_request[index_] = time.time()
else:
xhr_requests[index_] = [[evt.response.url, evt.request_id], ]
last_xhr_request[index_] = time.time()
page.add_handler(mycdp.network.ResponseReceived, handler)
async def receiveXHR(page, requests, index_):
responses = []
retries = 0
max_retries = 5
# Wait at least 2 seconds after last XHR request for more
while True:
if last_xhr_request[index_] is None or retries > max_retries:
break
if time.time() - last_xhr_request[index_] <= 3:
retries = retries + 1
await asyncio.sleep(2)
continue
else:
break
await page
# Loop through gathered requests and get response body
for request in requests:
try:
res = await page.send(mycdp.network.get_response_body(request[1]))
if res is None:
continue
responses.append({
"url": request[0],
"body": json.loads(res[0]),
"is_base64": res[1],
})
except Exception as e:
print("Error getting response:", e)
if responses:
xhr_requests[index_] = []
return responses
async def request_paused_handler(event, tab):
r = event.request
is_image = ".png" in r.url or ".jpg" in r.url or ".gif" in r.url or ".webp" in r.url or "pcdn.goldapple.ru" in r.url or "/front/api/apm/events" in r.url or '.mp4' in r.url
if not is_image: # Let the data through
tab.feed_cdp(
mycdp.fetch.continue_request(request_id=event.request_id)
)
else: # Block the data (images)
TIMED_OUT = mycdp.network.ErrorReason.TIMED_OUT
tab.feed_cdp(
mycdp.fetch.fail_request(event.request_id, TIMED_OUT)
)
async def check_hxr(index_, tab):
print("Starting check_hxr for index:", index_) # Debug print
products_ = []
while True:
print("Waiting for XHR responses...") # Debug print
xhr_responses = await receiveXHR(tab, xhr_requests.get(index_, []), index_)
print("Received XHR responses:", xhr_responses) # Debug print
data = await get_xhr_products(xhr_responses)
if data:
products_.extend(data)
print("request: ", xhr_requests.get(index_, [])) # Original print
await asyncio.sleep(1)
async def crawl(browser_index, link, scrols, max_pages=None, start_pages=0):
products_parsed = []
retries = 5
page_count = 0
driver = await cdp_driver.start_async()
tab = await driver.get("about:blank")
listenXHR(tab, browser_index)
tab.add_handler(mycdp.fetch.RequestPaused, request_paused_handler)
url = f"{link}?p={start_pages}&storestocks=1"
await tab.get(url)
await asyncio.sleep(5)
# check_hxr_task = asyncio.create_task(check_hxr(browser_index, tab))
# print("hello")
while True:
# xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
# data = await get_xhr_products(xhr_responses)
# if not data:
# retries -= 1
# else:
# products_parsed.extend(data)
if not await is_have_products(tab):
break
await scroll_down(tab)
await need_click_load_more(tab)
print(f"Браузер {browser_index} - Страница {page_count}; Всего продуктов: {len(products_parsed)}")
# if retries == 0:
# break
page_count += 1
if page_count == 5:
break
xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
data = await get_xhr_products(xhr_responses)
print(data)
print(f"Браузер {browser_index} собрал {len(products_parsed)} продуктов на странице {url}")
products.extend(products_parsed)
async def main():
driver = await cdp_driver.start_async()
for n, link in enumerate(config.links):
tasks = []
tab = await driver.get("https://goldapple.ru/" + link)
await tab.wait_for("span[data-category-products-count]")
product_count = await tab.select("span[data-category-products-count]")
product_count = product_count.attrs.get("data-category-products-count")
product_pages = int(int(product_count) / 24)
scrolls_per_browser = distribute_scrolls(product_pages, 1)
start_pages = []
start = 0
for s in scrolls_per_browser:
start_pages.append(start)
start += s
for i in range(1):
tasks.append(crawl(i, "https://goldapple.ru" + link, scrolls_per_browser[i], 100, start_pages[i]))
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())
`