botasaurus icon indicating copy to clipboard operation
botasaurus copied to clipboard

Bypass Cloudflare error

Open Operating3336 opened this issue 1 year ago • 5 comments

Hi everyone, I have a problem when I try to bypass cloudflare verification. Here is the beginning of my script:

from botasaurus import *
from botasaurus.browser import browser, Driver, Wait
from botasaurus import bt

# Function to read URLs from a file
def read_urls_from_file(file_path):
    with open(file_path, 'r') as file:
        urls = [line.strip() for line in file if line.strip()]
    return urls

# Read URLs from the file
url_file_path = 'file.txt'
urls = read_urls_from_file(url_file_path)

@browser(
    block_images_and_css=True,
    wait_for_complete_page_load=True,
    parallel=10,
    reuse_driver=True,
    cache=True,
    tiny_profile=True,
    profile="pikachu",
    #close_on_crash=True,
    data=urls
)

def scrape_links_task(driver: Driver, urls):
    error_file = 'errors.log'
    driver.google_get(urls, bypass_cloudflare=True)
    
    if driver.is_in_page("https://URL/error"):
        with open(error_file, 'a') as f:
            f.write(f"Error URL encountered: {urls}\n")
        print(f"Error URL encountered: {urls}")
        return []

    # Locate the list container
    list_container = driver.select(".liste2colonnes")
    if not list_container:
        with open(error_file, 'a') as f:
            f.write(f"No links found: {urls}\n")
        print(f"No links found: {urls}")
        return []

    # Find all links within the list container and extract href attributes
    links = []
    for link in list_container.select_all("a.pj-link"):
        href = link.get_attribute('href')
        if href:
            links.append(href)

    return links

Here is the error:

Traceback (most recent call last):
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus/browser_decorator.py", line 171, in run_task
    result = func(driver, data)
             ^^^^^^^^^^^^^^^^^^
  File "$PATH2scrape-links.py", line 54, in scrape_links_task
    driver.google_get(current_url, bypass_cloudflare=True)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 536, in google_get
    self.get_via(link, "https://www.google.com/", bypass_cloudflare=bypass_cloudflare, wait=wait)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 522, in get_via
    self.detect_and_bypass_cloudflare()
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 878, in detect_and_bypass_cloudflare
    bypass_if_detected(self)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 122, in bypass_if_detected
    wait_till_cloudflare_leaves(driver, previous_ray_id, raise_exception)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 41, in wait_till_cloudflare_leaves
    current_ray_id = get_rayid(driver)
                     ^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 25, in get_rayid
    ray = driver.get_text(".ray-id code")
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 743, in get_text
    elem = self.wait_for_element(selector, wait)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 749, in wait_for_element
    return make_element(self, self._tab, self._run(self._tab.wait_for(selector, timeout=wait)))
                                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 872, in wait_for
    item = self.query_selector(selector)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 403, in query_selector
    node_id = self.send(cdp.dom.query_selector(doc.node_id, selector))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 249, in send
    result = parse_response(result, cdp_obj)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 45, in parse_response
    raise ChromeException(response["error"])
botasaurus_driver.exceptions.ChromeException: Could not find node with given id [code: -32000]
Task failed for input: https://$URL/
Pagination found and pipe elements found
Traceback (most recent call last):
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus/browser_decorator.py", line 171, in run_task
    result = func(driver, data)
             ^^^^^^^^^^^^^^^^^^
  File "$PATH2scrape-links.py", line 37, in scrape_links_task
    driver.google_get(urls, bypass_cloudflare=True)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 536, in google_get
    self.get_via(link, "https://www.google.com/", bypass_cloudflare=bypass_cloudflare, wait=wait)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 522, in get_via
    self.detect_and_bypass_cloudflare()
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 878, in detect_and_bypass_cloudflare
    bypass_if_detected(self)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 122, in bypass_if_detected
    wait_till_cloudflare_leaves(driver, previous_ray_id, raise_exception)
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 41, in wait_till_cloudflare_leaves
    current_ray_id = get_rayid(driver)
                     ^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 25, in get_rayid
    ray = driver.get_text(".ray-id code")
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 743, in get_text
    elem = self.wait_for_element(selector, wait)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 749, in wait_for_element
    return make_element(self, self._tab, self._run(self._tab.wait_for(selector, timeout=wait)))
                                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 872, in wait_for
    item = self.query_selector(selector)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 403, in query_selector
    node_id = self.send(cdp.dom.query_selector(doc.node_id, selector))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 249, in send
    result = parse_response(result, cdp_obj)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 45, in parse_response
    raise ChromeException(response["error"])
botasaurus_driver.exceptions.ChromeException: Could not find node with given id [code: -32000]

Can anyone help me solve this issue? Thanks in advance

Operating3336 avatar Jun 17 '24 08:06 Operating3336

This error is because it can't open a chrome instance.

1, Try the basic code to test your chrome to see if it is right to open an instance. from botasaurus.browser import browser, Driver

@browser def scrape_heading_task(driver: Driver, data): # Visit the Omkar Cloud website driver.get("https://www.omkar.cloud/")

# Retrieve the heading element's text
heading = driver.get_text("h1")

# Save the data as a JSON file in output/scrape_heading_task.json
return {
    "heading": heading
}

Initiate the web scraping task

scrape_heading_task()

  1. If above test you can run good with a opened chrome instance, then try use python3.10 test you topic code.

stevenhubhub avatar Jun 17 '24 22:06 stevenhubhub

Is there a way to specify that I have to wait for the google-chrome driver to open completely ?

Operating3336 avatar Jun 21 '24 09:06 Operating3336

The cloudflare bypass was not working and loading about to 44m still not get any error or result

yuke2002 avatar Jul 11 '24 08:07 yuke2002

Thank you! And do you have any tips on how I can improve my code to not get this error?

Operating3336 avatar Jul 11 '24 09:07 Operating3336

from botasaurus.browser import browser, Driver

@browser(headless=True) def scrape_heading_task(driver: Driver, data): # Visit the Omkar Cloud website driver.get("https://www.zoominfo.com/c/thermon-group-holdings-inc/38126213", bypass_cloudflare=True) driver.save_screenshot('pic.png') driver.prompt()

# Retrieve the heading element's text
heading = driver.get_text("h1")

# Save the data as a JSON file in output/scrape_heading_task.json
return {
    "heading": heading
}

Initiate the web scraping task

data = scrape_heading_task() datas = [] datas.append(data)

Cloudflare was not bypassing could any one help me with this. Thanks in advance

yuke2002 avatar Jul 11 '24 11:07 yuke2002

Resolved this bug, run python -m pip install bota botasaurus botasaurus-api botasaurus-requests botasaurus-driver botasaurus-proxy-authentication botasaurus-server botasaurus-humancursor --upgrade

then try

from botasaurus.browser import browser, Driver

@browser
def scrape_heading_task(driver: Driver, data):
    driver.google_get("https://nopecha.com/demo/cloudflare", bypass_cloudflare=True)
    driver.prompt()

scrape_heading_task()

Chetan11-dev avatar Apr 11 '25 09:04 Chetan11-dev