Bypass Cloudflare error
Hi everyone, I have a problem when I try to bypass cloudflare verification. Here is the beginning of my script:
from botasaurus import *
from botasaurus.browser import browser, Driver, Wait
from botasaurus import bt
# Function to read URLs from a file
def read_urls_from_file(file_path):
with open(file_path, 'r') as file:
urls = [line.strip() for line in file if line.strip()]
return urls
# Read URLs from the file
url_file_path = 'file.txt'
urls = read_urls_from_file(url_file_path)
@browser(
block_images_and_css=True,
wait_for_complete_page_load=True,
parallel=10,
reuse_driver=True,
cache=True,
tiny_profile=True,
profile="pikachu",
#close_on_crash=True,
data=urls
)
def scrape_links_task(driver: Driver, urls):
error_file = 'errors.log'
driver.google_get(urls, bypass_cloudflare=True)
if driver.is_in_page("https://URL/error"):
with open(error_file, 'a') as f:
f.write(f"Error URL encountered: {urls}\n")
print(f"Error URL encountered: {urls}")
return []
# Locate the list container
list_container = driver.select(".liste2colonnes")
if not list_container:
with open(error_file, 'a') as f:
f.write(f"No links found: {urls}\n")
print(f"No links found: {urls}")
return []
# Find all links within the list container and extract href attributes
links = []
for link in list_container.select_all("a.pj-link"):
href = link.get_attribute('href')
if href:
links.append(href)
return links
Here is the error:
Traceback (most recent call last):
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus/browser_decorator.py", line 171, in run_task
result = func(driver, data)
^^^^^^^^^^^^^^^^^^
File "$PATH2scrape-links.py", line 54, in scrape_links_task
driver.google_get(current_url, bypass_cloudflare=True)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 536, in google_get
self.get_via(link, "https://www.google.com/", bypass_cloudflare=bypass_cloudflare, wait=wait)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 522, in get_via
self.detect_and_bypass_cloudflare()
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 878, in detect_and_bypass_cloudflare
bypass_if_detected(self)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 122, in bypass_if_detected
wait_till_cloudflare_leaves(driver, previous_ray_id, raise_exception)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 41, in wait_till_cloudflare_leaves
current_ray_id = get_rayid(driver)
^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 25, in get_rayid
ray = driver.get_text(".ray-id code")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 743, in get_text
elem = self.wait_for_element(selector, wait)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 749, in wait_for_element
return make_element(self, self._tab, self._run(self._tab.wait_for(selector, timeout=wait)))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 872, in wait_for
item = self.query_selector(selector)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 403, in query_selector
node_id = self.send(cdp.dom.query_selector(doc.node_id, selector))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 249, in send
result = parse_response(result, cdp_obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 45, in parse_response
raise ChromeException(response["error"])
botasaurus_driver.exceptions.ChromeException: Could not find node with given id [code: -32000]
Task failed for input: https://$URL/
Pagination found and pipe elements found
Traceback (most recent call last):
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus/browser_decorator.py", line 171, in run_task
result = func(driver, data)
^^^^^^^^^^^^^^^^^^
File "$PATH2scrape-links.py", line 37, in scrape_links_task
driver.google_get(urls, bypass_cloudflare=True)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 536, in google_get
self.get_via(link, "https://www.google.com/", bypass_cloudflare=bypass_cloudflare, wait=wait)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 522, in get_via
self.detect_and_bypass_cloudflare()
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 878, in detect_and_bypass_cloudflare
bypass_if_detected(self)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 122, in bypass_if_detected
wait_till_cloudflare_leaves(driver, previous_ray_id, raise_exception)
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 41, in wait_till_cloudflare_leaves
current_ray_id = get_rayid(driver)
^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/solve_cloudflare_captcha.py", line 25, in get_rayid
ray = driver.get_text(".ray-id code")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 743, in get_text
elem = self.wait_for_element(selector, wait)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/driver.py", line 749, in wait_for_element
return make_element(self, self._tab, self._run(self._tab.wait_for(selector, timeout=wait)))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 872, in wait_for
item = self.query_selector(selector)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/tab.py", line 403, in query_selector
node_id = self.send(cdp.dom.query_selector(doc.node_id, selector))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 249, in send
result = parse_response(result, cdp_obj)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "$PATH/my-python-env/lib/python3.11/site-packages/botasaurus_driver/core/connection.py", line 45, in parse_response
raise ChromeException(response["error"])
botasaurus_driver.exceptions.ChromeException: Could not find node with given id [code: -32000]
Can anyone help me solve this issue? Thanks in advance
This error is because it can't open a chrome instance.
1, Try the basic code to test your chrome to see if it is right to open an instance. from botasaurus.browser import browser, Driver
@browser def scrape_heading_task(driver: Driver, data): # Visit the Omkar Cloud website driver.get("https://www.omkar.cloud/")
# Retrieve the heading element's text
heading = driver.get_text("h1")
# Save the data as a JSON file in output/scrape_heading_task.json
return {
"heading": heading
}
Initiate the web scraping task
scrape_heading_task()
- If above test you can run good with a opened chrome instance, then try use python3.10 test you topic code.
Is there a way to specify that I have to wait for the google-chrome driver to open completely ?
The cloudflare bypass was not working and loading about to 44m still not get any error or result
Thank you! And do you have any tips on how I can improve my code to not get this error?
from botasaurus.browser import browser, Driver
@browser(headless=True) def scrape_heading_task(driver: Driver, data): # Visit the Omkar Cloud website driver.get("https://www.zoominfo.com/c/thermon-group-holdings-inc/38126213", bypass_cloudflare=True) driver.save_screenshot('pic.png') driver.prompt()
# Retrieve the heading element's text
heading = driver.get_text("h1")
# Save the data as a JSON file in output/scrape_heading_task.json
return {
"heading": heading
}
Initiate the web scraping task
data = scrape_heading_task() datas = [] datas.append(data)
Cloudflare was not bypassing could any one help me with this. Thanks in advance
Resolved this bug, run python -m pip install bota botasaurus botasaurus-api botasaurus-requests botasaurus-driver botasaurus-proxy-authentication botasaurus-server botasaurus-humancursor --upgrade
then try
from botasaurus.browser import browser, Driver
@browser
def scrape_heading_task(driver: Driver, data):
driver.google_get("https://nopecha.com/demo/cloudflare", bypass_cloudflare=True)
driver.prompt()
scrape_heading_task()