requests-html
requests-html copied to clipboard
A Browser closed issue
Unhandled error: Browser closed unexpectedly:
closedUnhandled error: Browser closed unexpectedly:
my code
from asyncio import events import uvloop import requests import asyncio, time import re import argparse import sys import threading from requests_html import AsyncHTMLSession, HTMLSession import urllib3 from pyppeteer import launch import hashlib import os
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Content-Encoding': 'gzip' }
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
def get_url(url_txt): with open(url_txt, "r") as f: s = f.readlines() lt = [i.strip() for i in s] return lt
def output_data(i, out_name): with open(out_name, "a", encoding='utf-8') as f: f.write(i + "\n")
def get_md5_value(src): myMd5 = hashlib.md5() myMd5.update(src.encode("utf8")) myMd5_Digest = myMd5.hexdigest() return myMd5_Digest
async def process_data(sem, s, i, None_data_list): async with sem: try:
r = await s.get(url=i, timeout=30, headers=headers, verify=False)
await r.html.arender(wait=30, sleep=30, timeout=30, retries=1)
content_length = len(r.content)
code = r.status_code
content = r.html.html.replace('\r', '').replace('\n', '').replace(' ', '')
body_md5 = get_md5_value(str(content))
if '<title>' in content:
title = re.findall('(?<=<title>)(.+?)(?=</title>)', content)[0]
elif r.html.find('title', first=True):
title = r.html.find('title', first=True).text
else:
title = 'None'
output_data(i, 'real_None.txt')
print(f'{i} {r.status_code}, {title}')
data = [str(code), str(title), str(content_length), body_md5, str(i)]
None_data_list.append(data)
except requests.exceptions.RequestException as e:
print(f"Request error: {e}")
except BaseException as e:
print(f"Unhandled error: {e}")
# Close only the current browser instance if possible
async def start_up(urls, None_data_list, timeout_duration=3000): s = AsyncHTMLSession(verify=False) sem = asyncio.Semaphore(3) tasks = (process_data(sem, s, url, None_data_list) for url in urls) await asyncio.wait_for(asyncio.gather(*tasks), timeout=timeout_duration) await s.close()
def main(urls): None_data_list = [] try: start = time.perf_counter() print(urls) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) loop.run_until_complete(start_up(urls, None_data_list)) end = time.perf_counter() print(f'None_Scan : {end - start} ') output_data(str(end - start), 'debug_time.txt') print('') except asyncio.TimeoutError: print("Timeout occurred") except Exception as e: print(e) finally: print(len(None_data_list)) os.system('pkill -f -9 chrome') return None_data_list
test.py new_request_None_url = ['http://bi-mokadisplay.tcl.com:83','http://tmsa.cmp.tcl.com:88']
update_data_list = nonetitle_info.main(new_request_None_url)
print(update_data_list)
#data_info.none_update(False, update_data_list)
figure it out?
This project uses pyppeteer which is uses very old version of Chromium. This is easily fixable. You can check my comment on another issue here. Let me know if this helps.
I forked this project and updated it to use playwright. see: #573