crawl4ai
crawl4ai copied to clipboard
[Bug]: - api - ERROR - Crawl error: cannot access local variable 'sig' where it is not associated with a value
crawl4ai version
0.7.6
Expected Behavior
crawler_pool.py (new file)
import asyncio, json, hashlib, time, psutil from contextlib import suppress from typing import Dict from crawl4ai import AsyncWebCrawler, BrowserConfig from typing import Dict from utils import load_config
CONFIG = load_config()
POOL: Dict[str, AsyncWebCrawler] = {} LAST_USED: Dict[str, float] = {} LOCK = asyncio.Lock()
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM β refuse new browsers above this IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30β―min
def sig(cfg: BrowserConfig) -> str: """Generate a signature for the browser config, handling non-serializable objects like ProxyConfig.""" def make_serializable(obj): """Recursively convert objects to JSON-serializable format.""" if isinstance(obj, dict): return {k: make_serializable(v) for k, v in obj.items()} elif isinstance(obj, (list, tuple)): return [make_serializable(item) for item in obj] elif hasattr(obj, 'dict'): # Handle objects (like ProxyConfig), extract public attributes result = {} for key, value in obj.dict.items(): if not key.startswith(''): result[key] = make_serializable(value) return result elif hasattr(obj, 'class'): # Other object types, convert to string representation return str(obj) else: return obj
try:
# Convert config to dict
if hasattr(cfg, 'to_dict'):
cfg_dict = cfg.to_dict()
elif hasattr(cfg, '__dict__'):
cfg_dict = cfg.__dict__.copy()
else:
cfg_dict = dict(cfg) if isinstance(cfg, dict) else str(cfg)
# Serialize config, handling ProxyConfig and other non-serializable objects
serialized = make_serializable(cfg_dict)
payload = json.dumps(serialized, sort_keys=True, separators=(",", ":"))
return hashlib.sha1(payload.encode()).hexdigest()
except Exception as e:
# Fallback: use object ID and type name if serialization fails
fallback = f"{type(cfg).__name__}_{id(cfg)}"
return hashlib.sha1(fallback.encode()).hexdigest()
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: sig = None # Initialize sig before try block to avoid UnboundLocalError try: sig = _sig(cfg) async with LOCK: if sig in POOL: LAST_USED[sig] = time.time() return POOL[sig] if psutil.virtual_memory().percent >= MEM_LIMIT: raise MemoryError("RAM pressure β new browser denied") crawler = AsyncWebCrawler(config=cfg, thread_safe=False) await crawler.start() POOL[sig] = crawler LAST_USED[sig] = time.time() return crawler except MemoryError as e: raise MemoryError(f"RAM pressure β new browser denied: {e}") except Exception as e: # Ensure sig has a value even if _sig() failed if sig is None: try: sig = _sig(cfg) except: sig = "error_fallback" raise RuntimeError(f"Failed to start browser: {e}") finally: # Only access sig if it was successfully initialized if sig is not None: if sig in POOL: LAST_USED[sig] = time.time() else: # If we failed to start the browser, ensure it's not in the pool POOL.pop(sig, None) LAST_USED.pop(sig, None) async def close_all(): async with LOCK: await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True) POOL.clear(); LAST_USED.clear()
async def janitor(): while True: await asyncio.sleep(60) now = time.time() async with LOCK: for sig, crawler in list(POOL.items()): if now - LAST_USED[sig] > IDLE_TTL: with suppress(Exception): await crawler.close() POOL.pop(sig, None); LAST_USED.pop(sig, None)
Current Behavior
An error occurred when using a proxy during local deployment
Is this reproducible?
Yes
Inputs Causing the Bug
Traceback (most recent call last):
File "/app/crawler_pool.py", line 24, in get_crawler
sig = _sig(cfg)
^^^^^^^^^
File "/app/crawler_pool.py", line 19, in _sig
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/json/__init__.py", line 238, in dumps
**kw).encode(obj)
^^^^^^^^^^^
File "/usr/local/lib/python3.12/json/encoder.py", line 200, in encode
chunks = self.iterencode(o, _one_shot=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/json/encoder.py", line 258, in iterencode
return _iterencode(o, 0)
^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/json/encoder.py", line 180, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type ProxyConfig is not JSON serializable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/app/crawler_pool.py", line 38, in get_crawler
raise RuntimeError(f"Failed to start browser: {e}")
RuntimeError: Failed to start browser: Object of type ProxyConfig is not JSON serializable
Steps to Reproduce
Code snippets
OS
windows
Python version
3.9.0
Browser
No response
Browser version
No response
Error logs & Screenshots (if applicable)
No response