crawl4ai
crawl4ai copied to clipboard
[Bug]: Docker server does not decode ContentRelevanceFilter
crawl4ai version
0.7.7
Expected Behavior
The crawl4ai server can decode ContentRelevanceFilter and execute deep crawls accordingly.
Current Behavior
The REST API returns status code 500, and I see the following in my terminal:
if depth != 0 and not await self.filter_chain.apply(url):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/crawl4ai/deep_crawling/filters.py", line 97, in apply
result = f.apply(url)
^^^^^^^
AttributeError: 'dict' object has no attribute 'apply'
Note: This bug is similar to https://github.com/unclecode/crawl4ai/issues/1419. I based my code snippet below off of the PR that fixed the older issue, https://github.com/unclecode/crawl4ai/pull/1436. Based on that PR, the solution here may be to store the original query in the __slots__ of ContentRelevanceFilter for serialization purposes; link to ContentRelevanceFilter.
Is this reproducible?
Yes
Inputs Causing the Bug
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "ContentRelevanceFilter",
"params": {
"query": ["about", "faq"],
"threshold": 0.2
}
}
]
}
}
Steps to Reproduce
1. Start the Docker server at port 11235.
2. Test the REST API directly with a BFSDeepCrawlStrategy containing a ContentRelevanceFilter.
Code snippets
import asyncio
import httpx
BASE_URL = "http://localhost:11235/" # Adjust port as needed
async def test_with_rest_api():
"""Test using REST API directly."""
print("\n" + "=" * 60)
print("Testing with REST API")
print("=" * 60)
# Create filter configuration
deep_crawl_strategy_payload = {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 2,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "ContentRelevanceFilter",
"params": {
"query": ["about", "faq"],
"threshold": 0.2
}
}
]
}
}
}
}
crawl_payload = {
"urls": ["https://docs.crawl4ai.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"deep_crawl_strategy": deep_crawl_strategy_payload,
"cache_mode": "bypass"
}
}
}
try:
async with httpx.AsyncClient() as client:
print("\n1. Sending crawl request to REST API...")
response = await client.post(
f"{BASE_URL}crawl",
json=crawl_payload,
timeout=30
)
if response.status_code == 200:
print(f"ā
REST API returned 200 OK")
data = response.json()
if data.get("success"):
results = data.get("results", [])
print(f"ā
Got {len(results)} results")
for i, result in enumerate(results[:3]):
print(f" Result {i}: {result.get('url', 'unknown')[:50]}...")
else:
print(f"ā Crawl not successful: {data}")
return False
else:
print(f"ā REST API returned {response.status_code}")
print(f" Response: {response.text[:500]}")
return False
print("\nā
REST API test completed successfully!")
return True
except Exception as e:
print(f"ā REST API test failed: {e}")
import traceback
traceback.print_exc()
return False
async def main():
"""Run all tests."""
# Test: REST API
rest_passed = await test_with_rest_api()
return 0 if rest_passed else 1
if __name__ == "__main__":
import sys
sys.exit(asyncio.run(main()))
OS
macOS Sequoia 15.7.1
Python version
3.10.18
Browser
No response
Browser version
No response
Error logs & Screenshots (if applicable)
No response