Crawling very slow and timeout error
Hello, I'm experiencing performance issues with my web crawler after approximately 1.5 to 2 hours of runtime. The crawling speed significantly decreases to about one site per minute or less, and I'm encountering numerous timeout errors.
Questions:
- What could be causing the performance degradation over time (maybe related to the queue size) ?
- Is this behavior correct?
Here is the code I use:
import asyncio
from crawlee.beautifulsoup_crawler import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
async def main() -> None:
crawler = BeautifulSoupCrawler()
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
url = context.request.url
context.log.info(f"Processing page: {url}")
await context.enqueue_links(strategy="all")
# Start the crawler with the provided URLs
await crawler.run(["https://crawlee.dev/"])
if __name__ == "__main__":
asyncio.run(main())
The logs and errors:
[crawlee.beautifulsoup_crawler.beautifulsoup_crawler] ERROR An exception occurred during handling of a request. This places the crawler and its underlying storages into an unknown state and crawling will be terminated.
Traceback (most recent call last):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/basic_crawler/basic_crawler.py", line 872, in __run_request_handler
await self._context_pipeline(crawling_context, self.router)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/basic_crawler/context_pipeline.py", line 62, in __call__
result = await middleware_instance.__anext__()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py", line 69, in _make_http_request
result = await self._http_client.crawl(
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/http_clients/httpx_client.py", line 98, in crawl
response = await client.send(http_request, follow_redirects=True)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_client.py", line 1675, in send
raise exc
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_client.py", line 1669, in send
await response.aread()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 911, in aread
self._content = b"".join([part async for part in self.aiter_bytes()])
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 911, in <listcomp>
self._content = b"".join([part async for part in self.aiter_bytes()])
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 929, in aiter_bytes
async for raw_bytes in self.aiter_raw():
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 987, in aiter_raw
async for raw_stream_bytes in self.stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_client.py", line 149, in __aiter__
async for chunk in self._stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_transports/default.py", line 254, in __aiter__
async for part in self._httpcore_stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 367, in __aiter__
raise exc from None
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 363, in __aiter__
async for part in self._stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 349, in __aiter__
raise exc
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 341, in __aiter__
async for chunk in self._connection._receive_response_body(**kwargs):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 210, in _receive_response_body
event = await self._receive_event(timeout=timeout)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 224, in _receive_event
data = await self._network_stream.read(
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_backends/anyio.py", line 35, in read
return await self._stream.receive(max_bytes=max_bytes)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/streams/tls.py", line 205, in receive
data = await self._call_sslobject_method(self._ssl_object.read, max_bytes)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/streams/tls.py", line 147, in _call_sslobject_method
data = await self.transport_stream.receive()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 1145, in receive
await AsyncIOBackend.checkpoint()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2050, in checkpoint
await sleep(0)
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 596, in sleep
await __sleep0()
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 590, in __sleep0
yield
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 456, in wait_for
return fut.result()
asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/_utils/wait.py", line 37, in wait_for
return await asyncio.wait_for(operation(), timeout.total_seconds())
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 458, in wait_for
raise exceptions.TimeoutError() from exc
asyncio.exceptions.TimeoutError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/basic_crawler/basic_crawler.py", line 782, in __run_task_function
await wait_for(
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/_utils/wait.py", line 39, in wait_for
raise asyncio.TimeoutError(timeout_message) from ex
asyncio.exceptions.TimeoutError: Request handler timed out after 60.0 seconds
[crawlee.autoscaling.autoscaled_pool] WARN Task timed out after *not set* seconds
d[crawlee.statistics.statistics] INFO crawlee.beautifulsoup_crawler.beautifulsoup_crawler request statistics {
"requests_finished": 5622,
"requests_failed": 1,
"retry_histogram": [
5622,
0,
1
],
"request_avg_failed_duration": 0.859078,
"request_avg_finished_duration": 50.458161,
"requests_finished_per_minute": 127,
"requests_failed_per_minute": 0,
"request_total_duration": 283676.640522,
"requests_total": 5623,
"crawler_runtime": 2646.37724
}
[crawlee.autoscaling.autoscaled_pool] INFO current_concurrency = 197; desired_concurrency = 161; cpu = 0.0; mem = 0.0; ev
It use a lot of ram also (after 4h-6h of crawl):
[crawlee.autoscaling.autoscaled_pool] INFO current_concurrency = 183; desired_concurrency = 173; cpu = 0.0; mem = 0.0; event_loop = 0.252; client_info = 0.0
[crawlee.autoscaling.snapshotter] WARN Memory is critically overloaded. Using 7.04 GB of 7.81 GB (90%). Consider increasing available memory.
[crawlee.statistics.statistics] INFO crawlee.beautifulsoup_crawler.beautifulsoup_crawler request statistics {
"requests_finished": 30381,
"requests_failed": 7,
"retry_histogram": [
30374,
7,
7
],
"request_avg_failed_duration": 1.340926,
"request_avg_finished_duration": 120.59418,
"requests_finished_per_minute": 87,
"requests_failed_per_minute": 0,
"request_total_duration": 3663781.171706,
"requests_total": 30388,
"crawler_runtime": 20939.883378
}
It's the user that have to limit the number of url added in the queue or the lib manage that? (hard limit etc)
Interesting. What is your total available memory?
32 Giga is available on my system.
I export my storage on Google Drive so you can test that: https://drive.google.com/file/d/1P8AgbgbVLmujiceYRtMIKK91zn9GVjen/view?usp=sharing
CRAWLEE_PURGE_ON_START=0 python test.py
When there is a lot of pending requests, crawlee is very very slow.
I'm seeing slow scraping too. About 200 requests per minute. I even self host the webpage to scrape. There are numerous times when scraper literally does nothing and waits for something.
@marisancans Would you mind sharing your scraper code as well? It might help us debug.
I also have the warning Consider increasing available memory., is there any way for the user to control the memory allocation?
I also have the warning
Consider increasing available memory., is there any way for the user to control the memory allocation?
Unless you're limiting the memory usage knowingly, no, there isn't, at least without digging deep in Crawlee's internals. Of course, if you're working with a cloud platform such as Apify, you can configure the available memory there.
Thank you for your response :D
I'm seeing slow scraping too. About 200 requests per minute. I even self host the webpage to scrape. There are numerous times when scraper literally does nothing and waits for something.
What is your PC setup (RAM, CPU)? I try to increase speed and can't go over 25 req/min. Maybe you can advise something to increase speed, some parameters, I use concurrency set.: ConcurrencySettings(min_concurrency=10, max_concurrency=200, max_tasks_per_minute=200, desired_concurrency=110)
@janbuchar
The following things are happening with the code below: the crawler is collecting data slowly and memory usage is increasing. I would like to know how to optimize it.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import redis
import asyncio
import time # Add this import
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee import ConcurrencySettings, service_locator
from crawlee.http_clients import HttpxHttpClient
from universal_crawler.routes import crawler_router
from crawlee.proxy_configuration import ProxyConfiguration
import aioredis
from conf.settings import settings, apify_logger
from conf.db_config import redis_config
# Disable clearing the `RequestQueue`, `KeyValueStore` and `Dataset` on each run.
# This makes the scraper continue from where it left off in the previous run.
# The recommended way to achieve this behavior is setting the environment variable
# `CRAWLEE_PURGE_ON_START=0`
configuration = service_locator.get_configuration()
# configuration.purge_on_start = False
configuration.write_metadata = False
configuration.persist_storage = False
configuration.available_memory_ratio = 0.8
class HostCrawler:
def __init__(self, ):
pool = aioredis.ConnectionPool(
host=redis_config.host,
port=redis_config.port,
username=redis_config.username,
password=redis_config.password,
db=redis_config.db,
max_connections=50,
)
self.redis_client = aioredis.Redis(connection_pool=pool)
self.concurrency_settings = settings.CONCURRENCY_SETTINGS
async def get_host_from_redis(self):
task_info = await self.redis_client.spop(redis_config.batch_host_queue)
if task_info:
if isinstance(task_info, bytes):
task_info = task_info.decode('utf-8').strip()
task_info = json.loads(task_info)
host = task_info['host']
proxy_urls = task_info['proxy_urls']
return host.strip(), proxy_urls
return None, None
async def set_finished_host(self, host):
try:
await self.redis_client.sadd(redis_config.batch_host_finished, host)
except Exception as e:
apify_logger.error(e)
@staticmethod
async def check_crawl_timeout(crawler):
while True:
await asyncio.sleep(10)
if time.time() - crawler_router.get_last_crawl_time() > 3 * 60:
apify_logger.info("No data crawled for 3 minutes. Stopping crawler.")
crawler.stop()
return True
if not crawler._running:
return False
async def crawl_host(self, host, proxy_urls=None):
http_client = HttpxHttpClient(
persist_cookies_per_session=False,
timeout=30,
follow_redirects=True,
)
proxy_configuration = None
if proxy_urls:
proxy_configuration = ProxyConfiguration(
proxy_urls=proxy_urls
)
crawler = BeautifulSoupCrawler(
request_handler=crawler_router.router,
concurrency_settings=self.concurrency_settings,
http_client=http_client,
proxy_configuration=proxy_configuration,
max_request_retries=3,
)
crawler_task = asyncio.create_task(crawler.run([host]))
timeout_task = asyncio.create_task(self.check_crawl_timeout(crawler))
done, pending = await asyncio.wait(
[crawler_task, timeout_task],
return_when=asyncio.FIRST_COMPLETED
)
for task in pending:
task.cancel()
apify_logger.info(f"host {host} finished")
async def run(self):
while True:
host, proxy_urls = await self.get_host_from_redis()
if not host:
apify_logger.info("task empty")
await asyncio.sleep(10)
continue
try:
await self.crawl_host(host, proxy_urls=proxy_urls)
await self.set_finished_host(host)
except Exception as e:
apify_logger.error(f"run host {host} failed: {e}")
async def main():
crawler = HostCrawler()
await crawler.run()
@ycq0125 could you please post this as a Discussion instead? I'd like to avoid derailing this thread.
@ycq0125 could you please post this as a Discussion instead? I'd like to avoid derailing this thread.
https://github.com/apify/crawlee-python/discussions/1190
This might be related to https://github.com/apify/crawlee-python/issues/1224
Hi, we completely reworked the request queue and the local storage clients. @Jourdelune could you please tell us whether this is still happening in the Crawlee v1.0?