crawlee-python icon indicating copy to clipboard operation
crawlee-python copied to clipboard

Add support for additional DBMS for `SqlStorageClient`

Open Mantisus opened this issue 3 months ago • 0 comments

Currently, SqlStorageClient only supports SQLite and PostgreSQL. Add support for other DBMS, for example:

  • mysql
  • mariadb
  • sql server
  • oracledb

It is possible that additional configuration and optimization on the database side will be required to support certain databases.

When adding support for a new database, test using several clients with intensive workloads.

For example, using:

import asyncio
from concurrent.futures import ProcessPoolExecutor

from crawlee import ConcurrencySettings, service_locator
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import SqlStorageClient
from crawlee.storages import RequestQueue

CONNECTION = None

async def run(queue_name: str) -> None:
    async with SqlStorageClient(connection_string=CONNECTION) as storage_client:
        service_locator.set_storage_client(storage_client)
        queue = await RequestQueue.open(name=queue_name)

        http_client = HttpxHttpClient()

        crawler = ParselCrawler(
            http_client=http_client,
            request_manager=queue,
            concurrency_settings=ConcurrencySettings(desired_concurrency=20),
        )

        @crawler.router.default_handler
        async def request_handler(context: ParselCrawlingContext) -> None:
            context.log.info(f'Processing URL: {context.request.url}...')
            data = {
                'url': context.request.url,
                'title': context.selector.css('title::text').get(),
            }
            await context.push_data(data)
            await context.enqueue_links()

        await crawler.run(['https://crawlee.dev'])

def process_run(queue_name: str) -> None:
    asyncio.run(run(queue_name))

def multi_run(queue_name: str = 'multi') -> None:
    workers = 3
    with ProcessPoolExecutor(max_workers=workers) as executor:
        executor.map(process_run, [queue_name for i in range(workers)])

if __name__ == '__main__':
    multi_run()

Mantisus avatar Sep 11 '25 17:09 Mantisus