crawlee-python
crawlee-python copied to clipboard
Add support for additional DBMS for `SqlStorageClient`
Currently, SqlStorageClient only supports SQLite and PostgreSQL. Add support for other DBMS, for example:
- mysql
- mariadb
- sql server
- oracledb
It is possible that additional configuration and optimization on the database side will be required to support certain databases.
When adding support for a new database, test using several clients with intensive workloads.
For example, using:
import asyncio
from concurrent.futures import ProcessPoolExecutor
from crawlee import ConcurrencySettings, service_locator
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
from crawlee.http_clients import HttpxHttpClient
from crawlee.storage_clients import SqlStorageClient
from crawlee.storages import RequestQueue
CONNECTION = None
async def run(queue_name: str) -> None:
async with SqlStorageClient(connection_string=CONNECTION) as storage_client:
service_locator.set_storage_client(storage_client)
queue = await RequestQueue.open(name=queue_name)
http_client = HttpxHttpClient()
crawler = ParselCrawler(
http_client=http_client,
request_manager=queue,
concurrency_settings=ConcurrencySettings(desired_concurrency=20),
)
@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Processing URL: {context.request.url}...')
data = {
'url': context.request.url,
'title': context.selector.css('title::text').get(),
}
await context.push_data(data)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
def process_run(queue_name: str) -> None:
asyncio.run(run(queue_name))
def multi_run(queue_name: str = 'multi') -> None:
workers = 3
with ProcessPoolExecutor(max_workers=workers) as executor:
executor.map(process_run, [queue_name for i in range(workers)])
if __name__ == '__main__':
multi_run()