ApeRAG icon indicating copy to clipboard operation
ApeRAG copied to clipboard

[Improvement] Celery worker OOMKilled

Open iziang opened this issue 1 month ago β€’ 0 comments

🧠 Summary

celery-worker pods are being OOMKilled repeatedly due to excessive memory usage when loading data from very large PostgreSQL tables (lightrag_vdb_relation and lightrag_vdb_entity).


πŸ› Problem Description

The celeryworker pod in the aperag deployment frequently restarts because of Out of Memory (OOMKilled) events. After investigation, it appears that the worker is loading entire tables from PostgreSQL into memory during the execution of the query_lightrag_vdb_relation_all function.

These two tables have grown significantly and now exceed the pod’s memory limit (4 GiB):

postgres=# SELECT pg_size_pretty(pg_total_relation_size('lightrag_vdb_relation'));
 pg_size_pretty 
----------------
 4228 MB
(1 row)

postgres=# SELECT pg_size_pretty(pg_total_relation_size('lightrag_vdb_entity'));
 pg_size_pretty 
----------------
 3533 MB
(1 row)

postgres=# 

πŸ“‹ Environment Details

Pod:

Name:        celeryworker-799fc9f787-65jmf
Namespace:   default
Node:        cn-hongkong.10.231.119.38
Status:      Running (frequently OOMKilled)
RestartCount: 412
Memory Limit: 4Gi

Image:

apecloud-registry.cn-zhangjiakou.cr.aliyuncs.com/apecloud/aperag:v0.7.0-alpha.5

Command:

/opt/venv/bin/celery -A config.celery worker -l INFO --concurrency=16 -Q 10.231.119.38,celery --pool=threads

πŸ” Investigation

Using py-spy to inspect the running worker process:

root@:~# py-spy dump -p 1956455
Process 1956455: /opt/venv/bin/python3 /opt/venv/bin/celery -A config.celery worker -l INFO --concurrency=16 -Q 10.231.119.38,celery --pool=threads
Python v3.11.13 (/usr/local/bin/python3.11)

Thread 11 (active): "MainThread"
    poll (kombu/utils/eventio.py:83)
    create_loop (kombu/asynchronous/hub.py:317)
    asynloop (celery/worker/loops.py:97)
    start (celery/worker/consumer/consumer.py:772)
    start (celery/bootsteps.py:116)
    start (celery/worker/consumer/consumer.py:341)
    start (celery/bootsteps.py:365)
    start (celery/bootsteps.py:116)
    start (celery/worker/worker.py:203)
    worker (celery/bin/worker.py:356)
    caller (celery/bin/base.py:135)
    new_func (click/decorators.py:33)
    invoke (click/core.py:788)
    invoke (click/core.py:1443)
    invoke (click/core.py:1697)
    main (click/core.py:1082)
    __call__ (click/core.py:1161)
    main (celery/bin/celery.py:231)
    main (celery/__main__.py:15)
    <module> (celery:10)
Thread 30 (active): "ThreadPoolExecutor-3_0"
    _read_ready__get_buffer (asyncio/selector_events.py:974)
    _read_ready (asyncio/selector_events.py:956)
    _run (asyncio/events.py:84)
    _run_once (asyncio/base_events.py:1936)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    process_document_for_celery (lightrag_manager.py:118)
    create_index (aperag/tasks/document.py:114)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 31 (active): "ThreadPoolExecutor-3_1"
    read (ssl.py:1168)
    recv (ssl.py:1295)
    read (httpcore/_backends/sync.py:128)
    _receive_event (httpcore/_sync/http11.py:217)
    _receive_response_headers (httpcore/_sync/http11.py:177)
    handle_request (httpcore/_sync/http11.py:106)
    handle_request (httpcore/_sync/connection.py:103)
    handle_request (httpcore/_sync/connection_pool.py:236)
    handle_request (httpx/_transports/default.py:250)
    _send_single_request (httpx/_client.py:1014)
    _send_handling_redirects (httpx/_client.py:979)
    _send_handling_auth (httpx/_client.py:942)
    send (httpx/_client.py:914)
    post (http_handler.py:761)
    _make_common_sync_call (llm_http_handler.py:175)
    completion (llm_http_handler.py:471)
    completion (litellm/main.py:2626)
    wrapper (litellm/utils.py:1219)
    _completion_core (aperag/llm/completion/completion_service.py:173)
    generate (aperag/llm/completion/completion_service.py:211)
    _summarize_text (aperag/index/summary_index.py:368)
    _generate_document_summary (aperag/index/summary_index.py:305)
    create_index (aperag/index/summary_index.py:75)
    update_index (aperag/index/summary_index.py:196)
    update_index (aperag/tasks/document.py:325)
    update_index_task (config/celery_tasks.py:393)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 32 (idle): "ThreadPoolExecutor-3_2"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    delete_document_for_celery (lightrag_manager.py:126)
    delete_index (aperag/tasks/document.py:216)
    delete_index_task (config/celery_tasks.py:334)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 33 (active): "ThreadPoolExecutor-3_3"
    acquire (logging/__init__.py:927)
    handle (logging/__init__.py:976)
    callHandlers (logging/__init__.py:1706)
    handle (logging/__init__.py:1644)
    _log (logging/__init__.py:1634)
    info (logging/__init__.py:1489)
    cleanup_expired_documents (aperag/tasks/collection.py:211)
    reconcile_all (aperag/tasks/reconciler.py:649)
    cleanup_expired_documents_task (config/celery_tasks.py:841)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 34 (idle): "ThreadPoolExecutor-3_4"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    delete_document_for_celery (lightrag_manager.py:126)
    delete_index (aperag/tasks/document.py:216)
    delete_index_task (config/celery_tasks.py:334)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 35 (active): "ThreadPoolExecutor-3_5"
    _read_ready__get_buffer (asyncio/selector_events.py:974)
    _read_ready (asyncio/selector_events.py:956)
    _run (asyncio/events.py:84)
    _run_once (asyncio/base_events.py:1936)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    process_document_for_celery (lightrag_manager.py:118)
    create_index (aperag/tasks/document.py:114)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 36 (active): "ThreadPoolExecutor-3_6"
    read (ssl.py:1168)
    recv (ssl.py:1295)
    read (httpcore/_backends/sync.py:128)
    _receive_event (httpcore/_sync/http11.py:217)
    _receive_response_headers (httpcore/_sync/http11.py:177)
    handle_request (httpcore/_sync/http11.py:106)
    handle_request (httpcore/_sync/connection.py:103)
    handle_request (httpcore/_sync/connection_pool.py:236)
    handle_request (httpx/_transports/default.py:250)
    _send_single_request (httpx/_client.py:1014)
    _send_handling_redirects (httpx/_client.py:979)
    _send_handling_auth (httpx/_client.py:942)
    send (httpx/_client.py:914)
    post (http_handler.py:761)
    _make_common_sync_call (llm_http_handler.py:175)
    completion (llm_http_handler.py:471)
    completion (litellm/main.py:2626)
    wrapper (litellm/utils.py:1219)
    _completion_core (aperag/llm/completion/completion_service.py:173)
    generate (aperag/llm/completion/completion_service.py:211)
    _summarize_text (aperag/index/summary_index.py:368)
    _generate_document_summary (aperag/index/summary_index.py:305)
    create_index (aperag/index/summary_index.py:75)
    create_index (aperag/tasks/document.py:135)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 37 (idle): "ThreadPoolExecutor-3_7"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    process_document_for_celery (lightrag_manager.py:118)
    create_index (aperag/tasks/document.py:114)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 38 (idle): "ThreadPoolExecutor-3_8"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    delete_document_for_celery (lightrag_manager.py:126)
    delete_index (aperag/tasks/document.py:216)
    delete_index_task (config/celery_tasks.py:334)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 39 (active): "ThreadPoolExecutor-3_9"
    read (ssl.py:1168)
    recv (ssl.py:1295)
    read (httpcore/_backends/sync.py:128)
    _receive_event (httpcore/_sync/http11.py:217)
    _receive_response_headers (httpcore/_sync/http11.py:177)
    handle_request (httpcore/_sync/http11.py:106)
    handle_request (httpcore/_sync/connection.py:103)
    handle_request (httpcore/_sync/connection_pool.py:236)
    handle_request (httpx/_transports/default.py:250)
    _send_single_request (httpx/_client.py:1014)
    _send_handling_redirects (httpx/_client.py:979)
    _send_handling_auth (httpx/_client.py:942)
    send (httpx/_client.py:914)
    post (http_handler.py:761)
    _make_common_sync_call (llm_http_handler.py:175)
    completion (llm_http_handler.py:471)
    completion (litellm/main.py:2626)
    wrapper (litellm/utils.py:1219)
    _completion_core (aperag/llm/completion/completion_service.py:173)
    generate (aperag/llm/completion/completion_service.py:211)
    _summarize_text (aperag/index/summary_index.py:368)
    _generate_document_summary (aperag/index/summary_index.py:305)
    create_index (aperag/index/summary_index.py:75)
    create_index (aperag/tasks/document.py:135)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 40 (idle): "ThreadPoolExecutor-3_10"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    delete_document_for_celery (lightrag_manager.py:126)
    delete_index (aperag/tasks/document.py:216)
    delete_index_task (config/celery_tasks.py:334)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 41 (active): "ThreadPoolExecutor-3_11"
    read (ssl.py:1168)
    recv (ssl.py:1295)
    read (httpcore/_backends/sync.py:128)
    _receive_event (httpcore/_sync/http11.py:217)
    _receive_response_headers (httpcore/_sync/http11.py:177)
    handle_request (httpcore/_sync/http11.py:106)
    handle_request (httpcore/_sync/connection.py:103)
    handle_request (httpcore/_sync/connection_pool.py:236)
    handle_request (httpx/_transports/default.py:250)
    _send_single_request (httpx/_client.py:1014)
    _send_handling_redirects (httpx/_client.py:979)
    _send_handling_auth (httpx/_client.py:942)
    send (httpx/_client.py:914)
    post (http_handler.py:761)
    _make_common_sync_call (llm_http_handler.py:175)
    completion (llm_http_handler.py:471)
    completion (litellm/main.py:2626)
    wrapper (litellm/utils.py:1219)
    _completion_core (aperag/llm/completion/completion_service.py:173)
    generate (aperag/llm/completion/completion_service.py:211)
    _summarize_text (aperag/index/summary_index.py:368)
    _generate_document_summary (aperag/index/summary_index.py:305)
    create_index (aperag/index/summary_index.py:75)
    create_index (aperag/tasks/document.py:135)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 42 (idle): "ThreadPoolExecutor-3_12"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    delete_document_for_celery (lightrag_manager.py:126)
    delete_index (aperag/tasks/document.py:216)
    delete_index_task (config/celery_tasks.py:334)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 43 (active): "ThreadPoolExecutor-3_13"
    emit (logging/__init__.py:1113)
    handle (logging/__init__.py:978)
    callHandlers (logging/__init__.py:1706)
    handle (logging/__init__.py:1644)
    _log (logging/__init__.py:1634)
    info (logging/__init__.py:1489)
    cleanup_expired_documents (aperag/tasks/collection.py:211)
    reconcile_all (aperag/tasks/reconciler.py:649)
    cleanup_expired_documents_task (config/celery_tasks.py:841)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 44 (idle): "ThreadPoolExecutor-3_14"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    process_document_for_celery (lightrag_manager.py:118)
    create_index (aperag/tasks/document.py:114)
    create_index_task (config/celery_tasks.py:267)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 45 (idle): "ThreadPoolExecutor-3_15"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    _run_in_new_loop (lightrag_manager.py:216)
    delete_document_for_celery (lightrag_manager.py:126)
    delete_index (aperag/tasks/document.py:216)
    delete_index_task (config/celery_tasks.py:334)
    run (celery/app/autoretry.py:38)
    __protected_call__ (celery/app/trace.py:736)
    trace_task (celery/app/trace.py:453)
    fast_trace_task (celery/app/trace.py:651)
    apply_target (celery/concurrency/base.py:30)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 46 (active): "ThreadPoolExecutor-1_0"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3476 (active): "asyncio_0"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3478 (active): "asyncio_0"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3480 (active): "asyncio_0"
    do_execute (sqlalchemy/engine/default.py:945)
    _exec_single_context (sqlalchemy/engine/base.py:1964)
    _execute_context (sqlalchemy/engine/base.py:1843)
    _execute_clauseelement (sqlalchemy/engine/base.py:1638)
    _execute_on_connection (sqlalchemy/sql/elements.py:523)
    execute (sqlalchemy/engine/base.py:1416)
    orm_execute_statement (sqlalchemy/orm/bulk_persistence.py:1294)
    _execute_internal (sqlalchemy/orm/session.py:2251)
    execute (sqlalchemy/orm/session.py:2365)
    _upsert_node (aperag/db/repositories/graph.py:65)
    _execute_transaction (aperag/db/repositories/base.py:65)
    upsert_graph_node (aperag/db/repositories/graph.py:69)
    _sync_upsert_node (lightrag/kg/pg_ops_sync_graph_storage.py:68)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3482 (active): "asyncio_0"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3484 (active): "asyncio_0"
    do_execute (sqlalchemy/engine/default.py:945)
    _exec_single_context (sqlalchemy/engine/base.py:1964)
    _execute_context (sqlalchemy/engine/base.py:1843)
    _execute_clauseelement (sqlalchemy/engine/base.py:1638)
    _execute_on_connection (sqlalchemy/sql/elements.py:523)
    execute (sqlalchemy/engine/base.py:1416)
    orm_execute_statement (sqlalchemy/orm/context.py:306)
    _execute_internal (sqlalchemy/orm/session.py:2251)
    execute (sqlalchemy/orm/session.py:2365)
    _query (aperag/db/repositories/lightrag.py:576)
    _execute_query (aperag/db/repositories/base.py:55)
    query_lightrag_vdb_relation_all (aperag/db/repositories/lightrag.py:579)
    _sync_get_all (lightrag/kg/pg_ops_sync_vector_storage.py:100)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3485 (active): "asyncio_0"
    do_execute (sqlalchemy/engine/default.py:945)
    _exec_single_context (sqlalchemy/engine/base.py:1964)
    _execute_context (sqlalchemy/engine/base.py:1843)
    _execute_clauseelement (sqlalchemy/engine/base.py:1638)
    _execute_on_connection (sqlalchemy/sql/elements.py:523)
    execute (sqlalchemy/engine/base.py:1416)
    orm_execute_statement (sqlalchemy/orm/context.py:306)
    _execute_internal (sqlalchemy/orm/session.py:2251)
    execute (sqlalchemy/orm/session.py:2365)
    _query (aperag/db/repositories/lightrag.py:576)
    _execute_query (aperag/db/repositories/base.py:55)
    query_lightrag_vdb_relation_all (aperag/db/repositories/lightrag.py:579)
    _sync_get_all (lightrag/kg/pg_ops_sync_vector_storage.py:100)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3486 (active): "asyncio_0"
    <listcomp> (sqlalchemy/orm/loading.py:223)
    chunks (sqlalchemy/orm/loading.py:223)
    _fetchall_impl (sqlalchemy/engine/result.py:2268)
    _fetchall_impl (sqlalchemy/engine/result.py:1674)
    _allrows (sqlalchemy/engine/result.py:548)
    all (sqlalchemy/engine/result.py:1767)
    _query (aperag/db/repositories/lightrag.py:577)
    _execute_query (aperag/db/repositories/base.py:55)
    query_lightrag_vdb_relation_all (aperag/db/repositories/lightrag.py:579)
    _sync_get_all (lightrag/kg/pg_ops_sync_vector_storage.py:100)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3487 (active+gil): "asyncio_0"
    fetchall (sqlalchemy/engine/cursor.py:1137)
    _fetchall_impl (sqlalchemy/engine/cursor.py:2135)
    _raw_all_rows (sqlalchemy/engine/result.py:540)
    chunks (sqlalchemy/orm/loading.py:219)
    _fetchall_impl (sqlalchemy/engine/result.py:2268)
    _fetchall_impl (sqlalchemy/engine/result.py:1674)
    _allrows (sqlalchemy/engine/result.py:548)
    all (sqlalchemy/engine/result.py:1767)
    _query (aperag/db/repositories/lightrag.py:577)
    _execute_query (aperag/db/repositories/base.py:55)
    query_lightrag_vdb_relation_all (aperag/db/repositories/lightrag.py:579)
    _sync_get_all (lightrag/kg/pg_ops_sync_vector_storage.py:100)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3488 (active): "asyncio_0"
    do_execute (sqlalchemy/engine/default.py:945)
    _exec_single_context (sqlalchemy/engine/base.py:1964)
    _execute_context (sqlalchemy/engine/base.py:1843)
    _execute_clauseelement (sqlalchemy/engine/base.py:1638)
    _execute_on_connection (sqlalchemy/sql/elements.py:523)
    execute (sqlalchemy/engine/base.py:1416)
    orm_execute_statement (sqlalchemy/orm/context.py:306)
    _execute_internal (sqlalchemy/orm/session.py:2251)
    execute (sqlalchemy/orm/session.py:2365)
    _query (aperag/db/repositories/lightrag.py:576)
    _execute_query (aperag/db/repositories/base.py:55)
    query_lightrag_vdb_relation_all (aperag/db/repositories/lightrag.py:579)
    _sync_get_all (lightrag/kg/pg_ops_sync_vector_storage.py:100)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3489 (active): "asyncio_0"
    do_execute (sqlalchemy/engine/default.py:945)
    _exec_single_context (sqlalchemy/engine/base.py:1964)
    _execute_context (sqlalchemy/engine/base.py:1843)
    _execute_clauseelement (sqlalchemy/engine/base.py:1638)
    _execute_on_connection (sqlalchemy/sql/elements.py:523)
    execute (sqlalchemy/engine/base.py:1416)
    orm_execute_statement (sqlalchemy/orm/context.py:306)
    _execute_internal (sqlalchemy/orm/session.py:2251)
    execute (sqlalchemy/orm/session.py:2365)
    _query (aperag/db/repositories/lightrag.py:576)
    _execute_query (aperag/db/repositories/base.py:55)
    query_lightrag_vdb_relation_all (aperag/db/repositories/lightrag.py:579)
    _sync_get_all (lightrag/kg/pg_ops_sync_vector_storage.py:100)
    run (concurrent/futures/thread.py:58)
    _worker (concurrent/futures/thread.py:83)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3490 (active): "asyncio_1"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3491 (active): "asyncio_1"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3492 (active): "asyncio_1"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3493 (active): "asyncio_1"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3497 (active): "ThreadPoolExecutor-1_1"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3498 (active): "ThreadPoolExecutor-1_2"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 3499 (active): "ThreadPoolExecutor-1_3"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)

Stack traces show multiple active calls to:

query_lightrag_vdb_relation_all

This function attempts to load all records from lightrag_vdb_relation, causing high memory pressure.


πŸ’₯ Root Cause

  • The tables lightrag_vdb_relation and lightrag_vdb_entity are very large (4 GB and 3.5 GB respectively).
  • The query_lightrag_vdb_relation_all function does not paginate or stream results.
  • As a result, the entire dataset is loaded into memory, exceeding the Celery worker’s 4 GiB memory limit and triggering OOMKilled events.

βœ… Suggested Fixes / Improvements

  1. Implement pagination or streaming queries when fetching relation/entity data. Example: use LIMIT/OFFSET or server-side cursors (psycopg2.extras.DictCursor(name='cursor_name')).
  2. Avoid loading entire tables into memory for processing β€” process items in batches.
  3. Consider offloading large queries to background jobs with higher memory limits or move data aggregation logic into SQL.
  4. Optionally, increase the worker memory limit temporarily as a mitigation (from 4Gi β†’ 8Gi), but the core issue is unbounded memory usage.

πŸ“ˆ Impact

  • Frequent Celery worker restarts.
  • Tasks interrupted or retried repeatedly.
  • Reduced system reliability and performance.

iziang avatar Nov 05 '25 09:11 iziang