PaddleX
PaddleX copied to clipboard
PaddleOCR-VL循环predict请求时内存增加不释放问题
Checklist:
- [x] 查找历史相关issue寻求解答
- [x] 翻阅FAQ
- [x] 翻阅PaddleX 文档
- [x] 确认bug是否在新版本里还未修复
描述问题
addleOCR-VL采用vllm部署后,300多张图片循环请求predict时,出现内存由2G占用不断增加到10G不释放,是否存在内存泄漏?如何改善?
客户端和服务端是k8s中的一个pod,2个容器,每个容器都用了一张A100的卡。
发现客户端的内存增长不释放,客户端的容器内,使用nvidia-smi看了下GPU是有资源使用的(由此可知是正常使用了显存,非用内存替代))
实测,在predict后加gc.collect(),上涨降低(2G到6G),但是还是逐个累加
复现
from paddleocr import PaddleOCRVL
pipeline = PaddleOCRVL(
layout_detection_model_name="PP-DocLayoutV2",
layout_deteection_model_dir="/models/PP-DocLayoutV2",
vl_rec_backend="vllm-server",
vl_rec_server_url="http://127.0.0.1:8000/v1",
)
async def parse_file(file_base64):
result = pipeline.predict(file_base64)
result = [res.json for res in result]
return result
环境
-
请提供您使用的PaddlePaddle和PaddleX的版本号 paddlepaddle-gpu==3.2.1
-
请提供您使用的操作系统信息,如Linux/Windows/MacOS linux
-
请问您使用的Python版本是? python=3.12
-
请问您使用的CUDA/cuDNN的版本号是? CUDA=12.2
可以麻烦尝试一下这个写法么,把整个_genai_client_process给替换掉:
def _genai_client_process(
self,
data,
max_new_tokens,
skip_special_tokens,
repetition_penalty,
temperature,
top_p,
min_pixels,
max_pixels,
):
import ctypes
import platform
import gc
def _force_memory_compact():
gc.collect()
if platform.system() == "Linux":
try:
libc = ctypes.CDLL("libc.so.6")
libc.malloc_trim(0)
except Exception:
pass
lock = Lock()
def _process(item):
image = item["image"]
image_url = None
try:
# 2. 处理 URL 图片
if isinstance(image, str):
if image.startswith("http://") or image.startswith("https://"):
image_url = image
else:
from PIL import Image
with Image.open(image) as img:
img = img.convert("RGB")
with io.BytesIO() as buf:
img.save(buf, format="JPEG")
image_url = "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode("ascii")
# 3. 处理 Numpy 图片 (OpenCV 优化)
elif isinstance(image, np.ndarray):
import cv2
# 使用 imencode 直接编码 BGR,避免 cvtColor 拷贝
success, buffer = cv2.imencode('.jpg', image, [int(cv2.IMWRITE_JPEG_QUALITY), 95])
if not success:
raise ValueError("Encode failed")
b64_str = base64.b64encode(buffer).decode("ascii")
image_url = f"data:image/jpeg;base64,{b64_str}"
# 显式删除 C++ buffer
del buffer
del b64_str
else:
raise TypeError(f"Not supported image type: {type(image)}")
# 4. 关键:在网络请求前删除 Input Image 引用
del image
item["image"] = None
# 准备参数
if self._genai_client.backend == "fastdeploy-server":
kwargs = {
"temperature": 1 if temperature is None else temperature,
"top_p": 0 if top_p is None else top_p,
}
else:
kwargs = {
"temperature": 0 if temperature is None else temperature,
}
if top_p is not None:
kwargs["top_p"] = top_p
if max_new_tokens is not None:
kwargs["max_completion_tokens"] = max_new_tokens
elif self.model_name in self.model_group["PaddleOCR-VL"]:
kwargs["max_completion_tokens"] = 8192
kwargs["extra_body"] = {}
if skip_special_tokens is not None:
if self._genai_client.backend in (
"fastdeploy-server",
"vllm-server",
"sglang-server",
):
kwargs["extra_body"]["skip_special_tokens"] = skip_special_tokens
else:
raise ValueError("Not supported")
if repetition_penalty is not None:
kwargs["extra_body"]["repetition_penalty"] = repetition_penalty
if min_pixels is not None:
if self._genai_client.backend == "vllm-server":
kwargs["extra_body"].setdefault("mm_processor_kwargs", {})["min_pixels"] = min_pixels
else:
warnings.warn(f"{repr(self._genai_client.backend)} does not support `min_pixels`.")
if max_pixels is not None:
if self._genai_client.backend == "vllm-server":
kwargs["extra_body"].setdefault("mm_processor_kwargs", {})["max_pixels"] = max_pixels
else:
warnings.warn(f"{repr(self._genai_client.backend)} does not support `max_pixels`.")
with lock:
future = self._genai_client.create_chat_completion(
[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": item["query"]},
],
}
],
return_future=True,
timeout=600,
**kwargs,
)
return future
except Exception as e:
logging.error(f"Processing error: {e}")
raise e
finally:
# 5. 兜底清理局部变量
if 'image' in locals(): del image
if 'buffer' in locals(): del buffer
if 'b64_str' in locals(): del b64_str
if len(data) > 1:
futures = list(self._thread_pool.map(_process, data))
else:
futures = [_process(data[0])]
results = []
for future in futures:
result = future.result()
results.append(result.choices[0].message.content)
# 6. 每个 Batch 结束后,强制整理内存
_force_memory_compact()
return results