非流式比流式更快吗
4090部署cosyvoice3, 非流式rtf 0.1左右,流式0.2左右,
非流式
流式
你好,请问你用的哪个接口,时延为什么这么低呢,我在H800和L20上流式rtf超过了0.4,测试环境是如何制作的呢?(比如推理加速相关的配置和版本),期待回复~
import sys import markdown from bs4 import BeautifulSoup from fastapi import FastAPI, HTTPException from fastapi.responses import StreamingResponse, Response import uvicorn from pydantic import BaseModel import torchaudio import io import os import torch import numpy as np sys.path.append('/mnt/sda/hlx/CosyVoice2/third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice3 from cosyvoice.utils.file_utils import load_wav from cosyvoice.utils.common import set_all_random_seed import logging
def markdown_to_text(markdown_string): # 将Markdown转换为HTML html = markdown.markdown(markdown_string) # 使用BeautifulSoup提取纯文本 soup = BeautifulSoup(html, "html.parser") return soup.get_text()
配置日志(通常在应用启动时设置)
logging.basicConfig( level=logging.ERROR, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(name)
app = FastAPI()
cosyvoice = CosyVoice3( '/mnt/sda/hlx/CosyVoice3/pretrained_models/FunAudioLLM/Fun-CosyVoice3-0___5B-2512', # load_jit=True, load_trt=True, load_vllm=True, fp16=True )
定义请求体(模仿 OpenAI 格式)
from pydantic import BaseModel, Field
class TTSRequest(BaseModel): # 对齐OpenAI的"input"参数(替代原"text") input: str = Field(..., description="需要转换为语音的文本") # OpenAI要求必选model参数(可自定义支持的模型名) model: str = Field(..., description="使用的TTS模型,如'cosyvoice-0.5b'") # 可选参数:音色(保持与OpenAI兼容的命名) voice: str = Field("echo", description="语音音色,如'alloy'/'echo'/'fable'等") # 可选参数:输出格式(默认mp3,OpenAI推荐) response_format: str = Field("mp3", description="输出音频格式,如'mp3'/'wav'") # 可选参数:语速(0.25-4.0) speed: float = Field(1.0, description="语速,范围0.25-4.0") # 可选参数:是否流式返回 stream: bool = Field(True, description="是否流式返回音频")
@app.post("/v1/audio/speech") async def generate_speech(request: TTSRequest): try: # 校验模型是否支持(可选,根据你的实际模型名调整) supported_models = ["CosyVoice2-0.5B"] # 自定义支持的模型名 if request.model not in supported_models: raise HTTPException( status_code=400, detail=f"不支持的模型: {request.model},支持的模型: {supported_models}" ) voice_prompts = { "alloy": "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", "hlx":"You are a helpful assistant.<|endofprompt|>按病因和发病机制分为以下三类,红细胞生成减少性贫血", "doubao":"You are a helpful assistant.<|endofprompt|>下班回家看到门口的快递盒,拆开的瞬间像拆盲盒,哪怕是刚需用品,也能收获满满的小开心。" }
# 校验语速范围
if not (0.25 <= request.speed <= 4.0):
raise HTTPException(
status_code=400,
detail="语速必须在0.25到4.0之间"
)
prompt_text = voice_prompts.get(request.voice, "请用正常的语气朗读下面的内容。")
# 设置随机种子
set_all_random_seed(42)
prompt_speech_16k = f'./asset/{request.voice}.wav'
# 生成语音(流式或非流式)
text = markdown_to_text(request.input)
def generate():
# 调用模型生成音频(注意:需根据语速调整,若模型支持)
BATCH_CHUNKS = 3
audio_chunks = []
for chunk in cosyvoice.inference_zero_shot(
text, # 使用OpenAI的"input"参数
prompt_text, # 提示文本(可根据voice调整音色)
prompt_speech_16k,
stream=False):
audio_tensor = chunk["tts_speech"]
audio_chunks.append(audio_tensor)
# 根据response_format转换格式(支持mp3/wav)
if len(audio_chunks) >= BATCH_CHUNKS:
concatenated_tensor = torch.cat(audio_chunks, dim=1)
with io.BytesIO() as buffer:
torchaudio.save(
buffer,
concatenated_tensor,
sample_rate=cosyvoice.sample_rate,
format=request.response_format,
)
buffer.seek(0)
yield buffer.read()
audio_chunks.clear()
if audio_chunks:
concatenated_tensor = torch.cat(audio_chunks, dim=1)
with io.BytesIO() as buffer:
torchaudio.save(
buffer,
concatenated_tensor,
sample_rate=cosyvoice.sample_rate,
format=request.response_format,
)
buffer.seek(0)
yield buffer.read()
# 流式响应媒体类型:mp3对应audio/mpeg,wav对应audio/wav
media_type = "audio/mpeg" if request.response_format == "mp3" else "audio/wav"
return StreamingResponse(generate(), media_type=media_type)
except HTTPException as e:
# 已知错误(如参数错误),直接返回
raise e
except Exception as e:
# 未知错误,按OpenAI格式返回错误信息
logger.exception("生成语音时发生错误")
raise HTTPException(
status_code=500,
detail={
"error": {
"message": str(e),
"code": "internal_error",
"param": None
}
}
)
if name == "main": uvicorn.run(app, host="192.168.2.115", port=9997)
关于环境,就是conda创建的环境,安装了requirements.txt,没啥特别的啊
非流式本身就是比流式快(rtf更低)吧,只是首包返回 流式有优势,在推理时延敏感的场景更适用
@houliangxue 麻烦请问楼主vllm是什么版本呢?环境冲突QAQ
@houliangxue 为啥用你的代码,跑出来的语音文件都是静音呢
请问您的cuda版本是多少?vllm版本是咋样的?默认好像开启不了vllm版本