利用vllm加速llm模型但是生成的是无意义的噪音
复现步骤:
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git && cd CosyVoice
&&git submodule update --init --recursive
model:
from modelscope import snapshot_download
snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')
/CosyVoice/cosyvoice/cli/cosyvoice.py
import os
import time
from typing import Generator
from tqdm import tqdm
from hyperpyyaml import load_hyperpyyaml
from modelscope import snapshot_download
import torch
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
###修改处
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model ,VllmCosyVoice2Model
###修改处
from cosyvoice.utils.file_utils import logging
from cosyvoice.utils.class_utils import get_model_type
class CosyVoice:
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
self.instruct = True if '-Instruct' in model_dir else False
self.model_dir = model_dir
self.fp16 = fp16
if not os.path.exists(model_dir):
model_dir = snapshot_download(model_dir)
hyper_yaml_path = '{}/cosyvoice.yaml'.format(model_dir)
if not os.path.exists(hyper_yaml_path):
raise ValueError('{} not found!'.format(hyper_yaml_path))
with open(hyper_yaml_path, 'r') as f:
configs = load_hyperpyyaml(f)
assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
configs['feat_extractor'],
'{}/campplus.onnx'.format(model_dir),
'{}/speech_tokenizer_v1.onnx'.format(model_dir),
'{}/spk2info.pt'.format(model_dir),
configs['allowed_special'])
self.sample_rate = configs['sample_rate']
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
load_jit, load_trt, fp16 = False, False, False
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
'{}/hift.pt'.format(model_dir))
if load_jit:
self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
if load_trt:
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
self.fp16)
del configs
def list_available_spks(self):
spks = list(self.frontend.spk2info.keys())
return spks
def add_zero_shot_spk(self, prompt_text, prompt_speech_16k, zero_shot_spk_id):
assert zero_shot_spk_id != '', 'do not use empty zero_shot_spk_id'
model_input = self.frontend.frontend_zero_shot('', prompt_text, prompt_speech_16k, self.sample_rate, '')
del model_input['text']
del model_input['text_len']
self.frontend.spk2info[zero_shot_spk_id] = model_input
return True
def save_spkinfo(self):
torch.save(self.frontend.spk2info, '{}/spk2info.pt'.format(self.model_dir))
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True):
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_sft(i, spk_id)
start_time = time.time()
logging.info('synthesis sft text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
#计算转化时间
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
if (not isinstance(i, Generator)) and len(i) < 0.5 * len(prompt_text):
logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
def inference_cross_lingual(self, tts_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0, text_frontend=True):
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
if self.instruct is False:
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
instruct_text = self.frontend.text_normalize(instruct_text, split=False, text_frontend=text_frontend)
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
start_time = time.time()
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
class CosyVoice2(CosyVoice):
def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, use_flow_cache=False):
self.instruct = True if '-Instruct' in model_dir else False
self.model_dir = model_dir
self.fp16 = fp16
if not os.path.exists(model_dir):
model_dir = snapshot_download(model_dir)
hyper_yaml_path = '{}/cosyvoice2.yaml'.format(model_dir)
if not os.path.exists(hyper_yaml_path):
raise ValueError('{} not found!'.format(hyper_yaml_path))
with open(hyper_yaml_path, 'r') as f:
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
assert get_model_type(configs) == CosyVoice2Model, 'do not use {} for CosyVoice2 initialization!'.format(model_dir)
#原始语音信号转化为后续模块(如 LLM)可处理的格式
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
configs['feat_extractor'],
'{}/campplus.onnx'.format(model_dir),
'{}/speech_tokenizer_v2.onnx'.format(model_dir),
'{}/spk2info.pt'.format(model_dir),
configs['allowed_special'])
self.sample_rate = configs['sample_rate']
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
load_jit, load_trt, fp16 = False, False, False
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
# self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16, use_flow_cache)
###修改处
self.model = VllmCosyVoice2Model(model_dir, configs['flow'], configs['hift'], fp16)
###修改处
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir) if use_flow_cache is False else '{}/flow.cache.pt'.format(model_dir),
'{}/hift.pt'.format(model_dir))
if load_jit:
self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
if load_trt:
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
self.fp16)
del configs
def inference_instruct(self, *args, **kwargs):
raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
start_time = time.time()
logging.info('synthesis text {}'.format(i))
for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
yield model_output
start_time = time.time()
/CosyVoice/cosyvoice/cli/cosyvoice.py
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Generator
import torch
import numpy as np
import threading
import time
from torch.nn import functional as F
from contextlib import nullcontext
import uuid
from cosyvoice.utils.common import fade_in_out
from cosyvoice.utils.file_utils import convert_onnx_to_trt
class CosyVoiceModel:
def __init__(self,
llm: torch.nn.Module,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool = False):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
self.fp16 = fp16
if self.fp16 is True:
self.llm.half()
self.flow.half()
self.token_min_hop_len = 2 * self.flow.input_frame_rate
self.token_max_hop_len = 4 * self.flow.input_frame_rate
self.token_overlap_len = 20
# mel fade in out
self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
self.mel_window = np.hamming(2 * self.mel_overlap_len)
# hift cache
self.mel_cache_len = 20
self.source_cache_len = int(self.mel_cache_len * 256)
# speech fade in out
self.speech_window = np.hamming(2 * self.source_cache_len)
# rtf and decoding related
self.stream_scale_factor = 1
assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
self.lock = threading.Lock()
# dict used to store session related variable
self.tts_speech_token_dict = {}
self.llm_end_dict = {}
self.mel_overlap_dict = {}
self.flow_cache_dict = {}
self.hift_cache_dict = {}
def load(self, llm_model, flow_model, hift_model):
self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
self.llm.to(self.device).eval()
self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
self.flow.to(self.device).eval()
# in case hift_model is a hifigan model
hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
self.hift.load_state_dict(hift_state_dict, strict=True)
self.hift.to(self.device).eval()
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
self.llm.text_encoder = llm_text_encoder
llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
self.llm.llm = llm_llm
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
self.flow.encoder = flow_encoder
def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
if not os.path.exists(flow_decoder_estimator_model):
convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
if os.path.getsize(flow_decoder_estimator_model) == 0:
raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
del self.flow.decoder.estimator
import tensorrt as trt
with open(flow_decoder_estimator_model, 'rb') as f:
self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
assert self.flow.decoder.estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
def get_trt_kwargs(self):
min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200)]
max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
input_names = ["x", "mask", "mu", "cond"]
return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
#使用 LLM(语言模型)根据文本、语音 prompt 等信息生成目标语音 token,供后续 TTS 使用。
def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
with self.llm_context, torch.cuda.amp.autocast(self.fp16):
if isinstance(text, Generator):
assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
for i in self.llm.inference_bistream(text=text,
prompt_text=prompt_text.to(self.device),
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
embedding=llm_embedding.to(self.device)):
self.tts_speech_token_dict[uuid].append(i)
else:
for i in self.llm.inference(text=text.to(self.device),
text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
prompt_text=prompt_text.to(self.device),
prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
embedding=llm_embedding.to(self.device)):
self.tts_speech_token_dict[uuid].append(i)
print(self.tts_speech_token_dict[uuid])
self.llm_end_dict[uuid] = True
def vc_job(self, source_speech_token, uuid):
self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
self.llm_end_dict[uuid] = True
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
with torch.cuda.amp.autocast(self.fp16):
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
prompt_feat=prompt_feat.to(self.device),
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
embedding=embedding.to(self.device),
flow_cache=self.flow_cache_dict[uuid])
# mel overlap fade in out
if self.mel_overlap_dict[uuid].shape[2] != 0:
tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
# append hift cache
if self.hift_cache_dict[uuid] is not None:
hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
else:
hift_cache_source = torch.zeros(1, 1, 0)
# keep overlap mel and hift cache
if finalize is False:
self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
if self.hift_cache_dict[uuid] is not None:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
'source': tts_source[:, :, -self.source_cache_len:],
'speech': tts_speech[:, -self.source_cache_len:]}
tts_speech = tts_speech[:, :-self.source_cache_len]
else:
if speed != 1.0:
assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
if self.hift_cache_dict[uuid] is not None:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
return tts_speech
def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
# this_uuid is used to track variables related to this inference thread
this_uuid = str(uuid.uuid1())
with self.lock:
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
self.hift_cache_dict[this_uuid] = None
self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
if source_speech_token.shape[1] == 0:
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
else:
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
p.start()
if stream is True:
token_hop_len = self.token_min_hop_len
while True:
time.sleep(0.1)
if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
.unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=False)
yield {'tts_speech': this_tts_speech.cpu()}
with self.lock:
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
# increase token_hop_len for better speech quality
token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
break
p.join()
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=True)
yield {'tts_speech': this_tts_speech.cpu()}
else:
# deal with all tokens
p.join()
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=True,
speed=speed)
yield {'tts_speech': this_tts_speech.cpu()}
with self.lock:
self.tts_speech_token_dict.pop(this_uuid)
self.llm_end_dict.pop(this_uuid)
self.mel_overlap_dict.pop(this_uuid)
self.hift_cache_dict.pop(this_uuid)
self.flow_cache_dict.pop(this_uuid)
torch.cuda.empty_cache()
class CosyVoice2Model(CosyVoiceModel):
def __init__(self,
llm: torch.nn.Module,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool = False,
use_flow_cache: bool = False):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
self.fp16 = fp16
self.use_flow_cache = use_flow_cache
if self.fp16 is True:
self.llm.half()
self.flow.half()
# stream related params, check examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
self.token_hop_len = 25
self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
# hift cache
self.mel_cache_len = 8
self.source_cache_len = int(self.mel_cache_len * 480)
# speech fade in out
self.speech_window = np.hamming(2 * self.source_cache_len)
# rtf and decoding related
self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
self.lock = threading.Lock()
# dict used to store session related variable
self.tts_speech_token_dict = {}
self.llm_end_dict = {}
self.flow_cache_dict = {}
self.hift_cache_dict = {}
def init_flow_cache(self):
encoder_cache = {'offset': 0,
'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
'upsample_offset': 0,
'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
decoder_cache = {'offset': 0,
'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
if self.fp16 is True:
for cache in [encoder_cache, decoder_cache]:
for k, v in cache.items():
if isinstance(v, torch.Tensor):
cache[k] = v.half()
cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
return cache
def load_jit(self, flow_encoder_model):
flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
self.flow.encoder = flow_encoder
def get_trt_kwargs(self):
min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4), (1, 4, 2, 0, 512, 2), (12, 4, 2, 0, 512, 2), (1, 4, 2, 0, 512, 2)]
opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200), (1, 4, 2, 100, 512, 2), (12, 4, 2, 100, 512, 2), (1, 4, 2, 100, 512, 2)]
max_shape = [(2, 80, 1500), (2, 1, 1500), (2, 80, 1500), (2, 80, 1500), (1, 4, 2, 200, 512, 2), (12, 4, 2, 200, 512, 2), (1, 4, 2, 200, 512, 2)]
input_names = ["x", "mask", "mu", "cond", 'down_blocks_kv_cache', 'mid_blocks_kv_cache', 'up_blocks_kv_cache']
assert self.use_flow_cache is True, "get_trt_kwargs is set for flow cache mode. If you want to use trt with use_flow_cache=False, please set higher max_shape"
return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}
def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
with torch.cuda.amp.autocast(self.fp16):
tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
prompt_token=prompt_token.to(self.device),
prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
prompt_feat=prompt_feat.to(self.device),
prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
embedding=embedding.to(self.device),
cache=self.flow_cache_dict[uuid],
finalize=finalize)
# append hift cache
if self.hift_cache_dict[uuid] is not None:
hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
else:
hift_cache_source = torch.zeros(1, 1, 0)
# keep overlap mel and hift cache
if finalize is False:
tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
if self.hift_cache_dict[uuid] is not None:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
'source': tts_source[:, :, -self.source_cache_len:],
'speech': tts_speech[:, -self.source_cache_len:]}
tts_speech = tts_speech[:, :-self.source_cache_len]
else:
if speed != 1.0:
assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
if self.hift_cache_dict[uuid] is not None:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
return tts_speech
def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
prompt_text=torch.zeros(1, 0, dtype=torch.int32),
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
# this_uuid is used to track variables related to this inference thread
this_uuid = str(uuid.uuid1())
with self.lock:
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
self.hift_cache_dict[this_uuid] = None
self.flow_cache_dict[this_uuid] = self.init_flow_cache()
if source_speech_token.shape[1] == 0:
print("*******************************2221******************************")
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
else:
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
sta_time1 = time.time()
p.start()
if stream is True:
assert self.use_flow_cache is True, "set use_flow_cache=True if you want to use stream inference to avoid OOM"
# NOTE in cache mode, trim flow_prompt to same size as flow_decoder_required_cache_size
flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
while True:
time.sleep(0.1)
if len(self.tts_speech_token_dict[this_uuid]) >= self.token_hop_len + self.flow.pre_lookahead_len:
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=False)
# NOTE in cache inference mode, we only use flow_prompt_speech_token/prompt_speech_feat in first chunk
flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
yield {'tts_speech': this_tts_speech.cpu()}
with self.lock:
self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
break
p.join()
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=True)
yield {'tts_speech': this_tts_speech.cpu()}
else:
print("*******************************2224******************************")
# deal with all tokens
assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
sta_time2 = time.time()
p.join()
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
sta_time3 = time.time()
this_tts_speech = self.token2wav(token=this_tts_speech_token,
prompt_token=flow_prompt_speech_token,
prompt_feat=prompt_speech_feat,
embedding=flow_embedding,
uuid=this_uuid,
finalize=True,
speed=speed)
yield {'tts_speech': this_tts_speech.cpu()}
sta_time4 = time.time()
with self.lock:
self.tts_speech_token_dict.pop(this_uuid)
self.llm_end_dict.pop(this_uuid)
self.hift_cache_dict.pop(this_uuid)
self.flow_cache_dict.pop(this_uuid)
torch.cuda.empty_cache()
sta_time5 = time.time()
print("threading time:",sta_time1 - sta_time,"this_tts_speech_token time:",sta_time3 - sta_time2,"token2wav:",sta_time4 - sta_time3,"other ",sta_time5 - sta_time4,"all time:",time.time()-sta_time)
###修改处
class VllmCosyVoice2Model(CosyVoice2Model):
def __init__(self,
model_dir: str,
flow: torch.nn.Module,
hift: torch.nn.Module,
fp16: bool):
try:
from cosyvoice.llm.llm_vllm import VllmQwen2LM
except Exception as e:
raise e
llm = VllmQwen2LM(model_dir)
super().__init__(llm, flow, hift, fp16)
def load(self, llm_model, flow_model, hift_model):
self.flow.load_state_dict(torch.load(flow_model, weights_only=True, map_location=self.device), strict=True)
self.flow.to(self.device).eval()
# in case hift_model is a hifigan model
hift_state_dict = {k.replace('generator.', ''): v for k, v in
torch.load(hift_model, weights_only=True, map_location=self.device).items()}
self.hift.load_state_dict(hift_state_dict, strict=True)
self.hift.to(self.device).eval()
###修改处
/CosyVoice/runtime/python/fastapi/server.py 若不修改会出现huggingface_hub.errors.HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import argparse
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from fastapi import FastAPI, UploadFile, Form, File
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import numpy as np
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/../../..'.format(ROOT_DIR))
sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
app = FastAPI()
# set cross region allowance
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"])
def generate_data(model_output):
for i in model_output:
tts_audio = (i['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
yield tts_audio
@app.get("/inference_sft")
@app.post("/inference_sft")
async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
model_output = cosyvoice.inference_sft(tts_text, spk_id)
return StreamingResponse(generate_data(model_output))
@app.get("/inference_zero_shot")
@app.post("/inference_zero_shot")
async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = load_wav(prompt_wav.file, 16000)
model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
return StreamingResponse(generate_data(model_output))
@app.get("/inference_cross_lingual")
@app.post("/inference_cross_lingual")
async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = load_wav(prompt_wav.file, 16000)
model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
return StreamingResponse(generate_data(model_output))
@app.get("/inference_instruct")
@app.post("/inference_instruct")
async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
return StreamingResponse(generate_data(model_output))
@app.get("/inference_instruct2")
@app.post("/inference_instruct2")
async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(), prompt_wav: UploadFile = File()):
prompt_speech_16k = load_wav(prompt_wav.file, 16000)
model_output = cosyvoice.inference_instruct2(tts_text, instruct_text, prompt_speech_16k)
return StreamingResponse(generate_data(model_output))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--port',
type=int,
default=50002)
parser.add_argument('--model_dir',
type=str,
default='iic/CosyVoice-300M',
help='local path or modelscope repo id')
args = parser.parse_args()
###修改处
# try:
# cosyvoice = CosyVoice(args.model_dir)
# except Exception:
try:
cosyvoice = CosyVoice2(args.model_dir)
except Exception:
raise TypeError('no valid model_type!')
###修改处
uvicorn.run(app, host="0.0.0.0", port=args.port)
增加/CosyVoice/cosyvoice/llm/llm_vllm.py
import time
import queue
import asyncio
import threading
from typing import List, Generator, AsyncGenerator
import torch
from cosyvoice.utils.file_utils import logging
from cosyvoice.llm.llm import Qwen2LM
# # 启用vllm V1版本
# import os
# os.environ["VLLM_USE_V1"] = '1'
from vllm import ModelRegistry
from vllm import LLMEngine, AsyncLLMEngine, CompletionOutput
from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs
from vllm.sampling_params import SamplingParams
from cosyvoice.llm.vllm_use_cosyvoice2_model import CosyVoice2Model as CosyVoice2LLM
ModelRegistry.register_model("CosyVoice2Model", CosyVoice2LLM)
# EngineArgs
ENGINE_ARGS = {
"block_size": 16,
"swap_space": 0,
# "enforce_eager": True,
"gpu_memory_utilization": 0.4,
"max_num_batched_tokens": 1024,
"max_model_len": 1024,
"max_num_seqs": 256,
"disable_log_requests": True,
"disable_log_stats": True,
"dtype": "float16"
}
from vllm.sampling_params import RequestOutputKind
# SamplingParams
SAMPLING_PARAMS = {
"temperature": 1, # 不能低于0.8, 否则会生成非常多的空音频,或者无法正常生成语音Token
"top_p": 1, # 不能低于0.8, 否则会生成非常多的空音频,或者无法正常生成语音Token
"top_k": 25,
# "min_tokens": 80, # 不支持设置最小的tokens数量设置,开启后vllm直接崩溃,无法启动
# "presence_penalty": 1.0, # 不支持设置
# "frequency_penalty": 0.0, # 不支持设置
"max_tokens": 1024,
"detokenize": False, # 目前 vllm 0.7.3 v1版本中设置无效,待后续版本更新后减少计算
"ignore_eos": False,
"output_kind": RequestOutputKind.DELTA # 设置为DELTA,如调整该参数,请同时调整llm_inference的处理代码
}
def tensor_to_list(tensor: torch.tensor):
return tensor.view(-1).cpu().numpy().tolist()
class VllmQwen2LM(Qwen2LM):
def __init__(
self,
model_dir,
mix_ratio: List[int] = [5, 15],
):
self.fp16 = False
self.half = lambda: None
self.mix_ratio = mix_ratio
# ---------------------------------------------
# vllm engine 的参数配置
engine_args = AsyncEngineArgs(
model=model_dir,
**ENGINE_ARGS,
)
self.llm_engine: AsyncLLMEngine = AsyncLLMEngine.from_engine_args(engine_args)
self.speech_token_size = 6564 # 6561 + 3
self.llm_token_size = 151936 # llm vocab_size
self.sos_eos_token_id = self.speech_token_size + self.llm_token_size + 1
self.task_token_id = self.sos_eos_token_id + 1
self.zero_token_id = self.task_token_id + 1
# vllm 的推理任务需要在一个固定的事件循环中,因此启动一个后台线程运行转用于推理任务
self.loop = asyncio.new_event_loop()
self.loop_thread = threading.Thread(target=self._run_event_loop, daemon=True)
self.loop_thread.start()
def _run_event_loop(self):
asyncio.set_event_loop(self.loop)
self.loop.run_forever()
async def async_llm_inference(self, out_queue, prompt_token_ids, request_id, stop_token_ids, max_tokens):
sampling_params = SamplingParams(**SAMPLING_PARAMS)
sampling_params.stop_token_ids = stop_token_ids or [6561]
if max_tokens:
sampling_params.max_tokens = max_tokens
async for output in self.llm_engine.generate(
{
"prompt_token_ids": prompt_token_ids,
},
sampling_params=sampling_params,
request_id=request_id or f"{time.time()}",
):
out_queue.put((output.outputs[0], output.finished))
def llm_inference(self, prompt_token_ids: List[int], request_id: str=None, stop_token_ids=None, max_tokens=None):
out_queue = queue.Queue()
asyncio.run_coroutine_threadsafe(
self.async_llm_inference(out_queue, prompt_token_ids, request_id, stop_token_ids, max_tokens), self.loop
)
# 接收 out_queue 返回的结果
finished = False
while not finished:
(output, finished) = out_queue.get_nowait() if not out_queue.empty() else out_queue.get()
yield output
def inference(
self,
text: torch.Tensor,
text_len: torch.Tensor,
prompt_text: torch.Tensor,
prompt_text_len: torch.Tensor,
prompt_speech_token: torch.Tensor,
prompt_speech_token_len: torch.Tensor,
embedding: torch.Tensor,
sampling: int = 25,
max_token_text_ratio: float = 20,
min_token_text_ratio: float = 2,
) -> Generator[torch.Tensor|int, None, None]:
prompt_text = tensor_to_list(prompt_text + torch.tensor(6564))
prompt_speech_token = tensor_to_list(prompt_speech_token)
text = tensor_to_list(text + torch.tensor(6564))
prompt_token_ids = [self.sos_eos_token_id] + prompt_text + text + \
[self.task_token_id] + prompt_speech_token
max_tokens = len(text) * 20
for output in self.llm_inference(
prompt_token_ids,
stop_token_ids=[6561],
max_tokens=max_tokens,
):
if output.token_ids[-1] == 6561:
need_add_tokens = output.token_ids[:-1]
else:
need_add_tokens = output.token_ids
for token in need_add_tokens:
yield token
def inference_bistream(
self,
text: Generator,
prompt_text: torch.Tensor,
prompt_text_len: torch.Tensor,
prompt_speech_token: torch.Tensor,
prompt_speech_token_len: torch.Tensor,
embedding: torch.Tensor,
sampling: int = 25,
max_token_text_ratio: float = 20,
min_token_text_ratio: float = 2,
) -> Generator[torch.Tensor, None, None]:
prompt_text = tensor_to_list(prompt_text + torch.tensor(6564))
prompt_speech_token = tensor_to_list(prompt_speech_token)
last_tokens = []
prompt_token_ids = [self.sos_eos_token_id]
text_tokens_cache = prompt_text
for this_text in text:
this_text = tensor_to_list(this_text + torch.tensor(6564))
# text need tokens
assert isinstance(this_text, list), "text need token ids List[int]."
text_tokens_cache += this_text
while len(prompt_speech_token) != 0:
if len(text_tokens_cache) >= self.mix_ratio[0]:
text_input_token = text_tokens_cache[:self.mix_ratio[0]]
speech_input_token = prompt_speech_token[:self.mix_ratio[1]]
prompt_token_ids += text_input_token + speech_input_token
# reset the last cache
text_tokens_cache = text_tokens_cache[self.mix_ratio[0]:]
prompt_speech_token = prompt_speech_token[self.mix_ratio[1]:]
else:
break
if len(prompt_speech_token) == 0:
if (len(last_tokens) > 0 and last_tokens[-1] == 6563) or len(prompt_token_ids) == 1:
if len(text_tokens_cache) >= self.mix_ratio[0]:
text_tokens_temp = text_tokens_cache[:self.mix_ratio[0]]
prompt_token_ids += text_tokens_temp
text_tokens_cache = text_tokens_cache[self.mix_ratio[0]:]
else:
continue
for output in self.llm_inference(prompt_token_ids, stop_token_ids=[6563]):
last_tokens = output.token_ids
if last_tokens[-1] == 6563:
need_add_tokens = last_tokens[:-1]
else:
need_add_tokens = last_tokens
for token in need_add_tokens:
yield token
prompt_token_ids.extend(need_add_tokens)
prompt_token_ids += text_tokens_cache + [self.task_token_id]
for output in self.llm_inference(prompt_token_ids, stop_token_ids=[6561]):
if output.token_ids[-1] == 6561:
need_add_tokens = output.token_ids[:-1]
else:
need_add_tokens = output.token_ids
for token in need_add_tokens:
yield token
增加/CosyVoice/cosyvoice/llm/vllm_use_cosyvoice2_model.py
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
from typing import Iterable, List, Optional, Set, Tuple, Union, Iterator, overload, TypedDict, Mapping, Any
from typing_extensions import TypeVar
import torch
from torch import nn
from vllm.attention import AttentionMetadata
# from vllm.config import VllmConfig
from vllm.config import CacheConfig, LoRAConfig,SchedulerConfig
from vllm.model_executor.layers.quantization import QuantizationConfig
from transformers import Qwen2Config
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
# from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.sampler import SamplerOutput, Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors
# from vllm.model_executor.models.interfaces import T
from vllm.model_executor.models.qwen2 import Qwen2Model
# from vllm.model_executor.models.utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings
from vllm.model_executor.models.utils import AutoWeightsLoader
logger = init_logger(__name__)
IGNORE_ID = -1
class CosyVoice2Model(nn.Module):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
# def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
def __init__(self, config: Qwen2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, scheduler_config: Optional[SchedulerConfig] = None,prefix: str = ""):
super().__init__()
# config = vllm_config.model_config.hf_config
# quant_config = vllm_config.quant_config
# lora_config = vllm_config.lora_config
self.config = config
self.lora_config = lora_config
self.quant_config = quant_config
self.llm_input_size = 896
self.llm_output_size = 896
self.speech_token_size = 6561+3
self.llm_token_size = config.vocab_size
# 2. build speech token language model related modules
self.sos_eos = 0
self.task_id = 1
self.fill_token = 2
self.allow_patterns_overrides = ["llm.*"]
self.llm_embedding = torch.nn.Embedding(2, self.llm_input_size)
# self.model = Qwen2Model(vllm_config=vllm_config,
# prefix=maybe_prefix(prefix, "model"))
print("-----------------------------")
print("config",config," cache_config",cache_config," quant_config",quant_config)
self.model = Qwen2Model(config, cache_config, quant_config)
# self.llm_decoder = nn.Linear(self.llm_output_size, self.speech_token_size)
self.llm_decoder = ParallelLMHead(self.speech_token_size,
self.llm_output_size,
bias=True,
quant_config=quant_config,
# prefix=maybe_prefix(
# prefix, "llm_decoder")
)
print("-----------------------------")
print("llm_decoder weight norm:", self.llm_decoder.weight.norm())
self.logits_processor = LogitsProcessor(self.speech_token_size)
# length_normalized_loss: bool = True,
# lsm_weight: float = 0.0,
# self.criterion_ce = LabelSmoothingLoss(
# size=self.speech_token_size,
# padding_idx=IGNORE_ID,
# smoothing=lsm_weight,
# normalize_length=length_normalized_loss,
# )
# 3. [Optional] build speech token related modules
self.speech_embedding = torch.nn.Embedding(self.speech_token_size, self.llm_input_size)
# 4. sampling method
## use vllm sampling method
self.sampler = Sampler()
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)
self.mix_ratio: List[int] = [5, 15]
# 定义特殊token常量
self.llm_token_id_delta = torch.tensor(self.speech_token_size, dtype=torch.int32)
self.sos_eos_token_id = torch.tensor((self.llm_token_id_delta + self.llm_token_size + 1), dtype=torch.int32) # 163840 + 6564 = 170404
self.task_token_id = self.sos_eos_token_id + torch.tensor(1, dtype=torch.int32) # 170405
self.zero_token_id = self.task_token_id + torch.tensor(1, dtype=torch.int32)
self.zero_embed_buffer = torch.zeros(
# (vllm_config.scheduler_config.max_num_seqs, self.llm_input_size),
(128, self.llm_input_size),
dtype=self.llm_embedding.weight.dtype,
device=self.llm_embedding.weight.device
)
self.inputs_embed_buffer = torch.zeros(
# (vllm_config.scheduler_config.max_num_batched_tokens, self.llm_input_size),
(2048, self.llm_input_size),
dtype=self.llm_embedding.weight.dtype,
device=self.llm_embedding.weight.device,
)
def get_sos_eos_emb(self):
return self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
def get_task_id_emb(self):
return self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
def get_input_embeddings(
self,
input_ids: torch.Tensor,
# multimodal_embeddings: Optional[T] = None,
attn_metadata: Optional["AttentionMetadata"] = None,
) -> torch.Tensor:
"""
Returns the input embeddings merged from the text embeddings from
input_ids and the multimodal embeddings generated from multimodal
kwargs.
"""
# 创建掩码,标记哪些 token_id 属于音频 Token
mask = input_ids < self.speech_token_size
# 获取 input_ids 的原始形状
input_shape = input_ids.shape
# 展平 input_ids 和掩码以便统一处理
flat_input_ids = input_ids.view(-1)
flat_mask = mask.view(-1)
inputs_embeds = self.inputs_embed_buffer[:flat_input_ids.shape[0]]
inputs_embeds.zero_()
# Process speech tokens
if flat_mask.any():
speech_token_ids = flat_input_ids[flat_mask]
inputs_embeds[flat_mask] = self.speech_embedding(speech_token_ids)
# 处理大于 delta 的 token_id
if (~flat_mask).any():
llm_token_ids = flat_input_ids[~flat_mask]
llm_embeds = torch.zeros_like(inputs_embeds[~flat_mask])
sos_eos_mask = llm_token_ids == self.sos_eos_token_id
task_mask = llm_token_ids == self.task_token_id
zero_mask = llm_token_ids == self.zero_token_id
normal_mask = ~(sos_eos_mask | task_mask | zero_mask)
# 分层处理逻辑
# 第一优先级:SOS/EOS标记
if sos_eos_mask.any():
llm_embeds[sos_eos_mask] = self.llm_embedding.weight[self.sos_eos].unsqueeze(0)
# 第二优先级:任务标记
if task_mask.any():
llm_embeds[task_mask] = self.llm_embedding.weight[self.task_id].unsqueeze(0)
# 第二优先级:空音频标记
if zero_mask.any():
llm_embeds[zero_mask] = self.zero_embed_buffer[:len(llm_embeds[zero_mask])]
# 常规LLM token
if normal_mask.any():
original_ids = llm_token_ids[normal_mask] - self.llm_token_id_delta
# print('original_ids: ',original_ids)
llm_embeds[normal_mask] = self.model.get_input_embeddings(original_ids)
inputs_embeds[~flat_mask] = llm_embeds
inputs_embeds = inputs_embeds.view(*input_shape, self.llm_input_size)
# # 合并多模态嵌入(如果有)
# if multimodal_embeddings is not None:
# inputs_embeds = merge_multimodal_embeddings(
# input_ids, inputs_embeds, multimodal_embeddings,
# self.config.audio_token_index
# )
return inputs_embeds
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings(
input_ids,
attn_metadata=attn_metadata,
)
return self.model(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
inputs_embeds)
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.llm_decoder, hidden_states,
sampling_metadata)
return logits
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
@staticmethod
def convert_weights(weights: Iterable[Tuple[str, torch.Tensor]]) -> Iterable[Tuple[str, torch.Tensor]]:
for name, param in weights:
# 处理Qwen2Model核心参数
if name.startswith("llm."):
if name.startswith("llm.model.model."):
name = name.replace("llm.model.model.", "model.")
else:
continue
# print('weights name: ', name)
yield name, param
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
weights = self.convert_weights(weights)
loader = AutoWeightsLoader(self)
loader.load_weights(weights)
遇到了一摸一样的同样的问题!!!!!cosyvoice2用vllm加速输出的token不对
看看https://github.com/qi-hua/async_cosyvoice的项目,VLLM能跑通
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
我也是定位到token生成错误
官方vllm代码并未完成,完成后会更新readme里增加用法提示
官方vllm代码并未完成,完成后会更新readme里增加用法提示
感谢回复,再问下这个有预期多久完成吗
官方vllm代码并未完成,完成后会更新readme里增加用法提示
感谢回复,再问下这个有预期多久完成吗
我们也在等vllm官方库的一个功能release,可能下个月吧
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
看看https://github.com/qi-hua/async_cosyvoice的项目,VLLM能跑通查看了 https://github.com/qi-hua/async_cosyvoice 项,VLLMノ跑通
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
您好,我想问下我一直报错这个是为什么:Traceback (most recent call last):
File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in
看看https://github.com/qi-hua/async_cosyvoice的项目,VLLM能跑通查看了 https://github.com/qi-hua/async_cosyvoice 项,VLLMノ跑通
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
您好,我想问下我一直报错这个是为什么:Traceback (most recent call last): File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in main(args) File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 190, in main cosyvoice = AsyncCosyVoice2(args.model_dir, load_jit=args.load_jit, load_trt=args.load_trt, fp16=args.fp16) File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/async_cosyvoice.py", line 60, in init self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/model.py", line 142, in load_trt convert_onnx_to_trt(flow_decoder_estimator_model, flow_decoder_onnx_model, fp16) TypeError: convert_onnx_to_trt() missing 1 required positional argument: 'fp16'
我jit,trt打开也是报错。所以把它们俩都关了。mygpu.plan文件那个项目作者没提供
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
你好可以求一个能正常推理的vllm代码吗
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
你好可以求一个能正常推理的vllm代码吗
就是拿 https://github.com/qi-hua/async_cosyvoice 这个项目跑的,jit和trt加速关掉
@foxmale007
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
你好可以求一个能正常推理的vllm代码吗
就是拿 https://github.com/qi-hua/async_cosyvoice 这个项目跑的,jit和trt加速关掉
@foxmale007 你好,请问使用的最新的CosyVoice2模型吗
这个试过啦,已经跑不通更新的模型了
model最新的0.5B可以用。但是那个项目是复制了一份推理代码,没有用最新基座的推理代码,反正能跑。等官方合并
问下你跑出来的推理结果是正常的吗,上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常,音频中噪声非常大。
正常了。VLLM推理出来的效果: https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a
你好可以求一个能正常推理的vllm代码吗
就是拿 https://github.com/qi-hua/async_cosyvoice 这个项目跑的,jit和trt加速关掉
@foxmale007 你好,请问使用的最新的CosyVoice2模型吗
vllm的推理代码和官方最新版是有差别的,model可以用最新的,按照文档覆盖一下就行
@foxmale007 @sjq19960802 @aluminumbox 您们好,请问 vllm 推理代码可以跑通了吗? 我用了 https://github.com/qi-hua/async_cosyvoice 也跑不通。。报错:
Traceback (most recent call last):
File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/main.py", line 71, in
@foxmale007 @sjq19960802 @aluminumbox 您们好,请问 vllm 推理代码可以跑通了吗? 我用了 https://github.com/qi-hua/async_cosyvoice 也跑不通。。报错: Traceback (most recent call last): File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/main.py", line 71, in cli.main() File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 501, in main run() File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 351, in run_file runpy.run_path(target, run_name="main") File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 310, in run_path return _run_module_code(code, init_globals, run_name, pkg_name=pkg_name, script_name=fname) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 127, in _run_module_code _run_code(code, mod_globals, init_globals, mod_name, mod_spec, pkg_name, script_name) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 118, in _run_code exec(code, run_globals) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in main(args) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 190, in main cosyvoice = AsyncCosyVoice2(args.model_dir, load_jit=args.load_jit, load_trt=args.load_trt, fp16=args.fp16) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/async_cosyvoice.py", line 53, in init self.model.load( File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/model.py", line 128, in load self.flow.load_state_dict(torch.load(flow_model, weights_only=True, map_location=self.device), strict=True) File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2584, in load_state_dict raise RuntimeError( RuntimeError: Error(s) in loading state_dict for CausalMaskedDiffWithXvec: Missing key(s) in state_dict: "decoder.estimator.down_blocks.0.0.block1.block.1.weight", "decoder.estimator.down_blocks.0.0.block1.block.1.bias", "decoder.estimator.down_blocks.0.0.block2.block.1.weight", "decoder.estimator.down_blocks.0.0.block2.block.1.bias", "decoder.estimator.mid_blocks.0.0.block1.block.1.weight", "decoder.estimator.mid_blocks.0.0.block1.block.1.bias", "decoder.estimator.mid_blocks.0.0.block2.block.1.weight", "decoder.estimator.mid_blocks.0.0.block2.block.1.bias", "decoder.estimator.mid_blocks.1.0.block1.block.1.weight", "decoder.estimator.mid_blocks.1.0.block1.block.1.bias", "decoder.estimator.mid_blocks.1.0.block2.block.1.weight", "decoder.estimator.mid_blocks.1.0.block2.block.1.bias", "decoder.estimator.mid_blocks.2.0.block1.block.1.weight", "decoder.estimator.mid_blocks.2.0.block1.block.1.bias", "decoder.estimator.mid_blocks.2.0.block2.block.1.weight", "decoder.estimator.mid_blocks.2.0.block2.block.1.bias", "decoder.estimator.mid_blocks.3.0.block1.block.1.weight", "decoder.estimator.mid_blocks.3.0.block1.block.1.bias", "decoder.estimator.mid_blocks.3.0.block2.block.1.weight", "decoder.estimator.mid_blocks.3.0.block2.block.1.bias", "decoder.estimator.mid_blocks.4.0.block1.block.1.weight", "decoder.estimator.mid_blocks.4.0.block1.block.1.bias", "decoder.estimator.mid_blocks.4.0.block2.block.1.weight", "decoder.estimator.mid_blocks.4.0.block2.block.1.bias", "decoder.estimator.mid_blocks.5.0.block1.block.1.weight", "decoder.estimator.mid_blocks.5.0.block1.block.1.bias", "decoder.estimator.mid_blocks.5.0.block2.block.1.weight", "decoder.estimator.mid_blocks.5.0.block2.block.1.bias", "decoder.estimator.mid_blocks.6.0.block1.block.1.weight", "decoder.estimator.mid_blocks.6.0.block1.block.1.bias", "decoder.estimator.mid_blocks.6.0.block2.block.1.weight", "decoder.estimator.mid_blocks.6.0.block2.block.1.bias", "decoder.estimator.mid_blocks.7.0.block1.block.1.weight", "decoder.estimator.mid_blocks.7.0.block1.block.1.bias", "decoder.estimator.mid_blocks.7.0.block2.block.1.weight", "decoder.estimator.mid_blocks.7.0.block2.block.1.bias", "decoder.estimator.mid_blocks.8.0.block1.block.1.weight", "decoder.estimator.mid_blocks.8.0.block1.block.1.bias", "decoder.estimator.mid_blocks.8.0.block2.block.1.weight", "decoder.estimator.mid_blocks.8.0.block2.block.1.bias", "decoder.estimator.mid_blocks.9.0.block1.block.1.weight", "decoder.estimator.mid_blocks.9.0.block1.block.1.bias", "decoder.estimator.mid_blocks.9.0.block2.block.1.weight", "decoder.estimator.mid_blocks.9.0.block2.block.1.bias", "decoder.estimator.mid_blocks.10.0.block1.block.1.weight", "decoder.estimator.mid_blocks.10.0.block1.block.1.bias", "decoder.estimator.mid_blocks.10.0.block2.block.1.weight", "decoder.estimator.mid_blocks.10.0.block2.block.1.bias", "decoder.estimator.mid_blocks.11.0.block1.block.1.weight", "decoder.estimator.mid_blocks.11.0.block1.block.1.bias", "decoder.estimator.mid_blocks.11.0.block2.block.1.weight", "decoder.estimator.mid_blocks.11.0.block2.block.1.bias", "decoder.estimator.up_blocks.0.0.block1.block.1.weight", "decoder.estimator.up_blocks.0.0.block1.block.1.bias", "decoder.estimator.up_blocks.0.0.block2.block.1.weight", "decoder.estimator.up_blocks.0.0.block2.block.1.bias", "decoder.estimator.final_block.block.1.weight", "decoder.estimator.final_block.block.1.bias". Unexpected key(s) in state_dict: "decoder.estimator.down_blocks.0.0.block1.block.2.weight", "decoder.estimator.down_blocks.0.0.block1.block.2.bias", "decoder.estimator.down_blocks.0.0.block2.block.2.weight", "decoder.estimator.down_blocks.0.0.block2.block.2.bias", "decoder.estimator.mid_blocks.0.0.block1.block.2.weight", "decoder.estimator.mid_blocks.0.0.block1.block.2.bias", "decoder.estimator.mid_blocks.0.0.block2.block.2.weight", "decoder.estimator.mid_blocks.0.0.block2.block.2.bias", "decoder.estimator.mid_blocks.1.0.block1.block.2.weight", "decoder.estimator.mid_blocks.1.0.block1.block.2.bias", "decoder.estimator.mid_blocks.1.0.block2.block.2.weight", "decoder.estimator.mid_blocks.1.0.block2.block.2.bias", "decoder.estimator.mid_blocks.2.0.block1.block.2.weight", "decoder.estimator.mid_blocks.2.0.block1.block.2.bias", "decoder.estimator.mid_blocks.2.0.block2.block.2.weight", "decoder.estimator.mid_blocks.2.0.block2.block.2.bias", "decoder.estimator.mid_blocks.3.0.block1.block.2.weight", "decoder.estimator.mid_blocks.3.0.block1.block.2.bias", "decoder.estimator.mid_blocks.3.0.block2.block.2.weight", "decoder.estimator.mid_blocks.3.0.block2.block.2.bias", "decoder.estimator.mid_blocks.4.0.block1.block.2.weight", "decoder.estimator.mid_blocks.4.0.block1.block.2.bias", "decoder.estimator.mid_blocks.4.0.block2.block.2.weight", "decoder.estimator.mid_blocks.4.0.block2.block.2.bias", "decoder.estimator.mid_blocks.5.0.block1.block.2.weight", "decoder.estimator.mid_blocks.5.0.block1.block.2.bias", "decoder.estimator.mid_blocks.5.0.block2.block.2.weight", "decoder.estimator.mid_blocks.5.0.block2.block.2.bias", "decoder.estimator.mid_blocks.6.0.block1.block.2.weight", "decoder.estimator.mid_blocks.6.0.block1.block.2.bias", "decoder.estimator.mid_blocks.6.0.block2.block.2.weight", "decoder.estimator.mid_blocks.6.0.block2.block.2.bias", "decoder.estimator.mid_blocks.7.0.block1.block.2.weight", "decoder.estimator.mid_blocks.7.0.block1.block.2.bias", "decoder.estimator.mid_blocks.7.0.block2.block.2.weight", "decoder.estimator.mid_blocks.7.0.block2.block.2.bias", "decoder.estimator.mid_blocks.8.0.block1.block.2.weight", "decoder.estimator.mid_blocks.8.0.block1.block.2.bias", "decoder.estimator.mid_blocks.8.0.block2.block.2.weight", "decoder.estimator.mid_blocks.8.0.block2.block.2.bias", "decoder.estimator.mid_blocks.9.0.block1.block.2.weight", "decoder.estimator.mid_blocks.9.0.block1.block.2.bias", "decoder.estimator.mid_blocks.9.0.block2.block.2.weight", "decoder.estimator.mid_blocks.9.0.block2.block.2.bias", "decoder.estimator.mid_blocks.10.0.block1.block.2.weight", "decoder.estimator.mid_blocks.10.0.block1.block.2.bias", "decoder.estimator.mid_blocks.10.0.block2.block.2.weight", "decoder.estimator.mid_blocks.10.0.block2.block.2.bias", "decoder.estimator.mid_blocks.11.0.block1.block.2.weight", "decoder.estimator.mid_blocks.11.0.block1.block.2.bias", "decoder.estimator.mid_blocks.11.0.block2.block.2.weight", "decoder.estimator.mid_blocks.11.0.block2.block.2.bias", "decoder.estimator.up_blocks.0.0.block1.block.2.weight", "decoder.estimator.up_blocks.0.0.block1.block.2.bias", "decoder.estimator.up_blocks.0.0.block2.block.2.weight", "decoder.estimator.up_blocks.0.0.block2.block.2.bias", "decoder.estimator.final_block.block.2.weight", "decoder.estimator.final_block.block.2.bias". [rank0]:[W619 08:17:38.670858366 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
你解决这个问题了吗,我也是报这个错