CosyVoice 利用vllm加速llm模型但是生成的是无意义的噪音

复现步骤：

git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git && cd CosyVoice
&&git submodule update --init --recursive

model：
from modelscope import snapshot_download
snapshot_download('iic/CosyVoice2-0.5B', local_dir='pretrained_models/CosyVoice2-0.5B')

/CosyVoice/cosyvoice/cli/cosyvoice.py

import os
import time
from typing import Generator
from tqdm import tqdm
from hyperpyyaml import load_hyperpyyaml
from modelscope import snapshot_download
import torch
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
###修改处
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model ,VllmCosyVoice2Model
###修改处
from cosyvoice.utils.file_utils import logging
from cosyvoice.utils.class_utils import get_model_type

class CosyVoice:
    def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
        self.instruct = True if '-Instruct' in model_dir else False
        self.model_dir = model_dir
        self.fp16 = fp16
        if not os.path.exists(model_dir):
            model_dir = snapshot_download(model_dir)
        hyper_yaml_path = '{}/cosyvoice.yaml'.format(model_dir)
        if not os.path.exists(hyper_yaml_path):
            raise ValueError('{} not found!'.format(hyper_yaml_path))
        with open(hyper_yaml_path, 'r') as f:
            configs = load_hyperpyyaml(f)
        assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
                                          configs['feat_extractor'],
                                          '{}/campplus.onnx'.format(model_dir),
                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
                                          '{}/spk2info.pt'.format(model_dir),
                                          configs['allowed_special'])
        self.sample_rate = configs['sample_rate']
        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
            load_jit, load_trt, fp16 = False, False, False
            logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16)
        self.model.load('{}/llm.pt'.format(model_dir),
                        '{}/flow.pt'.format(model_dir),
                        '{}/hift.pt'.format(model_dir))
        if load_jit:
            self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
                                '{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
                                '{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
        if load_trt:
            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
                                self.fp16)
        del configs

    def list_available_spks(self):
        spks = list(self.frontend.spk2info.keys())
        return spks

    def add_zero_shot_spk(self, prompt_text, prompt_speech_16k, zero_shot_spk_id):
        assert zero_shot_spk_id != '', 'do not use empty zero_shot_spk_id'
        model_input = self.frontend.frontend_zero_shot('', prompt_text, prompt_speech_16k, self.sample_rate, '')
        del model_input['text']
        del model_input['text_len']
        self.frontend.spk2info[zero_shot_spk_id] = model_input
        return True

    def save_spkinfo(self):
        torch.save(self.frontend.spk2info, '{}/spk2info.pt'.format(self.model_dir))

    def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True):
        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
            model_input = self.frontend.frontend_sft(i, spk_id)
            start_time = time.time()
            logging.info('synthesis sft text {}'.format(i))
            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
                #计算转化时间
                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                yield model_output
                start_time = time.time()

    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
        prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
            if (not isinstance(i, Generator)) and len(i) < 0.5 * len(prompt_text):
                logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
            start_time = time.time()
            logging.info('synthesis text {}'.format(i))
            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                yield model_output
                start_time = time.time()

    def inference_cross_lingual(self, tts_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
            start_time = time.time()
            logging.info('synthesis text {}'.format(i))
            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                yield model_output
                start_time = time.time()

    def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0, text_frontend=True):
        assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
        if self.instruct is False:
            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
        instruct_text = self.frontend.text_normalize(instruct_text, split=False, text_frontend=text_frontend)
        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
            start_time = time.time()
            logging.info('synthesis text {}'.format(i))
            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                yield model_output
                start_time = time.time()

    def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
        model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
        start_time = time.time()
        for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
            speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
            logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
            yield model_output
            start_time = time.time()
            
class CosyVoice2(CosyVoice):
    def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, use_flow_cache=False):
        self.instruct = True if '-Instruct' in model_dir else False
        self.model_dir = model_dir
        self.fp16 = fp16
        if not os.path.exists(model_dir):
            model_dir = snapshot_download(model_dir)
        hyper_yaml_path = '{}/cosyvoice2.yaml'.format(model_dir)
        if not os.path.exists(hyper_yaml_path):
            raise ValueError('{} not found!'.format(hyper_yaml_path))
        with open(hyper_yaml_path, 'r') as f:
            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
        assert get_model_type(configs) == CosyVoice2Model, 'do not use {} for CosyVoice2 initialization!'.format(model_dir)
        #原始语音信号转化为后续模块（如 LLM）可处理的格式
        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
                                          configs['feat_extractor'],
                                          '{}/campplus.onnx'.format(model_dir),
                                          '{}/speech_tokenizer_v2.onnx'.format(model_dir),
                                          '{}/spk2info.pt'.format(model_dir),
                                          configs['allowed_special'])
        self.sample_rate = configs['sample_rate']
        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
            load_jit, load_trt, fp16 = False, False, False
            logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
        # self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16, use_flow_cache)
        ###修改处
        self.model = VllmCosyVoice2Model(model_dir, configs['flow'], configs['hift'], fp16)
        ###修改处
        self.model.load('{}/llm.pt'.format(model_dir),
                        '{}/flow.pt'.format(model_dir) if use_flow_cache is False else '{}/flow.cache.pt'.format(model_dir),
                        '{}/hift.pt'.format(model_dir))
        if load_jit:
            self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
        if load_trt:
            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
                                self.fp16)
        del configs

    def inference_instruct(self, *args, **kwargs):
        raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')

    def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, zero_shot_spk_id='', stream=False, speed=1.0, text_frontend=True):
        assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
            model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate, zero_shot_spk_id)
            start_time = time.time()
            logging.info('synthesis text {}'.format(i))
            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
                yield model_output
                start_time = time.time()

May 23 '25 09:05 G1017

/CosyVoice/cosyvoice/cli/cosyvoice.py

# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Generator
import torch
import numpy as np
import threading
import time
from torch.nn import functional as F
from contextlib import nullcontext
import uuid
from cosyvoice.utils.common import fade_in_out
from cosyvoice.utils.file_utils import convert_onnx_to_trt


class CosyVoiceModel:

    def __init__(self,
                 llm: torch.nn.Module,
                 flow: torch.nn.Module,
                 hift: torch.nn.Module,
                 fp16: bool = False):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.llm = llm
        self.flow = flow
        self.hift = hift
        self.fp16 = fp16
        if self.fp16 is True:
            self.llm.half()
            self.flow.half()
        self.token_min_hop_len = 2 * self.flow.input_frame_rate
        self.token_max_hop_len = 4 * self.flow.input_frame_rate
        self.token_overlap_len = 20
        # mel fade in out
        self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
        self.mel_window = np.hamming(2 * self.mel_overlap_len)
        # hift cache
        self.mel_cache_len = 20
        self.source_cache_len = int(self.mel_cache_len * 256)
        # speech fade in out
        self.speech_window = np.hamming(2 * self.source_cache_len)
        # rtf and decoding related
        self.stream_scale_factor = 1
        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
        self.lock = threading.Lock()
        # dict used to store session related variable
        self.tts_speech_token_dict = {}
        self.llm_end_dict = {}
        self.mel_overlap_dict = {}
        self.flow_cache_dict = {}
        self.hift_cache_dict = {}

    def load(self, llm_model, flow_model, hift_model):
        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
        self.llm.to(self.device).eval()
        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
        self.flow.to(self.device).eval()
        # in case hift_model is a hifigan model
        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
        self.hift.load_state_dict(hift_state_dict, strict=True)
        self.hift.to(self.device).eval()

    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
        self.llm.text_encoder = llm_text_encoder
        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
        self.llm.llm = llm_llm
        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
        self.flow.encoder = flow_encoder

    def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
        assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
        if not os.path.exists(flow_decoder_estimator_model):
            convert_onnx_to_trt(flow_decoder_estimator_model, self.get_trt_kwargs(), flow_decoder_onnx_model, fp16)
        if os.path.getsize(flow_decoder_estimator_model) == 0:
            raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
        del self.flow.decoder.estimator
        import tensorrt as trt
        with open(flow_decoder_estimator_model, 'rb') as f:
            self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
        assert self.flow.decoder.estimator_engine is not None, 'failed to load trt {}'.format(flow_decoder_estimator_model)
        self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()

    def get_trt_kwargs(self):
        min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4)]
        opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200)]
        max_shape = [(2, 80, 3000), (2, 1, 3000), (2, 80, 3000), (2, 80, 3000)]
        input_names = ["x", "mask", "mu", "cond"]
        return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}

    #使用 LLM（语言模型）根据文本、语音 prompt 等信息生成目标语音 token，供后续 TTS 使用。
    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
        with self.llm_context, torch.cuda.amp.autocast(self.fp16):
            if isinstance(text, Generator):
                assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
                for i in self.llm.inference_bistream(text=text,
                                                     prompt_text=prompt_text.to(self.device),
                                                     prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
                                                     prompt_speech_token=llm_prompt_speech_token.to(self.device),
                                                     prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
                                                     embedding=llm_embedding.to(self.device)):
                    self.tts_speech_token_dict[uuid].append(i)
            else:
                for i in self.llm.inference(text=text.to(self.device),
                                            text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
                                            prompt_text=prompt_text.to(self.device),
                                            prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
                                            prompt_speech_token=llm_prompt_speech_token.to(self.device),
                                            prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
                                            embedding=llm_embedding.to(self.device)):
                    self.tts_speech_token_dict[uuid].append(i)
        print(self.tts_speech_token_dict[uuid])
        
        self.llm_end_dict[uuid] = True

    def vc_job(self, source_speech_token, uuid):
        self.tts_speech_token_dict[uuid] = source_speech_token.flatten().tolist()
        self.llm_end_dict[uuid] = True

    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
        with torch.cuda.amp.autocast(self.fp16):
            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
                                                                      token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
                                                                      prompt_token=prompt_token.to(self.device),
                                                                      prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
                                                                      prompt_feat=prompt_feat.to(self.device),
                                                                      prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
                                                                      embedding=embedding.to(self.device),
                                                                      flow_cache=self.flow_cache_dict[uuid])

        # mel overlap fade in out
        if self.mel_overlap_dict[uuid].shape[2] != 0:
            tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
        # append hift cache
        if self.hift_cache_dict[uuid] is not None:
            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
        else:
            hift_cache_source = torch.zeros(1, 1, 0)
        # keep overlap mel and hift cache
        if finalize is False:
            self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
            tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
            if self.hift_cache_dict[uuid] is not None:
                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
                                          'source': tts_source[:, :, -self.source_cache_len:],
                                          'speech': tts_speech[:, -self.source_cache_len:]}
            tts_speech = tts_speech[:, :-self.source_cache_len]
        else:
            if speed != 1.0:
                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
            if self.hift_cache_dict[uuid] is not None:
                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
        return tts_speech

    def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
            prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
        # this_uuid is used to track variables related to this inference thread
        this_uuid = str(uuid.uuid1())
        with self.lock:
            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
            self.hift_cache_dict[this_uuid] = None
            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
        if source_speech_token.shape[1] == 0:
            p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
        else:
            p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
        p.start()
        if stream is True:
            token_hop_len = self.token_min_hop_len
            while True:
                time.sleep(0.1)
                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
                        .unsqueeze(dim=0)
                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                                     prompt_token=flow_prompt_speech_token,
                                                     prompt_feat=prompt_speech_feat,
                                                     embedding=flow_embedding,
                                                     uuid=this_uuid,
                                                     finalize=False)
                    yield {'tts_speech': this_tts_speech.cpu()}
                    with self.lock:
                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
                    # increase token_hop_len for better speech quality
                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
                    break
            p.join()
            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
            this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                             prompt_token=flow_prompt_speech_token,
                                             prompt_feat=prompt_speech_feat,
                                             embedding=flow_embedding,
                                             uuid=this_uuid,
                                             finalize=True)
            yield {'tts_speech': this_tts_speech.cpu()}
        else:
            # deal with all tokens
            p.join()
            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
            this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                             prompt_token=flow_prompt_speech_token,
                                             prompt_feat=prompt_speech_feat,
                                             embedding=flow_embedding,
                                             uuid=this_uuid,
                                             finalize=True,
                                             speed=speed)
            yield {'tts_speech': this_tts_speech.cpu()}
        with self.lock:
            self.tts_speech_token_dict.pop(this_uuid)
            self.llm_end_dict.pop(this_uuid)
            self.mel_overlap_dict.pop(this_uuid)
            self.hift_cache_dict.pop(this_uuid)
            self.flow_cache_dict.pop(this_uuid)
        torch.cuda.empty_cache()


class CosyVoice2Model(CosyVoiceModel):

    def __init__(self,
                 llm: torch.nn.Module,
                 flow: torch.nn.Module,
                 hift: torch.nn.Module,
                 fp16: bool = False,
                 use_flow_cache: bool = False):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.llm = llm
        self.flow = flow
        self.hift = hift
        self.fp16 = fp16
        self.use_flow_cache = use_flow_cache
        if self.fp16 is True:
            self.llm.half()
            self.flow.half()
        # stream related params, check examples/libritts/cosyvoice2/conf/cosyvoice2.yaml
        self.token_hop_len = 25
        self.flow_decoder_required_cache_size = 0 if use_flow_cache is False else 1 * self.token_hop_len * self.flow.token_mel_ratio
        # hift cache
        self.mel_cache_len = 8
        self.source_cache_len = int(self.mel_cache_len * 480)
        # speech fade in out
        self.speech_window = np.hamming(2 * self.source_cache_len)
        # rtf and decoding related
        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
        self.lock = threading.Lock()
        # dict used to store session related variable
        self.tts_speech_token_dict = {}
        self.llm_end_dict = {}
        self.flow_cache_dict = {}
        self.hift_cache_dict = {}

    def init_flow_cache(self):
        encoder_cache = {'offset': 0,
                         'pre_lookahead_layer_conv2_cache': torch.zeros(1, 512, 2).to(self.device),
                         'encoders_kv_cache': torch.zeros(6, 1, 8, 0, 64 * 2).to(self.device),
                         'upsample_offset': 0,
                         'upsample_conv_cache': torch.zeros(1, 512, 4).to(self.device),
                         'upsample_kv_cache': torch.zeros(4, 1, 8, 0, 64 * 2).to(self.device)}
        decoder_cache = {'offset': 0,
                         'down_blocks_conv_cache': torch.zeros(10, 1, 2, 832, 2).to(self.device),
                         'down_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
                         'mid_blocks_conv_cache': torch.zeros(10, 12, 2, 512, 2).to(self.device),
                         'mid_blocks_kv_cache': torch.zeros(10, 12, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
                         'up_blocks_conv_cache': torch.zeros(10, 1, 2, 1024, 2).to(self.device),
                         'up_blocks_kv_cache': torch.zeros(10, 1, 4, 2, self.flow_decoder_required_cache_size, 512, 2).to(self.device),
                         'final_blocks_conv_cache': torch.zeros(10, 2, 256, 2).to(self.device)}
        if self.fp16 is True:
            for cache in [encoder_cache, decoder_cache]:
                for k, v in cache.items():
                    if isinstance(v, torch.Tensor):
                        cache[k] = v.half()
        cache = {'encoder_cache': encoder_cache, 'decoder_cache': decoder_cache}
        return cache

    def load_jit(self, flow_encoder_model):
        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
        self.flow.encoder = flow_encoder

    def get_trt_kwargs(self):
        min_shape = [(2, 80, 4), (2, 1, 4), (2, 80, 4), (2, 80, 4), (1, 4, 2, 0, 512, 2), (12, 4, 2, 0, 512, 2), (1, 4, 2, 0, 512, 2)]
        opt_shape = [(2, 80, 200), (2, 1, 200), (2, 80, 200), (2, 80, 200), (1, 4, 2, 100, 512, 2), (12, 4, 2, 100, 512, 2), (1, 4, 2, 100, 512, 2)]
        max_shape = [(2, 80, 1500), (2, 1, 1500), (2, 80, 1500), (2, 80, 1500), (1, 4, 2, 200, 512, 2), (12, 4, 2, 200, 512, 2), (1, 4, 2, 200, 512, 2)]
        input_names = ["x", "mask", "mu", "cond", 'down_blocks_kv_cache', 'mid_blocks_kv_cache', 'up_blocks_kv_cache']
        assert self.use_flow_cache is True, "get_trt_kwargs is set for flow cache mode. If you want to use trt with use_flow_cache=False, please set higher max_shape"
        return {'min_shape': min_shape, 'opt_shape': opt_shape, 'max_shape': max_shape, 'input_names': input_names}

    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
        with torch.cuda.amp.autocast(self.fp16):
            tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device),
                                                                      token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
                                                                      prompt_token=prompt_token.to(self.device),
                                                                      prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
                                                                      prompt_feat=prompt_feat.to(self.device),
                                                                      prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
                                                                      embedding=embedding.to(self.device),
                                                                      cache=self.flow_cache_dict[uuid],
                                                                      finalize=finalize)
        # append hift cache
        if self.hift_cache_dict[uuid] is not None:
            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
        else:
            hift_cache_source = torch.zeros(1, 1, 0)
        # keep overlap mel and hift cache
        if finalize is False:
            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
            if self.hift_cache_dict[uuid] is not None:
                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
                                          'source': tts_source[:, :, -self.source_cache_len:],
                                          'speech': tts_speech[:, -self.source_cache_len:]}
            tts_speech = tts_speech[:, :-self.source_cache_len]
        else:
            if speed != 1.0:
                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
            if self.hift_cache_dict[uuid] is not None:
                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
        return tts_speech

    def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.zeros(0, 192), llm_embedding=torch.zeros(0, 192),
            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
            prompt_speech_feat=torch.zeros(1, 0, 80), source_speech_token=torch.zeros(1, 0, dtype=torch.int32), stream=False, speed=1.0, **kwargs):
        # this_uuid is used to track variables related to this inference thread
        this_uuid = str(uuid.uuid1())
        
        with self.lock:
            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
            self.hift_cache_dict[this_uuid] = None
            self.flow_cache_dict[this_uuid] = self.init_flow_cache()
        if source_speech_token.shape[1] == 0:
            print("*******************************2221******************************")
            p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
        else:
            p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
        sta_time1 = time.time()
        p.start()
        if stream is True:
            assert self.use_flow_cache is True, "set use_flow_cache=True if you want to use stream inference to avoid OOM"
            # NOTE in cache mode, trim flow_prompt to same size as flow_decoder_required_cache_size
            flow_prompt_speech_token = flow_prompt_speech_token[:, -int(self.flow_decoder_required_cache_size / self.flow.token_mel_ratio):]
            prompt_speech_feat = prompt_speech_feat[:, -self.flow_decoder_required_cache_size:]
            while True:
                time.sleep(0.1)
                if len(self.tts_speech_token_dict[this_uuid]) >= self.token_hop_len + self.flow.pre_lookahead_len:
                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                                     prompt_token=flow_prompt_speech_token,
                                                     prompt_feat=prompt_speech_feat,
                                                     embedding=flow_embedding,
                                                     uuid=this_uuid,
                                                     finalize=False)
                    # NOTE in cache inference mode, we only use flow_prompt_speech_token/prompt_speech_feat in first chunk
                    flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32).to(self.device)
                    prompt_speech_feat = torch.zeros(1, 0, 80).to(self.device)
                    yield {'tts_speech': this_tts_speech.cpu()}
                    with self.lock:
                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][self.token_hop_len:]
                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < self.token_hop_len + self.flow.pre_lookahead_len:
                    break
            p.join()
            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
            this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                             prompt_token=flow_prompt_speech_token,
                                             prompt_feat=prompt_speech_feat,
                                             embedding=flow_embedding,
                                             uuid=this_uuid,
                                             finalize=True)
            yield {'tts_speech': this_tts_speech.cpu()}
        else:
            print("*******************************2224******************************")
            # deal with all tokens
            assert self.use_flow_cache is False, "set use_flow_cache=False for nonstream inference"
            sta_time2 = time.time()
            p.join()
            
            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
            sta_time3 = time.time()
            this_tts_speech = self.token2wav(token=this_tts_speech_token,
                                             prompt_token=flow_prompt_speech_token,
                                             prompt_feat=prompt_speech_feat,
                                             embedding=flow_embedding,
                                             uuid=this_uuid,
                                             finalize=True,
                                             speed=speed)
            yield {'tts_speech': this_tts_speech.cpu()}
            sta_time4 = time.time()
        with self.lock:
            self.tts_speech_token_dict.pop(this_uuid)
            self.llm_end_dict.pop(this_uuid)
            self.hift_cache_dict.pop(this_uuid)
            self.flow_cache_dict.pop(this_uuid)
        torch.cuda.empty_cache()
        sta_time5 = time.time()
        print("threading time:",sta_time1 - sta_time,"this_tts_speech_token time:",sta_time3 - sta_time2,"token2wav:",sta_time4 - sta_time3,"other ",sta_time5 - sta_time4,"all time:",time.time()-sta_time)


###修改处
class VllmCosyVoice2Model(CosyVoice2Model):
    def __init__(self,
                 model_dir: str,
                 flow: torch.nn.Module,
                 hift: torch.nn.Module,
                 fp16: bool):
        try:
            from cosyvoice.llm.llm_vllm import VllmQwen2LM
        except Exception as e:
            raise e
        llm = VllmQwen2LM(model_dir)
        super().__init__(llm, flow, hift, fp16)

    def load(self, llm_model, flow_model, hift_model):
        self.flow.load_state_dict(torch.load(flow_model, weights_only=True, map_location=self.device), strict=True)
        self.flow.to(self.device).eval()
        # in case hift_model is a hifigan model
        hift_state_dict = {k.replace('generator.', ''): v for k, v in
                           torch.load(hift_model, weights_only=True, map_location=self.device).items()}
        self.hift.load_state_dict(hift_state_dict, strict=True)
        self.hift.to(self.device).eval()
###修改处

/CosyVoice/runtime/python/fastapi/server.py 若不修改会出现huggingface_hub.errors.HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: ''.

# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import argparse
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from fastapi import FastAPI, UploadFile, Form, File
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import numpy as np
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/../../..'.format(ROOT_DIR))
sys.path.append('{}/../../../third_party/Matcha-TTS'.format(ROOT_DIR))
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav

app = FastAPI()
# set cross region allowance
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"])


def generate_data(model_output):
    for i in model_output:
        tts_audio = (i['tts_speech'].numpy() * (2 ** 15)).astype(np.int16).tobytes()
        yield tts_audio


@app.get("/inference_sft")
@app.post("/inference_sft")
async def inference_sft(tts_text: str = Form(), spk_id: str = Form()):
    model_output = cosyvoice.inference_sft(tts_text, spk_id)
    return StreamingResponse(generate_data(model_output))


@app.get("/inference_zero_shot")
@app.post("/inference_zero_shot")
async def inference_zero_shot(tts_text: str = Form(), prompt_text: str = Form(), prompt_wav: UploadFile = File()):
    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
    model_output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
    return StreamingResponse(generate_data(model_output))


@app.get("/inference_cross_lingual")
@app.post("/inference_cross_lingual")
async def inference_cross_lingual(tts_text: str = Form(), prompt_wav: UploadFile = File()):
    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
    model_output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
    return StreamingResponse(generate_data(model_output))


@app.get("/inference_instruct")
@app.post("/inference_instruct")
async def inference_instruct(tts_text: str = Form(), spk_id: str = Form(), instruct_text: str = Form()):
    model_output = cosyvoice.inference_instruct(tts_text, spk_id, instruct_text)
    return StreamingResponse(generate_data(model_output))


@app.get("/inference_instruct2")
@app.post("/inference_instruct2")
async def inference_instruct2(tts_text: str = Form(), instruct_text: str = Form(), prompt_wav: UploadFile = File()):
    prompt_speech_16k = load_wav(prompt_wav.file, 16000)
    model_output = cosyvoice.inference_instruct2(tts_text, instruct_text, prompt_speech_16k)
    return StreamingResponse(generate_data(model_output))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--port',
                        type=int,
                        default=50002)
    parser.add_argument('--model_dir',
                        type=str,
                        default='iic/CosyVoice-300M',
                        help='local path or modelscope repo id')
    args = parser.parse_args()
    ###修改处  
    # try:
    #     cosyvoice = CosyVoice(args.model_dir)
    # except Exception:
    try:
        cosyvoice = CosyVoice2(args.model_dir)
    except Exception:
        raise TypeError('no valid model_type!')
    ###修改处
    uvicorn.run(app, host="0.0.0.0", port=args.port)

May 23 '25 09:05 G1017

增加/CosyVoice/cosyvoice/llm/llm_vllm.py

import time
import queue
import asyncio
import threading
from typing import List, Generator, AsyncGenerator
import torch
from cosyvoice.utils.file_utils import logging
from cosyvoice.llm.llm import Qwen2LM

# # 启用vllm V1版本
# import os
# os.environ["VLLM_USE_V1"] = '1'
from vllm import ModelRegistry
from vllm import LLMEngine, AsyncLLMEngine, CompletionOutput
from vllm.engine.arg_utils import EngineArgs, AsyncEngineArgs
from vllm.sampling_params import SamplingParams

from cosyvoice.llm.vllm_use_cosyvoice2_model import CosyVoice2Model as CosyVoice2LLM
ModelRegistry.register_model("CosyVoice2Model", CosyVoice2LLM)

# EngineArgs
ENGINE_ARGS = {
    "block_size": 16,
    "swap_space": 0,
    # "enforce_eager": True,
    "gpu_memory_utilization": 0.4,
    "max_num_batched_tokens": 1024,
    "max_model_len": 1024,
    "max_num_seqs": 256,
    "disable_log_requests": True,
    "disable_log_stats": True,
    "dtype": "float16"
}

from vllm.sampling_params import RequestOutputKind
# SamplingParams
SAMPLING_PARAMS = {
    "temperature": 1,  # 不能低于0.8, 否则会生成非常多的空音频，或者无法正常生成语音Token
    "top_p": 1,       # 不能低于0.8, 否则会生成非常多的空音频，或者无法正常生成语音Token
    "top_k": 25,
    # "min_tokens": 80,       # 不支持设置最小的tokens数量设置，开启后vllm直接崩溃，无法启动
    # "presence_penalty": 1.0,    # 不支持设置
    # "frequency_penalty": 0.0,   # 不支持设置
    "max_tokens": 1024,
    "detokenize": False,          # 目前 vllm 0.7.3 v1版本中设置无效，待后续版本更新后减少计算
    "ignore_eos": False,
    "output_kind": RequestOutputKind.DELTA  # 设置为DELTA，如调整该参数，请同时调整llm_inference的处理代码
}

def tensor_to_list(tensor: torch.tensor):
    return tensor.view(-1).cpu().numpy().tolist()

class VllmQwen2LM(Qwen2LM):
    def __init__(
            self,
            model_dir,
            mix_ratio: List[int] = [5, 15],
    ):
        self.fp16 = False
        self.half = lambda: None
        self.mix_ratio = mix_ratio
        # ---------------------------------------------
        # vllm engine 的参数配置
        engine_args = AsyncEngineArgs(
            model=model_dir,
            **ENGINE_ARGS,
        )
        self.llm_engine: AsyncLLMEngine = AsyncLLMEngine.from_engine_args(engine_args)

        self.speech_token_size = 6564       # 6561 + 3
        self.llm_token_size = 151936        # llm  vocab_size
        self.sos_eos_token_id = self.speech_token_size + self.llm_token_size + 1
        self.task_token_id = self.sos_eos_token_id + 1
        self.zero_token_id = self.task_token_id + 1

        # vllm 的推理任务需要在一个固定的事件循环中，因此启动一个后台线程运行转用于推理任务
        self.loop = asyncio.new_event_loop()
        self.loop_thread = threading.Thread(target=self._run_event_loop, daemon=True)
        self.loop_thread.start()

    def _run_event_loop(self):
        asyncio.set_event_loop(self.loop)
        self.loop.run_forever()

    async def async_llm_inference(self, out_queue, prompt_token_ids, request_id, stop_token_ids, max_tokens):
        sampling_params = SamplingParams(**SAMPLING_PARAMS)
        sampling_params.stop_token_ids = stop_token_ids or [6561]
        if max_tokens:
            sampling_params.max_tokens = max_tokens
        async for output in self.llm_engine.generate(
                {
                    "prompt_token_ids": prompt_token_ids,
                },
                sampling_params=sampling_params,
                request_id=request_id or f"{time.time()}",
        ):
            out_queue.put((output.outputs[0], output.finished))

    def llm_inference(self, prompt_token_ids: List[int], request_id: str=None, stop_token_ids=None, max_tokens=None):
        out_queue = queue.Queue()
        asyncio.run_coroutine_threadsafe(
            self.async_llm_inference(out_queue, prompt_token_ids, request_id, stop_token_ids, max_tokens), self.loop
        )
        # 接收 out_queue 返回的结果
        finished = False
        while not finished:
            (output, finished) = out_queue.get_nowait() if not out_queue.empty() else out_queue.get()
            yield output

    def inference(
            self,
            text: torch.Tensor,
            text_len: torch.Tensor,
            prompt_text: torch.Tensor,
            prompt_text_len: torch.Tensor,
            prompt_speech_token: torch.Tensor,
            prompt_speech_token_len: torch.Tensor,
            embedding: torch.Tensor,
            sampling: int = 25,
            max_token_text_ratio: float = 20,
            min_token_text_ratio: float = 2,
    ) -> Generator[torch.Tensor|int, None, None]:
        prompt_text = tensor_to_list(prompt_text + torch.tensor(6564))
        prompt_speech_token = tensor_to_list(prompt_speech_token)

        text = tensor_to_list(text + torch.tensor(6564))
        prompt_token_ids = [self.sos_eos_token_id] + prompt_text + text + \
                           [self.task_token_id] + prompt_speech_token
        max_tokens = len(text) * 20
        for output in self.llm_inference(
                prompt_token_ids,
                stop_token_ids=[6561],
                max_tokens=max_tokens,
        ):
            if output.token_ids[-1] == 6561:
                need_add_tokens = output.token_ids[:-1]
            else:
                need_add_tokens = output.token_ids
            for token in need_add_tokens:
                yield token

    def inference_bistream(
            self,
            text: Generator,
            prompt_text: torch.Tensor,
            prompt_text_len: torch.Tensor,
            prompt_speech_token: torch.Tensor,
            prompt_speech_token_len: torch.Tensor,
            embedding: torch.Tensor,
            sampling: int = 25,
            max_token_text_ratio: float = 20,
            min_token_text_ratio: float = 2,
    ) -> Generator[torch.Tensor, None, None]:
        prompt_text = tensor_to_list(prompt_text + torch.tensor(6564))
        prompt_speech_token = tensor_to_list(prompt_speech_token)

        last_tokens = []
        prompt_token_ids = [self.sos_eos_token_id]
        text_tokens_cache = prompt_text
        for this_text in text:
            this_text = tensor_to_list(this_text + torch.tensor(6564))
            # text need tokens
            assert isinstance(this_text, list), "text need token ids List[int]."
            text_tokens_cache += this_text
            while len(prompt_speech_token) != 0:
                if len(text_tokens_cache) >= self.mix_ratio[0]:
                    text_input_token = text_tokens_cache[:self.mix_ratio[0]]
                    speech_input_token = prompt_speech_token[:self.mix_ratio[1]]
                    prompt_token_ids += text_input_token + speech_input_token
                    # reset the last cache
                    text_tokens_cache = text_tokens_cache[self.mix_ratio[0]:]
                    prompt_speech_token = prompt_speech_token[self.mix_ratio[1]:]
                else:
                    break
            if len(prompt_speech_token) == 0:
                if (len(last_tokens) > 0 and last_tokens[-1] == 6563) or len(prompt_token_ids) == 1:
                    if len(text_tokens_cache) >= self.mix_ratio[0]:
                        text_tokens_temp = text_tokens_cache[:self.mix_ratio[0]]
                        prompt_token_ids += text_tokens_temp
                        text_tokens_cache = text_tokens_cache[self.mix_ratio[0]:]
                    else:
                        continue
                for output in self.llm_inference(prompt_token_ids, stop_token_ids=[6563]):
                    last_tokens = output.token_ids
                    if last_tokens[-1] == 6563:
                        need_add_tokens = last_tokens[:-1]
                    else:
                        need_add_tokens = last_tokens
                    for token in need_add_tokens:
                        yield token
                    prompt_token_ids.extend(need_add_tokens)
        prompt_token_ids += text_tokens_cache + [self.task_token_id]
        for output in self.llm_inference(prompt_token_ids, stop_token_ids=[6561]):
            if output.token_ids[-1] == 6561:
                need_add_tokens = output.token_ids[:-1]
            else:
                need_add_tokens = output.token_ids
            for token in need_add_tokens:
                yield token

增加/CosyVoice/cosyvoice/llm/vllm_use_cosyvoice2_model.py

"""Inference-only Qwen2 model compatible with HuggingFace weights."""
from typing import Iterable, List, Optional, Set, Tuple, Union, Iterator, overload, TypedDict, Mapping, Any
from typing_extensions import TypeVar

import torch
from torch import nn

from vllm.attention import AttentionMetadata
# from vllm.config import VllmConfig
from vllm.config import CacheConfig, LoRAConfig,SchedulerConfig
from vllm.model_executor.layers.quantization import QuantizationConfig
from transformers import Qwen2Config


from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
# from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.sampler import SamplerOutput, Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors

# from vllm.model_executor.models.interfaces import T
from vllm.model_executor.models.qwen2 import Qwen2Model

# from vllm.model_executor.models.utils import AutoWeightsLoader, maybe_prefix, merge_multimodal_embeddings
from vllm.model_executor.models.utils import AutoWeightsLoader

logger = init_logger(__name__)

IGNORE_ID = -1


class CosyVoice2Model(nn.Module):

    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    }

    # def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    def __init__(self, config: Qwen2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, scheduler_config: Optional[SchedulerConfig] = None,prefix: str = ""):
        super().__init__()
        # config = vllm_config.model_config.hf_config
        # quant_config = vllm_config.quant_config
        # lora_config = vllm_config.lora_config

        self.config = config
        self.lora_config = lora_config
        self.quant_config = quant_config

        self.llm_input_size = 896
        self.llm_output_size = 896

        self.speech_token_size = 6561+3
        self.llm_token_size = config.vocab_size

        # 2. build speech token language model related modules
        self.sos_eos = 0
        self.task_id = 1
        self.fill_token = 2


        self.allow_patterns_overrides = ["llm.*"]
        self.llm_embedding = torch.nn.Embedding(2, self.llm_input_size)
        # self.model = Qwen2Model(vllm_config=vllm_config,
        #                       prefix=maybe_prefix(prefix, "model"))
        print("-----------------------------")
        print("config",config,"  cache_config",cache_config,"  quant_config",quant_config)
        self.model = Qwen2Model(config, cache_config, quant_config)

        # self.llm_decoder = nn.Linear(self.llm_output_size, self.speech_token_size)
        self.llm_decoder = ParallelLMHead(self.speech_token_size,
                                      self.llm_output_size,
                                      bias=True,
                                      quant_config=quant_config,
                                      # prefix=maybe_prefix(
                                      #     prefix, "llm_decoder")
                                         )
        print("-----------------------------")
        print("llm_decoder weight norm:", self.llm_decoder.weight.norm())
        self.logits_processor = LogitsProcessor(self.speech_token_size)

        # length_normalized_loss: bool = True,
        # lsm_weight: float = 0.0,
        # self.criterion_ce = LabelSmoothingLoss(
        #     size=self.speech_token_size,
        #     padding_idx=IGNORE_ID,
        #     smoothing=lsm_weight,
        #     normalize_length=length_normalized_loss,
        # )

        # 3. [Optional] build speech token related modules
        self.speech_embedding = torch.nn.Embedding(self.speech_token_size, self.llm_input_size)

        # 4. sampling method
        ## use vllm sampling method
        self.sampler = Sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

        self.mix_ratio: List[int] = [5, 15]

        # 定义特殊token常量
        self.llm_token_id_delta = torch.tensor(self.speech_token_size, dtype=torch.int32)
        self.sos_eos_token_id = torch.tensor((self.llm_token_id_delta + self.llm_token_size + 1), dtype=torch.int32)  # 163840 + 6564 = 170404
        self.task_token_id = self.sos_eos_token_id + torch.tensor(1, dtype=torch.int32)  # 170405
        self.zero_token_id = self.task_token_id + torch.tensor(1, dtype=torch.int32)

        self.zero_embed_buffer = torch.zeros(
            # (vllm_config.scheduler_config.max_num_seqs, self.llm_input_size),
            (128, self.llm_input_size),
            dtype=self.llm_embedding.weight.dtype,
            device=self.llm_embedding.weight.device
        )
        self.inputs_embed_buffer = torch.zeros(
            # (vllm_config.scheduler_config.max_num_batched_tokens, self.llm_input_size),
            (2048, self.llm_input_size),
            dtype=self.llm_embedding.weight.dtype,
            device=self.llm_embedding.weight.device,
        )

    def get_sos_eos_emb(self):
        return self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)

    def get_task_id_emb(self):
        return self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        # multimodal_embeddings: Optional[T] = None,
        attn_metadata: Optional["AttentionMetadata"] = None,
    ) -> torch.Tensor:
        """
        Returns the input embeddings merged from the text embeddings from
        input_ids and the multimodal embeddings generated from multimodal
        kwargs.
        """
        # 创建掩码，标记哪些 token_id 属于音频 Token
        mask = input_ids < self.speech_token_size

        # 获取 input_ids 的原始形状
        input_shape = input_ids.shape
        # 展平 input_ids 和掩码以便统一处理
        flat_input_ids = input_ids.view(-1)
        flat_mask = mask.view(-1)

        inputs_embeds = self.inputs_embed_buffer[:flat_input_ids.shape[0]]
        inputs_embeds.zero_()

        # Process speech tokens
        if flat_mask.any():
            speech_token_ids = flat_input_ids[flat_mask]
            inputs_embeds[flat_mask] = self.speech_embedding(speech_token_ids)

        # 处理大于 delta 的 token_id
        if (~flat_mask).any():
            llm_token_ids = flat_input_ids[~flat_mask]
            llm_embeds = torch.zeros_like(inputs_embeds[~flat_mask])

            sos_eos_mask = llm_token_ids == self.sos_eos_token_id
            task_mask = llm_token_ids == self.task_token_id
            zero_mask = llm_token_ids == self.zero_token_id
            normal_mask = ~(sos_eos_mask | task_mask | zero_mask)

            # 分层处理逻辑
            # 第一优先级：SOS/EOS标记
            if sos_eos_mask.any():
                llm_embeds[sos_eos_mask] = self.llm_embedding.weight[self.sos_eos].unsqueeze(0)

            # 第二优先级：任务标记
            if task_mask.any():
                llm_embeds[task_mask] = self.llm_embedding.weight[self.task_id].unsqueeze(0)

            # 第二优先级：空音频标记
            if zero_mask.any():
                llm_embeds[zero_mask] = self.zero_embed_buffer[:len(llm_embeds[zero_mask])]

            # 常规LLM token
            if normal_mask.any():
                original_ids = llm_token_ids[normal_mask] - self.llm_token_id_delta
                # print('original_ids: ',original_ids)
                llm_embeds[normal_mask] = self.model.get_input_embeddings(original_ids)

            inputs_embeds[~flat_mask] = llm_embeds

        inputs_embeds = inputs_embeds.view(*input_shape, self.llm_input_size)

        # # 合并多模态嵌入（如果有）
        # if multimodal_embeddings is not None:
        #     inputs_embeds = merge_multimodal_embeddings(
        #         input_ids, inputs_embeds, multimodal_embeddings,
        #         self.config.audio_token_index
        #     )
        return inputs_embeds

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings(
                input_ids,
                attn_metadata=attn_metadata,
            )
        return self.model(input_ids, positions, kv_caches,
                        attn_metadata, intermediate_tensors,
                        inputs_embeds)

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.llm_decoder, hidden_states,
                                       sampling_metadata)
        return logits

    def sample(
        self,
        logits: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[SamplerOutput]:
        next_tokens = self.sampler(logits, sampling_metadata)
        return next_tokens

    @staticmethod
    def convert_weights(weights: Iterable[Tuple[str, torch.Tensor]]) -> Iterable[Tuple[str, torch.Tensor]]:
        for name, param in weights:
            # 处理Qwen2Model核心参数
            if name.startswith("llm."):
                if name.startswith("llm.model.model."):
                    name = name.replace("llm.model.model.", "model.")
                else:
                    continue
            # print('weights name: ', name)
                yield name, param

    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
        weights = self.convert_weights(weights)
        loader = AutoWeightsLoader(self)
        loader.load_weights(weights)

May 23 '25 09:05 G1017

遇到了一摸一样的同样的问题！！！！！cosyvoice2用vllm加速输出的token不对

May 23 '25 13:05 sjq19960802

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

May 25 '25 11:05 foxmale007

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

May 25 '25 13:05 sjq19960802

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

May 25 '25 13:05 foxmale007

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

May 25 '25 15:05 sjq19960802

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

我也是定位到token生成错误

May 26 '25 02:05 G1017

官方vllm代码并未完成，完成后会更新readme里增加用法提示

May 26 '25 03:05 aluminumbox

官方vllm代码并未完成，完成后会更新readme里增加用法提示

感谢回复，再问下这个有预期多久完成吗

May 26 '25 05:05 sjq19960802

官方vllm代码并未完成，完成后会更新readme里增加用法提示

感谢回复，再问下这个有预期多久完成吗

我们也在等vllm官方库的一个功能release，可能下个月吧

May 26 '25 05:05 aluminumbox

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

May 26 '25 07:05 foxmale007

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通查看了 https://github.com/qi-hua/async_cosyvoice 项，VLLMノ跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

您好，我想问下我一直报错这个是为什么：Traceback (most recent call last): File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in main(args) File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 190, in main cosyvoice = AsyncCosyVoice2(args.model_dir, load_jit=args.load_jit, load_trt=args.load_trt, fp16=args.fp16) File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/async_cosyvoice.py", line 60, in init self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/model.py", line 142, in load_trt convert_onnx_to_trt(flow_decoder_estimator_model, flow_decoder_onnx_model, fp16) TypeError: convert_onnx_to_trt() missing 1 required positional argument: 'fp16'

May 26 '25 08:05 vvsicdat

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通查看了 https://github.com/qi-hua/async_cosyvoice 项，VLLMノ跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

您好，我想问下我一直报错这个是为什么：Traceback (most recent call last): File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in main(args) File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 190, in main cosyvoice = AsyncCosyVoice2(args.model_dir, load_jit=args.load_jit, load_trt=args.load_trt, fp16=args.fp16) File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/async_cosyvoice.py", line 60, in init self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'), File "/home/dqy/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/model.py", line 142, in load_trt convert_onnx_to_trt(flow_decoder_estimator_model, flow_decoder_onnx_model, fp16) TypeError: convert_onnx_to_trt() missing 1 required positional argument: 'fp16'

我jit,trt打开也是报错。所以把它们俩都关了。mygpu.plan文件那个项目作者没提供

May 26 '25 08:05 foxmale007

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

你好可以求一个能正常推理的vllm代码吗

May 27 '25 05:05 sjq19960802

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

你好可以求一个能正常推理的vllm代码吗

就是拿 https://github.com/qi-hua/async_cosyvoice 这个项目跑的，jit和trt加速关掉

May 28 '25 07:05 foxmale007

@foxmale007

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

你好可以求一个能正常推理的vllm代码吗

就是拿 https://github.com/qi-hua/async_cosyvoice 这个项目跑的，jit和trt加速关掉

@foxmale007 你好，请问使用的最新的CosyVoice2模型吗

May 28 '25 08:05 liuylmr

@foxmale007

看看https://github.com/qi-hua/async_cosyvoice的项目，VLLM能跑通

这个试过啦，已经跑不通更新的模型了

model最新的0.5B可以用。但是那个项目是复制了一份推理代码，没有用最新基座的推理代码，反正能跑。等官方合并

问下你跑出来的推理结果是正常的吗，上面这个项目我适配了一下最新的推理代码0.5b推理出来的结果不正常，音频中噪声非常大。

正常了。VLLM推理出来的效果： https://jim-aibuy-site.oss-cn-guangzhou.aliyuncs.com/baba.m4a

你好可以求一个能正常推理的vllm代码吗

就是拿 https://github.com/qi-hua/async_cosyvoice 这个项目跑的，jit和trt加速关掉

@foxmale007 你好，请问使用的最新的CosyVoice2模型吗

vllm的推理代码和官方最新版是有差别的，model可以用最新的，按照文档覆盖一下就行

May 29 '25 15:05 foxmale007

@foxmale007 @sjq19960802 @aluminumbox 您们好，请问 vllm 推理代码可以跑通了吗？我用了 https://github.com/qi-hua/async_cosyvoice 也跑不通。。报错： Traceback (most recent call last): File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/main.py", line 71, in cli.main() File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 501, in main run() File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 351, in run_file runpy.run_path(target, run_name="main") File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 310, in run_path return _run_module_code(code, init_globals, run_name, pkg_name=pkg_name, script_name=fname) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 127, in _run_module_code _run_code(code, mod_globals, init_globals, mod_name, mod_spec, pkg_name, script_name) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 118, in _run_code exec(code, run_globals) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in main(args) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 190, in main cosyvoice = AsyncCosyVoice2(args.model_dir, load_jit=args.load_jit, load_trt=args.load_trt, fp16=args.fp16) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/async_cosyvoice.py", line 53, in init self.model.load( File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/model.py", line 128, in load self.flow.load_state_dict(torch.load(flow_model, weights_only=True, map_location=self.device), strict=True) File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2584, in load_state_dict raise RuntimeError( RuntimeError: Error(s) in loading state_dict for CausalMaskedDiffWithXvec: Missing key(s) in state_dict: "decoder.estimator.down_blocks.0.0.block1.block.1.weight", "decoder.estimator.down_blocks.0.0.block1.block.1.bias", "decoder.estimator.down_blocks.0.0.block2.block.1.weight", "decoder.estimator.down_blocks.0.0.block2.block.1.bias", "decoder.estimator.mid_blocks.0.0.block1.block.1.weight", "decoder.estimator.mid_blocks.0.0.block1.block.1.bias", "decoder.estimator.mid_blocks.0.0.block2.block.1.weight", "decoder.estimator.mid_blocks.0.0.block2.block.1.bias", "decoder.estimator.mid_blocks.1.0.block1.block.1.weight", "decoder.estimator.mid_blocks.1.0.block1.block.1.bias", "decoder.estimator.mid_blocks.1.0.block2.block.1.weight", "decoder.estimator.mid_blocks.1.0.block2.block.1.bias", "decoder.estimator.mid_blocks.2.0.block1.block.1.weight", "decoder.estimator.mid_blocks.2.0.block1.block.1.bias", "decoder.estimator.mid_blocks.2.0.block2.block.1.weight", "decoder.estimator.mid_blocks.2.0.block2.block.1.bias", "decoder.estimator.mid_blocks.3.0.block1.block.1.weight", "decoder.estimator.mid_blocks.3.0.block1.block.1.bias", "decoder.estimator.mid_blocks.3.0.block2.block.1.weight", "decoder.estimator.mid_blocks.3.0.block2.block.1.bias", "decoder.estimator.mid_blocks.4.0.block1.block.1.weight", "decoder.estimator.mid_blocks.4.0.block1.block.1.bias", "decoder.estimator.mid_blocks.4.0.block2.block.1.weight", "decoder.estimator.mid_blocks.4.0.block2.block.1.bias", "decoder.estimator.mid_blocks.5.0.block1.block.1.weight", "decoder.estimator.mid_blocks.5.0.block1.block.1.bias", "decoder.estimator.mid_blocks.5.0.block2.block.1.weight", "decoder.estimator.mid_blocks.5.0.block2.block.1.bias", "decoder.estimator.mid_blocks.6.0.block1.block.1.weight", "decoder.estimator.mid_blocks.6.0.block1.block.1.bias", "decoder.estimator.mid_blocks.6.0.block2.block.1.weight", "decoder.estimator.mid_blocks.6.0.block2.block.1.bias", "decoder.estimator.mid_blocks.7.0.block1.block.1.weight", "decoder.estimator.mid_blocks.7.0.block1.block.1.bias", "decoder.estimator.mid_blocks.7.0.block2.block.1.weight", "decoder.estimator.mid_blocks.7.0.block2.block.1.bias", "decoder.estimator.mid_blocks.8.0.block1.block.1.weight", "decoder.estimator.mid_blocks.8.0.block1.block.1.bias", "decoder.estimator.mid_blocks.8.0.block2.block.1.weight", "decoder.estimator.mid_blocks.8.0.block2.block.1.bias", "decoder.estimator.mid_blocks.9.0.block1.block.1.weight", "decoder.estimator.mid_blocks.9.0.block1.block.1.bias", "decoder.estimator.mid_blocks.9.0.block2.block.1.weight", "decoder.estimator.mid_blocks.9.0.block2.block.1.bias", "decoder.estimator.mid_blocks.10.0.block1.block.1.weight", "decoder.estimator.mid_blocks.10.0.block1.block.1.bias", "decoder.estimator.mid_blocks.10.0.block2.block.1.weight", "decoder.estimator.mid_blocks.10.0.block2.block.1.bias", "decoder.estimator.mid_blocks.11.0.block1.block.1.weight", "decoder.estimator.mid_blocks.11.0.block1.block.1.bias", "decoder.estimator.mid_blocks.11.0.block2.block.1.weight", "decoder.estimator.mid_blocks.11.0.block2.block.1.bias", "decoder.estimator.up_blocks.0.0.block1.block.1.weight", "decoder.estimator.up_blocks.0.0.block1.block.1.bias", "decoder.estimator.up_blocks.0.0.block2.block.1.weight", "decoder.estimator.up_blocks.0.0.block2.block.1.bias", "decoder.estimator.final_block.block.1.weight", "decoder.estimator.final_block.block.1.bias". Unexpected key(s) in state_dict: "decoder.estimator.down_blocks.0.0.block1.block.2.weight", "decoder.estimator.down_blocks.0.0.block1.block.2.bias", "decoder.estimator.down_blocks.0.0.block2.block.2.weight", "decoder.estimator.down_blocks.0.0.block2.block.2.bias", "decoder.estimator.mid_blocks.0.0.block1.block.2.weight", "decoder.estimator.mid_blocks.0.0.block1.block.2.bias", "decoder.estimator.mid_blocks.0.0.block2.block.2.weight", "decoder.estimator.mid_blocks.0.0.block2.block.2.bias", "decoder.estimator.mid_blocks.1.0.block1.block.2.weight", "decoder.estimator.mid_blocks.1.0.block1.block.2.bias", "decoder.estimator.mid_blocks.1.0.block2.block.2.weight", "decoder.estimator.mid_blocks.1.0.block2.block.2.bias", "decoder.estimator.mid_blocks.2.0.block1.block.2.weight", "decoder.estimator.mid_blocks.2.0.block1.block.2.bias", "decoder.estimator.mid_blocks.2.0.block2.block.2.weight", "decoder.estimator.mid_blocks.2.0.block2.block.2.bias", "decoder.estimator.mid_blocks.3.0.block1.block.2.weight", "decoder.estimator.mid_blocks.3.0.block1.block.2.bias", "decoder.estimator.mid_blocks.3.0.block2.block.2.weight", "decoder.estimator.mid_blocks.3.0.block2.block.2.bias", "decoder.estimator.mid_blocks.4.0.block1.block.2.weight", "decoder.estimator.mid_blocks.4.0.block1.block.2.bias", "decoder.estimator.mid_blocks.4.0.block2.block.2.weight", "decoder.estimator.mid_blocks.4.0.block2.block.2.bias", "decoder.estimator.mid_blocks.5.0.block1.block.2.weight", "decoder.estimator.mid_blocks.5.0.block1.block.2.bias", "decoder.estimator.mid_blocks.5.0.block2.block.2.weight", "decoder.estimator.mid_blocks.5.0.block2.block.2.bias", "decoder.estimator.mid_blocks.6.0.block1.block.2.weight", "decoder.estimator.mid_blocks.6.0.block1.block.2.bias", "decoder.estimator.mid_blocks.6.0.block2.block.2.weight", "decoder.estimator.mid_blocks.6.0.block2.block.2.bias", "decoder.estimator.mid_blocks.7.0.block1.block.2.weight", "decoder.estimator.mid_blocks.7.0.block1.block.2.bias", "decoder.estimator.mid_blocks.7.0.block2.block.2.weight", "decoder.estimator.mid_blocks.7.0.block2.block.2.bias", "decoder.estimator.mid_blocks.8.0.block1.block.2.weight", "decoder.estimator.mid_blocks.8.0.block1.block.2.bias", "decoder.estimator.mid_blocks.8.0.block2.block.2.weight", "decoder.estimator.mid_blocks.8.0.block2.block.2.bias", "decoder.estimator.mid_blocks.9.0.block1.block.2.weight", "decoder.estimator.mid_blocks.9.0.block1.block.2.bias", "decoder.estimator.mid_blocks.9.0.block2.block.2.weight", "decoder.estimator.mid_blocks.9.0.block2.block.2.bias", "decoder.estimator.mid_blocks.10.0.block1.block.2.weight", "decoder.estimator.mid_blocks.10.0.block1.block.2.bias", "decoder.estimator.mid_blocks.10.0.block2.block.2.weight", "decoder.estimator.mid_blocks.10.0.block2.block.2.bias", "decoder.estimator.mid_blocks.11.0.block1.block.2.weight", "decoder.estimator.mid_blocks.11.0.block1.block.2.bias", "decoder.estimator.mid_blocks.11.0.block2.block.2.weight", "decoder.estimator.mid_blocks.11.0.block2.block.2.bias", "decoder.estimator.up_blocks.0.0.block1.block.2.weight", "decoder.estimator.up_blocks.0.0.block1.block.2.bias", "decoder.estimator.up_blocks.0.0.block2.block.2.weight", "decoder.estimator.up_blocks.0.0.block2.block.2.bias", "decoder.estimator.final_block.block.2.weight", "decoder.estimator.final_block.block.2.bias". [rank0]:[W619 08:17:38.670858366 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())

Jun 19 '25 00:06 jinec

@foxmale007 @sjq19960802 @aluminumbox 您们好，请问 vllm 推理代码可以跑通了吗？我用了 https://github.com/qi-hua/async_cosyvoice 也跑不通。。报错： Traceback (most recent call last): File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/main.py", line 71, in cli.main() File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 501, in main run() File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 351, in run_file runpy.run_path(target, run_name="main") File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 310, in run_path return _run_module_code(code, init_globals, run_name, pkg_name=pkg_name, script_name=fname) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 127, in _run_module_code _run_code(code, mod_globals, init_globals, mod_name, mod_spec, pkg_name, script_name) File "/root/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 118, in _run_code exec(code, run_globals) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 206, in main(args) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/server.py", line 190, in main cosyvoice = AsyncCosyVoice2(args.model_dir, load_jit=args.load_jit, load_trt=args.load_trt, fp16=args.fp16) File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/async_cosyvoice.py", line 53, in init self.model.load( File "/workspace/CosyVoice/async_cosyvoice/runtime/fastapi/../../../async_cosyvoice/model.py", line 128, in load self.flow.load_state_dict(torch.load(flow_model, weights_only=True, map_location=self.device), strict=True) File "/root/miniconda3/envs/cosyvoice2/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2584, in load_state_dict raise RuntimeError( RuntimeError: Error(s) in loading state_dict for CausalMaskedDiffWithXvec: Missing key(s) in state_dict: "decoder.estimator.down_blocks.0.0.block1.block.1.weight", "decoder.estimator.down_blocks.0.0.block1.block.1.bias", "decoder.estimator.down_blocks.0.0.block2.block.1.weight", "decoder.estimator.down_blocks.0.0.block2.block.1.bias", "decoder.estimator.mid_blocks.0.0.block1.block.1.weight", "decoder.estimator.mid_blocks.0.0.block1.block.1.bias", "decoder.estimator.mid_blocks.0.0.block2.block.1.weight", "decoder.estimator.mid_blocks.0.0.block2.block.1.bias", "decoder.estimator.mid_blocks.1.0.block1.block.1.weight", "decoder.estimator.mid_blocks.1.0.block1.block.1.bias", "decoder.estimator.mid_blocks.1.0.block2.block.1.weight", "decoder.estimator.mid_blocks.1.0.block2.block.1.bias", "decoder.estimator.mid_blocks.2.0.block1.block.1.weight", "decoder.estimator.mid_blocks.2.0.block1.block.1.bias", "decoder.estimator.mid_blocks.2.0.block2.block.1.weight", "decoder.estimator.mid_blocks.2.0.block2.block.1.bias", "decoder.estimator.mid_blocks.3.0.block1.block.1.weight", "decoder.estimator.mid_blocks.3.0.block1.block.1.bias", "decoder.estimator.mid_blocks.3.0.block2.block.1.weight", "decoder.estimator.mid_blocks.3.0.block2.block.1.bias", "decoder.estimator.mid_blocks.4.0.block1.block.1.weight", "decoder.estimator.mid_blocks.4.0.block1.block.1.bias", "decoder.estimator.mid_blocks.4.0.block2.block.1.weight", "decoder.estimator.mid_blocks.4.0.block2.block.1.bias", "decoder.estimator.mid_blocks.5.0.block1.block.1.weight", "decoder.estimator.mid_blocks.5.0.block1.block.1.bias", "decoder.estimator.mid_blocks.5.0.block2.block.1.weight", "decoder.estimator.mid_blocks.5.0.block2.block.1.bias", "decoder.estimator.mid_blocks.6.0.block1.block.1.weight", "decoder.estimator.mid_blocks.6.0.block1.block.1.bias", "decoder.estimator.mid_blocks.6.0.block2.block.1.weight", "decoder.estimator.mid_blocks.6.0.block2.block.1.bias", "decoder.estimator.mid_blocks.7.0.block1.block.1.weight", "decoder.estimator.mid_blocks.7.0.block1.block.1.bias", "decoder.estimator.mid_blocks.7.0.block2.block.1.weight", "decoder.estimator.mid_blocks.7.0.block2.block.1.bias", "decoder.estimator.mid_blocks.8.0.block1.block.1.weight", "decoder.estimator.mid_blocks.8.0.block1.block.1.bias", "decoder.estimator.mid_blocks.8.0.block2.block.1.weight", "decoder.estimator.mid_blocks.8.0.block2.block.1.bias", "decoder.estimator.mid_blocks.9.0.block1.block.1.weight", "decoder.estimator.mid_blocks.9.0.block1.block.1.bias", "decoder.estimator.mid_blocks.9.0.block2.block.1.weight", "decoder.estimator.mid_blocks.9.0.block2.block.1.bias", "decoder.estimator.mid_blocks.10.0.block1.block.1.weight", "decoder.estimator.mid_blocks.10.0.block1.block.1.bias", "decoder.estimator.mid_blocks.10.0.block2.block.1.weight", "decoder.estimator.mid_blocks.10.0.block2.block.1.bias", "decoder.estimator.mid_blocks.11.0.block1.block.1.weight", "decoder.estimator.mid_blocks.11.0.block1.block.1.bias", "decoder.estimator.mid_blocks.11.0.block2.block.1.weight", "decoder.estimator.mid_blocks.11.0.block2.block.1.bias", "decoder.estimator.up_blocks.0.0.block1.block.1.weight", "decoder.estimator.up_blocks.0.0.block1.block.1.bias", "decoder.estimator.up_blocks.0.0.block2.block.1.weight", "decoder.estimator.up_blocks.0.0.block2.block.1.bias", "decoder.estimator.final_block.block.1.weight", "decoder.estimator.final_block.block.1.bias". Unexpected key(s) in state_dict: "decoder.estimator.down_blocks.0.0.block1.block.2.weight", "decoder.estimator.down_blocks.0.0.block1.block.2.bias", "decoder.estimator.down_blocks.0.0.block2.block.2.weight", "decoder.estimator.down_blocks.0.0.block2.block.2.bias", "decoder.estimator.mid_blocks.0.0.block1.block.2.weight", "decoder.estimator.mid_blocks.0.0.block1.block.2.bias", "decoder.estimator.mid_blocks.0.0.block2.block.2.weight", "decoder.estimator.mid_blocks.0.0.block2.block.2.bias", "decoder.estimator.mid_blocks.1.0.block1.block.2.weight", "decoder.estimator.mid_blocks.1.0.block1.block.2.bias", "decoder.estimator.mid_blocks.1.0.block2.block.2.weight", "decoder.estimator.mid_blocks.1.0.block2.block.2.bias", "decoder.estimator.mid_blocks.2.0.block1.block.2.weight", "decoder.estimator.mid_blocks.2.0.block1.block.2.bias", "decoder.estimator.mid_blocks.2.0.block2.block.2.weight", "decoder.estimator.mid_blocks.2.0.block2.block.2.bias", "decoder.estimator.mid_blocks.3.0.block1.block.2.weight", "decoder.estimator.mid_blocks.3.0.block1.block.2.bias", "decoder.estimator.mid_blocks.3.0.block2.block.2.weight", "decoder.estimator.mid_blocks.3.0.block2.block.2.bias", "decoder.estimator.mid_blocks.4.0.block1.block.2.weight", "decoder.estimator.mid_blocks.4.0.block1.block.2.bias", "decoder.estimator.mid_blocks.4.0.block2.block.2.weight", "decoder.estimator.mid_blocks.4.0.block2.block.2.bias", "decoder.estimator.mid_blocks.5.0.block1.block.2.weight", "decoder.estimator.mid_blocks.5.0.block1.block.2.bias", "decoder.estimator.mid_blocks.5.0.block2.block.2.weight", "decoder.estimator.mid_blocks.5.0.block2.block.2.bias", "decoder.estimator.mid_blocks.6.0.block1.block.2.weight", "decoder.estimator.mid_blocks.6.0.block1.block.2.bias", "decoder.estimator.mid_blocks.6.0.block2.block.2.weight", "decoder.estimator.mid_blocks.6.0.block2.block.2.bias", "decoder.estimator.mid_blocks.7.0.block1.block.2.weight", "decoder.estimator.mid_blocks.7.0.block1.block.2.bias", "decoder.estimator.mid_blocks.7.0.block2.block.2.weight", "decoder.estimator.mid_blocks.7.0.block2.block.2.bias", "decoder.estimator.mid_blocks.8.0.block1.block.2.weight", "decoder.estimator.mid_blocks.8.0.block1.block.2.bias", "decoder.estimator.mid_blocks.8.0.block2.block.2.weight", "decoder.estimator.mid_blocks.8.0.block2.block.2.bias", "decoder.estimator.mid_blocks.9.0.block1.block.2.weight", "decoder.estimator.mid_blocks.9.0.block1.block.2.bias", "decoder.estimator.mid_blocks.9.0.block2.block.2.weight", "decoder.estimator.mid_blocks.9.0.block2.block.2.bias", "decoder.estimator.mid_blocks.10.0.block1.block.2.weight", "decoder.estimator.mid_blocks.10.0.block1.block.2.bias", "decoder.estimator.mid_blocks.10.0.block2.block.2.weight", "decoder.estimator.mid_blocks.10.0.block2.block.2.bias", "decoder.estimator.mid_blocks.11.0.block1.block.2.weight", "decoder.estimator.mid_blocks.11.0.block1.block.2.bias", "decoder.estimator.mid_blocks.11.0.block2.block.2.weight", "decoder.estimator.mid_blocks.11.0.block2.block.2.bias", "decoder.estimator.up_blocks.0.0.block1.block.2.weight", "decoder.estimator.up_blocks.0.0.block1.block.2.bias", "decoder.estimator.up_blocks.0.0.block2.block.2.weight", "decoder.estimator.up_blocks.0.0.block2.block.2.bias", "decoder.estimator.final_block.block.2.weight", "decoder.estimator.final_block.block.2.bias". [rank0]:[W619 08:17:38.670858366 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())

你解决这个问题了吗，我也是报这个错

Jul 07 '25 01:07 wx-333