vllm
vllm copied to clipboard
[Model] Add Qwen2-Audio model support
This PR adding support for Qwen2-Audio model.
FIX #8394 FIX #8461
Requirements
Use transformers>=4.45.1
, and please install vLLM from source.
Example Usage
import requests
from transformers import AutoTokenizer, AutoProcessor
from transformers.pipelines.audio_utils import ffmpeg_read
from vllm import LLM, SamplingParams
MODEL_PATH = 'Qwen/Qwen2-Audio-7B-Instruct'
def qwen2_audio_batch():
processor = AutoProcessor.from_pretrained(MODEL_PATH)
conversation1 = [
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
{"type": "text", "text": "What's that sound?"},
]},
{"role": "assistant", "content": "It is the sound of glass shattering."},
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
{"type": "text", "text": "What can you hear?"},
]}
]
conversation2 = [
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
{"type": "text", "text": "What does the person say?"},
]},
]
conversation3 = [
{"role": "user", "content": [
{"type": "text", "text": "How to make a pizza?"},
]},
]
conversations = [conversation1, conversation2, conversation3]
text = [processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False, add_audio_id = True) for conversation in conversations]
audios = []
for conversation in conversations:
audio_infos_vllm = []
for message in conversation:
if isinstance(message["content"], list):
for ele in message["content"]:
if ele["type"] == "audio":
audio_infos_vllm.append((ffmpeg_read(requests.get(ele['audio_url']).content,
sampling_rate=processor.
feature_extractor.sampling_rate),
processor.feature_extractor.sampling_rate))
audios.append(audio_infos_vllm)
inputs = [
{
'prompt': text[i],
'multi_modal_data': {
'audio': audios[i]
}
} for i in range(len(conversations))
]
return inputs
def main():
llm = LLM(
model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.98,
enforce_eager=True, # Disable CUDA graph, force call forward in every decode step.
limit_mm_per_prompt={"audio": 5},
)
sampling_params = SamplingParams(
temperature=0.7, top_p=0.01, top_k=1, repetition_penalty=1.1, max_tokens=256,
stop_token_ids=[],
)
inputs = qwen2_audio_batch()
print(f"{inputs=}")
outputs = llm.generate(inputs, sampling_params=sampling_params)
for i, output in enumerate(outputs):
generated_text = output.outputs[0].text
print()
print('=' * 40)
print(f"Inputs[{i}]: {inputs[i]['prompt']!r}")
print(f"Generated text: {generated_text!r}")
if __name__ == '__main__':
main()