vllm icon indicating copy to clipboard operation
vllm copied to clipboard

[Model] Add Qwen2-Audio model support

Open faychu opened this issue 4 months ago • 8 comments

This PR adding support for Qwen2-Audio model.

FIX #8394 FIX #8461

Requirements

Use transformers>=4.45.1, and please install vLLM from source.

Example Usage

import requests


from transformers import AutoTokenizer, AutoProcessor
from transformers.pipelines.audio_utils import ffmpeg_read

from vllm import LLM, SamplingParams

MODEL_PATH = 'Qwen/Qwen2-Audio-7B-Instruct'


def qwen2_audio_batch():
    processor = AutoProcessor.from_pretrained(MODEL_PATH)
    
    conversation1 = [
            {"role": "user", "content": [
                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
                {"type": "text", "text": "What's that sound?"},
            ]},
            {"role": "assistant", "content": "It is the sound of glass shattering."},
            {"role": "user", "content": [
                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
                {"type": "text", "text": "What can you hear?"},
            ]}
        ]

    conversation2 = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
                    {"type": "text", "text": "What does the person say?"},
                ]},
            ]

    conversation3 = [
                {"role": "user", "content": [
                    {"type": "text", "text": "How to make a pizza?"},
                ]},
            ]

    conversations = [conversation1, conversation2, conversation3]

    text = [processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False, add_audio_id = True) for conversation in conversations]
    
    audios = []
    for conversation in conversations:
        audio_infos_vllm = []
        for message in conversation:
            if isinstance(message["content"], list):
                for ele in message["content"]:
                    if ele["type"] == "audio":
                        audio_infos_vllm.append((ffmpeg_read(requests.get(ele['audio_url']).content,
                                        sampling_rate=processor.
                                        feature_extractor.sampling_rate),
                                        processor.feature_extractor.sampling_rate))
        audios.append(audio_infos_vllm)
        
    inputs = [
        {
            'prompt': text[i],
            'multi_modal_data': {
                'audio': audios[i]
            }
        } for i in range(len(conversations))
    ]
    return inputs



def main():
    llm = LLM(
        model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.98,
        enforce_eager=True,     # Disable CUDA graph, force call forward in every decode step.
        limit_mm_per_prompt={"audio": 5},
    )
    sampling_params = SamplingParams(
        temperature=0.7, top_p=0.01, top_k=1, repetition_penalty=1.1, max_tokens=256,
        stop_token_ids=[],
    )

    inputs = qwen2_audio_batch()
    print(f"{inputs=}")

    outputs = llm.generate(inputs, sampling_params=sampling_params)

    for i, output in enumerate(outputs):
        generated_text = output.outputs[0].text
        print()
        print('=' * 40)
        print(f"Inputs[{i}]: {inputs[i]['prompt']!r}")
        print(f"Generated text: {generated_text!r}")


if __name__ == '__main__':
    main()

faychu avatar Oct 10 '24 13:10 faychu