FunASR
FunASR copied to clipboard
online模型运行时间长了之后响应速度变慢
- 运行环境: 操作系统:linux python:3.8 torch:2.0.0 modelscope:1.9.3 gpu:p100, 显卡驱动535, cuda:11.7 2.执行代码
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
import tracemalloc
import logging
tracemalloc.start()
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
#
n_gpu = 1
n_cpu = 4
wav_file = "football.wav"
asr_model_online = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online'
inference_pipeline_asr_online = pipeline(
task=Tasks.auto_speech_recognition,
model=asr_model_online,
ngpu=n_gpu,
ncpu=n_cpu,
model_revision='v1.0.7',
update_model='v1.0.7',
num_cache_chunks=10,
mode='paraformer_streaming')
param_dict_asr_online = {"cache": dict(), "chunk_size": [5, 5, 5]}
wav_data = open(wav_file, mode='rb').read()
from tqdm import tqdm
step = 1920 * 5
for i in tqdm(range(0, len(wav_data), step)):
audio_in = wav_data[i:i + step]
rec_result = inference_pipeline_asr_online(audio_in=audio_in,
param_dict=param_dict_asr_online)
print(rec_result)
测试的音频是一个长达3个小时的足球解说视频 3. 问题:测试的时候发现,随着时间的变长,大概半个小时之后响应的时间会明显变长,从50ms到400ms
完整的跑了一下三个小时的模型,按照这里的代码来运行,打印了下执行次数和运行时间,完整代码如下
import os
import logging
import torch
import soundfile
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
os.environ["MODELSCOPE_CACHE"] = "./"
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
model_revision='v1.0.7',
update_model=False,
mode="paraformer_streaming"
)
model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"],
"damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online")
speech, sample_rate = soundfile.read("basketball.wav")
speech_length = speech.shape[0]
sample_offset = 0
chunk_size = [0, 10, 5] # [0, 10, 5] 600ms, [0, 8, 4] 480ms
encoder_chunk_look_back = 4 # number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 # number of encoder chunks to lookback for decoder cross-attention
stride_size = chunk_size[1] * 960
param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size,
"encoder_chunk_look_back": encoder_chunk_look_back, "decoder_chunk_look_back": decoder_chunk_look_back}
final_result = ""
import time
for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
if sample_offset + stride_size >= speech_length - 1:
stride_size = speech_length - sample_offset
param_dict["is_final"] = True
start = time.time()
rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
param_dict=param_dict)
end = time.time()
cost = end - start
if len(rec_result) != 0:
final_result += rec_result['text']
print(rec_result['text'], f"接口运行次数:{sample_offset // stride_size},耗时:{cost}", sep='\t')
print(final_result)
接口运行时间如下图,从一开始的60ms左右上升到最后的200ms
完整日志文件
asr_online.log
另外跑到最后显存好像也在不断增加,开始跑的时候显存占用只有3G左右,到最后显存快被占满了,cpu从600%上升到900%
完整的跑了一下三个小时的模型,按照这里的代码来运行,打印了下执行次数和运行时间,完整代码如下
import os import logging import torch import soundfile from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.logger import get_logger logger = get_logger(log_level=logging.CRITICAL) logger.setLevel(logging.CRITICAL) os.environ["MODELSCOPE_CACHE"] = "./" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', model_revision='v1.0.7', update_model=False, mode="paraformer_streaming" ) model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online") speech, sample_rate = soundfile.read("basketball.wav") speech_length = speech.shape[0] sample_offset = 0 chunk_size = [0, 10, 5] # [0, 10, 5] 600ms, [0, 8, 4] 480ms encoder_chunk_look_back = 4 # number of chunks to lookback for encoder self-attention decoder_chunk_look_back = 1 # number of encoder chunks to lookback for decoder cross-attention stride_size = chunk_size[1] * 960 param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size, "encoder_chunk_look_back": encoder_chunk_look_back, "decoder_chunk_look_back": decoder_chunk_look_back} final_result = "" import time for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): if sample_offset + stride_size >= speech_length - 1: stride_size = speech_length - sample_offset param_dict["is_final"] = True start = time.time() rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], param_dict=param_dict) end = time.time() cost = end - start if len(rec_result) != 0: final_result += rec_result['text'] print(rec_result['text'], f"接口运行次数:{sample_offset // stride_size},耗时:{cost}", sep='\t') print(final_result)
接口运行时间如下图,从一开始的60ms左右上升到最后的200ms
完整日志文件 asr_online.log 另外跑到最后显存好像也在不断增加,开始跑的时候显存占用只有3G左右,到最后显存快被占满了,cpu从600%上升到900%
我也遇到了这个问题,请问你解决了没有
加了官方社区群,按照作者的说法,升级2.0的代码之后应该没这个问题了,后面我不做这方面研究了,你可以试试 @wwfcnu