PaddleHub
PaddleHub copied to clipboard
Failed to customize PaddleSpeech service for PaddleHub
env:
OS:
- Windows 10
Python:
- 3.8.10
Requirements:
- paddlepaddle==2.4.2
- paddlenlp==2.5.2
- paddlehub==2.3.1
- paddlespeech==1.4.1
File Structure:
module.py:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import base64
import io
import sys
import time
import paddlehub as hub
from paddlehub.module.module import moduleinfo, runnable, serving
from paddlehub.utils.log import logger
# from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.server.engine.asr.python.asr_engine import ASREngine
from paddlespeech.server.engine.asr.python.asr_engine import PaddleASRConnectionHandler
from yacs.config import CfgNode
def base64_to_audio(b64str) -> io.BytesIO:
data = base64.b64decode(b64str.encode('utf8'))
# return io.BytesIO(data)
return data
@moduleinfo(
name="ASR",
version="1.0.0",
summary="This is a PaddleHub Self-Custom Module for Automatic Speech Recognition. Powered By Mercedes-Benz RDCP/SI Team.",
author="ruitian",
author_email="",
type="audio/asr"
)
class ASR(hub.Module):
def _initialize(self):
_config = dict()
_config['device'] = 'gpu'
_config['model'] = 'conformer_online_aishell'
_config['lang'] = 'zh'
_config['sample_rate'] = 16000
_config['cfg_path'] = None
_config['decode_method'] = 'attention_rescoring'
_config['ckpt_path'] = None
_config['force_yes'] = False
config = CfgNode(_config)
asr_engine = ASREngine()
asr_engine.init(config)
self.connection_handler = PaddleASRConnectionHandler(asr_engine)
@serving
def speech_recognize(self, audios, **kwargs):
def predict(audios, **kwargs):
res = []
if isinstance(audios, io.BytesIO):
# if not self.asr._check(audios, sample_rate=16000):
# sys.exit(-1)
# self.asr.preprocess(self.model, audios)
# self.asr.infer(self.model)
# _result = self.asr.postprocess()
self.connection_handler.run(audios)
asr_results = self.connection_handler.postprocess()
res.append(asr_results)
elif isinstance(audios, list):
for audio in audios:
# if not self.asr._check(audio, sample_rate=16000):
# sys.exit(-1)
# self.asr.preprocess(self.model, audio)
# self.asr.infer(self.model)
self.connection_handler.run(audio)
asr_results = self.connection_handler.postprocess()
res.append(asr_results)
else:
raise RuntimeError('The audio format cannot be used in serving. make sure the base64 data converted by audio file.')
return res
audio_decode = [base64_to_audio(audio) for audio in audios]
starttime = time.time()
results = predict(audio_decode, **kwargs)
elapse = time.time() - starttime
logger.info("Predict time: {}".format(elapse))
return [{"results": results, "elapse": elapse}]
if __name__ == '__main__':
lib = ASR()
lib._initialize()
def readwav2base64(wav_file):
"""
read wave file and covert to base64 string
"""
with open(wav_file, 'rb') as f:
base64_bytes = base64.b64encode(f.read())
base64_string = base64_bytes.decode('utf-8')
return base64_string
file_path = r"C:\Users\user\Desktop\zh.wav"
b64_data = readwav2base64(file_path)
a = lib.speech_recognize([b64_data])
# a = lib.speech_recognize([b64_data])
print(a)
Run "main" error logs:
2024-01-11 15:31:57.691 | INFO | paddlespeech.s2t.modules.ctc:<module>:45 - paddlespeech_ctcdecoders not installed!
2024-01-11 15:31:57.803 | INFO | paddlespeech.s2t.modules.embedding:__init__:150 - max len: 5000
[2024-01-11 15:32:00,635] [ INFO] - Initialize ASR server engine successfully on device: gpu.
[2024-01-11 15:32:00] [CRITICAL] [transformation.py:149] Catch a exception from 0th func: LogMelSpectrogramKaldi(fs=16000, n_mels=80, n_frame_shift=10.0, n_frame_length=25.0, dither=0.1))
[2024-01-11 15:32:00,642] [ INFO] - When the type of 'input' in assign is numpy.ndarray, the data type of 'input' must be bool, float32, int32 or int64, but received int16.
NOTE: