TensorFlowASR
TensorFlowASR copied to clipboard
Inconsistent infer results for stream-transducer and stream-transducer-tflite models
After getting help from this, I successfully trained a stream-ransducer model and converted the model to tflite.
I tested the trained model with "test_subword_streaming_transducer.py" first, the predicted result seems to be correct:
PATH GROUNDTRUTH GREEDY BEAMSEARCH BEAMSEARCHLM /data/audio_data/asr_cn/data_aishell/wav/test/S0767/BAC009S0767W0288.wav gao kong fei hang shi zhan lve wu ren ji gao kong fei hang shi zhan lve wu ren ji gao kong fei hang shi zhan lve wu ren ji /data/audio_data/asr_cn/data_aishell/wav/test/S0767/BAC009S0767W0308.wav zai li dian ye wu bao fa de qing kuang xia zai li dian ye wu bao fa de qing kuang xia /data/audio_data/asr_cn/data_aishell/wav/test/S0767/BAC009S0767W0234.wav shi di fang zheng fu de zhai wu rong zi gui mo kong zhi zai an quan fan wei nei shi di fang zheng fu de zhai wu rong zi gui mo kong zhi zai an quan fan wei nei nei /data/audio_data/asr_cn/data_aishell/wav/test/S0767/BAC009S0767W0407.wav ke lin ye tui chu le ju zu ke lian ye tui chu de ju /data/audio_data/asr_cn/data_aishell/wav/test/S0767/BAC009S0767W0363.wav de dao dang di qi ye de qiang lie ji ji xiang ying de dao dang li qi ye de qiang lie ji ji xiang ying
Then, I also tried to infer wav audio based on converted stream-transducer-tflite-model, but the result looks like to be abnormal. It looks like the parameter 'blocksize' could have a big impact on the infer results. E.g. i am trying to infer an audio "/data/audio_data/asr_cn/data_aishell/wav/test/S0767/BAC009S0767W0308.wav" by the scipt below,
If 'blocksize' == 28000, infer_result = "zai li dian ye wu bao fa de qing kuang" If 'blocksize' == 16000, infer_result = "zai li dian ye wu bao fa de xing hua" If 'blocksize' == 8000, infer_result = "zi li ye ye wu bao fa de qing kuang xia" If 'blocksize' == 4096, FAILED.
As far as i understand, if blocksize is set closer to the real length of the audio, the infer result might be more accurate. However, in real application, if the audio frames are feeded into tflite-models in STREAM format, the blocksize cannot be set as a too large value. As for me, the "blocksize" <= 5000 (frames) might be reasonable.
I tried to set beam_width as 5 or 1 or 0, but the results are still the same.
Is it possible to find a way to get around this even if I set"blocksize" <= 5000 ?
import sys
import argparse
import soundfile as sf
import sounddevice as sd
from multiprocessing import Process, Event, Manager
import queue
import numpy as np
import tensorflow as tf
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
parser = argparse.ArgumentParser(prog="Conformer audio file streaming")
parser.add_argument('-l', '--list-devices', action='store_true',
help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
print(sd.query_devices())
parser.exit(0)
parser.add_argument('filename', metavar='FILENAME',
help='audio file to be played back')
parser.add_argument('-d', '--device', type=int_or_str,
help='output device (numeric ID or substring)')
parser.add_argument('-b', '--blocksize', type=int, default=4096,
help='block size (default: %(default)s)')
parser.add_argument('-q', '--buffersize', type=int, default=20,
help='number of blocks used for buffering (default: %(default)s)')
parser.add_argument("--tflite", type=str, default=None,
help="Path to conformer tflite")
parser.add_argument("--blank", type=int, default=0,
help="Path to conformer tflite")
parser.add_argument("--num_rnns", type=int, default=4,
help="Number of RNN layers in prediction network")
parser.add_argument("--nstates", type=int, default=2,
help="Number of RNN states in prediction network (1 for GRU and 2 for LSTM)")
parser.add_argument("--statesize", type=int, default=512,
help="Size of RNN state in prediction network")
args = parser.parse_args(remaining)
if args.blocksize == 0:
parser.error('blocksize must not be zero')
if args.buffersize < 1:
parser.error('buffersize must be at least 1')
def recognizer():
tflitemodel = tf.lite.Interpreter(model_path=args.tflite)
input_details = tflitemodel.get_input_details()
output_details = tflitemodel.get_output_details()
tflitemodel.resize_tensor_input(input_details[0]["index"], [args.blocksize])
tflitemodel.allocate_tensors()
print("=== INPUT ===")
for detail in tflitemodel.get_input_details():
print(detail)
print("=== OUTPUT ===")
for detail in tflitemodel.get_output_details():
print(detail)
def recognize(signal, lastid, states, states_2):
if signal.shape[0] < args.blocksize:
signal = tf.pad(signal, [[0, args.blocksize - signal.shape[0]]])
# print(signal.shape)
tflitemodel.set_tensor(input_details[0]["index"], signal)
tflitemodel.set_tensor(input_details[1]["index"], lastid)
tflitemodel.set_tensor(input_details[2]["index"], states)
tflitemodel.set_tensor(input_details[3]["index"], states_2)
tflitemodel.invoke()
upoints = tflitemodel.get_tensor(output_details[0]["index"])
lastid = tflitemodel.get_tensor(output_details[1]["index"])
states = tflitemodel.get_tensor(output_details[2]["index"])
states_2 = tflitemodel.get_tensor(output_details[3]["index"])
text = "".join([chr(u) for u in upoints])
return text, lastid, states, states_2
lastid = args.blank * tf.ones(shape=[], dtype=tf.int32)
states = tf.zeros(shape=[args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)
states_2 = tf.zeros(shape=[args.nstates, args.nstates, 1, args.statesize], dtype=tf.float32)
transcript = ""
with open(args.filename, 'rb') as fr:
data, samplerate = sf.read(fr)
data = data.astype(np.float32)
start = 0
end = start + args.blocksize
while end < data.shape[0]:
text, lastid, states, states_2 = recognize(data[start:end], lastid, states, states_2)
transcript += text
print(transcript, flush=True)
start += args.blocksize
end = start + args.blocksize
# try:
# text, lastid, states, states_2 = recognize(data[start:end], lastid, states, states_2)
# transcript += text
# print(transcript, flush=True)
# except:
# print("failed")
# finally:
# start += args.blocksize
# end = start + args.blocksize
if __name__ == "__main__":
recognizer()
@iamweiweishi The reason is that the mel spectrogram of the chunks is not the same mel spectrogram of the whole signal. I'm working on this to reduce the differences 😄
@iamweiweishi You can see the incorrect features of chunked signal compare to the whole signal here
@iamweiweishi I found a problem is that the feature normalization, if I don't apply feature norm or apply it "per frame", the different between chunked signal and whole signal is insignificant: from this to this
The key point is the chunk size (aka block_size) must be frame_length + k * frame_step
and the chunks are splitted with stride = (k + 1) * frame_length
and the top_db = None
. Which means the prediction will delay around ((chunk_size - (k + 1) * frame_length) / sample_rate) + time_for_model_to_predict_per_chunk
. So for example, with the frame_length = 512
, frame_step = 160
and sample_rate = 16000
, the delay will be (512 - 160) / 16000 + model_time = 0.022s + model_time_in_sec = 22ms + model_time_in_ms
In this repo, we use frame_length = speech_featurizer.nfft
like librosa does.
Thank you so much. @usimarit I am trying to retrain a stream-model with params:
speech_config:
sample_rate: 16000
frame_ms: 25
stride_ms: 10
num_feature_bins: 80
feature_type: log_mel_spectrogram
preemphasis: 0.97
normalize_signal: False
normalize_feature: True
normalize_per_frame: True
center: False
top_db: None
I will get back when I get some results.
@usimarit I trained one new stream-transducer model with the modified params above. Then, I tried to infer with the scipt below referred to this, but still could not get the correct results.
BTW, statesize is set as 320, and I also tested different values of k (1, 3 ,5 10).
import sys
import argparse
import soundfile as sf
import sounddevice as sd
from multiprocessing import Process, Event, Manager
import queue
import numpy as np
import tensorflow as tf
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
parser = argparse.ArgumentParser(prog="Conformer audio file streaming")
parser.add_argument('-l', '--list-devices', action='store_true',
help='show list of audio devices and exit')
args, remaining = parser.parse_known_args()
if args.list_devices:
print(sd.query_devices())
parser.exit(0)
parser.add_argument('filename', metavar='FILENAME',
help='audio file to be played back')
parser.add_argument('-d', '--device', type=int_or_str,
help='output device (numeric ID or substring)')
parser.add_argument('-b', '--blocksize', type=int, default=512,
help='block size (default: %(default)s)')
parser.add_argument('-nftt', '--nftt', type=int, default=512, help='nftt (default: %(default)s)')
parser.add_argument('-q', '--buffersize', type=int, default=20,
help='number of blocks used for buffering (default: %(default)s)')
parser.add_argument("--tflite", type=str, default=None,
help="Path to conformer tflite")
parser.add_argument("--blank", type=int, default=0,
help="Path to conformer tflite")
parser.add_argument("--num_rnns", type=int, default=8,
help="Number of RNN layers in prediction network")
parser.add_argument("--nstates", type=int, default=2,
help="Number of RNN states in prediction network (1 for GRU and 2 for LSTM)")
parser.add_argument("--statesize", type=int, default=512,
help="Size of RNN state in prediction network")
args = parser.parse_args(remaining)
k = 5 frame_step = 160 stride = k * frame_step chunk_size = args.nftt + stride - frame_step args.blocksize = chunk_size
print("block size = {}, stride = {}".format(args.blocksize, stride))
if args.blocksize == 0:
parser.error('blocksize must not be zero')
if args.buffersize < 1:
parser.error('buffersize must be at least 1')
def recognizer():
tflitemodel = tf.lite.Interpreter(model_path=args.tflite)
input_details = tflitemodel.get_input_details()
output_details = tflitemodel.get_output_details()
tflitemodel.resize_tensor_input(input_details[0]["index"], [args.blocksize])
tflitemodel.allocate_tensors()
print("=== INPUT ===")
for detail in tflitemodel.get_input_details():
print(detail)
print("=== OUTPUT ===")
for detail in tflitemodel.get_output_details():
print(detail)
def recognize(signal, lastid, states, states_2):
if signal.shape[0] < args.blocksize:
signal = tf.pad(signal, [[0, args.blocksize - signal.shape[0]]])
# print(signal.shape)
tflitemodel.set_tensor(input_details[0]["index"], signal)
tflitemodel.set_tensor(input_details[1]["index"], lastid)
tflitemodel.set_tensor(input_details[2]["index"], states)
tflitemodel.set_tensor(input_details[3]["index"], states_2)
tflitemodel.invoke()
upoints = tflitemodel.get_tensor(output_details[0]["index"])
lastid = tflitemodel.get_tensor(output_details[1]["index"])
states = tflitemodel.get_tensor(output_details[2]["index"])
states_2 = tflitemodel.get_tensor(output_details[3]["index"])
text = "".join([chr(u) for u in upoints])
return text, lastid, states, states_2
lastid = args.blank * tf.ones(shape=[], dtype=tf.int32)
states = tf.zeros(shape=[args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)
states_2 = tf.zeros(shape=[args.nstates, args.nstates, 1, args.statesize], dtype=tf.float32)
transcript = ""
with open(args.filename, 'rb') as fr:
data, samplerate = sf.read(fr)
data = data.astype(np.float32)
start = 0
end = start + args.blocksize
while end < data.shape[0]:
# text, lastid, states, states_2 = recognize(data[start:end], lastid, states, states_2)
# transcript += text
# print(transcript, flush=True)
# start += args.blocksize
# end = start + args.blocksize
try:
text, lastid, states, states_2 = recognize(data[start:end], lastid, states, states_2)
transcript += text
print(transcript, flush=True)
except:
print("failed")
finally:
start += stride
end = start + args.blocksize
if __name__ == "__main__":
recognizer()
@iamweiweishi Can you show me the results?
@iamweiweishi Can you try with normalize_signal: True
?
-
k = 20 failed failed failed shui shui di shui di di yi shui di di yi yan yan shui di di yi yan yan yi shui di di yi yan yan yi wu shui di di yi yan yan yi wu bao gao shui di di yi yan yan yi wu bao gao ha shui di di yi yan yan yi wu bao gao ha er shui di di yi yan yan yi wu bao gao ha er qian shui di di yi yan yan yi wu bao gao ha er qian er failed failed failed failed
-
k = 10 failed failed failed failed failed failed failed failed failed failed failed failed n failed failed n yi failed n yi you failed failed failed failed failed n yi you hao failed failed failed failed failed failed failed failed n yi you hao ji failed failed failed
-
k = 5 failed ......
When I set normalize_signal: True, the prediction results are the same. It seems like this parameter does not produce too much effect on the results.