PaddleSpeech
PaddleSpeech copied to clipboard
[S2T] [ASRExecutor] list index out of range || The size of SequenceLength has to equal the batch_size
python==3.10 paddlepaddle==2.5.0 paddlespeech=1.4.1
错误信息1:
KeyError Traceback (most recent call last)
Cell In[4], line 11
4 # 读取wav
5 # 格式要求: 16k 16 bit 1 channel
6 # 音频时长 < 200s
7 # default_model = conformer_u2pp_online_wenetspeech
8 # better_model = deepspeech2online_wenetspeech
9 asr = ASRExecutor()
---> 11 result = asr(
12 audio_file='1.wav',
13 force_yes=True
14 )
16 print(asr._outputs)
18 # 给输出文本加上标点
19 # 使用 ernie_linear_p3_wudao 提升效果
File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/utils.py:328, in stats_wrapper.<locals>._warpper(self, *args, **kwargs)
326 except Exception:
327 pass
--> 328 return executor_func(self, *args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:512, in ASRExecutor.__call__(self, audio_file, model, lang, codeswitch, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
510 self.preprocess(model, audio_file)
511 self.infer(model)
--> 512 res = self.postprocess() # Retrieve result of asr.
514 if rtf:
515 CLI_TIMER[k]['end'].append(time.time())
File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:335, in ASRExecutor.postprocess(self)
331 def postprocess(self) -> Union[str, os.PathLike]:
332 """
333 Output postprocess and return human-readable results such as texts and audio files.
334 """
--> 335 return self._outputs["result"]
KeyError: 'result'
错误信息2:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[5], line 11
4 # 读取wav
5 # 格式要求: 16k 16 bit 1 channel
6 # 音频时长 < 200s
7 # default_model = conformer_u2pp_online_wenetspeech
8 # better_model = deepspeech2online_wenetspeech
9 asr = ASRExecutor()
---> 11 result = asr(
12 audio_file='1.wav',
13 force_yes=True,
14 rtf=True,
15 model='deepspeech2online_wenetspeech'
16 )
18 print(asr._outputs)
20 # 给输出文本加上标点
21 # 使用 ernie_linear_p3_wudao 提升效果
File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/utils.py:328, in stats_wrapper.<locals>._warpper(self, *args, **kwargs)
326 except Exception:
327 pass
--> 328 return executor_func(self, *args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:511, in ASRExecutor.__call__(self, audio_file, model, lang, codeswitch, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
508 CLI_TIMER[k]['start'].append(time.time())
510 self.preprocess(model, audio_file)
--> 511 self.infer(model)
512 res = self.postprocess() # Retrieve result of asr.
514 if rtf:
File /opt/conda/lib/python3.10/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw)
230 if not kwsyntax:
231 args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)
File /opt/conda/lib/python3.10/site-packages/paddle/fluid/dygraph/base.py:347, in _DecoratorContextManager.__call__.<locals>._decorate_function(func, *args, **kwargs)
344 @decorator.decorator
345 def _decorate_function(func, *args, **kwargs):
346 with self:
--> 347 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/paddlespeech/cli/asr/infer.py:306, in ASRExecutor.infer(self, model_type)
299 decode_batch_size = audio.shape[0]
300 self.model.decoder.init_decoder(
301 decode_batch_size, self.text_feature.vocab_list,
302 cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
303 cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
304 cfg.num_proc_bsearch)
--> 306 result_transcripts = self.model.decode(audio, audio_len)
307 self.model.decoder.del_decoder()
308 self._outputs["result"] = result_transcripts[0]
File /opt/conda/lib/python3.10/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw)
230 if not kwsyntax:
231 args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)
File /opt/conda/lib/python3.10/site-packages/paddle/fluid/dygraph/base.py:347, in _DecoratorContextManager.__call__.<locals>._decorate_function(func, *args, **kwargs)
344 @decorator.decorator
345 def _decorate_function(func, *args, **kwargs):
346 with self:
--> 347 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/paddlespeech/s2t/models/ds2/deepspeech2.py:299, in DeepSpeech2Model.decode(self, audio, audio_len)
295 @paddle.no_grad()
296 def decode(self, audio, audio_len):
297 # decoders only accept string encoded in utf-8
298 # Make sure the decoder has been initialized
--> 299 eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
300 audio, audio_len, None, None)
301 probs = self.decoder.softmax(eouts)
302 batch_size = probs.shape[0]
File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1254, in Layer.__call__(self, *inputs, **kwargs)
1245 if (
1246 (not in_declarative_mode())
1247 and (not self._forward_pre_hooks)
(...)
1251 and (not in_profiler_mode())
1252 ):
1253 self._build_once(*inputs, **kwargs)
-> 1254 return self.forward(*inputs, **kwargs)
1255 else:
1256 return self._dygraph_call_func(*inputs, **kwargs)
File /opt/conda/lib/python3.10/site-packages/paddlespeech/s2t/models/ds2/deepspeech2.py:130, in CRNNEncoder.forward(self, x, x_lens, init_state_h_box, init_state_c_box)
128 final_chunk_state_list = []
129 for i in range(0, self.num_rnn_layers):
--> 130 x, final_state = self.rnn[i](x, init_state_list[i],
131 x_lens) #[B, T, D]
132 final_chunk_state_list.append(final_state)
133 x = self.layernorm_list[i](x)
File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1254, in Layer.__call__(self, *inputs, **kwargs)
1245 if (
1246 (not in_declarative_mode())
1247 and (not self._forward_pre_hooks)
(...)
1251 and (not in_profiler_mode())
1252 ):
1253 self._build_once(*inputs, **kwargs)
-> 1254 return self.forward(*inputs, **kwargs)
1255 else:
1256 return self._dygraph_call_func(*inputs, **kwargs)
File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/rnn.py:1580, in RNNBase.forward(self, inputs, initial_states, sequence_length)
1570 initial_states = (
1571 [initial_states]
1572 if isinstance(initial_states, paddle.static.Variable)
1573 else initial_states
1574 )
1576 if self.could_use_cudnn and (
1577 not paddle.device.is_compiled_with_rocm() or sequence_length is None
1578 ):
1579 # Add CPU kernel and dispatch in backend later
-> 1580 return self._cudnn_impl(inputs, initial_states, sequence_length)
1582 states = split_states(
1583 initial_states, self.num_directions == 2, self.state_components
1584 )
1585 final_states = []
File /opt/conda/lib/python3.10/site-packages/paddle/nn/layer/rnn.py:1470, in RNNBase._cudnn_impl(self, inputs, initial_states, sequence_length)
1467 inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
1469 if in_dygraph_mode():
-> 1470 out, _, state = _C_ops.rnn(
1471 inputs,
1472 initial_states,
1473 self._all_weights,
1474 sequence_length,
1475 self._dropout_state,
1476 self.dropout,
1477 self.num_directions == 2,
1478 self.input_size,
1479 self.hidden_size,
1480 self.num_layers,
1481 self.mode,
1482 0,
1483 not self.training,
1484 )
1485 elif in_dynamic_mode():
1486 _, _, out, state = _legacy_C_ops.rnn(
1487 inputs,
1488 initial_states,
(...)
1506 not self.training,
1507 )
ValueError: (InvalidArgument) The size of SequenceLength has to equal the batch_size. But received batch_size is 1 and the size of SequenceLength is 0.
[Hint: Expected in_dims[1] == seq_dims[0], but received in_dims[1]:1 != seq_dims[0]:0.] (at ../paddle/phi/infermeta/multiary.cc:2690)
直接编译dev分支就没有上述问题,还是尽快发布新版本吧
感谢您的使用,给您带来的不便我们深感抱歉。我们目前没有发版计划,如果您定位到了问题,也欢迎贡献代码,感谢您的支持与理解。
你试着使用 develop 分支编译
- python setup.py build
- python setup.py install
我也遇到过类似的问题
使用命令行paddlespeech asr --lang zh --input demo.wav
也会出现keyError
我也遇到这个问题,训练完asr,测试的时候报这个错,请问有解决的办法吗