echomimic
echomimic copied to clipboard
关于在您的推理代码中音频处理的一个问题
def audio2feat(self,audio_path):
# get the sample rate of the audio
result = self.model.transcribe(audio_path)
embed_list = []
for emb in result['segments']:
encoder_embeddings = emb['encoder_embeddings']
encoder_embeddings = encoder_embeddings.transpose(0,2,1,3)
encoder_embeddings = encoder_embeddings.squeeze(0)
start_idx = int(emb['start'])
end_idx = int(emb['end'])
emb_end_idx = int((end_idx - start_idx)/2)
embed_list.append(encoder_embeddings[:emb_end_idx])#?为什么采取了取中间值的截断操作?
concatenated_array = np.concatenate(embed_list, axis=0)
return concatenated_array
问题在上面的注释中,在训练时,您对音频的处理也是这样的吗?