PyTorch_Speaker_Verification
PyTorch_Speaker_Verification copied to clipboard
Single inference
Hello guys, I'm new to this tool. I would like to know the path of the stones to calculate the similarity between two audio files. I started making a code here but it still has a lot of error. Can someone help me ? Thanks... ` import os import random import time import torch import librosa import numpy as np from torch.utils.data import DataLoader
from hparam import hparam as hp from speech_embedder_net import SpeechEmbedder, GE2ELoss, get_centroids, get_cossim from scipy.io.wavfile import read
def get_utterance(utter_path):
utter_min_len = (hp.data.tisv_frame * hp.data.hop + hp.data.window) * hp.data.sr # lower bound of utterance length
utter, sr = librosa.core.load(utter_path, hp.data.sr) # load utterance audio
intervals = librosa.effects.split(utter, top_db=30) # voice activity detection
# this works fine for timit but if you get array of shape 0 for any other audio change value of top_db
# for vctk dataset use top_db=100
utterances_spec = []
for interval in intervals:
if (interval[1]-interval[0]) > utter_min_len: # If partial utterance is sufficient long,
utter_part = utter[interval[0]:interval[1]] # save first and last 180 frames of spectrogram.
S = librosa.core.stft(y=utter_part, n_fft=hp.data.nfft,win_length=int(hp.data.window * sr), hop_length=int(hp.data.hop * sr))
S = np.abs(S) ** 2
mel_basis = librosa.filters.mel(sr=hp.data.sr, n_fft=hp.data.nfft, n_mels=hp.data.nmels)
S = np.log10(np.dot(mel_basis, S) + 1e-6) # log mel spectrogram of utterances
utterances_spec.append(S[:, :hp.data.tisv_frame]) # first 180 frames of partial utterance
utterances_spec.append(S[:, -hp.data.tisv_frame:]) # last 180 frames of partial utterance
utterances_spec = np.array(utterances_spec)
return utterances_spec
def get_data(sample_path):
utterance = get_utterance(sample_path)
utterance = utterance[:,:,:160] # TODO implement variable length batch size
utterance = torch.tensor(np.transpose(utterance, axes=(0,2,1))) # transpose [batch, frames, n_mels]
return utterance
def get_similarity(model_path, sample_a, sample_b):
data_a = get_data(sample_a)
data_b = get_data(sample_b)
embedder_net = SpeechEmbedder()
embedder_net.load_state_dict(torch.load(model_path))
embedder_net.eval()
enrollment_embeddings = embedder_net(data_a)
verification_embeddings = embedder_net(data_b)
#enrollment_embeddings = torch.reshape(enrollment_embeddings, (1, 1//2, enrollment_embeddings.size(1)))
enrollment_centroids = get_centroids(enrollment_embeddings)
return get_cossim(verification_embeddings, enrollment_embeddings)
if name=="main":
file1 = "test/audio_1.wav"
file2 = "test/audio_2.wav"
get_similarity(hp.model.model_path,file1,file2)
`