soft-vc
soft-vc copied to clipboard
is real-time voice conversion possible?
Hi- very impressed by the VC framework. It's very fast and accurate. I'm wondering is real-time possible? I have a simple WS server that receives audio, but when i push the data through soft-vc, the end result is just noise. In the code below, I save the input stream just to confirm the audio is being received correctly (which it is). Here is a snippet of my code:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cuda()
acoustic_load_path = "./pretrained_models/acoustic.pt"
checkpoint = torch.load(acoustic_load_path, map_location=device)["acoustic-model"]
acoustic = AcousticModel().to(device)
acoustic.load_state_dict(checkpoint)
acoustic.eval()
# load custom vocoder
hifigan_load_path = "./pretrained_models/hifigan.pt"
checkpoint = torch.load(hifigan_load_path, map_location=device)[
"generator"]["model"]
hifigan = HifiganGenerator().to(device)
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
hifigan.load_state_dict(checkpoint)
hifigan.eval()
hifigan.remove_weight_norm()
inputs = []
outputs = []
while True:
data = None
try:
data = await websocket.recv()
except:
break
if isinstance(data, str):
print(f"string -> {data}")
continue
source = torch.from_numpy(numpy.frombuffer(
data, dtype=numpy.int16).astype('float32') / 32767)
source = source.reshape((1, -1))
source = source.unsqueeze(0).cuda()
# # Convert to the target speaker
with torch.inference_mode():
# Extract speech units
units = hubert.units(source)
# Generate target spectrogram
mel = acoustic.generate(units).transpose(1, 2)
# Generate audio waveform
target = hifigan(mel)
inputs.append(source.squeeze(0).cpu())
outputs.append(target.squeeze(0).cpu())
await ws.send(data)
print(f"saving files...")
input_result = torch.cat(inputs, dim=1)
torchaudio.save("inputs.wav", input_result, sample_rate=16_000)
output_result = torch.cat(outputs, dim=1)
torchaudio.save("outputs.wav", output_result, sample_rate=16_000)