TTS
TTS copied to clipboard
[Bug] xtts ft demo: empty csv files with the format_audio_list
Describe the bug
I use the formatter method to process my audio files(Chinese language), but I got the csv files with no data. Because it has never met the condition of if word.word[-1] in ["!", ".", "?"]:
To Reproduce
below is my code:
datapath = "/mnt/workspace/tdy.tdy/mp3_lww"
out_path = "/mnt/workspace/tdy.tdy/mp3_lww_train"
os.makedirs(out_path, exist_ok=True)
whisper_path = "/mnt/workspace/.cache/modelscope/keepitsimple/faster-whisper-large-v3"
target_language = 'zh'
buffer=0.2
eval_percentage=0.15
speaker_name="lww"
import os
from os import path as osp
import torchaudio
from matplotlib import pyplot as plt
import torch
from faster_whisper import WhisperModel
import pandas
import gc
# Loading Whisper
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loading Whisper Model!")
asr_model = WhisperModel(whisper_path, device=device, compute_type="float16", local_files_only=True)
def plot_waveform(waveform, sample_rate):
waveform = waveform.numpy()
num_channels, num_frames = waveform.shape
time_axis = torch.arange(0, num_frames) / sample_rate
figure, axes = plt.subplots(num_channels, 1)
if num_channels == 1:
axes = [axes]
for c in range(num_channels):
axes[c].plot(time_axis, waveform[c], linewidth=1)
axes[c].grid(True)
if num_channels > 1:
axes[c].set_ylabel(f"Channel {c+1}")
figure.suptitle("waveform")
print("Reading audio files!")
audio_files = os.listdir(datapath)
audio_total_size = 0
metadata = {"audio_file": [], "text": [], "speaker_name": []}
for f in audio_files:
if f.endswith('mp3'):
audio_path = osp.join(datapath, f)
wav, sr = torchaudio.load(audio_path)
if wav.size(0) != 1:
wav = torch.mean(wav, dim=0, keepdim=True)
wav = wav.squeeze()
audio_total_size += (wav.size(-1) / sr)
# plot_waveform(wav, sr)
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
segments = list(segments)
i = 0
sentence = ""
sentence_start = None
first_word = True
# added all segments words in a unique list
words_list = []
for _, segment in enumerate(segments):
words = list(segment.words)
words_list.extend(words)
# process each word
for word_idx, word in enumerate(words_list):
if first_word:
sentence_start = word.start
# If it is the first sentence, add buffer or get the begining of the file
if word_idx == 0:
sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start
else:
# get previous sentence end
previous_word_end = words_list[word_idx - 1].end
# add buffer or get the silence midle between the previous sentence and the current one
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
sentence = word.word
first_word = False
else:
sentence += word.word
if word.word[-1] in ["!", ".", "?"]:
sentence = sentence[1:]
# Expand number and abbreviations plus normalization
sentence = multilingual_cleaners(sentence, target_language)
audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
# Check for the next word's existence
if word_idx + 1 < len(words_list):
next_word_start = words_list[word_idx + 1].start
else:
# If don't have more words it means that it is the last sentence then use the audio len as next word start
next_word_start = (wav.shape[0] - 1) / sr
# Average the current word end and next word start
word_end = min((word.end + next_word_start) / 2, word.end + buffer)
absoulte_path = os.path.join(out_path, audio_file)
os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
i += 1
first_word = True
audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
# if the audio is too short ignore it (i.e < 0.33 seconds)
if audio.size(-1) >= sr/3:
torchaudio.save(absoulte_path,
audio,
sr
)
else:
continue
metadata["audio_file"].append(audio_file)
metadata["text"].append(sentence)
metadata["speaker_name"].append(speaker_name)
df = pandas.DataFrame(metadata)
df = df.sample(frac=1)
num_val_samples = int(len(df)*eval_percentage)
df_eval = df[:num_val_samples]
df_train = df[num_val_samples:]
df_train = df_train.sort_values('audio_file')
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
df_train.to_csv(train_metadata_path, sep="|", index=False)
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
df_eval = df_eval.sort_values('audio_file')
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
# deallocate VRAM and RAM
del asr_model, df_train, df_eval, df, metadata
gc.collect()
print('audio total size: ', audio_total_size)
Expected behavior
there are data lines in metadata_train.csv and metadata_eval.csv
Logs
root@dsw-297768-d54489667-bcrfv:/mnt/workspace/clone_voice_sft_xtts# python process_audio_files.py
2023-12-31 21:37:21,419 - modelscope - INFO - PyTorch version 2.1.0+cu118 Found.
2023-12-31 21:37:21,421 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2023-12-31 21:37:21,421 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2023-12-31 21:37:21,462 - modelscope - INFO - Loading done! Current index file version is 1.10.0, with md5 44f0b88effe82ceea94a98cf99709694 and a total number of 946 components indexed
Loading Whisper Model!
/mnt/workspace/.cache/modelscope/keepitsimple/faster-whisper-large-v3
Reading audio files!
> /mnt/workspace/clone_voice_sft_xtts/process_audio_files.py(82)<module>()
-> if word.word[-1] in ["!", ".", "?"]:
(Pdb) words_list
[Word(start=0.0, end=0.42, word='但', probability=0.82470703125), Word(start=0.42, end=0.68, word='小', probability=0.9951171875), Word(start=0.68, end=1.06, word='狗', probability=0.99951171875), Word(start=1.06, end=1.18, word='呢', probability=0.8623046875), Word(start=1.18, end=1.34, word='它', probability=0.4169921875), Word(start=1.34, end=1.6, word='不是', probability=0.9970703125), Word(start=1.6, end=1.9, word='关', probability=0.904296875), Word(start=1.9, end=2.2, word='节', probability=0.99853515625), Word(start=2.2, end=2.38, word='它', probability=0.91015625), Word(start=2.38, end=2.64, word='是', probability=0.99951171875), Word(start=2.64, end=3.0, word='近', probability=0.362548828125), Word(start=3.0, end=3.72, word='病', probability=0.80419921875), Word(start=3.72, end=4.08, word='骨', probability=0.99072265625), Word(start=4.08, end=4.72, word='就', probability=0.9921875), Word(start=4.72, end=4.86, word='它', probability=0.9794921875), Word(start=4.86, end=5.16, word='病', probability=0.9990234375), Word(start=5.16, end=5.44, word='骨', probability=1.0), Word(start=5.44, end=5.6, word='和', probability=0.9990234375), Word(start=5.6, end=5.72, word='它', probability=0.99755859375), Word(start=5.72, end=6.0, word='那个', probability=0.99658203125), Word(start=6.0, end=6.24, word='什么', probability=0.994140625), Word(start=6.979999999999997, end=7.5, word='骨', probability=0.99853515625), Word(start=7.5, end=7.76, word='头', probability=1.0), Word(start=7.76, end=7.92, word='的', probability=1.0), Word(start=7.92, end=8.06, word='那个', probability=0.998046875), Word(start=8.06, end=8.26, word='位', probability=1.0), Word(start=8.26, end=8.54, word='置', probability=1.0), Word(start=8.54, end=8.84, word='它', probability=0.99560546875), Word(start=8.84, end=9.1, word='是', probability=1.0), Word(start=9.1, end=9.3, word='那个', probability=1.0), Word(start=9.3, end=9.74, word='地方', probability=1.0), Word(start=9.74, end=10.12, word='没', probability=0.9990234375), Word(start=10.12, end=10.32, word='长', probability=0.998046875), Word(start=10.32, end=10.66, word='好', probability=0.99951171875), Word(start=10.66, end=11.64, word='然后', probability=0.99853515625), Word(start=11.64, end=12.28, word='长', probability=0.99951171875), Word(start=12.28, end=12.7, word='期', probability=1.0), Word(start=12.7, end=12.86, word='那么', probability=0.9892578125), Word(start=12.86, end=13.16, word='走', probability=1.0), Word(start=13.16, end=13.4, word='路', probability=1.0), Word(start=13.4, end=13.52, word='呢', probability=0.990234375), Word(start=13.52, end=13.84, word='磨', probability=0.998291015625), Word(start=13.84, end=14.16, word='损', probability=0.999755859375), Word(start=14.16, end=14.5, word='导', probability=0.99951171875), Word(start=14.5, end=14.78, word='致', probability=1.0), Word(start=14.78, end=14.94, word='的', probability=0.98876953125), Word(start=14.94, end=15.92, word='就', probability=0.98681640625), Word(start=15.92, end=16.08, word='反', probability=1.0), Word(start=16.08, end=16.26, word='正', probability=1.0), Word(start=16.26, end=16.48, word='原', probability=0.9990234375), Word(start=16.48, end=16.62, word='理', probability=0.99755859375), Word(start=16.62, end=16.74, word='应', probability=0.99951171875), Word(start=16.74, end=16.84, word='该', probability=1.0), Word(start=16.84, end=16.96, word='都是', probability=1.0), Word(start=16.96, end=17.42, word='差不多', probability=0.99951171875), Word(start=17.42, end=17.7, word='反', probability=1.0), Word(start=17.7, end=17.84, word='正', probability=1.0), Word(start=17.84, end=18.08, word='就是', probability=1.0), Word(start=18.9, end=19.42, word='用', probability=0.99951171875), Word(start=19.42, end=19.7, word='力', probability=1.0), Word(start=19.7, end=19.86, word='用', probability=0.9990234375), Word(start=19.86, end=20.2, word='不对', probability=0.9990234375), Word(start=20.2, end=20.7, word='然后', probability=0.998046875), Word(start=20.7, end=21.68, word='导', probability=0.99951171875), Word(start=21.68, end=21.92, word='致', probability=1.0), Word(start=21.92, end=22.12, word='那个', probability=0.99658203125), Word(start=22.12, end=22.46, word='膝', probability=0.983154296875), Word(start=22.46, end=22.7, word='关', probability=0.99853515625), Word(start=22.7, end=22.96, word='节', probability=0.99951171875), Word(start=22.96, end=23.86, word='的', probability=0.99560546875), Word(start=23.86, end=24.04, word='那个', probability=0.9990234375), Word(start=24.04, end=24.36, word='白', probability=1.0), Word(start=24.36, end=24.66, word='色', probability=1.0), Word(start=24.66, end=24.74, word='的', probability=0.966796875), Word(start=24.74, end=24.9, word='那个', probability=0.97119140625), Word(start=24.9, end=25.22, word='软', probability=0.999267578125), Word(start=25.22, end=25.48, word='骨', probability=0.999755859375), Word(start=25.48, end=25.68, word='啊', probability=0.962890625), Word(start=25.68, end=26.58, word='就', probability=0.99853515625), Word(start=26.58, end=27.16, word='磨', probability=0.999755859375), Word(start=27.16, end=27.42, word='损', probability=1.0), Word(start=27.42, end=27.58, word='的', probability=0.9775390625), Word(start=27.58, end=27.72, word='太', probability=0.9990234375), Word(start=27.72, end=27.92, word='严', probability=0.999755859375), Word(start=27.92, end=28.16, word='重', probability=1.0), Word(start=28.16, end=28.26, word='了', probability=0.97509765625), Word(start=28.26, end=29.26, word='然后', probability=0.99560546875), Word(start=29.26, end=29.54, word='呢', probability=1.0), Word(start=29.54, end=29.82, word='现在', probability=0.38525390625), Word(start=29.82, end=30.08, word='呢', probability=0.283203125), Word(start=30.08, end=30.92, word='它', probability=0.1630859375), Word(start=30.92, end=31.16, word='走', probability=0.9970703125), Word(start=31.16, end=31.44, word='路', probability=0.99951171875), Word(start=31.44, end=31.52, word='呢', probability=0.89697265625), Word(start=31.52, end=31.74, word='它', probability=0.9326171875), Word(start=31.74, end=31.94, word='是', probability=0.98681640625), Word(start=31.94, end=32.18, word='骨', probability=0.991943359375), Word(start=32.18, end=32.38, word='头', probability=0.9970703125), Word(start=32.38, end=32.64, word='磨', probability=0.907470703125), Word(start=32.64, end=32.74, word='着', probability=0.76904296875), Word(start=32.74, end=32.96, word='骨', probability=0.994873046875), Word(start=32.96, end=33.2, word='头', probability=0.99951171875), Word(start=33.2, end=33.48, word='所以', probability=0.96240234375), Word(start=33.48, end=33.58, word='就', probability=0.990234375), Word(start=33.58, end=33.72, word='会', probability=0.99853515625), Word(start=33.72, end=33.94, word='很', probability=0.9990234375), Word(start=33.94, end=34.26, word='疼', probability=0.994384765625), Word(start=34.26, end=34.96, word='或者', probability=0.98193359375), Word(start=34.96, end=35.2, word='是', probability=0.9990234375), Word(start=35.2, end=35.76, word='那个', probability=0.79638671875), Word(start=35.76, end=37.32, word='软', probability=0.997314453125), Word(start=37.32, end=37.58, word='骨', probability=0.9990234375), Word(start=37.58, end=37.68, word='比', probability=0.98974609375), Word(start=37.68, end=38.1, word='较', probability=1.0), Word(start=38.92, end=38.94, word='比', probability=0.4990234375), Word(start=38.94, end=39.34, word='较', probability=1.0), Word(start=39.34, end=39.64, word='薄', probability=1.0), Word(start=39.64, end=39.78, word='了', probability=0.9990234375), Word(start=39.78, end=40.22, word='所以', probability=0.99658203125), Word(start=40.22, end=40.38, word='它', probability=0.96826171875), Word(start=40.38, end=40.56, word='就', probability=0.99755859375), Word(start=40.56, end=41.24, word='不能', probability=0.998046875), Word(start=41.24, end=41.66, word='缓', probability=0.99951171875), Word(start=41.66, end=41.98, word='冲', probability=0.99853515625), Word(start=41.98, end=42.62, word='所以', probability=0.99072265625), Word(start=42.62, end=42.78, word='就', probability=0.99951171875), Word(start=42.78, end=42.9, word='比', probability=0.99951171875), Word(start=42.9, end=43.08, word='较', probability=1.0), Word(start=43.08, end=43.4, word='疼', probability=0.999755859375)]
(Pdb) len(words_list)
129
(Pdb) words_list[0]
Word(start=0.0, end=0.42, word='但', probability=0.82470703125)
(Pdb) q
Traceback (most recent call last):
File "/mnt/workspace/clone_voice_sft_xtts/process_audio_files.py", line 82, in <module>
sentence = sentence[1:]
File "/mnt/workspace/clone_voice_sft_xtts/process_audio_files.py", line 82, in <module>
sentence = sentence[1:]
File "/opt/conda/lib/python3.10/bdb.py", line 90, in trace_dispatch
return self.dispatch_line(frame)
File "/opt/conda/lib/python3.10/bdb.py", line 115, in dispatch_line
if self.quitting: raise BdbQuit
bdb.BdbQuit
^[[A
Environment
{
"CUDA": {
"GPU": [
"Tesla V100-SXM2-16GB"
],
"available": true,
"version": "11.8"
},
"Packages": {
"PyTorch_debug": false,
"PyTorch_version": "2.1.0+cu118",
"numpy": "1.26.2"
},
"System": {
"OS": "Linux",
"architecture": [
"64bit",
"ELF"
],
"processor": "x86_64",
"python": "3.10.13",
"version": "#1 SMP Tue Jun 20 06:15:49 UTC 2023"
}
}
Additional context
I installed TTS by this:
rm -rf TTS/ # delete repo to be able to reinstall if needed
git clone --branch xtts_demo https://github.com/coqui-ai/TTS.git
pip install --use-deprecated=legacy-resolver -e TTS
pip install --use-deprecated=legacy-resolver -r TTS/TTS/demos/xtts_ft_demo/requirements.txt
pip install typing_extensions==4.8.0 numpy==1.26.2