TTS icon indicating copy to clipboard operation
TTS copied to clipboard

[Bug] xtts ft demo: empty csv files with the format_audio_list

Open dorbodwolf opened this issue 1 year ago • 0 comments

Describe the bug

I use the formatter method to process my audio files(Chinese language), but I got the csv files with no data. Because it has never met the condition of if word.word[-1] in ["!", ".", "?"]:

To Reproduce

below is my code:

datapath = "/mnt/workspace/tdy.tdy/mp3_lww"
out_path = "/mnt/workspace/tdy.tdy/mp3_lww_train"
os.makedirs(out_path, exist_ok=True)
whisper_path = "/mnt/workspace/.cache/modelscope/keepitsimple/faster-whisper-large-v3"
target_language = 'zh'
buffer=0.2
eval_percentage=0.15
speaker_name="lww"

import os
from os import path as osp
import torchaudio
from matplotlib import pyplot as plt
import torch
from faster_whisper import WhisperModel
import pandas
import gc

# Loading Whisper
device = "cuda" if torch.cuda.is_available() else "cpu" 
print("Loading Whisper Model!")
asr_model = WhisperModel(whisper_path, device=device, compute_type="float16", local_files_only=True)

def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")

print("Reading audio files!")
audio_files = os.listdir(datapath)
audio_total_size = 0
metadata = {"audio_file": [], "text": [], "speaker_name": []}
for f in audio_files:
    if f.endswith('mp3'):
        audio_path = osp.join(datapath, f)
        wav, sr = torchaudio.load(audio_path)
        if wav.size(0) != 1:
            wav = torch.mean(wav, dim=0, keepdim=True)
        wav = wav.squeeze()
        audio_total_size += (wav.size(-1) / sr)
        # plot_waveform(wav, sr)
        segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
        segments = list(segments)
        i = 0
        sentence = ""
        sentence_start = None
        first_word = True
        # added all segments words in a unique list
        words_list = []
        for _, segment in enumerate(segments):
            words = list(segment.words)
            words_list.extend(words)

        # process each word
        for word_idx, word in enumerate(words_list):
            if first_word:
                sentence_start = word.start
                # If it is the first sentence, add buffer or get the begining of the file
                if word_idx == 0:
                    sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
                else:
                    # get previous sentence end
                    previous_word_end = words_list[word_idx - 1].end
                    # add buffer or get the silence midle between the previous sentence and the current one
                    sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)

                sentence = word.word
                first_word = False
            else:
                sentence += word.word
            
            if word.word[-1] in ["!", ".", "?"]:
                sentence = sentence[1:]
                # Expand number and abbreviations plus normalization
                sentence = multilingual_cleaners(sentence, target_language)
                audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))

                audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"

                # Check for the next word's existence
                if word_idx + 1 < len(words_list):
                    next_word_start = words_list[word_idx + 1].start
                else:
                    # If don't have more words it means that it is the last sentence then use the audio len as next word start
                    next_word_start = (wav.shape[0] - 1) / sr

                # Average the current word end and next word start
                word_end = min((word.end + next_word_start) / 2, word.end + buffer)
                
                absoulte_path = os.path.join(out_path, audio_file)
                os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
                i += 1
                first_word = True

                audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
                # if the audio is too short ignore it (i.e < 0.33 seconds)
                if audio.size(-1) >= sr/3:
                    torchaudio.save(absoulte_path,
                        audio,
                        sr
                    )
                else:
                    continue
                
                metadata["audio_file"].append(audio_file)
                metadata["text"].append(sentence)
                metadata["speaker_name"].append(speaker_name)



df = pandas.DataFrame(metadata)
df = df.sample(frac=1)
num_val_samples = int(len(df)*eval_percentage)

df_eval = df[:num_val_samples]
df_train = df[num_val_samples:]

df_train = df_train.sort_values('audio_file')
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
df_train.to_csv(train_metadata_path, sep="|", index=False)

eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
df_eval = df_eval.sort_values('audio_file')
df_eval.to_csv(eval_metadata_path, sep="|", index=False)

# deallocate VRAM and RAM
del asr_model, df_train, df_eval, df, metadata
gc.collect()

print('audio total size: ', audio_total_size)

Expected behavior

there are data lines in metadata_train.csv and metadata_eval.csv

Logs

root@dsw-297768-d54489667-bcrfv:/mnt/workspace/clone_voice_sft_xtts# python process_audio_files.py 
2023-12-31 21:37:21,419 - modelscope - INFO - PyTorch version 2.1.0+cu118 Found.
2023-12-31 21:37:21,421 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2023-12-31 21:37:21,421 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2023-12-31 21:37:21,462 - modelscope - INFO - Loading done! Current index file version is 1.10.0, with md5 44f0b88effe82ceea94a98cf99709694 and a total number of 946 components indexed
Loading Whisper Model!
/mnt/workspace/.cache/modelscope/keepitsimple/faster-whisper-large-v3
Reading audio files!
> /mnt/workspace/clone_voice_sft_xtts/process_audio_files.py(82)<module>()
-> if word.word[-1] in ["!", ".", "?"]:
(Pdb) words_list
[Word(start=0.0, end=0.42, word='但', probability=0.82470703125), Word(start=0.42, end=0.68, word='小', probability=0.9951171875), Word(start=0.68, end=1.06, word='狗', probability=0.99951171875), Word(start=1.06, end=1.18, word='呢', probability=0.8623046875), Word(start=1.18, end=1.34, word='它', probability=0.4169921875), Word(start=1.34, end=1.6, word='不是', probability=0.9970703125), Word(start=1.6, end=1.9, word='关', probability=0.904296875), Word(start=1.9, end=2.2, word='节', probability=0.99853515625), Word(start=2.2, end=2.38, word='它', probability=0.91015625), Word(start=2.38, end=2.64, word='是', probability=0.99951171875), Word(start=2.64, end=3.0, word='近', probability=0.362548828125), Word(start=3.0, end=3.72, word='病', probability=0.80419921875), Word(start=3.72, end=4.08, word='骨', probability=0.99072265625), Word(start=4.08, end=4.72, word='就', probability=0.9921875), Word(start=4.72, end=4.86, word='它', probability=0.9794921875), Word(start=4.86, end=5.16, word='病', probability=0.9990234375), Word(start=5.16, end=5.44, word='骨', probability=1.0), Word(start=5.44, end=5.6, word='和', probability=0.9990234375), Word(start=5.6, end=5.72, word='它', probability=0.99755859375), Word(start=5.72, end=6.0, word='那个', probability=0.99658203125), Word(start=6.0, end=6.24, word='什么', probability=0.994140625), Word(start=6.979999999999997, end=7.5, word='骨', probability=0.99853515625), Word(start=7.5, end=7.76, word='头', probability=1.0), Word(start=7.76, end=7.92, word='的', probability=1.0), Word(start=7.92, end=8.06, word='那个', probability=0.998046875), Word(start=8.06, end=8.26, word='位', probability=1.0), Word(start=8.26, end=8.54, word='置', probability=1.0), Word(start=8.54, end=8.84, word='它', probability=0.99560546875), Word(start=8.84, end=9.1, word='是', probability=1.0), Word(start=9.1, end=9.3, word='那个', probability=1.0), Word(start=9.3, end=9.74, word='地方', probability=1.0), Word(start=9.74, end=10.12, word='没', probability=0.9990234375), Word(start=10.12, end=10.32, word='长', probability=0.998046875), Word(start=10.32, end=10.66, word='好', probability=0.99951171875), Word(start=10.66, end=11.64, word='然后', probability=0.99853515625), Word(start=11.64, end=12.28, word='长', probability=0.99951171875), Word(start=12.28, end=12.7, word='期', probability=1.0), Word(start=12.7, end=12.86, word='那么', probability=0.9892578125), Word(start=12.86, end=13.16, word='走', probability=1.0), Word(start=13.16, end=13.4, word='路', probability=1.0), Word(start=13.4, end=13.52, word='呢', probability=0.990234375), Word(start=13.52, end=13.84, word='磨', probability=0.998291015625), Word(start=13.84, end=14.16, word='损', probability=0.999755859375), Word(start=14.16, end=14.5, word='导', probability=0.99951171875), Word(start=14.5, end=14.78, word='致', probability=1.0), Word(start=14.78, end=14.94, word='的', probability=0.98876953125), Word(start=14.94, end=15.92, word='就', probability=0.98681640625), Word(start=15.92, end=16.08, word='反', probability=1.0), Word(start=16.08, end=16.26, word='正', probability=1.0), Word(start=16.26, end=16.48, word='原', probability=0.9990234375), Word(start=16.48, end=16.62, word='理', probability=0.99755859375), Word(start=16.62, end=16.74, word='应', probability=0.99951171875), Word(start=16.74, end=16.84, word='该', probability=1.0), Word(start=16.84, end=16.96, word='都是', probability=1.0), Word(start=16.96, end=17.42, word='差不多', probability=0.99951171875), Word(start=17.42, end=17.7, word='反', probability=1.0), Word(start=17.7, end=17.84, word='正', probability=1.0), Word(start=17.84, end=18.08, word='就是', probability=1.0), Word(start=18.9, end=19.42, word='用', probability=0.99951171875), Word(start=19.42, end=19.7, word='力', probability=1.0), Word(start=19.7, end=19.86, word='用', probability=0.9990234375), Word(start=19.86, end=20.2, word='不对', probability=0.9990234375), Word(start=20.2, end=20.7, word='然后', probability=0.998046875), Word(start=20.7, end=21.68, word='导', probability=0.99951171875), Word(start=21.68, end=21.92, word='致', probability=1.0), Word(start=21.92, end=22.12, word='那个', probability=0.99658203125), Word(start=22.12, end=22.46, word='膝', probability=0.983154296875), Word(start=22.46, end=22.7, word='关', probability=0.99853515625), Word(start=22.7, end=22.96, word='节', probability=0.99951171875), Word(start=22.96, end=23.86, word='的', probability=0.99560546875), Word(start=23.86, end=24.04, word='那个', probability=0.9990234375), Word(start=24.04, end=24.36, word='白', probability=1.0), Word(start=24.36, end=24.66, word='色', probability=1.0), Word(start=24.66, end=24.74, word='的', probability=0.966796875), Word(start=24.74, end=24.9, word='那个', probability=0.97119140625), Word(start=24.9, end=25.22, word='软', probability=0.999267578125), Word(start=25.22, end=25.48, word='骨', probability=0.999755859375), Word(start=25.48, end=25.68, word='啊', probability=0.962890625), Word(start=25.68, end=26.58, word='就', probability=0.99853515625), Word(start=26.58, end=27.16, word='磨', probability=0.999755859375), Word(start=27.16, end=27.42, word='损', probability=1.0), Word(start=27.42, end=27.58, word='的', probability=0.9775390625), Word(start=27.58, end=27.72, word='太', probability=0.9990234375), Word(start=27.72, end=27.92, word='严', probability=0.999755859375), Word(start=27.92, end=28.16, word='重', probability=1.0), Word(start=28.16, end=28.26, word='了', probability=0.97509765625), Word(start=28.26, end=29.26, word='然后', probability=0.99560546875), Word(start=29.26, end=29.54, word='呢', probability=1.0), Word(start=29.54, end=29.82, word='现在', probability=0.38525390625), Word(start=29.82, end=30.08, word='呢', probability=0.283203125), Word(start=30.08, end=30.92, word='它', probability=0.1630859375), Word(start=30.92, end=31.16, word='走', probability=0.9970703125), Word(start=31.16, end=31.44, word='路', probability=0.99951171875), Word(start=31.44, end=31.52, word='呢', probability=0.89697265625), Word(start=31.52, end=31.74, word='它', probability=0.9326171875), Word(start=31.74, end=31.94, word='是', probability=0.98681640625), Word(start=31.94, end=32.18, word='骨', probability=0.991943359375), Word(start=32.18, end=32.38, word='头', probability=0.9970703125), Word(start=32.38, end=32.64, word='磨', probability=0.907470703125), Word(start=32.64, end=32.74, word='着', probability=0.76904296875), Word(start=32.74, end=32.96, word='骨', probability=0.994873046875), Word(start=32.96, end=33.2, word='头', probability=0.99951171875), Word(start=33.2, end=33.48, word='所以', probability=0.96240234375), Word(start=33.48, end=33.58, word='就', probability=0.990234375), Word(start=33.58, end=33.72, word='会', probability=0.99853515625), Word(start=33.72, end=33.94, word='很', probability=0.9990234375), Word(start=33.94, end=34.26, word='疼', probability=0.994384765625), Word(start=34.26, end=34.96, word='或者', probability=0.98193359375), Word(start=34.96, end=35.2, word='是', probability=0.9990234375), Word(start=35.2, end=35.76, word='那个', probability=0.79638671875), Word(start=35.76, end=37.32, word='软', probability=0.997314453125), Word(start=37.32, end=37.58, word='骨', probability=0.9990234375), Word(start=37.58, end=37.68, word='比', probability=0.98974609375), Word(start=37.68, end=38.1, word='较', probability=1.0), Word(start=38.92, end=38.94, word='比', probability=0.4990234375), Word(start=38.94, end=39.34, word='较', probability=1.0), Word(start=39.34, end=39.64, word='薄', probability=1.0), Word(start=39.64, end=39.78, word='了', probability=0.9990234375), Word(start=39.78, end=40.22, word='所以', probability=0.99658203125), Word(start=40.22, end=40.38, word='它', probability=0.96826171875), Word(start=40.38, end=40.56, word='就', probability=0.99755859375), Word(start=40.56, end=41.24, word='不能', probability=0.998046875), Word(start=41.24, end=41.66, word='缓', probability=0.99951171875), Word(start=41.66, end=41.98, word='冲', probability=0.99853515625), Word(start=41.98, end=42.62, word='所以', probability=0.99072265625), Word(start=42.62, end=42.78, word='就', probability=0.99951171875), Word(start=42.78, end=42.9, word='比', probability=0.99951171875), Word(start=42.9, end=43.08, word='较', probability=1.0), Word(start=43.08, end=43.4, word='疼', probability=0.999755859375)]
(Pdb) len(words_list)
129
(Pdb) words_list[0]
Word(start=0.0, end=0.42, word='但', probability=0.82470703125)
(Pdb) q
Traceback (most recent call last):
  File "/mnt/workspace/clone_voice_sft_xtts/process_audio_files.py", line 82, in <module>
    sentence = sentence[1:]
  File "/mnt/workspace/clone_voice_sft_xtts/process_audio_files.py", line 82, in <module>
    sentence = sentence[1:]
  File "/opt/conda/lib/python3.10/bdb.py", line 90, in trace_dispatch
    return self.dispatch_line(frame)
  File "/opt/conda/lib/python3.10/bdb.py", line 115, in dispatch_line
    if self.quitting: raise BdbQuit
bdb.BdbQuit
^[[A

Environment

{
    "CUDA": {
        "GPU": [
            "Tesla V100-SXM2-16GB"
        ],
        "available": true,
        "version": "11.8"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "2.1.0+cu118",
        "numpy": "1.26.2"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            "ELF"
        ],
        "processor": "x86_64",
        "python": "3.10.13",
        "version": "#1 SMP Tue Jun 20 06:15:49 UTC 2023"
    }
}

Additional context

I installed TTS by this:

rm -rf TTS/ # delete repo to be able to reinstall if needed
git clone --branch xtts_demo  https://github.com/coqui-ai/TTS.git
pip install --use-deprecated=legacy-resolver  -e TTS
pip install --use-deprecated=legacy-resolver  -r TTS/TTS/demos/xtts_ft_demo/requirements.txt
pip install typing_extensions==4.8.0 numpy==1.26.2

dorbodwolf avatar Dec 31 '23 14:12 dorbodwolf