VidChapters Replace with faster-whisper

Replace with faster-whisper

Open Remosy opened this issue 10 months ago • 0 comments

I prefer using faster whisper, if you need to do demo with it too, here is the revised code:


import argparse
import torch
import os
import pickle
from args import get_args_parser, MODEL_DIR
import whisper
from faster_whisper import WhisperModel, decode_audio
import whisperx
from typing import TypedDict
class SingleSegment(TypedDict):
    """
    A single segment (up to multiple sentences) of a speech.
    """
    start: float
    end: float
    text: str

# Args
parser = argparse.ArgumentParser(parents=[get_args_parser()])
args = parser.parse_args()
device = torch.device(args.device)

print("load Whisper model")
asr_model = WhisperModel("large-v3",device="cuda", compute_type="float16")
print("extract ASR")
asr = asr_model.transcribe(args.video_example,without_timestamps=True,word_timestamps=False, beam_size=5,initial_prompt='Please！ add punctuations。',vad_filter=True)
print("load align model")
align_model, metadata = whisperx.load_align_model(language_code=asr[1].language, device=args.device, model_dir=MODEL_DIR)
print("extract audio")
audio = whisperx.load_audio(args.video_example)

print("align ASR")
the_segments = []
for segment in asr[0]:
    s_item = {'text':segment.text,'start':segment.start,'end':segment.end}
    the_segments.append(s_item)
print(the_segments[:3])

print("whisperx.......")
aligned_asr = whisperx.align(the_segments, align_model, metadata, audio, args.device, return_char_alignments=False)

print("saving")
pickle.dump(aligned_asr, open(args.asr_example, 'wb'))

Mar 29 '24 06:03 Remosy

VidChapters VidChapters copied to clipboard

Replace with faster-whisper

VidChapters
VidChapters copied to clipboard