VidChapters
VidChapters copied to clipboard
Replace with faster-whisper
I prefer using faster whisper, if you need to do demo with it too, here is the revised code:
import argparse
import torch
import os
import pickle
from args import get_args_parser, MODEL_DIR
import whisper
from faster_whisper import WhisperModel, decode_audio
import whisperx
from typing import TypedDict
class SingleSegment(TypedDict):
"""
A single segment (up to multiple sentences) of a speech.
"""
start: float
end: float
text: str
# Args
parser = argparse.ArgumentParser(parents=[get_args_parser()])
args = parser.parse_args()
device = torch.device(args.device)
print("load Whisper model")
asr_model = WhisperModel("large-v3",device="cuda", compute_type="float16")
print("extract ASR")
asr = asr_model.transcribe(args.video_example,without_timestamps=True,word_timestamps=False, beam_size=5,initial_prompt='Please! add punctuations。',vad_filter=True)
print("load align model")
align_model, metadata = whisperx.load_align_model(language_code=asr[1].language, device=args.device, model_dir=MODEL_DIR)
print("extract audio")
audio = whisperx.load_audio(args.video_example)
print("align ASR")
the_segments = []
for segment in asr[0]:
s_item = {'text':segment.text,'start':segment.start,'end':segment.end}
the_segments.append(s_item)
print(the_segments[:3])
print("whisperx.......")
aligned_asr = whisperx.align(the_segments, align_model, metadata, audio, args.device, return_char_alignments=False)
print("saving")
pickle.dump(aligned_asr, open(args.asr_example, 'wb'))