insanely-fast-whisper
insanely-fast-whisper copied to clipboard
Cuda Index Out of Bound error on GPU
Hey guys, I attempted to use insanely-fast-whisper
without CLI hosted on Modal. When I am running on CPU, albeit slower, it's able to reach to completion.
However on GPU, it's stuck facing this sort of error. Note that I see this error on longer videos (ie >30 minutes). Some of the chunk works but most don't after a certain time.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [3,0,0] Assertion `-sizes[i] <= index && index < sizes[i] && "index out of bounds"` failed
Any help on this?
My setup (ignore the special decorator)
class Whisper:
"""
Whisper model class for transcribing audio on a per-segment basis.
"""
@enter()
def setup(self):
"""Set up the Whisper model for transcription."""
import time
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
pipeline,
utils,
)
logger.info("🥶 Cold starting inference")
start = time.monotonic_ns()
device = "cuda" if torch.cuda.is_available() else "cpu"
device_int = 0 if device == "cuda" else -1
logger.info(f"Running on {device}")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_DIR,
torch_dtype=torch_dtype,
use_safetensors=True,
attn_implementation="flash_attention_2"
if utils.is_flash_attn_2_available() == "cuda"
else "sdpa",
).to(device)
processor = AutoProcessor.from_pretrained(MODEL_DIR)
self.pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
chunk_length_s=30,
batch_size=24,
return_timestamps=True,
model_kwargs={"attn_implementation": "flash_attention_2"},
device=device_int,
)
duration_s = (time.monotonic_ns() - start) / 1e9
logger.info(f"🏎️ Engine started in {duration_s:.0f}s")
@method()
def transcribe_segment(self, start: float, end: float, audio_filepath: Path):
"""Transcribe a specific segment of an audio file."""
import tempfile
import time
import ffmpeg
t0 = time.time()
with tempfile.NamedTemporaryFile(suffix=".mp3") as f:
(
ffmpeg.input(str(audio_filepath))
.filter("atrim", start=start, end=end)
.output(f.name)
.overwrite_output()
.run(quiet=True)
)
# Transcribe the processed audio segment
result = self.pipe(f.name)
logger.info(
f"Transcribed segment from {start:.2f} to {end:.2f} in {time.time() - t0:.2f} seconds."
)
# Adjust timestamps to original audio timeline
for segment in result["chunks"]:
restored_timestamp = (
segment["timestamp"][0] + start,
# Fix for error faced
# Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
segment["timestamp"][1] + start
if segment["timestamp"][1] is not None
else None,
)
segment["timestamp"] = restored_timestamp
return result