whisper-cpp-python icon indicating copy to clipboard operation
whisper-cpp-python copied to clipboard

Segmentation fault

Open samhaaf opened this issue 8 months ago • 0 comments

I've got this minimal script to record 1 second and then transcribe:

import os
import numpy as np
import speech_recognition as sr
import whisper_cpp_python  # Import the whisper-cpp-python library
import tempfile
import wave
from tqdm import tqdm
import urllib.request
from time import sleep

def download_model(model_name="base"):
    print(f"Downloading model {model_name} if not already downloaded...")
    base_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/"
    model_path = os.path.join("models", f"ggml-{model_name}.bin")

    if not os.path.exists("models"):
        os.makedirs("models")

    if not os.path.isfile(model_path):
        with tqdm(total=100, desc="Downloading Model", unit='%') as progress_bar:
            def reporthook(block_num, block_size, total_size):
                if total_size > 0:
                    downloaded = block_num * block_size
                    progress = min(100, downloaded * 100 / total_size)
                    progress_bar.n = progress
                    progress_bar.refresh()

            download_url = base_url + f"ggml-{model_name}.bin"
            urllib.request.urlretrieve(download_url, model_path, reporthook)
            print(f"\nModel {model_name} downloaded to {model_path}.")

    return model_path

def record_audio(duration=1):
    print("Recording audio for 1 second...")
    recognizer = sr.Recognizer()
    with sr.Microphone(sample_rate=16000) as source:
        recognizer.adjust_for_ambient_noise(source, duration=0.5)
        audio = recognizer.record(source, duration=duration)
        print("Recording complete.")
        return audio.get_raw_data()

def save_audio_to_file(audio_data):
    print("Saving audio to a temporary file...")
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
        with wave.open(temp_file, 'wb') as wf:
            wf.setnchannels(1)  # Mono channel
            wf.setsampwidth(2)  # 16 bits per sample
            wf.setframerate(16000)  # Sample rate
            wf.writeframes(audio_data)
        print(f"Audio saved to {temp_file.name}")
        return temp_file.name

def transcribe_audio(file_path, model_path):
    print(f"Transcribing audio using model at {model_path}...")
    whisper_model = whisper_cpp_python.Whisper(model_path)
    result = whisper_model.transcribe(file_path)
    transcription = result['text'].strip()
    print(f"Transcription: {transcription}")

def main():
    try:
        # Step 1: Download the model
        model_path = download_model("base")
        print(f"Model path: {model_path}")

        # Step 2: Record audio
        audio_data = record_audio(duration=1)
        print(f"Audio data length: {len(audio_data)}")

        # Step 3: Save audio to a file
        audio_file_path = save_audio_to_file(audio_data)
        print(f"Audio file path: {audio_file_path}")

        # Step 4: Transcribe the audio
        transcribe_audio(audio_file_path, model_path)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Clean up
        if 'audio_file_path' in locals() and os.path.exists(audio_file_path):
            print(f"Removing temporary file: {audio_file_path}")
            os.remove(audio_file_path)

if __name__ == "__main__":
    main()

And I'm getting a seg fault:

poetry run python test_whisper_cpp_python.py
Downloading model base if not already downloaded...
Model path: models/ggml-base.bin
Recording audio for 1 second...
Recording complete.
Audio data length: 30720
Saving audio to a temporary file...
Audio saved to /var/folders/rk/v7pd4lnx2c514f1qjkkb5s7c0000gn/T/tmpamg7etjh.wav
Audio file path: /var/folders/rk/v7pd4lnx2c514f1qjkkb5s7c0000gn/T/tmpamg7etjh.wav
Transcribing audio using model at models/ggml-base.bin...
whisper_init_from_file_no_state: loading model from 'models/ggml-base.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51865
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 512
whisper_model_load: n_audio_head  = 8
whisper_model_load: n_audio_layer = 6
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 512
whisper_model_load: n_text_head   = 8
whisper_model_load: n_text_layer  = 6
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 2
whisper_model_load: mem required  =  310.00 MB (+    6.00 MB per decoder)
whisper_model_load: adding 1608 extra tokens
whisper_model_load: model ctx     =  140.66 MB
whisper_model_load: model size    =  140.54 MB
whisper_init_state: kv self size  =    5.25 MB
whisper_init_state: kv cross size =   17.58 MB
zsh: segmentation fault  poetry run python test_whisper_cpp_python.py

I'm on MacOS (M1), and I installed whisper roughly like this:

brew install ffmpeg

git clone [email protected]:ggerganov/whisper.cpp.git
cd whisper.cpp || exit 1

# Create a virtual environment if it doesn't exist
if [ ! -d "venv" ]; then
  python --version 3.10 --no-default -m venv venv
  source venv/bin/activate
fi

WHISPER_COREML=1 make -j

chmod +x ./models/generate-coreml-model.sh

add-to-path /Applications/Xcode.app/Contents/Developer/usr/bin

mkdir build
cd build
cmake ..
make

add-to-profile WHISPER_CPP_LIB "$WHISPER_PATH/build/libwhisper.dylib"

Any ideas on next steps, or additional info I should share?

P.S. There was another issue I was having before, which I resolved by changing to cmake and setting that WHISPER_CPP_LIB to the ".dylib" and now that issue is resolved, but it made my ability to run whisper the classic way a little weird, and I haven't gone through the debug cycle on seeing if that direct execution still works, but I can do that if you suspect that as the issue.

Thanks for the project 🥳

samhaaf avatar Jun 20 '24 03:06 samhaaf