speech_recognition icon indicating copy to clipboard operation
speech_recognition copied to clipboard

fasterwhisper and distilwhisper implementations

Open sujitvasanth opened this issue 1 year ago • 4 comments

Hi @Uberi I wrote some extensions to your API for faster whisper and distil whisper that just need to be added to the init.py file to work - they will load the models automatically.

def recognize_whisper(self, audio_data, model="base.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
...
      return result["text"]
def recognize_fasterwhisper(self, audio_data, model="small", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
    #custom recognizer for faster whisper
    assert isinstance(audio_data, AudioData), "Data must be audio data"
    import numpy as np
    import soundfile as sf
    import torch
    from faster_whisper import WhisperModel

    if load_options or not hasattr(self, "whisper_model") or self.whisper_model.get(model) is None:
        self.whisper_model = getattr(self, "whisper_model", {})
        #self.whisper_model[model] = WhisperModel("base", device="cpu", compute_type="int8")
        self.whisper_model[model] = WhisperModel("tiny", device="cuda", compute_type="auto")
        
    wav_bytes = audio_data.get_wav_data(convert_rate=16000)
    wav_stream = io.BytesIO(wav_bytes)
    audio_array, sampling_rate = sf.read(wav_stream)
    audio_array = audio_array.astype(np.float32)

    segments, info = self.whisper_model[model].transcribe(audio_array, beam_size=5,)
    text =""
    for segment in segments:
        #print("%s " % (segment.text))
        text=text+segment.text+" "
        #print(text)
    if show_dict:
        return result
    else:
        return text.lower()

def recognize_distilwhisper(self, audio_data, model="distil-whisper/distil-small.en", show_dict=False, load_options=None, language=None, translate=False, **transcribe_options):
    #custom recognizer for distill-whisper
    assert isinstance(audio_data, AudioData), "Data must be audio data"
    import numpy as np
    import soundfile as sf
    import torch
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    model_id = "distil-whisper/distil-small.en"
    
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
    model.to(device)
    processor = AutoProcessor.from_pretrained(model_id)
    whisper = pipeline(
        "automatic-speech-recognition",model=model,tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,max_new_tokens=128,
        torch_dtype=torch_dtype,device=device,)
                
    wav_bytes = audio_data.get_wav_data(convert_rate=16000)
    wav_stream = io.BytesIO(wav_bytes)
    audio_array, sampling_rate = sf.read(wav_stream)
    audio_array = audio_array.astype(np.float16)

    text = whisper(audio_array,
                    chunk_length_s=50,
                    stride_length_s=10,
                    batch_size=8)
    print(text)
    if show_dict:
        return result
    else:
        return text["text"]`

would be great if you would consider adding these to your API. Just needs some minor alterations to allow users to choose which model and gpu/cpu

new dependencies required

pip3 install faster-whisper
pip3 install transformers optimum accelerate
 

sujitvasanth avatar Jan 14 '24 21:01 sujitvasanth

great idea nice work

MiskaWasTaken avatar Feb 22 '24 12:02 MiskaWasTaken

@MiskaWasTaken please test the code out and let me know what you think

sujitvasanth avatar Feb 25 '24 11:02 sujitvasanth

I might have to copy you and add support for whisperx

Masame avatar Jun 05 '24 23:06 Masame

this is good work, i am using faster-whisper to transcribe anyway. trying it out tomo

Genesis1231 avatar Jun 14 '24 14:06 Genesis1231