vosk-api icon indicating copy to clipboard operation
vosk-api copied to clipboard

No results with node.js with WASM (no binding)

Open zavalyshyn opened this issue 2 years ago • 5 comments

Hi,

I'm working on a node.js version that relies on WASM version of vosk rather than a locally installed one (i.e., binding). I have successfully compiled a wasm version using vosk-browser project as a reference. I'm now trying to run a base test with a wav file but the recognizer's results are always empty when using a small en-us model. When using a bigger en-us model I only get "the" as a final result which is wrong.

I've tried many things and at this point I need an advise. Is there anything I'm still missing or should check? Thanks for any suggestions.

My code is as follows:

const vosk = require('./vosk');
const { Readable } = require("stream");
const wav = require("wav");
const fs = require("fs");

FILE_NAME = "2830-3980-0043.wav"
// FILE_NAME = "test.wav"
SAMPLE_RATE = 16000
// MODEL_PATH = "small-model"
MODEL_PATH = "big-model"

async function init() {
  vosk.FS.mkdir('/model');
  vosk.FS.mount(vosk.NODEFS, { root: MODEL_PATH }, '/model');
  
  vosk.SetLogLevel(2);
  
  const model = new vosk.Model('/model')
  
  const rec = new vosk.KaldiRecognizer(model, SAMPLE_RATE,);
  // rec.setMaxAlternatives(10);
  rec.SetWords(true);
  // rec.setPartialWords(true);
  
  const wfReader = new wav.Reader();
  const wfReadable = new Readable().wrap(wfReader);
  
  let bufferAddr = null // address of buffer on the heap, has to be _malloc/_free-d
  let bufferSize = null
  
  let testBuffer = [];
  
  function allocateBuffer(size) {
    if (bufferAddr !== null && bufferSize === size) {
		    return
    }
    freeBuffer()
    bufferAddr = vosk._malloc(size)
    bufferSize = size
    console.debug(`RecognizerWorker: allocated buffer of ${bufferSize} bytes`);
  }
  function freeBuffer() {
    if (bufferAddr === null) {
	    return
    }
    vosk._free(bufferAddr)
    console.debug(`RecognizerWorker: freed buffer of ${bufferSize} bytes`);
    bufferAddr = null
    bufferSize = null
  }
  
  wfReader.on('format', async ({ audioFormat, sampleRate, channels }) => {
    if (audioFormat != 1 || channels != 1) {
	      console.error("Audio file must be WAV format mono PCM.");
	      process.exit(1);
    }
    
    for await (let data of wfReadable) { // data is instanceof Uint8Array
	if (bufferAddr !== null) {
          vosk._free(bufferAddr);
          bufferAddr = null
          bufferSize = null
	}

	if (!(data instanceof Int32Array)) {
          data = new Int32Array(data);
          // throw new Error(`Channel data is not a Float32Array as expected`);
        }
	const requiredSize = data.length * data.BYTES_PER_ELEMENT
	allocateBuffer(requiredSize)
        vosk.HEAP32.set(data, bufferAddr / data.BYTES_PER_ELEMENT);

	const end_of_speech = rec.AcceptWaveform(bufferAddr, data.length);
	if (end_of_speech) {
	      console.log(JSON.parse(rec.Result()));
	} else {
	      console.log(JSON.parse(rec.PartialResult()));
	}
	freeBuffer();
    }
    console.log(JSON.parse(rec.FinalResult()));
    model.delete();
    rec.delete();
  });
  
  fs.createReadStream(FILE_NAME, {'highWaterMark': 4096}).pipe(wfReader).on('finish',
    function (err) {
      if (err) console.log(err);
  });

}

init()

And here is an output log

$ node test
LOG (VoskAPI:ReadDataFiles():src/model.cc:211) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():src/model.cc:214) Silence phones 1:2:3:4:5:11:12:13:14:15
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:CompileLooped():nnet-compile-looped.cc:345) Spent 0.127 seconds in looped compilation.
LOG (VoskAPI:ReadDataFiles():src/model.cc:238) Loading i-vector extractor from /model/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:198) Done.
LOG (VoskAPI:ReadDataFiles():src/model.cc:271) Loading HCL and G from /model/graph/HCLr.fst /model/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():src/model.cc:292) Loading winfo /model/graph/phones/word_boundary.int
VLOG[2] (VoskAPI:KaldiRecognizer_SetWords():src/bindings.cc:49) Setting words to 1
RecognizerWorker: allocated buffer of 16208 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16208 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 16384 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 16384 bytes
RecognizerWorker: allocated buffer of 7216 bytes
{ partial: '' }
RecognizerWorker: freed buffer of 7216 bytes
VLOG[2] (VoskAPI:AccStats():sausages.cc:197) L = 0
VLOG[2] (VoskAPI:MbrDecode():sausages.cc:98) Iter = 0, delta-Q = 0
VLOG[2] (VoskAPI:PrintDiagnostics():online-ivector-feature.cc:369) By the end of the utterance, objf change/frame from estimating iVector (vs. default) was 33.5486 and iVector length was 3.68048
{
  result: [ { conf: 1, end: 3.93, start: 0.27, word: 'the' } ],
  text: 'the'
}

zavalyshyn avatar Jun 15 '22 12:06 zavalyshyn

Something wrong with the data format. You need to try the same file from python. Then you need to dump the audio data as you feed into recognizer and listen/check the expected values.

nshmyrev avatar Jun 15 '22 12:06 nshmyrev

Thanks. let me try this and get back with the results. Just to be clear, what type of data KaldiRecognizer expects? Is it Int32Array or Float32Array?

zavalyshyn avatar Jun 15 '22 12:06 zavalyshyn

It depends on the bindings. There are different versions with float/short/byte inputs. See here:

https://github.com/alphacep/vosk-api/blob/master/src/vosk_api.h#L216

nshmyrev avatar Jun 15 '22 12:06 nshmyrev

Hi, It took me a while to get back to this issue. I still can't figure out why it doesn't work but now I have some more insights.

I did try the same audio file using the same model file with a python vosk version (no wasm, regular pip module) and the recognition worked properly. So the audio should be recognized.

I then tried to dump each chunk of the audio I'm sending to rec.AcceptWaveform() into a file and listen to it. It was the original audio file sans the wave header. So it receives proper audio stream and in the proper format.

One thing I've noticed was the difference in final log outputs. On my failing code (node.js + wasm) I get the following:

VLOG[2] (VoskAPI:PrintDiagnostics():online-ivector-feature.cc:369) 
By the end of the utterance, objf change/frame from 
estimating iVector (vs. default) was 15.8403 and iVector length was 3.93156

and on a working Python version I get this

VLOG[2] (VoskAPI:PrintDiagnostics():online-ivector-feature.cc:369) 
By the end of the utterance, objf change/frame from 
estimating iVector (vs. default) was 9.69913 and iVector length was 4.49303

I see a significant difference in iVector estimation. Is it something I need to dig deeper? If so where should I begin? Thanks

zavalyshyn avatar Jul 27 '22 14:07 zavalyshyn

If so where should I begin?

Print audio waveform values on cpp side and compare

nshmyrev avatar Jul 28 '22 17:07 nshmyrev

After trying out several things (including checking the audio waveform which was identical to the one from python vosk) I'm still missing something. The weird thing I don't get any error message but no recognition results either. I don't have more time and resources to dig deeper into this issues so I'll just close this issue for now. Thanks for your help and suggestions along the way.

zavalyshyn avatar Aug 11 '22 12:08 zavalyshyn