vosk-api icon indicating copy to clipboard operation
vosk-api copied to clipboard

Vosk empty results from RTP packet with gstreamer

Open Brexard opened this issue 1 month ago • 4 comments

Hi, I am currently working with vosk and mediasoup to create a live speech to text recognition. My web video conferencing app is working really well. For the speech recongnition, I am currently getting rtp packet one by one with mediasoup. I use gstreamer to create a pipeline and to format the rtp packets for vosk. Vosk is getting some packets from my gstreamer pipeline but every results/partial results are empty. I was wondering if I was doing something wrong when formating my rtp packets or even if what I am doing is posible.

I already tried multiple gstreamer configurations to make this work without success. Maybe it is preferable to manually create the gstreamer process instead of using "gstreamer-superficial" in node ?

Thanks for your help

My code from my mediasoup server :

this._directTransport = await this._mediasoupRouter.createDirectTransport();
const codecs = [];
const routerCodec = this._mediasoupRouter.rtpCapabilities.codecs.find(
      codec => codec.kind === producer.kind
);
codecs.push(routerCodec);
const rtpCapabilities = {
      codecs,
      rtcpFeedback: []
};
const directConsumer = await this._directTransport.consume({
       producerId: producer.id,
       rtpCapabilities: rtpCapabilities,
       paused: false
});
console.log(`DirectConsumer (${directConsumer.id}) is created on transport ${this._directTransport.id}`);

const model = await rtpConverter.createVoskModel();
const recognizer = new Recognizer({model: model, sampleRate: 16000 });

const { payloadType, clockRate } = rtpParameters.codecs[0];
const pipelineInfo = rtpConverter.createGstreamerPipeline(payloadType, clockRate);
//console.log(pipelineInfo);
rtpConverter.startPipeline(pipelineInfo);

directConsumer.on('rtp', (rtpPacket) => {
       //console.log(rtpPacket.dump());
       const parsedPacket = parseRtpPacket(rtpPacket);
       //console.log(parsedPacket);
       const buffer = new Uint8Array(rtpPacket.buffer);
       rtpConverter.receiveRtp(buffer, pipelineInfo);
});


My rtpConverter.js file :



const { v4: uuidv4 } = require("uuid");
const gstreamer = require("gstreamer-superficial");
const { Model } = require("vosk");

const modelPath = '/home/busseau/temma/dev/auxamed-media-server/lib/';

function createGstreamerPipeline(payloadType, clockRate) {
       console.log("Creating Gstreamer Pipeline...");
       console.log("recevied payloadType :", payloadType, "and clockRate :", clockRate);
       const sourceId = uuidv4();
       const sinkId = uuidv4();
       const pipelineElements = [
              `appsrc name=${sourceId} format=time is-live=true do-timestamp=true caps="application/x-rtp,media=audio,clock-rate=${clockRate},encoding-name=OPUS,channels=2,payload=96"`,
              `rtpopusdepay`,
              `opusparse`,
              `opusdec`,
              `audioconvert`,
              `audioresample`,
              `audio/x-raw,format=S16LE,channels=1,rate=16000`,
              `appsink name=${sinkId}`
       ];
       const elements = pipelineElements.join(` ! `);
       const pipeline = new gstreamer.Pipeline(elements);
       const gstreamerInfo = {
              pipeline: pipeline,
              appsrc: pipeline.findChild(sourceId),
              appsink: pipeline.findChild(sinkId),
       };
       return gstreamerInfo;
}

function startPipeline(gstreamerInfo) {
       //Start pipeline according to pipeline information
       let pipeline = gstreamerInfo.pipeline;
       pipeline.play();
       console.log("pipeline started");
}

function stopPipeline(gstreamerInfo) {
       //Stop pipeline according to pipeline information
       let pipeline = gstreamerInfo.pipeline;
       pipeline.stop();
       console.log("pipeline Stoped");
}

async function receiveRtp(buffer, gstreamerInfo) {
       //add every rtp packet to the pipeline
       await gstreamerInfo.appsrc.push(buffer);
}

async function createVoskModel() {
       console.log("Loading Vosk model...");
       try {
              const model = new Model(modelPath + 'vosk-model-small-fr-0.22/');
              console.log("Vosk model loaded successfully");
              return model;
       } catch (error) {
              console.error("Error loading Vosk model:", error);
              return null;
       }
}

async function processAudioWithVosk(gstreamerInfo, recognizer) {
       const appsink = gstreamerInfo.appsink;
       const idleTimeInMs = 30;

       const poll = function() {
              appsink.pull((sample) => {
              if (!sample) {
                     setTimeout(poll, idleTimeInMs);
                     return;
       }

       const arrayBuffer = sample.buffer;
       if (!arrayBuffer || arrayBuffer.byteLength === 0) {
              console.error('ArrayBuffer is undefined or empty:', arrayBuffer);
              setTimeout(poll, idleTimeInMs);
              return;
       }

       const buffer = Buffer.from(new Uint8Array(arrayBuffer));
       if (!Buffer.isBuffer(buffer)) {
              console.error('Error: Converted buffer is not a Buffer instance');
              setTimeout(poll, idleTimeInMs);
              return;
       }

       try {
              //console.log("Sample received from pipeline: ", sample);
              //console.log("Buffer from sample : ", sample.buffer);
              //console.log("Uint8Array Buffer sent to Vosk: ", buffer);
              recognizer.acceptWaveform(buffer);

              const result = recognizer.resultString();
              const partialResult = recognizer.partialResult();
              if (result) {
                     console.log('Result =', result);
              }
              if (partialResult) {
                     console.log('Partial result =', partialResult);
              }
       } catch (error) {
       console.error('Error during recognition:', error);
       }

poll();
});
};

console.log("Starting audio processing with Vosk");
poll();
}

module.exports = {
      createGstreamerPipeline,
      startPipeline,
      stopPipeline,
      receiveRtp,
      createVoskModel,
      processAudioWithVosk,
}


Brexard avatar May 28 '24 15:05 Brexard