vosk-api
vosk-api copied to clipboard
Vosk empty results from RTP packet with gstreamer
Hi, I am currently working with vosk and mediasoup to create a live speech to text recognition. My web video conferencing app is working really well. For the speech recongnition, I am currently getting rtp packet one by one with mediasoup. I use gstreamer to create a pipeline and to format the rtp packets for vosk. Vosk is getting some packets from my gstreamer pipeline but every results/partial results are empty. I was wondering if I was doing something wrong when formating my rtp packets or even if what I am doing is posible.
I already tried multiple gstreamer configurations to make this work without success. Maybe it is preferable to manually create the gstreamer process instead of using "gstreamer-superficial" in node ?
Thanks for your help
My code from my mediasoup server :
this._directTransport = await this._mediasoupRouter.createDirectTransport();
const codecs = [];
const routerCodec = this._mediasoupRouter.rtpCapabilities.codecs.find(
codec => codec.kind === producer.kind
);
codecs.push(routerCodec);
const rtpCapabilities = {
codecs,
rtcpFeedback: []
};
const directConsumer = await this._directTransport.consume({
producerId: producer.id,
rtpCapabilities: rtpCapabilities,
paused: false
});
console.log(`DirectConsumer (${directConsumer.id}) is created on transport ${this._directTransport.id}`);
const model = await rtpConverter.createVoskModel();
const recognizer = new Recognizer({model: model, sampleRate: 16000 });
const { payloadType, clockRate } = rtpParameters.codecs[0];
const pipelineInfo = rtpConverter.createGstreamerPipeline(payloadType, clockRate);
//console.log(pipelineInfo);
rtpConverter.startPipeline(pipelineInfo);
directConsumer.on('rtp', (rtpPacket) => {
//console.log(rtpPacket.dump());
const parsedPacket = parseRtpPacket(rtpPacket);
//console.log(parsedPacket);
const buffer = new Uint8Array(rtpPacket.buffer);
rtpConverter.receiveRtp(buffer, pipelineInfo);
});
My rtpConverter.js file :
const { v4: uuidv4 } = require("uuid");
const gstreamer = require("gstreamer-superficial");
const { Model } = require("vosk");
const modelPath = '/home/busseau/temma/dev/auxamed-media-server/lib/';
function createGstreamerPipeline(payloadType, clockRate) {
console.log("Creating Gstreamer Pipeline...");
console.log("recevied payloadType :", payloadType, "and clockRate :", clockRate);
const sourceId = uuidv4();
const sinkId = uuidv4();
const pipelineElements = [
`appsrc name=${sourceId} format=time is-live=true do-timestamp=true caps="application/x-rtp,media=audio,clock-rate=${clockRate},encoding-name=OPUS,channels=2,payload=96"`,
`rtpopusdepay`,
`opusparse`,
`opusdec`,
`audioconvert`,
`audioresample`,
`audio/x-raw,format=S16LE,channels=1,rate=16000`,
`appsink name=${sinkId}`
];
const elements = pipelineElements.join(` ! `);
const pipeline = new gstreamer.Pipeline(elements);
const gstreamerInfo = {
pipeline: pipeline,
appsrc: pipeline.findChild(sourceId),
appsink: pipeline.findChild(sinkId),
};
return gstreamerInfo;
}
function startPipeline(gstreamerInfo) {
//Start pipeline according to pipeline information
let pipeline = gstreamerInfo.pipeline;
pipeline.play();
console.log("pipeline started");
}
function stopPipeline(gstreamerInfo) {
//Stop pipeline according to pipeline information
let pipeline = gstreamerInfo.pipeline;
pipeline.stop();
console.log("pipeline Stoped");
}
async function receiveRtp(buffer, gstreamerInfo) {
//add every rtp packet to the pipeline
await gstreamerInfo.appsrc.push(buffer);
}
async function createVoskModel() {
console.log("Loading Vosk model...");
try {
const model = new Model(modelPath + 'vosk-model-small-fr-0.22/');
console.log("Vosk model loaded successfully");
return model;
} catch (error) {
console.error("Error loading Vosk model:", error);
return null;
}
}
async function processAudioWithVosk(gstreamerInfo, recognizer) {
const appsink = gstreamerInfo.appsink;
const idleTimeInMs = 30;
const poll = function() {
appsink.pull((sample) => {
if (!sample) {
setTimeout(poll, idleTimeInMs);
return;
}
const arrayBuffer = sample.buffer;
if (!arrayBuffer || arrayBuffer.byteLength === 0) {
console.error('ArrayBuffer is undefined or empty:', arrayBuffer);
setTimeout(poll, idleTimeInMs);
return;
}
const buffer = Buffer.from(new Uint8Array(arrayBuffer));
if (!Buffer.isBuffer(buffer)) {
console.error('Error: Converted buffer is not a Buffer instance');
setTimeout(poll, idleTimeInMs);
return;
}
try {
//console.log("Sample received from pipeline: ", sample);
//console.log("Buffer from sample : ", sample.buffer);
//console.log("Uint8Array Buffer sent to Vosk: ", buffer);
recognizer.acceptWaveform(buffer);
const result = recognizer.resultString();
const partialResult = recognizer.partialResult();
if (result) {
console.log('Result =', result);
}
if (partialResult) {
console.log('Partial result =', partialResult);
}
} catch (error) {
console.error('Error during recognition:', error);
}
poll();
});
};
console.log("Starting audio processing with Vosk");
poll();
}
module.exports = {
createGstreamerPipeline,
startPipeline,
stopPipeline,
receiveRtp,
createVoskModel,
processAudioWithVosk,
}