google-cloud-node
google-cloud-node copied to clipboard
Speech v2 is not drop-in replacement for v1, and it is undocumented
Environment details
- OS: Windows 10
- Node.js version: 18.13
- npm version: 8.19.3
- google-cloud-node version: 5.3.0
Steps to reproduce
Running the following program (either with or without doStream
), fails with errors.
const speech = require('@google-cloud/speech');
const fs = require('fs');
var protos = speech.protos.google.cloud.speech;
const client = new speech.v2.SpeechClient({
credentials: {
client_email: 'MY_MAIL',
private_key: 'MY_KEY',
},
projectId: 'MY_PROJECT',
});
const doStream = true;
(async () => {
try {
if (doStream) {
const file = fs.createReadStream('test.wav');
const recognizer = await client.streamingRecognize({
config: {
encoding: protos.v1.RecognitionConfig.AudioEncoding.MULAW,
languageCode: 'en-US',
sampleRateHertz: 16000,
model: 'phone_call'
},
interimResults: true
});
recognizer.on('error', console.error);
recognizer.on('data', (data) => console.log('data', data.results[0].alternatives[0].transcript));
file.pipe(recognizer);
} else {
const [response] = await client.recognize({
config: {
encoding: protos.v1.RecognitionConfig.AudioEncoding.MULAW,
languageCode: 'en-US',
sampleRateHertz: 16000,
model: 'phone_call'
},
audio: { content: fs.readFileSync('test.wav') },
interimResults: true
});
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log(`Transcription: ${transcription}`);
}
}
catch (err) {
console.error(err);
}
})();
Error:
Error: 3 INVALID_ARGUMENT: Invalid resource field value in the request.
at callErrorFromStatus (F:\Projects\jstest\node_modules\@grpc\grpc-js\build\src\call.js:31:19)
...
for call at
at ServiceClientImpl.makeUnaryRequest (F:\Projects\jstest\node_modules\@grpc\grpc-js\build\src\client.js:160:34)
...
reason: 'RESOURCE_PROJECT_INVALID',
Changing v2 to v1 works as expected.
I've noticed that there's a helper function in helpers.js, which translates the request. I'm not sure if/how it should be adapted for v2.
I couldn't find any documentation or a running example how to correctly use v2.
@orgads Please refer to the client library reference page here: https://cloud.google.com/nodejs/docs/reference/speech/latest/speech/v2.speechclient-class
There are also some generated code samples here that might help you get started: https://github.com/googleapis/google-cloud-node/tree/main/packages/google-cloud-speech/samples/generated/v2
@dizcology Thank you very much!
I'll test it next week. Closing this issue.
@dizcology I tried the streamingRecognize
sample, and couldn't get it to work. The documentation is also unclear. What is SpeechClient
and what is Recognizer
? What's the relation between them? What is the right way to create and use Recognizer
?
This is my code:
import { v2, protos } from '@google-cloud/speech';
var speech = protos.google.cloud.speech.v2;
import * as fs from 'fs';
function main() {
const audio = fs.createReadStream('test.wav');
const speechClient = new v2.SpeechClient({
credentials: {
client_email: '[email protected]',
private_key: '***&&&'
},
projectId: 'project'
});
const request = {
config: {
explicitDecodingConfig: {
encoding: speech.ExplicitDecodingConfig.AudioEncoding.MULAW,
sampleRateHertz: 16000
},
}
};
const stream = speechClient.streamingRecognize();
stream.on('data', (response) => { console.log(response); });
stream.on('error', (err) => { throw (err); });
stream.on('end', () => { });
stream.write(request);
audio.pipe(stream);
audio.on('end', () => stream.end());
}
process.on('unhandledRejection', err => {
console.error(err);
process.exitCode = 1;
});
main();
Result:
Uncaught Error Error: 3 INVALID_ARGUMENT: Invalid resource field value in the request.
cc @yoshigev
Ok, we did it!
If someone gets here, this is a complete example that works for me:
import { v2, protos } from '@google-cloud/speech';
var speech = protos.google.cloud.speech.v2;
import * as fs from 'fs';
import throttle from 'throttle';
import streamEvents from 'stream-events';
import pumpify from 'pumpify';
import * as stream from 'stream';
import common from '@google-cloud/common';
export function streamingRecognize(client, streamingConfig, options = undefined) {
options = options || {};
streamingConfig = streamingConfig || {};
const recognizeStream = streamEvents(new pumpify.obj());
const requestStream = client
._streamingRecognize(options)
.on('error', (err) => { recognizeStream.destroy(err); })
.on('response', (response) => { recognizeStream.emit('response', response); });
recognizeStream.once('writing', () => {
requestStream.write(streamingConfig);
recognizeStream.setPipeline([
new stream.PassThrough({
objectMode: true,
transform: (audio, _, next) => {
if (audio !== undefined) {
next(undefined, { audio });
return;
}
next();
},
}),
requestStream,
new stream.PassThrough({
objectMode: true,
transform: (response, enc, next) => {
if (response.error) {
next(new common.util.ApiError(response.error));
return;
}
next(undefined, response);
},
}),
]);
});
return recognizeStream;
}
async function getRecognizer(speechClient, projectId) {
const name = speechClient.recognizerPath(projectId, 'global', 'test-rec');
const existing = await speechClient.getRecognizer({ name });
if (existing[0])
return existing[0].name;
const recognizer = await speechClient.createRecognizer({
recognizer: {
languageCodes: ['en-US'],
model: 'telephony',
},
recognizerId: 'test-rec',
parent: speechClient.locationPath(projectId, 'global')
});
return (await recognizer[0].promise())[0].name;
}
async function main() {
const audio = fs.createReadStream('test.wav').pipe(new throttle(16000));
const projectId = 'project';
const speechClient = new v2.SpeechClient({
credentials: {
client_email: '[email protected]',
private_key: '***&&&'
},
projectId: projectId
});
const recognizerName = await getRecognizer(speechClient, projectId);
const request = {
recognizer: recognizerName,
streamingConfig: {
config: {
explicitDecodingConfig: {
encoding: speech.ExplicitDecodingConfig.AudioEncoding.MULAW,
sampleRateHertz: 16000,
audioChannelCount: 1
}
}
}
};
const stream = streamingRecognize(speechClient, request);
stream.on('data', (response) => {
console.log(response);
});
stream.on('error', (err) => { throw (err); });
stream.on('end', () => { console.log('stream end') });
audio.pipe(stream);
}
main();
Actually this shouldn't have been closed. The API should either be drop-in, or at the very least this should be documented.
+1 let's update the docs to the v2 api or atleast note explicity that the docs are NOT covering v2