google-cloud-node Speech v2 is not drop-in replacement for v1, and it is undocumented

Environment details

OS: Windows 10
Node.js version: 18.13
npm version: 8.19.3
google-cloud-node version: 5.3.0

Steps to reproduce

Running the following program (either with or without doStream), fails with errors.

const speech = require('@google-cloud/speech');
const fs = require('fs');
var protos = speech.protos.google.cloud.speech;
const client = new speech.v2.SpeechClient({
  credentials: {
    client_email: 'MY_MAIL',
    private_key: 'MY_KEY',
  },
  projectId: 'MY_PROJECT',
});

const doStream = true;
(async () => {
  try {
    if (doStream) {
      const file = fs.createReadStream('test.wav');
      const recognizer = await client.streamingRecognize({
        config: {
          encoding: protos.v1.RecognitionConfig.AudioEncoding.MULAW,
          languageCode: 'en-US',
          sampleRateHertz: 16000,
          model: 'phone_call'
        },
        interimResults: true
      });
      recognizer.on('error', console.error);
      recognizer.on('data', (data) => console.log('data', data.results[0].alternatives[0].transcript));
      file.pipe(recognizer);
    } else {
      const [response] = await client.recognize({
        config: {
          encoding: protos.v1.RecognitionConfig.AudioEncoding.MULAW,
          languageCode: 'en-US',
          sampleRateHertz: 16000,
          model: 'phone_call'
        },
        audio: { content: fs.readFileSync('test.wav') },
        interimResults: true
      });
      const transcription = response.results
        .map(result => result.alternatives[0].transcript)
        .join('\n');
      console.log(`Transcription: ${transcription}`);
    }
  }
  catch (err) {
    console.error(err);
  }
})();

Error:

Error: 3 INVALID_ARGUMENT: Invalid resource field value in the request.
    at callErrorFromStatus (F:\Projects\jstest\node_modules\@grpc\grpc-js\build\src\call.js:31:19)
    ...
for call at
    at ServiceClientImpl.makeUnaryRequest (F:\Projects\jstest\node_modules\@grpc\grpc-js\build\src\client.js:160:34)
    ...
reason: 'RESOURCE_PROJECT_INVALID',

Changing v2 to v1 works as expected.

I've noticed that there's a helper function in helpers.js, which translates the request. I'm not sure if/how it should be adapted for v2.

I couldn't find any documentation or a running example how to correctly use v2.

Feb 05 '23 20:02 orgads

@orgads Please refer to the client library reference page here: https://cloud.google.com/nodejs/docs/reference/speech/latest/speech/v2.speechclient-class

There are also some generated code samples here that might help you get started: https://github.com/googleapis/google-cloud-node/tree/main/packages/google-cloud-speech/samples/generated/v2

Feb 09 '23 00:02 dizcology

@dizcology Thank you very much!

I'll test it next week. Closing this issue.

Feb 09 '23 20:02 orgads

@dizcology I tried the streamingRecognize sample, and couldn't get it to work. The documentation is also unclear. What is SpeechClient and what is Recognizer? What's the relation between them? What is the right way to create and use Recognizer?

This is my code:

import { v2, protos } from '@google-cloud/speech';
var speech = protos.google.cloud.speech.v2;
import * as fs from 'fs';
function main() {
  const audio = fs.createReadStream('test.wav');
  const speechClient = new v2.SpeechClient({
    credentials: {
      client_email: '[email protected]',
      private_key: '***&&&'
    },
    projectId: 'project'
  });
  const request = {
    config: {
      explicitDecodingConfig: {
        encoding: speech.ExplicitDecodingConfig.AudioEncoding.MULAW,
        sampleRateHertz: 16000
      },
    }
  };
  const stream = speechClient.streamingRecognize();
  stream.on('data', (response) => { console.log(response); });
  stream.on('error', (err) => { throw (err); });
  stream.on('end', () => { });
  stream.write(request);
  audio.pipe(stream);
  audio.on('end', () => stream.end());
}
process.on('unhandledRejection', err => {
  console.error(err);
  process.exitCode = 1;
});
main();

Result:

Uncaught Error Error: 3 INVALID_ARGUMENT: Invalid resource field value in the request.

Feb 21 '23 17:02 orgads

cc @yoshigev

Feb 22 '23 11:02 orgads

Ok, we did it!

If someone gets here, this is a complete example that works for me:

import { v2, protos } from '@google-cloud/speech';
var speech = protos.google.cloud.speech.v2;
import * as fs from 'fs';
import throttle from 'throttle';
import streamEvents from 'stream-events';
import pumpify from 'pumpify';
import * as stream from 'stream';
import common from '@google-cloud/common';

export function streamingRecognize(client, streamingConfig, options = undefined) {
  options = options || {};
  streamingConfig = streamingConfig || {};
  const recognizeStream = streamEvents(new pumpify.obj());
  const requestStream = client
    ._streamingRecognize(options)
    .on('error', (err) => { recognizeStream.destroy(err); })
    .on('response', (response) => { recognizeStream.emit('response', response); });
  recognizeStream.once('writing', () => {
    requestStream.write(streamingConfig);
    recognizeStream.setPipeline([
      new stream.PassThrough({
        objectMode: true,
        transform: (audio, _, next) => {
          if (audio !== undefined) {
            next(undefined, { audio });
            return;
          }
          next();
        },
      }),
      requestStream,
      new stream.PassThrough({
        objectMode: true,
        transform: (response, enc, next) => {
          if (response.error) {
            next(new common.util.ApiError(response.error));
            return;
          }
          next(undefined, response);
        },
      }),
    ]);
  });
  return recognizeStream;
}

async function getRecognizer(speechClient, projectId) {
  const name = speechClient.recognizerPath(projectId, 'global', 'test-rec');
  const existing = await speechClient.getRecognizer({ name });
  if (existing[0])
    return existing[0].name;
  const recognizer = await speechClient.createRecognizer({
    recognizer: {
      languageCodes: ['en-US'],
      model: 'telephony',
    },
    recognizerId: 'test-rec',
    parent: speechClient.locationPath(projectId, 'global')
  });
  return (await recognizer[0].promise())[0].name;
}

async function main() {
  const audio = fs.createReadStream('test.wav').pipe(new throttle(16000));
  const projectId = 'project';
  const speechClient = new v2.SpeechClient({
    credentials: {
      client_email: '[email protected]',
      private_key: '***&&&'
    },
    projectId: projectId
  });
  const recognizerName = await getRecognizer(speechClient, projectId);
  const request = {
    recognizer: recognizerName,
    streamingConfig: {
      config: {
        explicitDecodingConfig: {
          encoding: speech.ExplicitDecodingConfig.AudioEncoding.MULAW,
          sampleRateHertz: 16000,
          audioChannelCount: 1
        }
      }
    }
  };
  const stream = streamingRecognize(speechClient, request);
  stream.on('data', (response) => {
    console.log(response);
  });
  stream.on('error', (err) => { throw (err); });
  stream.on('end', () => { console.log('stream end') });
  audio.pipe(stream);
}
main();

Feb 22 '23 16:02 orgads

Actually this shouldn't have been closed. The API should either be drop-in, or at the very least this should be documented.

Jan 07 '24 15:01 orgads

+1 let's update the docs to the v2 api or atleast note explicity that the docs are NOT covering v2

May 02 '24 16:05 govindrai

google-cloud-node google-cloud-node copied to clipboard

Speech v2 is not drop-in replacement for v1, and it is undocumented

Environment details

Steps to reproduce

google-cloud-node
google-cloud-node copied to clipboard