google-cloud-node icon indicating copy to clipboard operation
google-cloud-node copied to clipboard

Speech v2 is not drop-in replacement for v1, and it is undocumented

Open orgads opened this issue 2 years ago • 7 comments

Environment details

  • OS: Windows 10
  • Node.js version: 18.13
  • npm version: 8.19.3
  • google-cloud-node version: 5.3.0

Steps to reproduce

Running the following program (either with or without doStream), fails with errors.

const speech = require('@google-cloud/speech');
const fs = require('fs');
var protos = speech.protos.google.cloud.speech;
const client = new speech.v2.SpeechClient({
  credentials: {
    client_email: 'MY_MAIL',
    private_key: 'MY_KEY',
  },
  projectId: 'MY_PROJECT',
});

const doStream = true;
(async () => {
  try {
    if (doStream) {
      const file = fs.createReadStream('test.wav');
      const recognizer = await client.streamingRecognize({
        config: {
          encoding: protos.v1.RecognitionConfig.AudioEncoding.MULAW,
          languageCode: 'en-US',
          sampleRateHertz: 16000,
          model: 'phone_call'
        },
        interimResults: true
      });
      recognizer.on('error', console.error);
      recognizer.on('data', (data) => console.log('data', data.results[0].alternatives[0].transcript));
      file.pipe(recognizer);
    } else {
      const [response] = await client.recognize({
        config: {
          encoding: protos.v1.RecognitionConfig.AudioEncoding.MULAW,
          languageCode: 'en-US',
          sampleRateHertz: 16000,
          model: 'phone_call'
        },
        audio: { content: fs.readFileSync('test.wav') },
        interimResults: true
      });
      const transcription = response.results
        .map(result => result.alternatives[0].transcript)
        .join('\n');
      console.log(`Transcription: ${transcription}`);
    }
  }
  catch (err) {
    console.error(err);
  }
})();

Error:

Error: 3 INVALID_ARGUMENT: Invalid resource field value in the request.
    at callErrorFromStatus (F:\Projects\jstest\node_modules\@grpc\grpc-js\build\src\call.js:31:19)
    ...
for call at
    at ServiceClientImpl.makeUnaryRequest (F:\Projects\jstest\node_modules\@grpc\grpc-js\build\src\client.js:160:34)
    ...
reason: 'RESOURCE_PROJECT_INVALID',

Changing v2 to v1 works as expected.

I've noticed that there's a helper function in helpers.js, which translates the request. I'm not sure if/how it should be adapted for v2.

I couldn't find any documentation or a running example how to correctly use v2.

orgads avatar Feb 05 '23 20:02 orgads

@orgads Please refer to the client library reference page here: https://cloud.google.com/nodejs/docs/reference/speech/latest/speech/v2.speechclient-class

There are also some generated code samples here that might help you get started: https://github.com/googleapis/google-cloud-node/tree/main/packages/google-cloud-speech/samples/generated/v2

dizcology avatar Feb 09 '23 00:02 dizcology

@dizcology Thank you very much!

I'll test it next week. Closing this issue.

orgads avatar Feb 09 '23 20:02 orgads

@dizcology I tried the streamingRecognize sample, and couldn't get it to work. The documentation is also unclear. What is SpeechClient and what is Recognizer? What's the relation between them? What is the right way to create and use Recognizer?

This is my code:

import { v2, protos } from '@google-cloud/speech';
var speech = protos.google.cloud.speech.v2;
import * as fs from 'fs';
function main() {
  const audio = fs.createReadStream('test.wav');
  const speechClient = new v2.SpeechClient({
    credentials: {
      client_email: '[email protected]',
      private_key: '***&&&'
    },
    projectId: 'project'
  });
  const request = {
    config: {
      explicitDecodingConfig: {
        encoding: speech.ExplicitDecodingConfig.AudioEncoding.MULAW,
        sampleRateHertz: 16000
      },
    }
  };
  const stream = speechClient.streamingRecognize();
  stream.on('data', (response) => { console.log(response); });
  stream.on('error', (err) => { throw (err); });
  stream.on('end', () => { });
  stream.write(request);
  audio.pipe(stream);
  audio.on('end', () => stream.end());
}
process.on('unhandledRejection', err => {
  console.error(err);
  process.exitCode = 1;
});
main();

Result:

Uncaught Error Error: 3 INVALID_ARGUMENT: Invalid resource field value in the request.

orgads avatar Feb 21 '23 17:02 orgads

cc @yoshigev

orgads avatar Feb 22 '23 11:02 orgads

Ok, we did it!

If someone gets here, this is a complete example that works for me:

import { v2, protos } from '@google-cloud/speech';
var speech = protos.google.cloud.speech.v2;
import * as fs from 'fs';
import throttle from 'throttle';
import streamEvents from 'stream-events';
import pumpify from 'pumpify';
import * as stream from 'stream';
import common from '@google-cloud/common';

export function streamingRecognize(client, streamingConfig, options = undefined) {
  options = options || {};
  streamingConfig = streamingConfig || {};
  const recognizeStream = streamEvents(new pumpify.obj());
  const requestStream = client
    ._streamingRecognize(options)
    .on('error', (err) => { recognizeStream.destroy(err); })
    .on('response', (response) => { recognizeStream.emit('response', response); });
  recognizeStream.once('writing', () => {
    requestStream.write(streamingConfig);
    recognizeStream.setPipeline([
      new stream.PassThrough({
        objectMode: true,
        transform: (audio, _, next) => {
          if (audio !== undefined) {
            next(undefined, { audio });
            return;
          }
          next();
        },
      }),
      requestStream,
      new stream.PassThrough({
        objectMode: true,
        transform: (response, enc, next) => {
          if (response.error) {
            next(new common.util.ApiError(response.error));
            return;
          }
          next(undefined, response);
        },
      }),
    ]);
  });
  return recognizeStream;
}

async function getRecognizer(speechClient, projectId) {
  const name = speechClient.recognizerPath(projectId, 'global', 'test-rec');
  const existing = await speechClient.getRecognizer({ name });
  if (existing[0])
    return existing[0].name;
  const recognizer = await speechClient.createRecognizer({
    recognizer: {
      languageCodes: ['en-US'],
      model: 'telephony',
    },
    recognizerId: 'test-rec',
    parent: speechClient.locationPath(projectId, 'global')
  });
  return (await recognizer[0].promise())[0].name;
}

async function main() {
  const audio = fs.createReadStream('test.wav').pipe(new throttle(16000));
  const projectId = 'project';
  const speechClient = new v2.SpeechClient({
    credentials: {
      client_email: '[email protected]',
      private_key: '***&&&'
    },
    projectId: projectId
  });
  const recognizerName = await getRecognizer(speechClient, projectId);
  const request = {
    recognizer: recognizerName,
    streamingConfig: {
      config: {
        explicitDecodingConfig: {
          encoding: speech.ExplicitDecodingConfig.AudioEncoding.MULAW,
          sampleRateHertz: 16000,
          audioChannelCount: 1
        }
      }
    }
  };
  const stream = streamingRecognize(speechClient, request);
  stream.on('data', (response) => {
    console.log(response);
  });
  stream.on('error', (err) => { throw (err); });
  stream.on('end', () => { console.log('stream end') });
  audio.pipe(stream);
}
main();

orgads avatar Feb 22 '23 16:02 orgads

Actually this shouldn't have been closed. The API should either be drop-in, or at the very least this should be documented.

orgads avatar Jan 07 '24 15:01 orgads

+1 let's update the docs to the v2 api or atleast note explicity that the docs are NOT covering v2

govindrai avatar May 02 '24 16:05 govindrai