ospeak Streaming mode

Streaming mode

Open maelp opened this issue 11 months ago • 0 comments
I wanted to build my own (before I saw this), so I did my own script which breaks text into paragraphs, asynchronously convert them (using batches), so that it smoothly runs and start talking early, then start transcribing new text "on-the-fly" while it progresses
I use this to read long articles from NYTimes or whatever
pasting here my dirty JS code in case it could be of interest to do a streaming feature here? (there's also code which either takes just an URL and use puppeteer with my chrome cookies so it can access sites I'm logged in, etc)
#!/usr/bin/env node

import prompt from "prompt";
import clipboardy from "clipboardy";
import fetch from "node-fetch";
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
import OpenAI from "openai";
import puppeteer from "puppeteer";
import chrome from "chrome-cookies-secure";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import fs from "fs";
import { exec } from "child_process";
import path from "path";

const MAX_PARAGRAPH_SIZE_CHARS = 1000;
const TEMP_AUDIO_DIR = "/tmp/tts-temp-audio/"; // Directory for temp audio files
const RESPONSE_FORMAT = "mp3";

const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
if (!OPENAI_API_KEY) {
  throw new Error("Please set OPENAI_API_KEY");
}
const openai = new OpenAI({ apiKey: OPENAI_API_KEY });

// Check if ffmpeg and ffplay commands are available
const checkCommandAvailability = (command) => {
  return new Promise((resolve, reject) => {
    exec(`which ${command}`, (error, stdout) => {
      if (error || !stdout) {
        reject(new Error(`${command} not found. Please install it.`));
      } else {
        resolve(true);
      }
    });
  });
};

Promise.all([
  checkCommandAvailability("ffmpeg"),
  checkCommandAvailability("ffplay"),
]).catch((error) => {
  console.error(error.message);
  process.exit(1);
});

prompt.start();

const N_LOOKAHEAD = 5;

const argv = yargs(hideBin(process.argv))
  .option("speed", {
    alias: "s",
    description: "Speed of the speech output",
    type: "number",
    default: 1.5,
  })
  .option("summary", {
    alias: "r",
    description: "Summarize text",
    type: "boolean",
  })
  .option("text", {
    alias: "t",
    description: "Text to process",
    type: "string",
  })
  .option("clipboard", {
    alias: "c",
    description: "Process text from clipboard",
    type: "boolean",
  })
  .option("url", {
    alias: "u",
    description: "URL to process",
    type: "string",
  })
  .option("curl", {
    description: "Fetch content via curl and process it",
    type: "string",
  })
  .help()
  .alias("help", "h").argv;

// Create temp directory if it doesn't exist
if (!fs.existsSync(TEMP_AUDIO_DIR)) {
  fs.mkdirSync(TEMP_AUDIO_DIR);
}

// Fetch TTS audio using OpenAI API
const getTTSAudio = async (paragraph, outputPath) => {
  console.log("Processing paragraph:", paragraph);
  const response = await openai.audio.speech.create({
    model: "tts-1",
    voice: "nova",
    input: paragraph,
    response_format: RESPONSE_FORMAT,
  });
  const audioData = await response.arrayBuffer();
  fs.writeFileSync(outputPath, Buffer.from(audioData));
};

// Adjust the speed using FFMPEG
const adjustSpeedWithFFMPEG = (inputFile, outputFile, speed) => {
  return new Promise((resolve, reject) => {
    const command = `ffmpeg -i ${inputFile} -filter:a "atempo=${speed}" -y ${outputFile}`;
    exec(command, (error) => {
      if (error) {
        reject(`Error adjusting speed: ${error}`);
      } else {
        resolve();
      }
    });
  });
};

// Play the audio using ffplay
const playAudio = (audioFile) => {
  return new Promise((resolve, reject) => {
    exec(`afplay ${audioFile}`, (error, stdout, stderr) => {
      if (error) {
        reject(new Error(`Error playing audio: ${error.message}`));
      } else {
        resolve();
      }
    });
  });
};

// Clean up temp files
const cleanupFiles = (files) => {
  files.forEach((file) => {
    if (fs.existsSync(file)) {
      fs.unlinkSync(file);
    }
  });
};

const getSummarizedText = async (text) => {
  console.log("Summarizing text...");
  const instructions =
    "You are a professional summarizer. Your task is to concisely and precisely summarize the following text, in 2 to 5 paragraphs:";
  const response = await openai.chat.completions.create({
    model: "gpt-4o",
    messages: [
      { role: "system", content: instructions },
      { role: "user", content: text },
    ],
  });
  const summarizedText = response.choices[0].message.content;
  return summarizedText;
};

// Break text into paragraphs and process them
const processText = async (text, summary) => {
  console.log("------------------");
  console.log(text);
  console.log("------------------");

  if (summary) {
    text = await getSummarizedText(text);
    console.log("-----SUMMARY------");
    console.log(text);
    console.log("------------------");
  }

  const paragraphs = breakIntoParagraphs(text);
  const wellSizedParagraphs = paragraphs.flatMap((paragraph) =>
    breakIntoWellSizedParagraphs(paragraph)
  );

  const N_LOOKAHEAD = 2;
  const paragraphsAudioPromises = new Map();

  for (let i = 0; i < wellSizedParagraphs.length; i++) {
    // Add the current and next N_LOOKAHEAD paragraphs to the processing queue
    for (
      let j = i;
      j < Math.min(i + N_LOOKAHEAD, wellSizedParagraphs.length);
      j++
    ) {
      if (!paragraphsAudioPromises.has(j)) {
        // Only add to queue if it's not already processing
        paragraphsAudioPromises.set(
          j,
          processParagraph(j, wellSizedParagraphs[j])
        );
      }
    }

    // Wait for the current paragraph's processing to finish
    const { spedUpAudioPath } = await paragraphsAudioPromises.get(i);

    // Play audio after processing
    await playAudio(spedUpAudioPath);

    // Clean up the files after audio is played
    cleanupFiles([spedUpAudioPath]);

    // Optionally remove processed paragraphs from the map to free up memory
    paragraphsAudioPromises.delete(i);
  }
};

const processParagraph = async (idx, paragraph) => {
  const tempAudioPath = path.join(
    TEMP_AUDIO_DIR,
    `audio-${idx}.${RESPONSE_FORMAT}`
  );
  const spedUpAudioPath = path.join(
    TEMP_AUDIO_DIR,
    `audio-speed-${idx}.${RESPONSE_FORMAT}`
  );

  // Get TTS audio and adjust speed
  await getTTSAudio(paragraph, tempAudioPath);
  await adjustSpeedWithFFMPEG(tempAudioPath, spedUpAudioPath, argv.speed);

  // Return the path to the sped-up audio file for further processing
  return { spedUpAudioPath };
};

// Helper functions for breaking text into parts
const breakIntoParagraphs = (text) => text.split(/\n\n+/);
const breakIntoWellSizedParagraphs = (paragraph) => {
  const parts = [];
  let currentPart = "";

  paragraph.split(" ").forEach((word) => {
    if (currentPart.length + word.length + 1 > MAX_PARAGRAPH_SIZE_CHARS) {
      parts.push(currentPart);
      currentPart = word;
    } else {
      currentPart += (currentPart.length > 0 ? " " : "") + word;
    }
  });

  if (currentPart.length > 0) {
    parts.push(currentPart);
  }

  return parts;
};

// Fetch and process URL with Puppeteer
const fetchAndProcessURLPuppeteer = async (url, summary) => {
  const cookies = await chrome.getCookiesPromised(url, "puppeteer");

  const browser = await puppeteer.launch({
    headless: true,
  });
  const page = await browser.newPage();
  await page.setCookie(...cookies);

  await page.setViewport({ width: 1280, height: 800 });
  await page.goto(url, { waitUntil: "networkidle2" });

  const html = await page.content();
  const doc = new JSDOM(html, { url }).window.document;
  let reader = new Readability(doc);
  let article = reader.parse();
  console.log(doc);
  processText(article.textContent, summary);

  await browser.close();
};

// Fetch and process URL using fetch
const fetchAndProcessURL = async (url, summary) => {
  const response = await fetch(url);
  const html = await response.text();
  const doc = new JSDOM(html, { url }).window.document;
  let reader = new Readability(doc);
  let article = reader.parse();
  processText(article.textContent, summary);
};

// Handle different input methods based on CLI arguments
if (argv.text) {
  processText(argv.text, argv.summary);
} else if (argv.clipboard) {
  processText(clipboardy.readSync(), argv.summary);
} else if (argv.url) {
  fetchAndProcessURLPuppeteer(argv.url, argv.summary);
} else if (argv.curl) {
  fetchAndProcessURL(argv.curl, argv.summary);
} else {
  console.log("Reading from stdin...");
  let data = "";
  process.stdin.on("data", function (chunk) {
    data += chunk;
  });
  process.stdin.on("end", function () {
    processText(data);
  });
}
Dec 14 '24 18:12 maelp
ospeak ospeak copied to clipboard

Streaming mode

ospeak
ospeak copied to clipboard