skip to Main Content

I’m using Microsoft Azure’s Text-to-speech API with a simple goal: play the synthesized speech on browser when I click a button.

I am using nextJS API routes to make a request to Azure, and then use this route from the client-side button to play the audio.

blob:http://localhost:3000/aab03e2a-14c1-48a7-9dae-4eac158325a5:1
GET blob:http://localhost:3000/aab03e2a-14c1-48a7-9dae-4eac158325a5 

net::ERR_REQUEST_RANGE_NOT_SATISFIABLE

localhost/:1 Uncaught (in promise) DOMException: Failed to load because no supported source was found.

/pages/api/synthesizeSpeech.tsx

import { NextApiRequest, NextApiResponse } from "next";
import * as sdk from "microsoft-cognitiveservices-speech-sdk";

export default async (req: NextApiRequest, res: NextApiResponse) => {
  if (req.method !== "POST") {
    return res.status(405).end();
  }

  const speechConfig = sdk.SpeechConfig.fromSubscription(process.env.SPEECH_KEY, process.env.SPEECH_REGION);
  speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural";

  // Create a pull stream
  const pullStream = sdk.AudioOutputStream.createPullStream();

  const audioConfig = sdk.AudioConfig.fromStreamOutput(pullStream);

  const synthesizer = new sdk.SpeechSynthesizer(speechConfig, audioConfig);

  const text = req.body.text;

  synthesizer.speakTextAsync(
    text,
    (result) => {
      if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
        // Set the appropriate headers for audio data
        res.setHeader("Content-Type", "audio/wav");
        res.setHeader("Content-Disposition", "attachment; filename=speech.wav");

        // Read the audio data from the pull stream and write it to the response
        const audioBuffer = [];
        const bufferSize = 10240;
        const buffer = new ArrayBuffer(bufferSize);
        let bytesRead = 0;

        do {
          // @ts-ignore
          bytesRead = pullStream.read(buffer);
          for (let i = 0; i < bytesRead; i++) {
            // @ts-ignore
            audioBuffer.push(buffer[i]);
          }
        } while (bytesRead > 0);

        res.status(200).end(Buffer.from(audioBuffer));
      } else {
        res.status(500).json({
          error: `Speech synthesis canceled, ${result.errorDetails}nDid you set the speech resource key and region values?`,
        });
      }
      synthesizer.close();
    },
    (err) => {
      res.status(500).json({ error: `Error - ${err}` });
      synthesizer.close();
    }
  );
};

pages/demo.tsx

const ButtonPanel = () => {
  const handleSynthesize = async (text: string) => {
    alert(text);
    try {
      const response = await fetch("/api/synthesizeSpeech", {
        method: "POST",
        headers: {
          "Content-Type": "application/json",
        },
        body: JSON.stringify({ text }),
      });

      if (!response.ok) {
        throw new Error("Failed to synthesize speech");
      }

      const blob = await response.blob();
      const audioUrl = URL.createObjectURL(blob);
      const audio = new Audio(audioUrl);
      audio.play();
    } catch (error) {
      console.error(error);
    }
  };

  return (
    <footer className="m-4 mt-0 w-[calc(100vw-2rem)] rounded-b-lg border-t-2 border-gray-200 bg-white shadow-lg">
      <div className="flex items-center justify-center space-x-4 p-4">
        <button
          onClick={() => {
            handleSynthesize("hello this is a test test hello");
          }}
          className="btn-solid w-32 disabled:cursor-not-allowed disabled:bg-gray-100"
        >
          <FaCircleArrowUp size={28} />
        </button>
      </div>
    </footer>
  );

2

Answers


  1. Chosen as BEST ANSWER

    I was able to solve with this code, although it doesn't support async audio streaming

    const ButtonPanel: React.FC = () => {
      const handleSynthesize = async (text: string) => {
        const language = "en";
        const voiceName = "en-US-JennyNeural";
        const speechConfig = sdk.SpeechConfig.fromSubscription(
          "d384d67ea6d34965bd761e46e158b247",
          "eastus"
        );
        speechConfig.speechSynthesisOutputFormat =
          SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm;
        let synthesizer = new sdk.SpeechSynthesizer(speechConfig);
        synthesizer.speakSsmlAsync(
          `<speak version="1.0" xml:lang="${language}"><voice name="${voiceName}"><prosody rate="10%" pitch="0%">${text}</prosody></voice></speak>`,
          (result) => {
            synthesizer.close();
            console.log(result.audioData);
            return result.audioData;
          }
        );
      };
    
      return (
        <footer className="m-4 mt-0 w-[calc(100vw-2rem)] rounded-b-lg border-t-2 border-gray-200 bg-white shadow-lg">
          <div className="flex items-center justify-center space-x-4 p-4">
            <button
              onClick={() => {
                handleSynthesize("hello my name is tyler kim, how can I help you?");
              }}
              className="btn-solid w-32 disabled:cursor-not-allowed disabled:bg-gray-100"
            >
              Synthesize and Play
            </button>
          </div>
        </footer>
      );
    };
    
    

  2. I made some changes to your code and can able here the audio output with input text in the browser.

    Code:

    synthesizeSpeech.ts:

    import { NextApiRequest, NextApiResponse } from "next";
    import * as sdk from "microsoft-cognitiveservices-speech-sdk";
    import fs from "fs";
    import path from "path";
    
    export default async (req: NextApiRequest, res: NextApiResponse) => {
      if (req.method !== "POST") {
        return res.status(405).end();
      }
    
      const speechConfig = sdk.SpeechConfig.fromSubscription("<speech_key>", "<speech_region>");
      speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural";
    
      const synthesizer = new sdk.SpeechSynthesizer(speechConfig);
    
      const text = req.body.text;
    
      try {
        const audioPath = path.join(process.cwd(), "public", "audio", "synthesized_audio.wav");
        const audioStream = fs.createWriteStream(audioPath);
    
        synthesizer.speakSsmlAsync(
          `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis">${text}</speak>`,
          (event) => {
            if (event.audioData) {
              audioStream.write(event.audioData);
            }
          },
          (err) => {
            console.error(err);
            res.status(500).json({ error: "An error occurred during speech synthesis" });
          }
        );
    
        audioStream.on("finish", () => {
          res.setHeader("Content-Type", "audio/wav");
          const audioFileStream = fs.createReadStream(audioPath);
          audioFileStream.pipe(res);
    
          audioFileStream.on("end", () => {
            fs.unlinkSync(audioPath); // Delete the temporary audio file after sending
          });
        });
      } catch (err) {
        console.error(err);
        res.status(500).json({ error: "An error occurred during speech synthesis" });
      }
    };
    

    ButtonPanel.tsx:

    import React from "react";
    
    const ButtonPanel: React.FC = () => {
      const handleSynthesize = async (text: string) => {
        try {
          const synthesisPromise = new Promise<void>((resolve) => {
            const utterance = new SpeechSynthesisUtterance(text);
            utterance.onend = () => resolve();
            speechSynthesis.speak(utterance);
          });
    
          await synthesisPromise;
        } catch (error) {
          console.error(error);
        }
      };
    
      return (
        <footer className="m-4 mt-0 w-[calc(100vw-2rem)] rounded-b-lg border-t-2 border-gray-200 bg-white shadow-lg">
          <div className="flex items-center justify-center space-x-4 p-4">
            <button
              onClick={() => {
                handleSynthesize("hello this is a test test hello");
              }}
              className="btn-solid w-32 disabled:cursor-not-allowed disabled:bg-gray-100"
            >
              Synthesize and Play
            </button>
          </div>
        </footer>
      );
    };
    
    export default function Home() {
      return (
        <div>
          <ButtonPanel />
        </div>
      );
    }
    

    index.tsx:

    import React from "react";
    import ButtonPanel from "../components/ButtonPanel";
    
    const Home: React.FC = () => {
      return (
        <div>
          <ButtonPanel />
        </div>
      );
    };
    
    export default Home;
    

    Output:

    It runs successfully as below,

    enter image description here

    With the above output URL, I got below in the browser. Then, click on Synthesize and Play, and I can hear the audio output.

    enter image description here

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search