ERR_REQUEST_RANGE_NOT_SATISFIABLE when using Azure Text To Speech API in nextJS

TylerKim
August 12, 2023
390 views
1 vote
2 Answers

I’m using Microsoft Azure’s Text-to-speech API with a simple goal: play the synthesized speech on browser when I click a button.

I am using nextJS API routes to make a request to Azure, and then use this route from the client-side button to play the audio.

blob:http://localhost:3000/aab03e2a-14c1-48a7-9dae-4eac158325a5:1
GET blob:http://localhost:3000/aab03e2a-14c1-48a7-9dae-4eac158325a5 

net::ERR_REQUEST_RANGE_NOT_SATISFIABLE

localhost/:1 Uncaught (in promise) DOMException: Failed to load because no supported source was found.

/pages/api/synthesizeSpeech.tsx

import { NextApiRequest, NextApiResponse } from "next";
import * as sdk from "microsoft-cognitiveservices-speech-sdk";

export default async (req: NextApiRequest, res: NextApiResponse) => {
  if (req.method !== "POST") {
    return res.status(405).end();
  }

  const speechConfig = sdk.SpeechConfig.fromSubscription(process.env.SPEECH_KEY, process.env.SPEECH_REGION);
  speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural";

  // Create a pull stream
  const pullStream = sdk.AudioOutputStream.createPullStream();

  const audioConfig = sdk.AudioConfig.fromStreamOutput(pullStream);

  const synthesizer = new sdk.SpeechSynthesizer(speechConfig, audioConfig);

  const text = req.body.text;

  synthesizer.speakTextAsync(
    text,
    (result) => {
      if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
        // Set the appropriate headers for audio data
        res.setHeader("Content-Type", "audio/wav");
        res.setHeader("Content-Disposition", "attachment; filename=speech.wav");

        // Read the audio data from the pull stream and write it to the response
        const audioBuffer = [];
        const bufferSize = 10240;
        const buffer = new ArrayBuffer(bufferSize);
        let bytesRead = 0;

        do {
          // @ts-ignore
          bytesRead = pullStream.read(buffer);
          for (let i = 0; i < bytesRead; i++) {
            // @ts-ignore
            audioBuffer.push(buffer[i]);
          }
        } while (bytesRead > 0);

        res.status(200).end(Buffer.from(audioBuffer));
      } else {
        res.status(500).json({
          error: `Speech synthesis canceled, ${result.errorDetails}nDid you set the speech resource key and region values?`,
        });
      }
      synthesizer.close();
    },
    (err) => {
      res.status(500).json({ error: `Error - ${err}` });
      synthesizer.close();
    }
  );
};

pages/demo.tsx

const ButtonPanel = () => {
  const handleSynthesize = async (text: string) => {
    alert(text);
    try {
      const response = await fetch("/api/synthesizeSpeech", {
        method: "POST",
        headers: {
          "Content-Type": "application/json",
        },
        body: JSON.stringify({ text }),
      });

      if (!response.ok) {
        throw new Error("Failed to synthesize speech");
      }

      const blob = await response.blob();
      const audioUrl = URL.createObjectURL(blob);
      const audio = new Audio(audioUrl);
      audio.play();
    } catch (error) {
      console.error(error);
    }
  };

  return (
    <footer className="m-4 mt-0 w-[calc(100vw-2rem)] rounded-b-lg border-t-2 border-gray-200 bg-white shadow-lg">
      <div className="flex items-center justify-center space-x-4 p-4">
        <button
          onClick={() => {
            handleSynthesize("hello this is a test test hello");
          }}
          className="btn-solid w-32 disabled:cursor-not-allowed disabled:bg-gray-100"
        >
          <FaCircleArrowUp size={28} />
        </button>
      </div>
    </footer>
  );

Answers

Chosen as BEST ANSWER

I was able to solve with this code, although it doesn't support async audio streaming

const ButtonPanel: React.FC = () => {
  const handleSynthesize = async (text: string) => {
    const language = "en";
    const voiceName = "en-US-JennyNeural";
    const speechConfig = sdk.SpeechConfig.fromSubscription(
      "d384d67ea6d34965bd761e46e158b247",
      "eastus"
    );
    speechConfig.speechSynthesisOutputFormat =
      SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm;
    let synthesizer = new sdk.SpeechSynthesizer(speechConfig);
    synthesizer.speakSsmlAsync(
      `<speak version="1.0" xml:lang="${language}"><voice name="${voiceName}"><prosody rate="10%" pitch="0%">${text}</prosody></voice></speak>`,
      (result) => {
        synthesizer.close();
        console.log(result.audioData);
        return result.audioData;
      }
    );
  };

  return (
    <footer className="m-4 mt-0 w-[calc(100vw-2rem)] rounded-b-lg border-t-2 border-gray-200 bg-white shadow-lg">
      <div className="flex items-center justify-center space-x-4 p-4">
        <button
          onClick={() => {
            handleSynthesize("hello my name is tyler kim, how can I help you?");
          }}
          className="btn-solid w-32 disabled:cursor-not-allowed disabled:bg-gray-100"
        >
          Synthesize and Play
        </button>
      </div>
    </footer>
  );
};

(Edit)

I made some changes to your code and can able here the audio output with input text in the browser.

Code:

synthesizeSpeech.ts:

import { NextApiRequest, NextApiResponse } from "next";
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
import fs from "fs";
import path from "path";

export default async (req: NextApiRequest, res: NextApiResponse) => {
  if (req.method !== "POST") {
    return res.status(405).end();
  }

  const speechConfig = sdk.SpeechConfig.fromSubscription("<speech_key>", "<speech_region>");
  speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural";

  const synthesizer = new sdk.SpeechSynthesizer(speechConfig);

  const text = req.body.text;

  try {
    const audioPath = path.join(process.cwd(), "public", "audio", "synthesized_audio.wav");
    const audioStream = fs.createWriteStream(audioPath);

    synthesizer.speakSsmlAsync(
      `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis">${text}</speak>`,
      (event) => {
        if (event.audioData) {
          audioStream.write(event.audioData);
        }
      },
      (err) => {
        console.error(err);
        res.status(500).json({ error: "An error occurred during speech synthesis" });
      }
    );

    audioStream.on("finish", () => {
      res.setHeader("Content-Type", "audio/wav");
      const audioFileStream = fs.createReadStream(audioPath);
      audioFileStream.pipe(res);

      audioFileStream.on("end", () => {
        fs.unlinkSync(audioPath); // Delete the temporary audio file after sending
      });
    });
  } catch (err) {
    console.error(err);
    res.status(500).json({ error: "An error occurred during speech synthesis" });
  }
};

ButtonPanel.tsx:

import React from "react";

const ButtonPanel: React.FC = () => {
  const handleSynthesize = async (text: string) => {
    try {
      const synthesisPromise = new Promise<void>((resolve) => {
        const utterance = new SpeechSynthesisUtterance(text);
        utterance.onend = () => resolve();
        speechSynthesis.speak(utterance);
      });

      await synthesisPromise;
    } catch (error) {
      console.error(error);
    }
  };

  return (
    <footer className="m-4 mt-0 w-[calc(100vw-2rem)] rounded-b-lg border-t-2 border-gray-200 bg-white shadow-lg">
      <div className="flex items-center justify-center space-x-4 p-4">
        <button
          onClick={() => {
            handleSynthesize("hello this is a test test hello");
          }}
          className="btn-solid w-32 disabled:cursor-not-allowed disabled:bg-gray-100"
        >
          Synthesize and Play
        </button>
      </div>
    </footer>
  );
};

export default function Home() {
  return (
    <div>
      <ButtonPanel />
    </div>
  );
}

index.tsx:

import React from "react";
import ButtonPanel from "../components/ButtonPanel";

const Home: React.FC = () => {
  return (
    <div>
      <ButtonPanel />
    </div>
  );
};

export default Home;

Output:

It runs successfully as below,