skip to Main Content

I’m working on a project using NextJS where I need to implement continuous Speech-to-Text with language detection. While I have successfully set up Speech-to-Text for a single language, I’m struggling to get automatic language detection to work. The documentation seems limited, and I can’t seem to figure out what I’m doing wrong.

Following the Documentation it should be implemented is this way (source)

var autoDetectSourceLanguageConfig = SpeechSDK.AutoDetectSourceLanguageConfig.fromLanguages(["en-US", "de-DE"]); var speechRecognizer = SpeechSDK.SpeechRecognizer.FromConfig(speechConfig, autoDetectSourceLanguageConfig, audioConfig);

This is my part of my component:

    useEffect(() => {
      const fetchTokenAndSetupRecognizer = async () => {
        const tokenObj = await getTokenOrRefresh();
        if (tokenObj.authToken && tokenObj.region) {
          audioConfig.current = AudioConfig.fromDefaultMicrophoneInput();

          const autoDetectLanguages = [
            "en-US",
            "de-DE"
          ];
          speechConfig.current = SpeechConfig.fromAuthorizationToken(
            tokenObj.authToken,
            tokenObj.region
          );
          const autoDetectConfig =
            AutoDetectSourceLanguageConfig.fromLanguages(autoDetectLanguages);

          audioConfig.current = AudioConfig.fromDefaultMicrophoneInput();
          recognizer.current = SpeechRecognizer.FromConfig(
            speechConfig.current,
            autoDetectConfig,
            audioConfig.current
          );
          recognizer.current.recognized = (s, e) =>
            processRecognizedTranscript(e);
          recognizer.current.canceled = (s, e) => handleCanceled(e);
        }
        setIsDisabled(!recognizer.current);
      };
      fetchTokenAndSetupRecognizer();
      return () => {
        recognizer.current?.close();
      };
    }, []);

I searched through here, the documentation, and the repository, but there are limited examples and information for React/JavaScript

2

Answers


  1. I tried your code and encountered issues with implementing automatic language detection in Azure Speech-to-Text using the Azure Speech SDK.

    To enable language identification, you should use code like this.

    Const autoDetectConfig = sdk. AutoDetectSourceLanguageConfig.fromLanguages(["en-US","de-DE","zh-CN"]);
    Const recognizer = new sdk. SpeechRecognizer(config, audioConfig, autoDetectConfig);
    
    

    Below code is recognizing speech from an audio file using the Azure Speech SDK and the code is taken from MSDOC and git.

    const sdk = require('microsoft-cognitiveservices-speech-sdk');
    const fs = require('fs');
    require('dotenv').config();
    
    const subscriptionKey =process.env.AZURE_SPEECH_KEY; 
    const serviceRegion = process.env.AZURE_SpeechRegion;
    
    class AutoDetectSourceLanguageResult {
        constructor(language, confidence) {
            this.privLanguage = language;
            this.privLanguageDetectionConfidence = confidence;
        }
    
        static fromResult(result) {
            return new AutoDetectSourceLanguageResult(result.language, result.languageDetectionConfidence);
        }
    
        get language() {
            return this.privLanguage;
        }
    
        get languageDetectionConfidence() {
            return this.privLanguageDetectionConfidence;
        }
    }
    
    async function recognitionWithMicrophone() {
        const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput();
        const config = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);
        const autoDetectConfig = sdk.AutoDetectSourceLanguageConfig.fromLanguages(["en-US", "zh-CN"]);
        const recognizer = new sdk.SpeechRecognizer(config, audioConfig,autoDetectConfig);
    
        recognizer.recognizeOnceAsync(result => {
            if (result.reason === sdk.ResultReason.RecognizedSpeech) {
                const languageResult = AutoDetectSourceLanguageResult.fromResult(result);
                var detectedLanguage = languageDetectionResult.language;
                console.log(`RECOGNIZED: Text=${result.text}`);
    
                console.log(`DETECTED: Language=${detectedLanguage}; (Confidence: ${languageResult.languageDetectionConfidence})`);
            } else if (result.reason === sdk.ResultReason.NoMatch) {
                console.log("NOMATCH: Speech could not be recognized.");
            } else if (result.reason === sdk.ResultReason.Canceled) {
                const cancellation = sdk.CancellationDetails.fromResult(result);
                console.log(`CANCELED: Reason=${cancellation.reason}`);
                if (cancellation.reason === sdk.CancellationReason.Error) {
                    console.log(`CANCELED: ErrorCode=${cancellation.errorCode}`);
                    console.log(`CANCELED: ErrorDetails=${cancellation.errorDetails}`);
                }
            }
        });
    }
    
    async function multiLingualRecognitionWithAudioFile() {
        const audioFilePath = "console_en-us_zh-cn.wav";
    
        console.log(`Attempting to access audio file at: ${audioFilePath}`);
    
        if (!fs.existsSync(audioFilePath)) {
            console.error(`Error: Audio file '${audioFilePath}' not found.`);
            return;
        }
    
        try {
            const audioData = fs.readFileSync(audioFilePath);
            const audioConfig = sdk.AudioConfig.fromWavFileInput(audioData);
            const config = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);
    
            const autoDetectConfig = sdk.AutoDetectSourceLanguageConfig.fromLanguages(["en-US", "zh-CN"]);
            const recognizer = new sdk.SpeechRecognizer(config, audioConfig);
    
            recognizer.recognizing = (s, e) => {
                if (e.result.reason === sdk.ResultReason.RecognizingSpeech) {
                    const languageResult = AutoDetectSourceLanguageResult.fromResult(e.result);
                    console.log(`RECOGNIZING: Text=${e.result.text}`);
                    console.log(`DETECTED: Language=${languageResult.language} (Confidence: ${languageResult.languageDetectionConfidence})`);
                }
            };
    
            recognizer.recognized = (s, e) => {
                if (e.result.reason === sdk.ResultReason.RecognizedSpeech) {
                    const languageResult = AutoDetectSourceLanguageResult.fromResult(e.result);
                    console.log(`RECOGNIZED: Text=${e.result.text}`);
                    console.log(`DETECTED: Language=${languageResult.language} (Confidence: ${languageResult.languageDetectionConfidence})`);
                } else if (e.result.reason === sdk.ResultReason.NoMatch) {
                    console.log("NOMATCH: Speech could not be recognized.");
                }
            };
    
            recognizer.canceled = (s, e) => {
                console.log(`CANCELED: Reason=${e.reason}`);
                if (e.reason === sdk.CancellationReason.Error) {
                    console.log(`CANCELED: ErrorCode=${e.errorCode}`);
                    console.log(`CANCELED: ErrorDetails=${e.errorDetails}`);
                }
                recognizer.stopContinuousRecognitionAsync();
            };
    
            recognizer.sessionStarted = (s, e) => {
                console.log("n    Session started event.");
            };
    
            recognizer.sessionStopped = (s, e) => {
                console.log("n    Session stopped event.");
                recognizer.stopContinuousRecognitionAsync();
            };
    
            await recognizer.startContinuousRecognitionAsync();
        } catch (error) {
            console.error("Error while initializing speech recognizer:", error);
        }
    }
    
    async function main() {
        console.log("Starting Speech Recognition Samples...");
        try {
            await Promise.all([
                recognitionWithMicrophone(),
                multiLingualRecognitionWithAudioFile()
            ]);
        } catch (err) {
            console.error("Error occurred:", err);
        }
    }
    
    main().catch(err => {
        console.error("Error occurred:", err);
    });
    
    

    Output:

    enter image description here

    Login or Signup to reply.
  2. @jojak did you resolved the issue? i am also tried to capture the audio from microphone. it wont works well

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search