skip to Main Content

I cant seem to get the pitch increase through ssml and am lost at this point. Apologies if this question is not properly asked, it is my first time on stack overflow. I am developing a chatbot that utilizes the Microsoft Azure Text-to-Speech (TTS) service to synthesize speech. I am trying to adjust the pitch of the generated speech using SSML (Speech Synthesis Markup Language). Despite trying different approaches, the pitch adjustment does not seem to work, and I am unsure why it seems to just ignore it.

import os
import time
import nltk
from azure.cognitiveservices import speech as speechsdk
from twitchio.ext import commands
from chat import *

output_file_name_with_path = '{0}\output.wav'.format(os.path.dirname(__file__))


def get_value_from_json_key(key_name):
    with open("config.json", "r") as file:
        json_data = json.load(file)
    for i in json_data:
        if str(i) == str(key_name):
            return str(json_data[i])


def get_audio_or_return_error(result):
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        stream = speechsdk.AudioDataStream(result)
        stream.save_to_wav_file(output_file_name_with_path)
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(cancellation_details.error_details))
                print("Did you set the speech resource key and region values?")


def get_output_audio_file(text):
    # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    speech_config = speechsdk.SpeechConfig(subscription=get_value_from_json_key("microsoft-azure-api-key"),
                                           region=get_value_from_json_key("microsoft-azure-speech-region"))
    speech_config.speech_synthesis_voice_name = get_value_from_json_key("voice-name")
    audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
    # The language of the voice that speaks.
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
      
    # Get text from the console and synthesize to the default speaker.
    print("<Speaking...>")
    with open("output.txt", "a", encoding="utf-8") as out:
        out.write(str(text) + "n")
    speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
    get_audio_or_return_error(speech_synthesis_result)


def generate_conversation(message_content, message_author):
    print('------------------------------------------------------')
    print(message_content)
    print(message_author)
    print(Bot.conversation)

    Bot.conversation.append(f'CHATTER: {message_content}')
    text_block = 'n'.join(Bot.conversation)
    prompt = open_file('prompt_chat.txt').replace('<<BLOCK>>', text_block)
    bot_name = get_value_from_json_key("bot-name")
    prompt = prompt + 'n' + bot_name + ':'
    print(prompt)
    response = gpt3_completion(prompt)
    print(bot_name + ': ', response)
    if Bot.conversation.count(bot_name + ': ' + response) == 0:
        Bot.conversation.append(bot_name + f': {response}')
    return response


def generate_ssml(response):
    ssml_text = f'<speak><prosody pitch="+15.00%">{response}</prosody></voice></speak>'
    return ssml_text


def get_audio_and_text(message_content, message_author):
    response = generate_conversation(message_content, message_author)
#    response = message_content + "? " + response
    generate_ssml(response)
    get_output_audio_file(str(response))
    audio_file = output_file_name_with_path
    time.sleep(2)
    open('output.txt', 'w').close()
    print('------------------------------------------------------')
    os.remove(audio_file)


class Bot(commands.Bot):
    conversation = list()

    def __init__(self):
        # Initialise our Bot with our access token, prefix and a list of channels to join on boot...
        # prefix can be a callable, which returns a list of strings or a string...
        # initial_channels can also be a callable which returns a list of strings...
        super().__init__(token=get_value_from_json_key("twitch-access-key"), prefix='!',
                         initial_channels=[get_value_from_json_key("twitch-account-name")])

    async def event_ready(self):
        # Notify us when everything is ready!
        # We are logged in and ready to chat and use commands...
        print(f'Logged in as | {self.nick}')

    async def event_message(self, message):
        # Messages with echo set to True are messages sent by the bot...
        # For now, we just want to ignore them...
        if not message.echo:
            # download the words corpus
            nltk.download('words')
            # Check if the message contains english words
            if any(word in message.content for word in nltk.corpus.words.words()):
                # Check if the message is too long
                if len(message.content) <= 100:
                    get_audio_and_text(message.content, message.author.name)
                    # Since we have commands and are overriding the default `event_message`
                    # We must let the bot know we want to handle and invoke our commands...
        await self.handle_commands(message)

    @commands.command()
    async def hello(self, ctx: commands.Context):
        # Here we have a command hello, we can invoke our command with our prefix and command name
        # e.g ?hello
        # We can also give our commands aliases (different names) to invoke with.

        # Send a hello back!
        # Sending a reply back to the channel is easy... Below is an example.
        await ctx.send(f'Hello {ctx.author.name}!')


bot = Bot()
bot.run()
# bot.run() is blocking and will stop execution of any below code here until stopped or closed.

2

Answers


  1. You can make use of below SSML code to tweak your pitch in your text to speech code:-

    <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts'><voice name='en-US-JennyNeural'><prosody pitch='+50%'>Hello, world!</prosody></voice></speak>
    

    Try changing the value of <prosody pitch='+50%'> from 50 to 10 you will see the difference in the pitch, As I tried running the same in Speech studio.

    enter image description here

    My python texttospeech.py code :-

    import os
    import azure.cognitiveservices.speech as speechsdk
    
    # Replace these variables with your own values
    subscription_key = 'xxxxxc57f4a81feff3'
    region = 'eastus'
    voice_name = 'en-US-GuyRUS'
    text = 'Hello, this is a pitch test.'
    pitch_percentage = '+50%'  # Adjust the pitch by +50%. Use '-' for a lower pitch.
    
    # Create the SSML with pitch adjustment
    ssml = f"""
    <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
        <voice name='{voice_name}'>
            <prosody pitch='{pitch_percentage}'>
                {text}
            </prosody>
        </voice>
    </speak>
    """
    
    # Speech configuration
    speech_config = speechsdk.SpeechConfig(subscription=subscription_key, region=region)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    
    # Synthesize speech
    result = speech_synthesizer.speak_ssml_async(ssml).get()
    
    # Check if synthesis was successful and save the audio
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        audio_data = result.audio_data
        audio_filename = 'output.wav'
        with open(audio_filename, 'wb') as audio_file:
            audio_file.write(audio_data)
        print(f'Audio saved as {audio_filename}')
    else:
        print(f'Synthesis failed: {result.reason}')
    

    Output:-

    enter image description here

    Login or Signup to reply.
  2. You can use the values x-high – high – medium – low – x-low

     <voice  name="en-US-JennyNeural">
        <prosody  pitch="x-high">
    

    This is a script with a mother – daughter conversation

    <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US' xmlns:mstts = 'http://www.w3.org/2001/mstts'> 
    <voice  name="en-US-JennyNeural">
    <prosody  pitch="x-high">
    Mike, hello, do you want to play with me?          
    </prosody>
    </voice>
    <voice  name="en-US-GuyNeural">
    <prosody  pitch="x-high">
    I first have to do my homework Jenny
    </prosody>
    </voice>
    <voice  name="en-US-JennyNeural">
      <prosody  pitch="x-high">
    Boring
    </prosody>
      </voice>
    <voice  name="en-CA-ClaraNeural">
    Please ,take Mike as an example.
    </voice>
    <voice  name="en-US-JennyNeural">
    <prosody  pitch="x-high">
     Ok, mama
    </prosody>
    </voice>
    
    </speak>
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search