Edge-TTS-Text-to-Speech

Running

File size: 5,040 Bytes

import asyncio
import os
import edge_tts
import gradio as gr
from datetime import datetime

# Function to get available voices
async def get_voices():
    try:
        voices = await edge_tts.list_voices()
        return sorted([f"{voice['ShortName']} ({voice['Gender']})" for voice in voices])
    except Exception as e:
        return [f"Error fetching voices: {str(e)}"]

# Function to convert text to speech
async def text_to_speech(text, voice, rate, pitch):
    try:
        if not text or not voice:
            return None, "Error: Text and voice selection are required."
        
        # Extract voice ShortName (e.g., "en-US-AvaNeural (Female)" -> "en-US-AvaNeural")
        voice_short_name = voice.split(" (")[0]
        
        # Convert rate to edge-tts format (e.g., 10 -> "+10%", -10 -> "-10%")
        rate_str = f"+{int(rate)}%" if rate >= 0 else f"{int(rate)}%"
        
        # Convert pitch to edge-tts format (e.g., 100 -> "+100Hz", -100 -> "-100Hz")
        pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz"
        
        # Generate unique output filename with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"output_{timestamp}.mp3"
        
        # Initialize edge-tts communication
        communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
        
        # Save the audio
        await communicate.save(output_file)
        
        # Check if file was created
        if os.path.exists(output_file):
            return output_file, "Audio generated successfully!"
        else:
            return None, "Error: Audio file was not generated."
    except Exception as e:
        return None, f"Error: {str(e)}"

# Gradio interface function
def create_gradio_interface():
    # Get voices synchronously
    loop = asyncio.get_event_loop()
    voices = loop.run_until_complete(get_voices())
    
    # Custom CSS for a polished look
    css = """
    .gradio-container {background-color: #f5f7fa;}
    .title {text-align: center; color: #2c3e50;}
    .footer {text-align: center; color: #7f8c8d; font-size: 0.9em; margin-top: 20px;}
    .button-primary {background-color: #3498db !important; color: white !important;}
    .input-box {border-radius: 8px;}
    """

    # Define Gradio interface
    with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
        gr.Markdown(
            """
            <h1 class='title'>Edge TTS Text-to-Speech</h1>
            <p style='text-align: center;'>Convert text to speech with customizable voice, rate, and pitch.</p>
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                text_input = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter the text you want to convert to speech...",
                    lines=5,
                    elem_classes="input-box"
                )
                voice_dropdown = gr.Dropdown(
                    choices=voices,
                    label="Voice Model",
                    value=voices[0] if voices else None,
                    allow_custom_value=False
                )
                rate_slider = gr.Slider(
                    minimum=-50,
                    maximum=50,
                    value=0,
                    step=1,
                    label="Speech Rate (%)",
                    info="Adjust the speed of the speech (±50%)"
                )
                pitch_slider = gr.Slider(
                    minimum=-200,
                    maximum=200,
                    value=0,
                    step=10,
                    label="Pitch (Hz)",
                    info="Adjust the pitch of the voice (±200Hz)"
                )
                generate_button = gr.Button("Generate Audio", variant="primary", elem_classes="button-primary")
            
            with gr.Column(scale=1):
                audio_output = gr.Audio(label="Generated Audio", interactive=False)
                status_output = gr.Textbox(
                    label="Status",
                    interactive=False,
                    placeholder="Status messages will appear here..."
                )
        
        # Button click event
        async def on_generate(text, voice, rate, pitch):
            audio, status = await text_to_speech(text, voice, rate, pitch)
            return audio, status
        
        generate_button.click(
            fn=on_generate,
            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
            outputs=[audio_output, status_output]
        )
        
        gr.Markdown(
            """
            <p class='footer'>
                Powered by Edge TTS and Gradio | Deployed on Hugging Face Spaces
            </p>
            """
        )
    
    return interface

# Launch the interface
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(server_name="0.0.0.0", server_port=7860, share=False)