import asyncio import os import edge_tts import gradio as gr from datetime import datetime # Function to get available voices async def get_voices(): try: voices = await edge_tts.list_voices() return sorted([f"{voice['ShortName']} ({voice['Gender']})" for voice in voices]) except Exception as e: return [f"Error fetching voices: {str(e)}"] # Function to convert text to speech async def text_to_speech(text, voice, rate, pitch): try: if not text or not voice: return None, "Error: Text and voice selection are required." # Extract voice ShortName (e.g., "en-US-AvaNeural (Female)" -> "en-US-AvaNeural") voice_short_name = voice.split(" (")[0] # Convert rate to edge-tts format (e.g., 10 -> "+10%", -10 -> "-10%") rate_str = f"+{int(rate)}%" if rate >= 0 else f"{int(rate)}%" # Convert pitch to edge-tts format (e.g., 100 -> "+100Hz", -100 -> "-100Hz") pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz" # Generate unique output filename with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = f"output_{timestamp}.mp3" # Initialize edge-tts communication communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) # Save the audio await communicate.save(output_file) # Check if file was created if os.path.exists(output_file): return output_file, "Audio generated successfully!" else: return None, "Error: Audio file was not generated." except Exception as e: return None, f"Error: {str(e)}" # Gradio interface function def create_gradio_interface(): # Get voices synchronously loop = asyncio.get_event_loop() voices = loop.run_until_complete(get_voices()) # Custom CSS for a polished look css = """ .gradio-container {background-color: #f5f7fa;} .title {text-align: center; color: #2c3e50;} .footer {text-align: center; color: #7f8c8d; font-size: 0.9em; margin-top: 20px;} .button-primary {background-color: #3498db !important; color: white !important;} .input-box {border-radius: 8px;} """ # Define Gradio interface with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface: gr.Markdown( """
Convert text to speech with customizable voice, rate, and pitch.
""" ) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Input Text", placeholder="Enter the text you want to convert to speech...", lines=5, elem_classes="input-box" ) voice_dropdown = gr.Dropdown( choices=voices, label="Voice Model", value=voices[0] if voices else None, allow_custom_value=False ) rate_slider = gr.Slider( minimum=-50, maximum=50, value=0, step=1, label="Speech Rate (%)", info="Adjust the speed of the speech (±50%)" ) pitch_slider = gr.Slider( minimum=-200, maximum=200, value=0, step=10, label="Pitch (Hz)", info="Adjust the pitch of the voice (±200Hz)" ) generate_button = gr.Button("Generate Audio", variant="primary", elem_classes="button-primary") with gr.Column(scale=1): audio_output = gr.Audio(label="Generated Audio", interactive=False) status_output = gr.Textbox( label="Status", interactive=False, placeholder="Status messages will appear here..." ) # Button click event async def on_generate(text, voice, rate, pitch): audio, status = await text_to_speech(text, voice, rate, pitch) return audio, status generate_button.click( fn=on_generate, inputs=[text_input, voice_dropdown, rate_slider, pitch_slider], outputs=[audio_output, status_output] ) gr.Markdown( """ """ ) return interface # Launch the interface if __name__ == "__main__": interface = create_gradio_interface() interface.launch(server_name="0.0.0.0", server_port=7860, share=False)