File size: 3,367 Bytes
1b41e6d
7877a4f
a257b37
25681f6
1b41e6d
7877a4f
 
 
 
25681f6
7877a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25681f6
7877a4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b41e6d
7877a4f
1b41e6d
7877a4f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import asyncio
import os
import edge_tts
import gradio as gr

# Function to get available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return [f"{voice['ShortName']} ({voice['Gender']})" for voice in voices]

# Function to convert text to speech
async def text_to_speech(text, voice, rate, pitch):
    try:
        # Extract voice ShortName from the dropdown (e.g., "en-US-AvaNeural (Female)" -> "en-US-AvaNeural")
        voice_short_name = voice.split(" (")[0]
        
        # Convert rate from percentage (e.g., "10" for +10%) to edge-tts format (e.g., "+10%")
        rate_str = f"+{int(rate)}%" if rate >= 0 else f"{int(rate)}%"
        
        # Convert pitch from Hz (e.g., "100" for +100Hz) to edge-tts format (e.g., "+100Hz")
        pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz"
        
        # Generate unique output filename
        output_file = "output.mp3"
        
        # Initialize edge-tts communication
        communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
        
        # Save the audio
        await communicate.save(output_file)
        
        # Check if file was created
        if os.path.exists(output_file):
            return output_file
        else:
            return "Error: Audio file was not generated."
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio interface function
def create_gradio_interface():
    # Get voices synchronously
    loop = asyncio.get_event_loop()
    voices = loop.run_until_complete(get_voices())
    
    # Define Gradio interface
    with gr.Blocks(title="Edge TTS Text-to-Speech") as interface:
        gr.Markdown("# Edge TTS Text-to-Speech")
        gr.Markdown("Enter text, select a voice, adjust rate and pitch, and generate audio.")
        
        # Input components
        text_input = gr.Textbox(label="Input Text", placeholder="Type your text here...")
        voice_dropdown = gr.Dropdown(choices=voices, label="Voice", value=voices[0] if voices else None)
        rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, step=1, label="Rate (%)")
        pitch_slider = gr.Slider(minimum=-200, maximum=200, value=0, step=10, label="Pitch (Hz)")
        
        # Generate button
        generate_button = gr.Button("Generate Audio")
        
        # Output
        audio_output = gr.Audio(label="Generated Audio")
        error_output = gr.Textbox(label="Status", interactive=False)
        
        # Button click event
        async def on_generate(text, voice, rate, pitch):
            if not text:
                return None, "Error: Please enter some text."
            if not voice:
                return None, "Error: Please select a voice."
            
            result = await text_to_speech(text, voice, rate, pitch)
            if result.startswith("Error"):
                return None, result
            return result, "Audio generated successfully!"
        
        generate_button.click(
            fn=on_generate,
            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
            outputs=[audio_output, error_output]
        )
    
    return interface

# Launch the interface
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(server_name="0.0.0.0", server_port=7860)