File size: 5,040 Bytes
1b41e6d
7877a4f
a257b37
25681f6
37a2817
1b41e6d
7877a4f
 
37a2817
 
 
 
 
25681f6
7877a4f
 
 
37a2817
 
 
 
7877a4f
 
37a2817
7877a4f
 
37a2817
7877a4f
 
37a2817
 
 
7877a4f
 
 
 
 
 
 
 
 
37a2817
7877a4f
37a2817
7877a4f
37a2817
25681f6
7877a4f
 
 
 
 
 
37a2817
 
 
 
 
 
 
 
 
7877a4f
37a2817
 
 
 
 
 
 
7877a4f
37a2817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7877a4f
 
 
37a2817
 
7877a4f
 
 
 
37a2817
 
 
 
 
 
 
 
 
7877a4f
 
 
1b41e6d
7877a4f
1b41e6d
7877a4f
37a2817
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import asyncio
import os
import edge_tts
import gradio as gr
from datetime import datetime

# Function to get available voices
async def get_voices():
    try:
        voices = await edge_tts.list_voices()
        return sorted([f"{voice['ShortName']} ({voice['Gender']})" for voice in voices])
    except Exception as e:
        return [f"Error fetching voices: {str(e)}"]

# Function to convert text to speech
async def text_to_speech(text, voice, rate, pitch):
    try:
        if not text or not voice:
            return None, "Error: Text and voice selection are required."
        
        # Extract voice ShortName (e.g., "en-US-AvaNeural (Female)" -> "en-US-AvaNeural")
        voice_short_name = voice.split(" (")[0]
        
        # Convert rate to edge-tts format (e.g., 10 -> "+10%", -10 -> "-10%")
        rate_str = f"+{int(rate)}%" if rate >= 0 else f"{int(rate)}%"
        
        # Convert pitch to edge-tts format (e.g., 100 -> "+100Hz", -100 -> "-100Hz")
        pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz"
        
        # Generate unique output filename with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"output_{timestamp}.mp3"
        
        # Initialize edge-tts communication
        communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
        
        # Save the audio
        await communicate.save(output_file)
        
        # Check if file was created
        if os.path.exists(output_file):
            return output_file, "Audio generated successfully!"
        else:
            return None, "Error: Audio file was not generated."
    except Exception as e:
        return None, f"Error: {str(e)}"

# Gradio interface function
def create_gradio_interface():
    # Get voices synchronously
    loop = asyncio.get_event_loop()
    voices = loop.run_until_complete(get_voices())
    
    # Custom CSS for a polished look
    css = """
    .gradio-container {background-color: #f5f7fa;}
    .title {text-align: center; color: #2c3e50;}
    .footer {text-align: center; color: #7f8c8d; font-size: 0.9em; margin-top: 20px;}
    .button-primary {background-color: #3498db !important; color: white !important;}
    .input-box {border-radius: 8px;}
    """

    # Define Gradio interface
    with gr.Blocks(css=css, theme=gr.themes.Soft()) as interface:
        gr.Markdown(
            """
            <h1 class='title'>Edge TTS Text-to-Speech</h1>
            <p style='text-align: center;'>Convert text to speech with customizable voice, rate, and pitch.</p>
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                text_input = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter the text you want to convert to speech...",
                    lines=5,
                    elem_classes="input-box"
                )
                voice_dropdown = gr.Dropdown(
                    choices=voices,
                    label="Voice Model",
                    value=voices[0] if voices else None,
                    allow_custom_value=False
                )
                rate_slider = gr.Slider(
                    minimum=-50,
                    maximum=50,
                    value=0,
                    step=1,
                    label="Speech Rate (%)",
                    info="Adjust the speed of the speech (±50%)"
                )
                pitch_slider = gr.Slider(
                    minimum=-200,
                    maximum=200,
                    value=0,
                    step=10,
                    label="Pitch (Hz)",
                    info="Adjust the pitch of the voice (±200Hz)"
                )
                generate_button = gr.Button("Generate Audio", variant="primary", elem_classes="button-primary")
            
            with gr.Column(scale=1):
                audio_output = gr.Audio(label="Generated Audio", interactive=False)
                status_output = gr.Textbox(
                    label="Status",
                    interactive=False,
                    placeholder="Status messages will appear here..."
                )
        
        # Button click event
        async def on_generate(text, voice, rate, pitch):
            audio, status = await text_to_speech(text, voice, rate, pitch)
            return audio, status
        
        generate_button.click(
            fn=on_generate,
            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
            outputs=[audio_output, status_output]
        )
        
        gr.Markdown(
            """
            <p class='footer'>
                Powered by Edge TTS and Gradio | Deployed on Hugging Face Spaces
            </p>
            """
        )
    
    return interface

# Launch the interface
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(server_name="0.0.0.0", server_port=7860, share=False)