File size: 6,863 Bytes
f870ddf
 
 
 
 
49b2f3e
 
f870ddf
 
 
 
 
 
 
 
 
 
49b2f3e
 
 
 
 
 
 
 
 
 
 
f870ddf
49b2f3e
f870ddf
49b2f3e
f870ddf
 
 
 
 
 
 
49b2f3e
 
f870ddf
49b2f3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d5faea
 
 
 
 
49b2f3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f870ddf
 
49b2f3e
 
f870ddf
49b2f3e
 
f870ddf
 
 
 
 
 
 
49b2f3e
f870ddf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49b2f3e
f870ddf
 
 
49b2f3e
f870ddf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import json
import datetime


async def get_voices():
    voices = await edge_tts.list_voices()
    return {
        f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v["ShortName"]
        for v in voices
    }


def format_time(milliseconds):
    """Convert milliseconds to SRT time format (HH:MM:SS,mmm)"""
    # Ensure milliseconds is an integer
    milliseconds = int(milliseconds)
    seconds, milliseconds = divmod(milliseconds, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"


async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False):
    if not text.strip():
        return None, None, "Please enter text to convert."
    if not voice:
        return None, None, "Please select a voice."

    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(
        text, voice_short_name, rate=rate_str, pitch=pitch_str
    )
    
    # Create temporary file for audio
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        audio_path = tmp_file.name
    
    subtitle_path = None
    if generate_subtitles:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
            subtitle_path = srt_file.name
            
        # Generate audio and collect word boundary data
        async def process_audio():
            word_boundaries = []
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    with open(audio_path, "ab") as audio_file:
                        audio_file.write(chunk["data"])
                elif chunk["type"] == "WordBoundary":
                    word_boundaries.append(chunk)
            return word_boundaries
        
        word_boundaries = await process_audio()
        
        # Group words into sensible phrases/sentences for subtitles
        phrases = []
        current_phrase = []
        current_text = ""
        phrase_start = 0
        
        for i, boundary in enumerate(word_boundaries):
            word = boundary["text"]
            start_time = boundary["offset"] / 10000
            duration = boundary["duration"] / 10000
            end_time = start_time + duration
            
            if not current_phrase:
                phrase_start = start_time
                
            current_phrase.append(boundary)
            
            if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
                current_text = current_text.rstrip() + word + " "
            else:
                current_text += word + " "
            
            # Determine if we should end this phrase and start a new one
            should_break = False
            
            # Break on punctuation
            if word.endswith(('.', '!', '?', ':', ';', ',')) or i == len(word_boundaries) - 1:
                should_break = True
                
            # Break after a certain number of words (4-5 is typical for subtitles)
            elif len(current_phrase) >= 5:
                should_break = True
                
            # Break on long pause (more than 300ms between words)
            elif i < len(word_boundaries) - 1:
                next_start = word_boundaries[i + 1]["offset"] / 10000
                if next_start - end_time > 300:
                    should_break = True
            
            if should_break or i == len(word_boundaries) - 1:
                if current_phrase:
                    last_boundary = current_phrase[-1]
                    phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
                    phrases.append({
                        "text": current_text.strip(),
                        "start": phrase_start,
                        "end": phrase_end
                    })
                    current_phrase = []
                    current_text = ""
        
        # Write phrases to SRT file
        with open(subtitle_path, "w", encoding="utf-8") as srt_file:
            for i, phrase in enumerate(phrases):
                # Write SRT entry
                srt_file.write(f"{i+1}\n")
                srt_file.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
                srt_file.write(f"{phrase['text']}\n\n")
    else:
        # Just generate audio
        await communicate.save(audio_path)
    
    return audio_path, subtitle_path, None


async def tts_interface(text, voice, rate, pitch, generate_subtitles):
    audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles)
    if warning:
        return audio, subtitle, gr.Warning(warning)
    return audio, subtitle, None


async def create_demo():
    voices = await get_voices()

    description = """
    Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
    You can also generate subtitle files (.srt) along with the audio.
    
    **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!"),
            gr.Dropdown(
                choices=[""] + list(voices.keys()),
                label="Select Voice",
                value=list(voices.keys())[0] if voices else "",
            ),
            gr.Slider(
                minimum=-50,
                maximum=50,
                value=0,
                label="Speech Rate Adjustment (%)",
                step=1,
            ),
            gr.Slider(
                minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
            ),
            gr.Checkbox(label="Generate Subtitles (.srt)", value=False),
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Generated Subtitles"),
            gr.Markdown(label="Warning", visible=False),
        ],
        title="Edge TTS Text-to-Speech",
        description=description,
        article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
        analytics_enabled=False,
        flagging_mode="manual",  
        api_name="predict",
    )
    return demo


async def main():
    demo = await create_demo()
    demo.queue(default_concurrency_limit=50)
    demo.launch(show_api=True, show_error=True)


if __name__ == "__main__":
    asyncio.run(main())