walidadebayo commited on
Commit
49b2f3e
·
1 Parent(s): 17f4c83

Add subtitle generation feature to text-to-speech functionality

Browse files
Files changed (1) hide show
  1. app.py +103 -10
app.py CHANGED
@@ -3,6 +3,8 @@ import edge_tts
3
  import asyncio
4
  import tempfile
5
  import os
 
 
6
 
7
 
8
  async def get_voices():
@@ -13,11 +15,21 @@ async def get_voices():
13
  }
14
 
15
 
16
- async def text_to_speech(text, voice, rate, pitch):
 
 
 
 
 
 
 
 
 
 
17
  if not text.strip():
18
- return None, "Please enter text to convert."
19
  if not voice:
20
- return None, "Please select a voice."
21
 
22
  voice_short_name = voice.split(" - ")[0]
23
  rate_str = f"{rate:+d}%"
@@ -25,17 +37,95 @@ async def text_to_speech(text, voice, rate, pitch):
25
  communicate = edge_tts.Communicate(
26
  text, voice_short_name, rate=rate_str, pitch=pitch_str
27
  )
 
 
28
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
29
- tmp_path = tmp_file.name
30
- await communicate.save(tmp_path)
31
- return tmp_path, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
- async def tts_interface(text, voice, rate, pitch):
35
- audio, warning = await text_to_speech(text, voice, rate, pitch)
36
  if warning:
37
- return audio, gr.Warning(warning)
38
- return audio, None
39
 
40
 
41
  async def create_demo():
@@ -43,6 +133,7 @@ async def create_demo():
43
 
44
  description = """
45
  Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
 
46
 
47
  **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""
48
 
@@ -65,9 +156,11 @@ async def create_demo():
65
  gr.Slider(
66
  minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
67
  ),
 
68
  ],
69
  outputs=[
70
  gr.Audio(label="Generated Audio", type="filepath"),
 
71
  gr.Markdown(label="Warning", visible=False),
72
  ],
73
  title="Edge TTS Text-to-Speech",
 
3
  import asyncio
4
  import tempfile
5
  import os
6
+ import json
7
+ import datetime
8
 
9
 
10
  async def get_voices():
 
15
  }
16
 
17
 
18
+ def format_time(milliseconds):
19
+ """Convert milliseconds to SRT time format (HH:MM:SS,mmm)"""
20
+ # Ensure milliseconds is an integer
21
+ milliseconds = int(milliseconds)
22
+ seconds, milliseconds = divmod(milliseconds, 1000)
23
+ minutes, seconds = divmod(seconds, 60)
24
+ hours, minutes = divmod(minutes, 60)
25
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
26
+
27
+
28
+ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False):
29
  if not text.strip():
30
+ return None, None, "Please enter text to convert."
31
  if not voice:
32
+ return None, None, "Please select a voice."
33
 
34
  voice_short_name = voice.split(" - ")[0]
35
  rate_str = f"{rate:+d}%"
 
37
  communicate = edge_tts.Communicate(
38
  text, voice_short_name, rate=rate_str, pitch=pitch_str
39
  )
40
+
41
+ # Create temporary file for audio
42
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
43
+ audio_path = tmp_file.name
44
+
45
+ subtitle_path = None
46
+ if generate_subtitles:
47
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
48
+ subtitle_path = srt_file.name
49
+
50
+ # Generate audio and collect word boundary data
51
+ async def process_audio():
52
+ word_boundaries = []
53
+ async for chunk in communicate.stream():
54
+ if chunk["type"] == "audio":
55
+ with open(audio_path, "ab") as audio_file:
56
+ audio_file.write(chunk["data"])
57
+ elif chunk["type"] == "WordBoundary":
58
+ word_boundaries.append(chunk)
59
+ return word_boundaries
60
+
61
+ word_boundaries = await process_audio()
62
+
63
+ # Group words into sensible phrases/sentences for subtitles
64
+ phrases = []
65
+ current_phrase = []
66
+ current_text = ""
67
+ phrase_start = 0
68
+
69
+ for i, boundary in enumerate(word_boundaries):
70
+ word = boundary["text"]
71
+ start_time = boundary["offset"] / 10000
72
+ duration = boundary["duration"] / 10000
73
+ end_time = start_time + duration
74
+
75
+ if not current_phrase:
76
+ phrase_start = start_time
77
+
78
+ current_phrase.append(boundary)
79
+ current_text += word + " "
80
+
81
+ # Determine if we should end this phrase and start a new one
82
+ should_break = False
83
+
84
+ # Break on punctuation
85
+ if word.endswith(('.', '!', '?', ':', ';', ',')) or i == len(word_boundaries) - 1:
86
+ should_break = True
87
+
88
+ # Break after a certain number of words (4-5 is typical for subtitles)
89
+ elif len(current_phrase) >= 5:
90
+ should_break = True
91
+
92
+ # Break on long pause (more than 300ms between words)
93
+ elif i < len(word_boundaries) - 1:
94
+ next_start = word_boundaries[i + 1]["offset"] / 10000
95
+ if next_start - end_time > 300:
96
+ should_break = True
97
+
98
+ if should_break or i == len(word_boundaries) - 1:
99
+ if current_phrase:
100
+ last_boundary = current_phrase[-1]
101
+ phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
102
+ phrases.append({
103
+ "text": current_text.strip(),
104
+ "start": phrase_start,
105
+ "end": phrase_end
106
+ })
107
+ current_phrase = []
108
+ current_text = ""
109
+
110
+ # Write phrases to SRT file
111
+ with open(subtitle_path, "w", encoding="utf-8") as srt_file:
112
+ for i, phrase in enumerate(phrases):
113
+ # Write SRT entry
114
+ srt_file.write(f"{i+1}\n")
115
+ srt_file.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
116
+ srt_file.write(f"{phrase['text']}\n\n")
117
+ else:
118
+ # Just generate audio
119
+ await communicate.save(audio_path)
120
+
121
+ return audio_path, subtitle_path, None
122
 
123
 
124
+ async def tts_interface(text, voice, rate, pitch, generate_subtitles):
125
+ audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles)
126
  if warning:
127
+ return audio, subtitle, gr.Warning(warning)
128
+ return audio, subtitle, None
129
 
130
 
131
  async def create_demo():
 
133
 
134
  description = """
135
  Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
136
+ You can also generate subtitle files (.srt) along with the audio.
137
 
138
  **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""
139
 
 
156
  gr.Slider(
157
  minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
158
  ),
159
+ gr.Checkbox(label="Generate Subtitles (.srt)", value=False),
160
  ],
161
  outputs=[
162
  gr.Audio(label="Generated Audio", type="filepath"),
163
+ gr.File(label="Generated Subtitles"),
164
  gr.Markdown(label="Warning", visible=False),
165
  ],
166
  title="Edge TTS Text-to-Speech",