walidadebayo commited on
Commit
fa758b4
·
1 Parent(s): 9c4e2f2

Add SRT support and file upload functionality to text_to_speech

Browse files
Files changed (1) hide show
  1. app.py +314 -107
app.py CHANGED
@@ -5,6 +5,8 @@ import tempfile
5
  import os
6
  import json
7
  import datetime
 
 
8
 
9
 
10
  async def get_voices():
@@ -25,108 +27,284 @@ def format_time(milliseconds):
25
  return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
26
 
27
 
28
- async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False):
29
- if not text.strip():
30
- return None, None, "Please enter text to convert."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  if not voice:
32
  return None, None, "Please select a voice."
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  voice_short_name = voice.split(" - ")[0]
35
  rate_str = f"{rate:+d}%"
36
  pitch_str = f"{pitch:+d}Hz"
37
- communicate = edge_tts.Communicate(
38
- text, voice_short_name, rate=rate_str, pitch=pitch_str
39
- )
40
 
41
  # Create temporary file for audio
42
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
43
  audio_path = tmp_file.name
44
 
45
  subtitle_path = None
46
- if generate_subtitles:
47
- with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
48
- subtitle_path = srt_file.name
 
 
 
 
49
 
50
- # Generate audio and collect word boundary data
51
- async def process_audio():
52
- word_boundaries = []
53
- async for chunk in communicate.stream():
54
- if chunk["type"] == "audio":
55
- with open(audio_path, "ab") as audio_file:
56
- audio_file.write(chunk["data"])
57
- elif chunk["type"] == "WordBoundary":
58
- word_boundaries.append(chunk)
59
- return word_boundaries
60
-
61
- word_boundaries = await process_audio()
62
-
63
- # Group words into sensible phrases/sentences for subtitles
64
- phrases = []
65
- current_phrase = []
66
- current_text = ""
67
- phrase_start = 0
68
-
69
- for i, boundary in enumerate(word_boundaries):
70
- word = boundary["text"]
71
- start_time = boundary["offset"] / 10000
72
- duration = boundary["duration"] / 10000
73
- end_time = start_time + duration
 
 
 
 
 
74
 
75
- if not current_phrase:
76
- phrase_start = start_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- current_phrase.append(boundary)
 
 
 
 
 
 
 
 
 
79
 
80
- if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
81
- current_text = current_text.rstrip() + word + " "
82
- else:
83
- current_text += word + " "
84
 
85
- # Determine if we should end this phrase and start a new one
86
- should_break = False
 
 
 
87
 
88
- # Break on punctuation
89
- if word.endswith(('.', '!', '?', ':', ';', ',')) or i == len(word_boundaries) - 1:
90
- should_break = True
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # Break after a certain number of words (4-5 is typical for subtitles)
93
- elif len(current_phrase) >= 5:
94
- should_break = True
95
 
96
- # Break on long pause (more than 300ms between words)
97
- elif i < len(word_boundaries) - 1:
98
- next_start = word_boundaries[i + 1]["offset"] / 10000
99
- if next_start - end_time > 300:
100
  should_break = True
 
 
 
 
 
 
 
 
 
 
101
 
102
- if should_break or i == len(word_boundaries) - 1:
103
- if current_phrase:
104
- last_boundary = current_phrase[-1]
105
- phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
106
- phrases.append({
107
- "text": current_text.strip(),
108
- "start": phrase_start,
109
- "end": phrase_end
110
- })
111
- current_phrase = []
112
- current_text = ""
113
-
114
- # Write phrases to SRT file
115
- with open(subtitle_path, "w", encoding="utf-8") as srt_file:
116
- for i, phrase in enumerate(phrases):
117
- # Write SRT entry
118
- srt_file.write(f"{i+1}\n")
119
- srt_file.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
120
- srt_file.write(f"{phrase['text']}\n\n")
121
- else:
122
- # Just generate audio
123
- await communicate.save(audio_path)
124
 
125
  return audio_path, subtitle_path, None
126
 
127
 
128
- async def tts_interface(text, voice, rate, pitch, generate_subtitles):
129
- audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles)
130
  if warning:
131
  return audio, subtitle, gr.Warning(warning)
132
  return audio, subtitle, None
@@ -141,39 +319,68 @@ async def create_demo():
141
 
142
  **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""
143
 
144
- demo = gr.Interface(
145
- fn=tts_interface,
146
- inputs=[
147
- gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!"),
148
- gr.Dropdown(
149
- choices=[""] + list(voices.keys()),
150
- label="Select Voice",
151
- value=list(voices.keys())[0] if voices else "",
152
- ),
153
- gr.Slider(
154
- minimum=-50,
155
- maximum=50,
156
- value=0,
157
- label="Speech Rate Adjustment (%)",
158
- step=1,
159
- ),
160
- gr.Slider(
161
- minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
162
- ),
163
- gr.Checkbox(label="Generate Subtitles (.srt)", value=False),
164
- ],
165
- outputs=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  gr.Audio(label="Generated Audio", type="filepath"),
167
  gr.File(label="Generated Subtitles"),
168
- gr.Markdown(label="Warning", visible=False),
169
- ],
170
- title="Edge TTS Text-to-Speech",
171
- description=description,
172
- article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
173
- analytics_enabled=False,
174
- flagging_mode="manual",
175
- api_name="predict",
176
- )
 
 
 
 
 
 
 
 
 
 
 
177
  return demo
178
 
179
 
 
5
  import os
6
  import json
7
  import datetime
8
+ import re
9
+ import io
10
 
11
 
12
  async def get_voices():
 
27
  return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
28
 
29
 
30
+ def time_to_ms(time_str):
31
+ """Convert SRT time format (HH:MM:SS,mmm) to milliseconds"""
32
+ hours, minutes, rest = time_str.split(':')
33
+ seconds, milliseconds = rest.split(',')
34
+ return int(hours) * 3600000 + int(minutes) * 60000 + int(seconds) * 1000 + int(milliseconds)
35
+
36
+
37
+ def parse_srt_content(content):
38
+ """Parse SRT file content and extract text and timing data"""
39
+ lines = content.split('\n')
40
+ timing_data = []
41
+ text_only = []
42
+
43
+ i = 0
44
+ while i < len(lines):
45
+ if not lines[i].strip():
46
+ i += 1
47
+ continue
48
+
49
+ # Check if this is a subtitle number line
50
+ if lines[i].strip().isdigit():
51
+ subtitle_num = int(lines[i].strip())
52
+ i += 1
53
+ if i >= len(lines):
54
+ break
55
+
56
+ # Parse timestamp line
57
+ timestamp_match = re.search(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})', lines[i])
58
+ if timestamp_match:
59
+ start_time = timestamp_match.group(1)
60
+ end_time = timestamp_match.group(2)
61
+
62
+ # Convert to milliseconds
63
+ start_ms = time_to_ms(start_time)
64
+ end_ms = time_to_ms(end_time)
65
+
66
+ i += 1
67
+ subtitle_text = ""
68
+
69
+ # Collect all text lines until empty line or end of file
70
+ while i < len(lines) and lines[i].strip():
71
+ subtitle_text += lines[i] + " "
72
+ i += 1
73
+
74
+ subtitle_text = subtitle_text.strip()
75
+ text_only.append(subtitle_text)
76
+ timing_data.append({
77
+ 'text': subtitle_text,
78
+ 'start': start_ms,
79
+ 'end': end_ms
80
+ })
81
+ else:
82
+ i += 1
83
+
84
+ return " ".join(text_only), timing_data
85
+
86
+
87
+ async def process_uploaded_file(file):
88
+ """Process uploaded file and detect if it's SRT or plain text"""
89
+ if file is None:
90
+ return None, None, False, None
91
+
92
+ try:
93
+ file_path = file.name if hasattr(file, 'name') else file
94
+ file_extension = os.path.splitext(file_path)[1].lower()
95
+
96
+ with open(file_path, 'r', encoding='utf-8') as f:
97
+ content = f.read()
98
+
99
+ # Check if it's an SRT file
100
+ is_subtitle = False
101
+ timing_data = None
102
+
103
+ if file_extension == '.srt' or re.search(r'^\d+\s*\n\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}', content, re.MULTILINE):
104
+ is_subtitle = True
105
+ text_content, timing_data = parse_srt_content(content)
106
+ # Return original content for display
107
+ return text_content, timing_data, is_subtitle, content
108
+ else:
109
+ # Treat as plain text
110
+ text_content = content
111
+
112
+ return text_content, timing_data, is_subtitle, content
113
+ except Exception as e:
114
+ return f"Error processing file: {str(e)}", None, False, None
115
+
116
+
117
+ async def update_text_from_file(file):
118
+ """Callback function to update text area when file is uploaded"""
119
+ if file is None:
120
+ return "", None
121
+
122
+ text_content, timing_data, is_subtitle, original_content = await process_uploaded_file(file)
123
+ if original_content is not None:
124
+ # Return the original content to preserve formatting
125
+ return original_content, None
126
+ return "", gr.Warning("Failed to process the file")
127
+
128
+
129
+ async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False, uploaded_file=None):
130
+ """Convert text to speech, handling both direct text input and uploaded files"""
131
+ if not text.strip() and uploaded_file is None:
132
+ return None, None, "Please enter text or upload a file to convert."
133
  if not voice:
134
  return None, None, "Please select a voice."
135
 
136
+ # First, determine if the text is SRT format
137
+ is_srt_format = bool(re.search(r'^\d+\s*\n\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}', text, re.MULTILINE))
138
+
139
+ # If the text is in SRT format, parse it directly
140
+ if is_srt_format:
141
+ text_content, timing_data = parse_srt_content(text)
142
+ is_subtitle = True
143
+ else:
144
+ # Process uploaded file if provided
145
+ timing_data = None
146
+ is_subtitle = False
147
+
148
+ if uploaded_file is not None:
149
+ file_text, file_timing_data, file_is_subtitle, _ = await process_uploaded_file(uploaded_file)
150
+ if isinstance(file_text, str) and file_text.strip():
151
+ if file_is_subtitle:
152
+ text = file_text
153
+ timing_data = file_timing_data
154
+ is_subtitle = file_is_subtitle
155
+
156
  voice_short_name = voice.split(" - ")[0]
157
  rate_str = f"{rate:+d}%"
158
  pitch_str = f"{pitch:+d}Hz"
 
 
 
159
 
160
  # Create temporary file for audio
161
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
162
  audio_path = tmp_file.name
163
 
164
  subtitle_path = None
165
+
166
+ # Handle SRT-formatted text or subtitle files differently for audio generation
167
+ if is_srt_format or (is_subtitle and timing_data):
168
+ # Create separate audio files for each subtitle entry and then combine them
169
+ with tempfile.TemporaryDirectory() as temp_dir:
170
+ audio_segments = []
171
+ max_end_time = 0
172
 
173
+ # If we don't have timing data but have SRT format text, parse it
174
+ if not timing_data and is_srt_format:
175
+ _, timing_data = parse_srt_content(text)
176
+
177
+ # Process each subtitle entry separately
178
+ for i, entry in enumerate(timing_data):
179
+ segment_text = entry['text']
180
+ start_time = entry['start']
181
+ end_time = entry['end']
182
+ max_end_time = max(max_end_time, end_time)
183
+
184
+ # Create temporary file for this segment
185
+ segment_file = os.path.join(temp_dir, f"segment_{i}.mp3")
186
+
187
+ # Generate audio for this segment
188
+ communicate = edge_tts.Communicate(segment_text, voice_short_name, rate=rate_str, pitch=pitch_str)
189
+ await communicate.save(segment_file)
190
+
191
+ audio_segments.append({
192
+ 'file': segment_file,
193
+ 'start': start_time,
194
+ 'end': end_time,
195
+ 'text': segment_text
196
+ })
197
+
198
+ # Combine audio segments with proper timing
199
+ import wave
200
+ import audioop
201
+ from pydub import AudioSegment
202
 
203
+ # Initialize final audio
204
+ final_audio = AudioSegment.silent(duration=max_end_time + 1000) # Add 1 second buffer
205
+
206
+ # Add each segment at its proper time
207
+ for segment in audio_segments:
208
+ segment_audio = AudioSegment.from_file(segment['file'])
209
+ final_audio = final_audio.overlay(segment_audio, position=segment['start'])
210
+
211
+ # Export the combined audio
212
+ final_audio.export(audio_path, format="mp3")
213
+
214
+ # Generate subtitles if requested
215
+ if generate_subtitles:
216
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
217
+ subtitle_path = srt_file.name
218
+ with open(subtitle_path, "w", encoding="utf-8") as f:
219
+ for i, entry in enumerate(timing_data):
220
+ f.write(f"{i+1}\n")
221
+ f.write(f"{format_time(entry['start'])} --> {format_time(entry['end'])}\n")
222
+ f.write(f"{entry['text']}\n\n")
223
+ else:
224
+ # Use the existing approach for regular text
225
+ communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
226
+
227
+ if generate_subtitles:
228
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
229
+ subtitle_path = srt_file.name
230
 
231
+ # Generate audio and collect word boundary data
232
+ async def process_audio():
233
+ word_boundaries = []
234
+ async for chunk in communicate.stream():
235
+ if chunk["type"] == "audio":
236
+ with open(audio_path, "ab") as audio_file:
237
+ audio_file.write(chunk["data"])
238
+ elif chunk["type"] == "WordBoundary":
239
+ word_boundaries.append(chunk)
240
+ return word_boundaries
241
 
242
+ word_boundaries = await process_audio()
 
 
 
243
 
244
+ # Group words into sensible phrases/sentences for subtitles
245
+ phrases = []
246
+ current_phrase = []
247
+ current_text = ""
248
+ phrase_start = 0
249
 
250
+ for i, boundary in enumerate(word_boundaries):
251
+ word = boundary["text"]
252
+ start_time = boundary["offset"] / 10000
253
+ duration = boundary["duration"] / 10000
254
+ end_time = start_time + duration
255
+
256
+ if not current_phrase:
257
+ phrase_start = start_time
258
+
259
+ current_phrase.append(boundary)
260
+
261
+ if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
262
+ current_text = current_text.rstrip() + word + " "
263
+ else:
264
+ current_text += word + " "
265
 
266
+ # Determine if we should end this phrase and start a new one
267
+ should_break = False
 
268
 
269
+ # Break on punctuation
270
+ if word.endswith(('.', '!', '?', ':', ';', ',')) or i == len(word_boundaries) - 1:
 
 
271
  should_break = True
272
+
273
+ # Break after a certain number of words (4-5 is typical for subtitles)
274
+ elif len(current_phrase) >= 5:
275
+ should_break = True
276
+
277
+ # Break on long pause (more than 300ms between words)
278
+ elif i < len(word_boundaries) - 1:
279
+ next_start = word_boundaries[i + 1]["offset"] / 10000
280
+ if next_start - end_time > 300:
281
+ should_break = True
282
 
283
+ if should_break or i == len(word_boundaries) - 1:
284
+ if current_phrase:
285
+ last_boundary = current_phrase[-1]
286
+ phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
287
+ phrases.append({
288
+ "text": current_text.strip(),
289
+ "start": phrase_start,
290
+ "end": phrase_end
291
+ })
292
+ current_phrase = []
293
+ current_text = ""
294
+
295
+ # Write phrases to SRT file
296
+ with open(subtitle_path, "w", encoding="utf-8") as srt_file:
297
+ for i, phrase in enumerate(phrases):
298
+ # Write SRT entry
299
+ srt_file.write(f"{i+1}\n")
300
+ srt_file.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
301
+ srt_file.write(f"{phrase['text']}\n\n")
 
 
 
302
 
303
  return audio_path, subtitle_path, None
304
 
305
 
306
+ async def tts_interface(text, voice, rate, pitch, generate_subtitles, uploaded_file=None):
307
+ audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles, uploaded_file)
308
  if warning:
309
  return audio, subtitle, gr.Warning(warning)
310
  return audio, subtitle, None
 
319
 
320
  **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""
321
 
322
+ features = """
323
+ ## ✨ Latest Features
324
+ - **SRT Subtitle Support**: Upload SRT files or input SRT format text to generate perfectly synchronized speech
325
+ - **SRT Generation**: Create subtitle files alongside your audio for perfect timing
326
+ - **File Upload**: Easily upload TXT or SRT files for conversion
327
+ - **Smart Format Detection**: Automatically detects plain text or SRT subtitle format
328
+ """
329
+
330
+ with gr.Blocks(title="Edge TTS Text-to-Speech", analytics_enabled=False) as demo:
331
+ gr.Markdown("# Edge TTS Text-to-Speech Converter")
332
+ gr.Markdown(description)
333
+ gr.Markdown(features)
334
+
335
+ with gr.Row():
336
+ with gr.Column(scale=3):
337
+ text_input = gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!")
338
+ file_input = gr.File(label="Or upload a TXT/SRT file", file_types=[".txt", ".srt"])
339
+
340
+ with gr.Column(scale=2):
341
+ voice_dropdown = gr.Dropdown(
342
+ choices=[""] + list(voices.keys()),
343
+ label="Select Voice",
344
+ value=list(voices.keys())[0] if voices else "",
345
+ )
346
+ rate_slider = gr.Slider(
347
+ minimum=-50,
348
+ maximum=50,
349
+ value=0,
350
+ label="Speech Rate Adjustment (%)",
351
+ step=1,
352
+ )
353
+ pitch_slider = gr.Slider(
354
+ minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
355
+ )
356
+ subtitle_checkbox = gr.Checkbox(label="Generate Subtitles (.srt)", value=False)
357
+
358
+ submit_btn = gr.Button("Convert to Speech", variant="primary")
359
+ warning_md = gr.Markdown(visible=False)
360
+
361
+ outputs = [
362
  gr.Audio(label="Generated Audio", type="filepath"),
363
  gr.File(label="Generated Subtitles"),
364
+ warning_md
365
+ ]
366
+
367
+ # Handle file upload to update text
368
+ file_input.change(
369
+ fn=update_text_from_file,
370
+ inputs=[file_input],
371
+ outputs=[text_input, warning_md]
372
+ )
373
+
374
+ # Handle submit button
375
+ submit_btn.click(
376
+ fn=tts_interface,
377
+ api_name="predict",
378
+ inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, subtitle_checkbox, file_input],
379
+ outputs=outputs
380
+ )
381
+
382
+ gr.Markdown("Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!")
383
+
384
  return demo
385
 
386