cnph001 commited on
Commit
555abcf
·
verified ·
1 Parent(s): b184cb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -32
app.py CHANGED
@@ -112,32 +112,6 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
112
  return audio_path
113
  return None
114
 
115
- async def process_transcript_line(line, default_voice, rate, pitch):
116
- """Processes a single transcript line with timestamp and quoted text segments."""
117
- match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', line)
118
- if match:
119
- minutes, seconds, milliseconds_str, text_parts = match.groups()
120
- start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
121
- audio_segments = []
122
- split_parts = re.split(r'(")', text_parts) # Split by quote marks, keeping the quotes
123
-
124
- process_next = False
125
- for part in split_parts:
126
- if part == '"':
127
- process_next = not process_next
128
- continue
129
- if process_next and part.strip():
130
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
131
- if audio_path:
132
- audio_segments.append(audio_path)
133
- elif not process_next and part.strip():
134
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
135
- if audio_path:
136
- audio_segments.append(audio_path)
137
-
138
- return start_time_ms, audio_segments
139
- return None, None
140
-
141
  async def process_transcript_line(line, default_voice, rate, pitch):
142
  """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
143
  match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
@@ -169,6 +143,49 @@ async def process_transcript_line(line, default_voice, rate, pitch):
169
  return start_time_ms, audio_segments
170
  return None, None
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  @spaces.GPU
173
  def tts_interface(transcript, voice, rate, pitch):
174
  audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
@@ -178,18 +195,18 @@ async def create_demo():
178
  voices = await get_voices()
179
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
180
  description = """
181
- Process timestamped text with voice changes within quotes.
182
- Format: `minutes:seconds[.milliseconds] "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
183
  Example:
184
  ```
185
- 0:00 "This is the default voice." more default. "1F Now a female voice." and back to default.
186
- 0:05 "1C Yes," said the child, "it is fun!"
187
  ```
188
  """
189
  demo = gr.Interface(
190
  fn=tts_interface,
191
  inputs=[
192
- gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='0:00 "Text" more text "1F Different Voice"'),
193
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
194
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
195
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -198,7 +215,7 @@ async def create_demo():
198
  gr.Audio(label="Generated Audio", type="filepath"),
199
  gr.Markdown(label="Warning", visible=False)
200
  ],
201
- title="TTS with Timestamp and In-Quote Voice Switching",
202
  description=description,
203
  analytics_enabled=False,
204
  allow_flagging=False
 
112
  return audio_path
113
  return None
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  async def process_transcript_line(line, default_voice, rate, pitch):
116
  """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
117
  match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
 
143
  return start_time_ms, audio_segments
144
  return None, None
145
 
146
+ async def transcript_to_speech(transcript_text, voice, rate, pitch):
147
+ if not transcript_text.strip():
148
+ return None, gr.Warning("Please enter transcript text.")
149
+ if not voice:
150
+ return None, gr.Warning("Please select a voice.")
151
+
152
+ lines = transcript_text.strip().split('\n')
153
+ timed_audio_segments = []
154
+ max_end_time_ms = 0
155
+
156
+ for line in lines:
157
+ start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
158
+ if start_time is not None and audio_paths:
159
+ combined_line_audio = AudioSegment.empty()
160
+ for path in audio_paths:
161
+ try:
162
+ audio = AudioSegment.from_mp3(path)
163
+ combined_line_audio += audio
164
+ os.remove(path)
165
+ except FileNotFoundError:
166
+ print(f"Warning: Audio file not found: {path}")
167
+
168
+ if combined_line_audio:
169
+ timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
170
+ max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
171
+ elif audio_paths:
172
+ for path in audio_paths:
173
+ try:
174
+ os.remove(path)
175
+ except FileNotFoundError:
176
+ pass # Clean up even if no timestamp
177
+
178
+ if not timed_audio_segments:
179
+ return None, "No processable audio segments found."
180
+
181
+ final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
182
+ for segment in timed_audio_segments:
183
+ final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
184
+
185
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
186
+ final_audio.export(combined_audio_path, format="mp3")
187
+ return combined_audio_path, None
188
+
189
  @spaces.GPU
190
  def tts_interface(transcript, voice, rate, pitch):
191
  audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
 
195
  voices = await get_voices()
196
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
197
  description = """
198
+ Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
199
+ Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
200
  Example:
201
  ```
202
+ 00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
203
+ 00:00:05.000 "1C Yes," said the child, "it is fun!"
204
  ```
205
  """
206
  demo = gr.Interface(
207
  fn=tts_interface,
208
  inputs=[
209
+ gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
210
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
211
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
212
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
215
  gr.Audio(label="Generated Audio", type="filepath"),
216
  gr.Markdown(label="Warning", visible=False)
217
  ],
218
+ title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
219
  description=description,
220
  analytics_enabled=False,
221
  allow_flagging=False