cnph001 commited on
Commit
0910ca5
·
verified ·
1 Parent(s): e94f4f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -50
app.py CHANGED
@@ -202,7 +202,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
204
  previous_end_time_ms = 0
205
- next_start_time_ms = None
 
206
 
207
  for i, line in enumerate(lines):
208
  start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
@@ -220,56 +221,13 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
220
  current_audio_duration = len(combined_line_audio)
221
  intended_start_time = start_time
222
 
223
- # Get next start time for comparison
224
- if i + 1 < len(lines):
225
- next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
226
- if next_line_match:
227
- next_h, next_m, next_s, next_ms = next_line_match.groups()
228
- next_start_time_ms = (
229
- int(next_h) * 3600000 +
230
- int(next_m) * 60000 +
231
- int(next_s) * 1000 +
232
- int(next_ms)
233
- )
234
- else:
235
- next_start_time_ms = None
236
- else:
237
- next_start_time_ms = None
238
-
239
- # Combine audio segments if current audio is longer than the time difference
240
- while next_start_time_ms and current_audio_duration > (next_start_time_ms - start_time):
241
- if i + 1 < len(lines):
242
- next_start_time, next_audio_paths = await process_transcript_line(lines[i + 1], voice, rate, pitch)
243
- if next_start_time is not None and next_audio_paths:
244
- for next_path in next_audio_paths:
245
- try:
246
- next_audio = AudioSegment.from_mp3(next_path)
247
- combined_line_audio += next_audio
248
- os.remove(next_path)
249
- except FileNotFoundError:
250
- print(f"Warning: Audio file not found: {next_path}")
251
- current_audio_duration = len(combined_line_audio)
252
- i += 1 # Move to the next line
253
- if i + 1 < len(lines):
254
- next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
255
- if next_line_match:
256
- next_h, next_m, next_s, next_ms = next_line_match.groups()
257
- next_start_time_ms = (
258
- int(next_h) * 3600000 +
259
- int(next_m) * 60000 +
260
- int(next_s) * 1000 +
261
- int(next_ms)
262
- )
263
- else:
264
- next_start_time_ms = None
265
- else:
266
- next_start_time_ms = None
267
- else:
268
- break # Exit the loop if there are no more processable lines
269
- else:
270
- break
271
 
272
  timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
 
273
  previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
274
  max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
275
  elif audio_paths:
@@ -300,7 +258,7 @@ async def create_demo():
300
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
301
  description = """
302
  Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
303
- Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
304
  Example:
305
  ```
306
  00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
 
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
204
  previous_end_time_ms = 0
205
+ next_start_time_ms = None # Keep track of the start time of the *next* segment
206
+ previous_start_time_ms = 0
207
 
208
  for i, line in enumerate(lines):
209
  start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
 
221
  current_audio_duration = len(combined_line_audio)
222
  intended_start_time = start_time
223
 
224
+ if i > 0:
225
+ time_difference = start_time - previous_start_time_ms
226
+ if current_audio_duration > time_difference:
227
+ intended_start_time = previous_end_time_ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
230
+ previous_start_time_ms = start_time
231
  previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
232
  max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
233
  elif audio_paths:
 
258
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
259
  description = """
260
  Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
261
+ Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "1F Different Voice"
262
  Example:
263
  ```
264
  00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.