Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on May 13

Commit

0910ca5

verified ·

1 Parent(s): e94f4f0

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -50

app.py CHANGED Viewed

@@ -202,7 +202,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
     timed_audio_segments = []
     max_end_time_ms = 0
     previous_end_time_ms = 0
-    next_start_time_ms = None
     for i, line in enumerate(lines):
         start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
@@ -220,56 +221,13 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
                 current_audio_duration = len(combined_line_audio)
                 intended_start_time = start_time
-                # Get next start time for comparison
-                if i + 1 < len(lines):
-                    next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
-                    if next_line_match:
-                        next_h, next_m, next_s, next_ms = next_line_match.groups()
-                        next_start_time_ms = (
-                            int(next_h) * 3600000 +
-                            int(next_m) * 60000 +
-                            int(next_s) * 1000 +
-                            int(next_ms)
-                        )
-                    else:
-                        next_start_time_ms = None
-                else:
-                    next_start_time_ms = None
-                # Combine audio segments if current audio is longer than the time difference
-                while next_start_time_ms and current_audio_duration > (next_start_time_ms - start_time):
-                    if i + 1 < len(lines):
-                        next_start_time, next_audio_paths = await process_transcript_line(lines[i + 1], voice, rate, pitch)
-                        if next_start_time is not None and next_audio_paths:
-                            for next_path in next_audio_paths:
-                                try:
-                                    next_audio = AudioSegment.from_mp3(next_path)
-                                    combined_line_audio += next_audio
-                                    os.remove(next_path)
-                                except FileNotFoundError:
-                                    print(f"Warning: Audio file not found: {next_path}")
-                            current_audio_duration = len(combined_line_audio)
-                            i += 1  # Move to the next line
-                            if i + 1 < len(lines):
-                                next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
-                                if next_line_match:
-                                    next_h, next_m, next_s, next_ms = next_line_match.groups()
-                                    next_start_time_ms = (
-                                        int(next_h) * 3600000 +
-                                        int(next_m) * 60000 +
-                                        int(next_s) * 1000 +
-                                        int(next_ms)
-                                    )
-                                else:
-                                    next_start_time_ms = None
-                            else:
-                                next_start_time_ms = None
-                        else:
-                            break  # Exit the loop if there are no more processable lines
-                    else:
-                        break
                 timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
                 previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
                 max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
         elif audio_paths:
@@ -300,7 +258,7 @@ async def create_demo():
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
     Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
-    Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
     00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.

     timed_audio_segments = []
     max_end_time_ms = 0
     previous_end_time_ms = 0
+    next_start_time_ms = None # Keep track of the start time of the *next* segment
+    previous_start_time_ms = 0
     for i, line in enumerate(lines):
         start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
                 current_audio_duration = len(combined_line_audio)
                 intended_start_time = start_time
+                if i > 0:
+                    time_difference = start_time - previous_start_time_ms
+                    if current_audio_duration > time_difference:
+                        intended_start_time = previous_end_time_ms
                 timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
+                previous_start_time_ms = start_time
                 previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
                 max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
         elif audio_paths:
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
     Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
+    Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "1F Different Voice"
     Example:
     ```
     00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.