Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on May 13

Commit

e94f4f0

verified ·

1 Parent(s): 946571e

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -9

app.py CHANGED Viewed

@@ -202,8 +202,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
     timed_audio_segments = []
     max_end_time_ms = 0
     previous_end_time_ms = 0
-    previous_start_time_ms = 0 # Keep track of the *start* time of the previous segment
-    next_start_time_ms = None # Keep track of the *start* time of the next segment
     for i, line in enumerate(lines):
         start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
@@ -221,15 +220,56 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
                 current_audio_duration = len(combined_line_audio)
                 intended_start_time = start_time
-                if i > 0:
-                    time_difference = start_time - previous_start_time_ms
-                    if current_audio_duration > time_difference:
-                        intended_start_time = previous_end_time_ms
                 timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
-                previous_start_time_ms = start_time #update previous start time
                 previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
                 max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
         elif audio_paths:
@@ -305,4 +345,3 @@ async def create_demo():
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
     demo.launch()

     timed_audio_segments = []
     max_end_time_ms = 0
     previous_end_time_ms = 0
+    next_start_time_ms = None
     for i, line in enumerate(lines):
         start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
                 current_audio_duration = len(combined_line_audio)
                 intended_start_time = start_time
+                # Get next start time for comparison
+                if i + 1 < len(lines):
+                    next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
+                    if next_line_match:
+                        next_h, next_m, next_s, next_ms = next_line_match.groups()
+                        next_start_time_ms = (
+                            int(next_h) * 3600000 +
+                            int(next_m) * 60000 +
+                            int(next_s) * 1000 +
+                            int(next_ms)
+                        )
+                    else:
+                        next_start_time_ms = None
+                else:
+                    next_start_time_ms = None
+                # Combine audio segments if current audio is longer than the time difference
+                while next_start_time_ms and current_audio_duration > (next_start_time_ms - start_time):
+                    if i + 1 < len(lines):
+                        next_start_time, next_audio_paths = await process_transcript_line(lines[i + 1], voice, rate, pitch)
+                        if next_start_time is not None and next_audio_paths:
+                            for next_path in next_audio_paths:
+                                try:
+                                    next_audio = AudioSegment.from_mp3(next_path)
+                                    combined_line_audio += next_audio
+                                    os.remove(next_path)
+                                except FileNotFoundError:
+                                    print(f"Warning: Audio file not found: {next_path}")
+                            current_audio_duration = len(combined_line_audio)
+                            i += 1  # Move to the next line
+                            if i + 1 < len(lines):
+                                next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
+                                if next_line_match:
+                                    next_h, next_m, next_s, next_ms = next_line_match.groups()
+                                    next_start_time_ms = (
+                                        int(next_h) * 3600000 +
+                                        int(next_m) * 60000 +
+                                        int(next_s) * 1000 +
+                                        int(next_ms)
+                                    )
+                                else:
+                                    next_start_time_ms = None
+                            else:
+                                next_start_time_ms = None
+                        else:
+                            break  # Exit the loop if there are no more processable lines
+                    else:
+                        break
                 timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
                 previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
                 max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
         elif audio_paths:
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
     demo.launch()