NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on May 11

Commit

e4c6d2d

verified ·

1 Parent(s): c411b7a

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -33

app.py CHANGED Viewed

@@ -102,29 +102,20 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
-            if target_duration_ms is not None and os.path.exists(audio_path):
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
                 #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
-                if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
                     speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
                     #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
                     if speed_factor > 0:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
-                        #y, sr = librosa.load(audio_path, sr=None)
-                        # Load audio file
                         audio = AudioSegment.from_file(audio_path)
-                        # Apply time-stretching
                         audio_stretched = audio.speedup(playback_speed=speed_factor)
-                        # Save the stretched audio
                         audio_stretched.export(audio_path, format="mp3")
-                        #y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
-                        #sf.write(audio_path, y_stretched, sr)
                 else:
                     print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
             return audio_path
@@ -133,24 +124,21 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             return None
     return None
-async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
-    """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
-    match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
-        start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
         start_time_ms = (
             int(start_h) * 3600000 +
             int(start_m) * 60000 +
             int(start_s) * 1000 +
             int(start_ms)
         )
-        end_time_ms = (
-            int(end_h) * 3600000 +
-            int(end_m) * 60000 +
-            int(end_s) * 1000 +
-            int(end_ms)
-        )
-        duration_ms = end_time_ms - start_time_ms
         audio_segments = []
         split_parts = re.split(r'[“”"]', text_parts)
         process_next = False
@@ -177,12 +165,22 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
-    for line in lines:
-        start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
-            current_time_ms = start_time
-            segment_duration = duration / len(audio_paths) if audio_paths else 0
             for path in audio_paths:
                 if path:  # Only process if audio_path is not None (meaning TTS was successful)
                     try:
@@ -201,11 +199,14 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
                         os.remove(path)
                     except FileNotFoundError:
                         pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
     final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
     for segment in timed_audio_segments:
         final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
     combined_audio_path = tempfile.mktemp(suffix=".mp3")
     final_audio.export(combined_audio_path, format="mp3")
     return combined_audio_path, None
@@ -219,14 +220,16 @@ async def create_demo():
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
-    Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
-    The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
     You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
-    Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
-    00:00:00,000 - 00:00:05,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
-    00:00:05,500 - 00:00:10,250 "1C Yes," said the child, "it is fun!"
     ```
     ***************************************************************************************************
     1M = en-AU-WilliamNeural - en-AU (Male)
@@ -248,7 +251,7 @@ async def create_demo():
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
-            gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
@@ -258,7 +261,7 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="TTS with Duration-Aware Speed Adjustment and In-Quote Voice Switching",
         description=description,
         analytics_enabled=False,
         allow_flagging=False

             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
+            if target_duration_ms is not None and os.path.exists(audio_path) and target_duration_ms > 0:
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
                 #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
+                if audio_duration_ms > target_duration_ms:
                     speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
                     #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
                     if speed_factor > 0:
                         if speed_factor < 1.0:
                             speed_factor = 1.0
                         audio = AudioSegment.from_file(audio_path)
                         audio_stretched = audio.speedup(playback_speed=speed_factor)
                         audio_stretched.export(audio_path, format="mp3")
                 else:
                     print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
             return audio_path
             return None
     return None
+async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, speed_adjustment_factor):
+    """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
+    match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
+        start_h, start_m, start_s, start_ms, text_parts = match.groups()
         start_time_ms = (
             int(start_h) * 3600000 +
             int(start_m) * 60000 +
             int(start_s) * 1000 +
             int(start_ms)
         )
+        duration_ms = None
+        if next_line_start_time is not None:
+            duration_ms = next_line_start_time - start_time_ms
         audio_segments = []
         split_parts = re.split(r'[“”"]', text_parts)
         process_next = False
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
+    for i, line in enumerate(lines):
+        next_line_start_time = None
+        if i < len(lines) - 1:
+            next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i+1])
+            if next_line_match:
+                nh, nm, ns, nms = next_line_match.groups()
+                next_line_start_time = (
+                    int(nh) * 3600000 +
+                    int(nm) * 60000 +
+                    int(ns) * 1000 +
+                    int(nms)
+                )
+        start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, speed_adjustment_factor)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             for path in audio_paths:
                 if path:  # Only process if audio_path is not None (meaning TTS was successful)
                     try:
                         os.remove(path)
                     except FileNotFoundError:
                         pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
     final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
     for segment in timed_audio_segments:
         final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
     combined_audio_path = tempfile.mktemp(suffix=".mp3")
     final_audio.export(combined_audio_path, format="mp3")
     return combined_audio_path, None
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
+    Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
+    The duration for each segment is determined by the timestamp of the following line.
+    The speed of the generated audio will be adjusted to fit within this duration.
+    If there is no subsequent timestamp, the speed adjustment will be skipped.
     You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
+    Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
+    00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
+    00:00:05,500 "1C Yes," said the child, "it is fun!"
     ```
     ***************************************************************************************************
     1M = en-AU-WilliamNeural - en-AU (Male)
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
+            gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 "Text" more text "1F Different Voice"'),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="TTS with Dynamic Duration and In-Quote Voice Switching",
         description=description,
         analytics_enabled=False,
         allow_flagging=False