NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on May 1

Commit

555abcf

verified ·

1 Parent(s): b184cb6

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -32

app.py CHANGED Viewed

@@ -112,32 +112,6 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
         return audio_path
     return None
-async def process_transcript_line(line, default_voice, rate, pitch):
-    """Processes a single transcript line with timestamp and quoted text segments."""
-    match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', line)
-    if match:
-        minutes, seconds, milliseconds_str, text_parts = match.groups()
-        start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
-        audio_segments = []
-        split_parts = re.split(r'(")', text_parts)  # Split by quote marks, keeping the quotes
-        process_next = False
-        for part in split_parts:
-            if part == '"':
-                process_next = not process_next
-                continue
-            if process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
-                if audio_path:
-                    audio_segments.append(audio_path)
-            elif not process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
-                if audio_path:
-                    audio_segments.append(audio_path)
-        return start_time_ms, audio_segments
-    return None, None
 async def process_transcript_line(line, default_voice, rate, pitch):
     """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
     match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
@@ -169,6 +143,49 @@ async def process_transcript_line(line, default_voice, rate, pitch):
         return start_time_ms, audio_segments
     return None, None
 @spaces.GPU
 def tts_interface(transcript, voice, rate, pitch):
     audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
@@ -178,18 +195,18 @@ async def create_demo():
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
-    Process timestamped text with voice changes within quotes.
-    Format: `minutes:seconds[.milliseconds] "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
-    0:00 "This is the default voice." more default. "1F Now a female voice." and back to default.
-    0:05 "1C Yes," said the child, "it is fun!"
     ```
     """
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
-            gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='0:00 "Text" more text "1F Different Voice"'),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -198,7 +215,7 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="TTS with Timestamp and In-Quote Voice Switching",
         description=description,
         analytics_enabled=False,
         allow_flagging=False

         return audio_path
     return None
 async def process_transcript_line(line, default_voice, rate, pitch):
     """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
     match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
         return start_time_ms, audio_segments
     return None, None
+async def transcript_to_speech(transcript_text, voice, rate, pitch):
+    if not transcript_text.strip():
+        return None, gr.Warning("Please enter transcript text.")
+    if not voice:
+        return None, gr.Warning("Please select a voice.")
+    lines = transcript_text.strip().split('\n')
+    timed_audio_segments = []
+    max_end_time_ms = 0
+    for line in lines:
+        start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
+        if start_time is not None and audio_paths:
+            combined_line_audio = AudioSegment.empty()
+            for path in audio_paths:
+                try:
+                    audio = AudioSegment.from_mp3(path)
+                    combined_line_audio += audio
+                    os.remove(path)
+                except FileNotFoundError:
+                    print(f"Warning: Audio file not found: {path}")
+            if combined_line_audio:
+                timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
+                max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
+        elif audio_paths:
+            for path in audio_paths:
+                try:
+                    os.remove(path)
+                except FileNotFoundError:
+                    pass # Clean up even if no timestamp
+    if not timed_audio_segments:
+        return None, "No processable audio segments found."
+    final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
+    for segment in timed_audio_segments:
+        final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
+    combined_audio_path = tempfile.mktemp(suffix=".mp3")
+    final_audio.export(combined_audio_path, format="mp3")
+    return combined_audio_path, None
 @spaces.GPU
 def tts_interface(transcript, voice, rate, pitch):
     audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
+    Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
+    Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
+    00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
+    00:00:05.000 "1C Yes," said the child, "it is fun!"
     ```
     """
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
+            gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
         description=description,
         analytics_enabled=False,
         allow_flagging=False