Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on May 13

Commit

cdec9da

verified ·

1 Parent(s): f18fa5d

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -56

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
-## fix overlap, remove silence, leave a tiny bit of silence
 ## Simplified
-## Permanent voice change implemented
 import spaces
 import gradio as gr
@@ -13,6 +12,9 @@ from pathlib import Path
 from pydub.silence import detect_nonsilent
 from pydub import AudioSegment
 def strip_silence(audio: AudioSegment, silence_thresh=-40, min_silence_len=100, silence_padding_ms=100):
     from pydub.silence import detect_nonsilent
     # Detect non-silent regions
@@ -62,12 +64,9 @@ async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-## EDIT
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
-    """Generates audio for a text segment, handling permanent and temporary voice changes."""
-    # Define the voice map for reference
     voice_map = {
         "1F": ("en-GB-SoniaNeural", 25, 0),
         "2F": ("en-US-JennyNeural", 0, 0),
@@ -77,69 +76,52 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
         "2M": ("en-GB-RyanNeural", 0, 0),
         "3M": ("en-US-BrianMultilingualNeural", 0, 0),
         "4M": ("en-GB-ThomasNeural", 0, 0),
-        "1O": ("en-GB-RyanNeural", -20, -10),  # Old man
-        "1C": ("en-GB-MaisieNeural", 0, 0),   # Child
         "1V": ("vi-VN-HoaiMyNeural", 0, 0),
         "2V": ("vi-VN-NamMinhNeural", 0, 0),
         "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
         "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
     }
-    # Initialize current voice and processing variables
-    current_voice_full = default_voice
-    current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
-    # Track permanent voice and temporary changes
-    permanent_voice = current_voice_short
-    temp_voice = None
-    # We'll process the text and adjust voices accordingly
-    result = []
-    idx = 0
-    while idx < len(processed_text):
-        # Detect potential voice change
-        match = re.match(r"(1F|2F|3F|4F|1M|2M|3M|4M|1O|1C|1V|2V|3V|4V)(P?)(-?\d+)?", processed_text[idx:])
-        if match:
-            prefix = match.group(1)
-            permanent_flag = match.group(2) == 'P'  # Check if it's a permanent change
-            pitch_modifier = match.group(3)  # This will be None or a number
-            if permanent_flag:
-                # Permanent voice change (e.g., "4VP")
-                permanent_voice, pitch_adj, rate_adj = voice_map[prefix]
-                current_pitch += pitch_adj
-                current_rate += rate_adj
-                result.append(f"<perm>{prefix}P")  # Mark as permanent change
-                temp_voice = None  # Clear temporary voice changes
-            elif pitch_modifier:
-                # Temporary pitch adjustment (e.g., "4V-10" or "4V+5")
-                pitch_adjustment = int(pitch_modifier)
-                current_pitch += pitch_adjustment
-                result.append(f"<temp>{prefix}{pitch_modifier}")  # Mark as temporary change
-            # Move index forward past the match
-            idx += len(match.group(0))
-            continue
-        # If no match, just add the normal text character
-        result.append(processed_text[idx])
-        idx += 1
-    # Rebuild the text with permanent and temporary voice marks
-    final_processed_text = ''.join(result).strip()
-    if final_processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
-        # Retry logic for TTS
         for attempt in range(3):
             try:
-                communicate = edge_tts.Communicate(final_processed_text, permanent_voice, rate=rate_str, pitch=pitch_str)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                     audio_path = tmp_file.name
                     await communicate.save(audio_path)
@@ -151,16 +133,16 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
                 audio.export(stripped_path, format="mp3")
                 return stripped_path
             except Exception as e:
                 if attempt == 2:
                     # Final failure: return 500ms of silence
                     silent_audio = AudioSegment.silent(duration=500)
                     fallback_path = tempfile.mktemp(suffix=".mp3")
                     silent_audio.export(fallback_path, format="mp3")
                     return fallback_path
-                await asyncio.sleep(0.5)  # Retry after brief pause
     return None
-### END EDIT
 async def process_transcript_line(line, default_voice, rate, pitch):
     """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
@@ -318,6 +300,7 @@ async def create_demo():
     2V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
     3V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
     4V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
     ****************************************************************************************************
     """
     demo = gr.Interface(

+##fix overlap, remove silence, leave a tiny bit of silence
 ## Simplified
 import spaces
 import gradio as gr
 from pydub.silence import detect_nonsilent
 from pydub import AudioSegment
+flagpermanent = False
+default_voice_short= ""
 def strip_silence(audio: AudioSegment, silence_thresh=-40, min_silence_len=100, silence_padding_ms=100):
     from pydub.silence import detect_nonsilent
     # Detect non-silent regions
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
+    """Generates audio for a text segment, handling voice prefixes, retries, and fallback."""
+    print(f"Text: {text_segment}")  #Debug
     voice_map = {
         "1F": ("en-GB-SoniaNeural", 25, 0),
         "2F": ("en-US-JennyNeural", 0, 0),
         "2M": ("en-GB-RyanNeural", 0, 0),
         "3M": ("en-US-BrianMultilingualNeural", 0, 0),
         "4M": ("en-GB-ThomasNeural", 0, 0),
+        "1O": ("en-GB-RyanNeural", -20, -10),
+        "1C": ("en-GB-MaisieNeural", 0, 0),
         "1V": ("vi-VN-HoaiMyNeural", 0, 0),
         "2V": ("vi-VN-NamMinhNeural", 0, 0),
         "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
         "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
     }
+    if default_voice_short == "":
+        current_voice_full = default_voice
+        current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
+    else:
+        current_voice_short = default_voice_short
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
+    detect = False
+    prefix = processed_text[:2]
+    if prefix in voice_map:
+        current_voice_short, pitch_adj, rate_adj = voice_map[prefix]
+        current_pitch += pitch_adj
+        current_rate += rate_adj
+        detect = True
+    match = re.search(r'[A-Za-z]+\-?\d+', processed_text)
+    if match:
+        group = match.group()
+        prefix_only = ''.join(filter(str.isalpha, group))
+        number = int(''.join(ch for ch in group if ch.isdigit() or ch == '-'))
+        if number=0:
+            default_voice_short= current_voice_short
+        current_pitch += number
+        processed_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()
+        processed_text = processed_text[len(prefix_only):].strip()
+    elif detect:
+        processed_text = processed_text[2:].strip()
+    if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
+        # Retry logic
         for attempt in range(3):
             try:
+                communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                     audio_path = tmp_file.name
                     await communicate.save(audio_path)
                 audio.export(stripped_path, format="mp3")
                 return stripped_path
             except Exception as e:
+                print(f"Edge TTS Failed# {attempt}:: {e}")  #Debug
                 if attempt == 2:
                     # Final failure: return 500ms of silence
                     silent_audio = AudioSegment.silent(duration=500)
                     fallback_path = tempfile.mktemp(suffix=".mp3")
                     silent_audio.export(fallback_path, format="mp3")
                     return fallback_path
+                await asyncio.sleep(0.5)  # brief wait before retry
     return None
 async def process_transcript_line(line, default_voice, rate, pitch):
     """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
     2V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
     3V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
     4V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
+    Add 0 after Prefix to make it permanent voice
     ****************************************************************************************************
     """
     demo = gr.Interface(