Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on May 13

Commit

74db9d2

verified ·

1 Parent(s): c919734

Update app.py

Browse files

Add permanent voice change

Files changed (1) hide show

app.py +49 -28

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
-##fix overlap, remove silence, leave a tiny bit of silence
 ## Simplified
 import spaces
 import gradio as gr
@@ -62,8 +63,9 @@ async def get_voices():
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
-    """Generates audio for a text segment, handling voice prefixes, retries, and fallback."""
-    print(f"Text: {text_segment}")  #Debug
     voice_map = {
         "1F": ("en-GB-SoniaNeural", 25, 0),
         "2F": ("en-US-JennyNeural", 0, 0),
@@ -73,48 +75,68 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
         "2M": ("en-GB-RyanNeural", 0, 0),
         "3M": ("en-US-BrianMultilingualNeural", 0, 0),
         "4M": ("en-GB-ThomasNeural", 0, 0),
-        "1O": ("en-GB-RyanNeural", -20, -10),
-        "1C": ("en-GB-MaisieNeural", 0, 0),
         "1V": ("vi-VN-HoaiMyNeural", 0, 0),
         "2V": ("vi-VN-NamMinhNeural", 0, 0),
         "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
         "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
     }
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
-    detect = False
-    prefix = processed_text[:2]
-    if prefix in voice_map:
-        current_voice_short, pitch_adj, rate_adj = voice_map[prefix]
-        current_pitch += pitch_adj
-        current_rate += rate_adj
-        detect = True
-    match = re.search(r'[A-Za-z]+\-?\d+', processed_text)
-    if match:
-        group = match.group()
-        prefix_only = ''.join(filter(str.isalpha, group))
-        number = int(''.join(ch for ch in group if ch.isdigit() or ch == '-'))
-        current_pitch += number
-        processed_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()
-        processed_text = processed_text[len(prefix_only):].strip()
-    elif detect:
-        processed_text = processed_text[2:].strip()
-    if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
-        # Retry logic
         for attempt in range(3):
             try:
-                communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                     audio_path = tmp_file.name
                     await communicate.save(audio_path)
@@ -126,14 +148,13 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
                 audio.export(stripped_path, format="mp3")
                 return stripped_path
             except Exception as e:
-                print(f"Edge TTS Failed# {attempt}:: {e}")  #Debug
                 if attempt == 2:
                     # Final failure: return 500ms of silence
                     silent_audio = AudioSegment.silent(duration=500)
                     fallback_path = tempfile.mktemp(suffix=".mp3")
                     silent_audio.export(fallback_path, format="mp3")
                     return fallback_path
-                await asyncio.sleep(0.5)  # brief wait before retry
     return None

+## fix overlap, remove silence, leave a tiny bit of silence
 ## Simplified
+## Permanent voice change implemented
 import spaces
 import gradio as gr
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
+    """Generates audio for a text segment, handling permanent and temporary voice changes with new rules."""
+    # Define the voice map for reference
     voice_map = {
         "1F": ("en-GB-SoniaNeural", 25, 0),
         "2F": ("en-US-JennyNeural", 0, 0),
         "2M": ("en-GB-RyanNeural", 0, 0),
         "3M": ("en-US-BrianMultilingualNeural", 0, 0),
         "4M": ("en-GB-ThomasNeural", 0, 0),
+        "1O": ("en-GB-RyanNeural", -20, -10),  # Old man
+        "1C": ("en-GB-MaisieNeural", 0, 0),   # Child
         "1V": ("vi-VN-HoaiMyNeural", 0, 0),
         "2V": ("vi-VN-NamMinhNeural", 0, 0),
         "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
         "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
     }
+    # Initialize current voice and processing variables
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
+    # Track permanent voice and temporary changes
+    permanent_voice = current_voice_short
+    temp_voice = None
+    # We'll process the text and adjust voices accordingly
+    result = []
+    idx = 0
+    while idx < len(processed_text):
+        # Detect potential voice change
+        match = re.match(r"(1F|2F|3F|4F|1M|2M|3M|4M|1O|1C|1V|2V|3V|4V)(P?)(-?\d+)?", processed_text[idx:])
+        if match:
+            prefix = match.group(1)
+            permanent_flag = match.group(2) == 'P'  # Check if it's a permanent change
+            pitch_modifier = match.group(3)  # This will be None or a number
+            if permanent_flag:
+                # Permanent voice change (e.g., "4VP")
+                permanent_voice, pitch_adj, rate_adj = voice_map[prefix]
+                current_pitch += pitch_adj
+                current_rate += rate_adj
+                result.append(f"<perm>{prefix}P")  # Mark as permanent change
+            elif pitch_modifier:
+                # Temporary pitch adjustment (e.g., "4V-10" or "4V+5")
+                pitch_adjustment = int(pitch_modifier)
+                current_pitch += pitch_adjustment
+                result.append(f"<temp>{prefix}{pitch_modifier}")  # Mark as temporary change
+            # Move index forward past the match
+            idx += len(match.group(0))
+            continue
+        # If no match, just add the normal text character
+        result.append(processed_text[idx])
+        idx += 1
+    # Rebuild the text with permanent and temporary voice marks
+    final_processed_text = ''.join(result).strip()
+    if final_processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
+        # Retry logic for TTS
         for attempt in range(3):
             try:
+                communicate = edge_tts.Communicate(final_processed_text, permanent_voice, rate=rate_str, pitch=pitch_str)
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                     audio_path = tmp_file.name
                     await communicate.save(audio_path)
                 audio.export(stripped_path, format="mp3")
                 return stripped_path
             except Exception as e:
                 if attempt == 2:
                     # Final failure: return 500ms of silence
                     silent_audio = AudioSegment.silent(duration=500)
                     fallback_path = tempfile.mktemp(suffix=".mp3")
                     silent_audio.export(fallback_path, format="mp3")
                     return fallback_path
+                await asyncio.sleep(0.5)  # Retry after brief pause
     return None