NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on Apr 28

Commit

b0718f9

verified ·

1 Parent(s): 4f3af59

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -25

app.py CHANGED Viewed

@@ -41,25 +41,24 @@ async def get_voices():
 # Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
-    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
-    #voice1F ="en-US-EmmaNeural - en-US (Female)"
     voice1F ="en-GB-SoniaNeural - en-GB (Female)"
-    #voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
     voice2 = "en-GB-RyanNeural - en-GB (Male)"
     voice2F = "en-US-JennyNeural - en-US (Female)"
-    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
-    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
     voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
     if not text.strip():
         return None, []  # Return None for audio path and empty list for silence
     audio_segments = []
     silence_durations = []
-    parts = re.split(r'(SS\d+\.?\d*)', text)
-    for part in parts:
         if re.match(r'SS\d+\.?\d*', part):  #Check if there is Silence tag
             # At the top of your file:
             #SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
@@ -76,37 +75,44 @@ async def paragraph_to_speech(text, voice, rate, pitch):
             silence_file_path = get_silence(silence_duration)  # Store the returned filename
             audio_segments.append(silence_file_path)  # Use the stored filename
         elif part.strip():
             processed_text = part
             current_voice = voice
             current_rate = rate
             current_pitch = pitch
             if part.startswith("1F"):
-                #processed_text = part[2:]
                 current_voice = voice1F.split(" - ")[0]
                 current_pitch = 25
             elif part.startswith("2F"):
-                #processed_text = part[2:]
                 current_voice = voice2F.split(" - ")[0]
             elif part.startswith("3F"):
-                #processed_text = part[2:]
                 current_voice = voice3F.split(" - ")[0]
             elif part.startswith("1M"):
-                #processed_text = part[2:]
                 current_voice = voice1.split(" - ")[0]
             elif part.startswith("2M"):
-                #processed_text = part[2:]
                 current_voice = voice2.split(" - ")[0]
             elif part.startswith("3M"):
-                #processed_text = part[2:]
                 current_voice = voice3.split(" - ")[0]
-            elif part.startswith("1C"):
-                #processed_text = part[2:]
-                current_voice = voice4.split(" - ")[0]
-            elif part.startswith("1O"):
-                #processed_text = part[2:]
                 current_voice = voice5.split(" - ")[0]
                 current_pitch = -20
                 current_rate = -10
             else:
                 # Use selected voice, or fallback to default
                 #voice_short_name = (voice or default_voice).split(" - ")[0]
@@ -125,7 +131,8 @@ async def paragraph_to_speech(text, voice, rate, pitch):
                 #processed_text = new_text[2:]  #cut out the prefix like 1F, 3M etc
                 processed_text = new_text[len(prefix):]  # Dynamically remove the prefix part
             else:
-                processed_text = part[2:]
             rate_str = f"{current_rate:+d}%"
             #if  part[2:4].isdigit():
             #    processed_text = part[4:]
@@ -192,10 +199,12 @@ async def create_demo():
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
     description = """
-    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
-    You can insert silence using the marker 'SS' (This will insert a Silence period from the Silence.mp3 file).
-    Enter your text, select a voice, and adjust the speech rate and pitch. Can also set like 1F-20  or 1M24
-    The application will process your text paragraph by paragraph (separated by two blank lines).
     """
     demo = gr.Interface(
@@ -210,7 +219,7 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
         description=description,
         article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,

 # Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
+    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
     voice1F ="en-GB-SoniaNeural - en-GB (Female)"
     voice2 = "en-GB-RyanNeural - en-GB (Male)"
     voice2F = "en-US-JennyNeural - en-US (Female)"
+    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
     voice3F = "en-HK-YanNeural - en-HK (Female)"
+    voice4 = "en-GB-ThomasNeural - en-GB (Male)"
+    voice4F ="en-US-EmmaNeural - en-US (Female)"
     voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
+    voice6 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
     if not text.strip():
         return None, []  # Return None for audio path and empty list for silence
     audio_segments = []
     silence_durations = []
+    parts = re.split(r'(SS\d+\.?\d*)', text)  #this one separtate the SS## tag if any in the text.
+    for part in parts:
         if re.match(r'SS\d+\.?\d*', part):  #Check if there is Silence tag
             # At the top of your file:
             #SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
             silence_file_path = get_silence(silence_duration)  # Store the returned filename
             audio_segments.append(silence_file_path)  # Use the stored filename
         elif part.strip():
+            detect=0
             processed_text = part
             current_voice = voice
             current_rate = rate
             current_pitch = pitch
             if part.startswith("1F"):
+                detect=1
                 current_voice = voice1F.split(" - ")[0]
                 current_pitch = 25
             elif part.startswith("2F"):
+                detect=1
                 current_voice = voice2F.split(" - ")[0]
             elif part.startswith("3F"):
+                detect=1
                 current_voice = voice3F.split(" - ")[0]
+            elif part.startswith("4F"):
+                #detect=1
+                current_voice = voice4F.split(" - ")[0]
             elif part.startswith("1M"):
+                detect=1
                 current_voice = voice1.split(" - ")[0]
             elif part.startswith("2M"):
+                detect=1
                 current_voice = voice2.split(" - ")[0]
             elif part.startswith("3M"):
+                detect=1
                 current_voice = voice3.split(" - ")[0]
+            elif part.startswith("4M"):
+                detect=1
+                current_voice = voice4.split(" - ")[0]
+            elif part.startswith("1O"):  # Old man voice
+                detect=1
                 current_voice = voice5.split(" - ")[0]
                 current_pitch = -20
                 current_rate = -10
+            elif part.startswith("1C"):  #Child voice
+                detect=1
+                current_voice = voice6.split(" - ")[0]
             else:
                 # Use selected voice, or fallback to default
                 #voice_short_name = (voice or default_voice).split(" - ")[0]
                 #processed_text = new_text[2:]  #cut out the prefix like 1F, 3M etc
                 processed_text = new_text[len(prefix):]  # Dynamically remove the prefix part
             else:
+                if detect:
+                   processed_text = part[2:]
             rate_str = f"{current_rate:+d}%"
             #if  part[2:4].isdigit():
             #    processed_text = part[4:]
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
     description = """
+    Default = <b>"en-US-AndrewMultilingualNeural - en-US (Male),
+    other voices 1F:en-GB-SoniaNeural,    2F:en-US-JennyNeural,  3F:en-HK-YanNeural,  4F:en-US-EmmaNeural
+                 1M:en-AU-WilliamNeural,  2M:en-GB-RyanNeural,   3M:en-US-BrianMultilingualNeural,  4M:en-GB-ThomasNeural
+                 1C: en-GB-MaisieNeural (Childvoice), 1O = en-GB-RyanNeural (OldMan)"</b>
+    You can insert silence using the marker 'SS##' example "SS2.0"
+    Enter your text, select a voice, and adjust the speech rate and pitch. Can also set like 1F-20  or 1M24.
     """
     demo = gr.Interface(
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="TTS using Edge Engine.. ENGLISH!",
         description=description,
         article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,