Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on Apr 26

Commit

2f93aef

verified ·

1 Parent(s): 92f530c

adding silence by marker "SS##"

Browse files

Try adding silence by marker "SS##"

Files changed (1) hide show

app.py +98 -68

app.py CHANGED Viewed

@@ -12,88 +12,117 @@ async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-# Text-to-speech function for a single paragraph
 async def paragraph_to_speech(text, voice, rate, pitch):
     voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
     voice1F ="en-US-EmmaNeural - en-US (Female)"
     voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
-    voice2F = "en-US-JennyNeural - en-US (Female)"
-    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
     voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
     voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
     if not text.strip():
-        return None
-    if text.startswith("1F"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice1F.split(" - ")[0]
-    elif text.startswith("2F"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice2F.split(" - ")[0]
-    elif text.startswith("3F"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice3F.split(" - ")[0]
-    elif text.startswith("1M"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice1.split(" - ")[0]
-    elif text.startswith("2M"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice2.split(" - ")[0]
-    elif text.startswith("3M"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice3.split(" - ")[0]
-    elif text.startswith("1C"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice4.split(" - ")[0]
-    elif text.startswith("1O"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice5.split(" - ")[0]
-        pitch = -30
-        rate = -20
-    else:
-        # Use selected voice, or fallback to default
-        voice_short_name = (voice or default_voice).split(" - ")[0]
-        text2=text
-    rate_str = f"{rate:+d}%"
-    pitch_str = f"{pitch:+d}Hz"
-    communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tmp_path = tmp_file.name
-        await communicate.save(tmp_path)
-    return tmp_path
-# Main text-to-speech function that processes paragraphs
 async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
-    # Split by two or more newline characters, optionally preceded by carriage returns
-    #paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
     paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
-    audio_files = []
     for paragraph in paragraphs:
-        audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
-        if audio_path:
-            audio_files.append(audio_path)
-    if not audio_files:
-        return None, None  # No audio generated
-    # Combine audio files if there are multiple paragraphs
-    if len(audio_files) == 1:
-        return audio_files[0], None
-    else:
-        # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
-        combined_audio_path = tempfile.mktemp(suffix=".mp3")
-        with open(combined_audio_path, 'wb') as outfile:
-            for filename in audio_files:
-                with open(filename, 'rb') as infile:
-                    outfile.write(infile.read())
-                os.remove(filename)  # Clean up individual files
-        return combined_audio_path, None
 # Gradio interface function
 @spaces.GPU
@@ -109,6 +138,7 @@ async def create_demo():
     default_voice = "en-US-AndrewNeural - en-US (Male)"  # 👈 Pick one of the available voices
     description = """
     Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
     Enter your text, select a voice, and adjust the speech rate and pitch.
     The application will process your text paragraph by paragraph (separated by two blank lines).
     """
@@ -116,7 +146,7 @@ async def create_demo():
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
-            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -125,9 +155,9 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
         description=description,
-        article="Process text paragraph by paragraph for smoother output.",
         analytics_enabled=False,
         allow_flagging=False
     )

     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+# Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
     voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
     voice1F ="en-US-EmmaNeural - en-US (Female)"
     voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
+    voice2F = "en-US-JennyNeural - en-US (Female)"
+    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
     voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
     voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
     if not text.strip():
+        return None, []  # Return None for audio path and empty list for silence
+    audio_segments = []
+    silence_durations = []
+    parts = re.split(r'(SS\d+\.?\d*)', text)
+    for part in parts:
+        if re.match(r'SS\d+\.?\d*', part):
+            try:
+                silence_duration = float(part[2:])
+                silence_durations.append(silence_duration)
+                audio_segments.append(None) # Placeholder for silence
+            except ValueError:
+                print(f"Warning: Invalid silence duration format: {part}")
+        elif part.strip():
+            processed_text = part
+            current_voice = voice
+            current_rate = rate
+            current_pitch = pitch
+            if part.startswith("1F"):
+                processed_text = part[2:]
+                current_voice = voice1F.split(" - ")[0]
+            elif part.startswith("2F"):
+                processed_text = part[2:]
+                current_voice = voice2F.split(" - ")[0]
+            elif part.startswith("3F"):
+                processed_text = part[2:]
+                current_voice = voice3F.split(" - ")[0]
+            elif part.startswith("1M"):
+                processed_text = part[2:]
+                current_voice = voice1.split(" - ")[0]
+            elif part.startswith("2M"):
+                processed_text = part[2:]
+                current_voice = voice2.split(" - ")[0]
+            elif part.startswith("3M"):
+                processed_text = part[2:]
+                current_voice = voice3.split(" - ")[0]
+            elif part.startswith("1C"):
+                processed_text = part[2:]
+                current_voice = voice4.split(" - ")[0]
+            elif part.startswith("1O"):
+                processed_text = part[2:]
+                current_voice = voice5.split(" - ")[0]
+                current_pitch = -30
+                current_rate = -20
+            rate_str = f"{current_rate:+d}%"
+            pitch_str = f"{current_pitch:+d}Hz"
+            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+                tmp_path = tmp_file.name
+                await communicate.save(tmp_path)
+            audio_segments.append(tmp_path)
+        else:
+            audio_segments.append(None) # Empty string
+    return audio_segments, silence_durations
+# Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
     paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
+    final_audio_segments = []
     for paragraph in paragraphs:
+        audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
+        if audio_paths:
+            for i, path in enumerate(audio_paths):
+                final_audio_segments.append(path)
+                if i < len(silence_times):
+                    final_audio_segments.append(silence_times[i])
+    if not any(isinstance(item, str) for item in final_audio_segments):
+        return None, None  # No actual audio generated
+    if all(not isinstance(item, str) for item in final_audio_segments):
+        return None, "Only silence markers found."
+    combined_audio_path = tempfile.mktemp(suffix=".mp3")
+    with open(combined_audio_path, 'wb') as outfile:
+        for segment in final_audio_segments:
+            if isinstance(segment, str):
+                try:
+                    with open(segment, 'rb') as infile:
+                        outfile.write(infile.read())
+                    os.remove(segment)  # Clean up individual files
+                except FileNotFoundError:
+                    print(f"Warning: Audio file not found: {segment}")
+            elif isinstance(segment, (int, float)):
+                # Basic silence insertion (approximate)
+                silence = b'\x00' * int(segment * 44100 * 2) # Assuming 16-bit mono at 44.1kHz
+                outfile.write(silence)
+    return combined_audio_path, None
 # Gradio interface function
 @spaces.GPU
     default_voice = "en-US-AndrewNeural - en-US (Male)"  # 👈 Pick one of the available voices
     description = """
     Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
+    You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
     Enter your text, select a voice, and adjust the speech rate and pitch.
     The application will process your text paragraph by paragraph (separated by two blank lines).
     """
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
+            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
         description=description,
+        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,
         allow_flagging=False
     )