Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on Apr 24

Commit

5021a0c

verified ·

1 Parent(s): 4337b98

Restored to previous working

Browse files

Restored to previous working
Changed to detect quotes instead of paragraph

Files changed (1) hide show

app.py +92 -107

app.py CHANGED Viewed

@@ -1,145 +1,130 @@
 import asyncio
-import os
-import re
 import tempfile
-import edge_tts
-import gradio as gr
-# Default voice
-default_voice = "en-US-AndrewNeural - en-US (Male)"
-# Text-to-speech function for a single segment
-async def process_speech_segment(text, voice, rate, pitch):
-    """
-    Processes a single segment of text (either a quote or regular text)
-    and generates speech using edge-tts.
-    Args:
-        text (str): The text to be converted to speech.
-        voice (str): The voice to use (can be overridden by prefixes).
-        rate (int): The speech rate.
-        pitch (int): The speech pitch.
-    Returns:
-        str: The path to the generated audio file, or None on error.
-    """
-    voice1 = "en-US-AndrewNeural - en-US (Male)"  # good for reading
-    voice1F = "en-US-EmmaNeural - en-US (Female)"
     voice2 = "en-US-BrianNeural - en-US (Male)"
-    voice2F = "en-US-JennyNeural - en-US (Female)"
-    voice3 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
-    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  # Child
     if not text.strip():
         return None
-    voice_short_name = voice.split(" - ")[0] #default
     if text.startswith("1F"):
-        text2 = text[2:].strip()
-        voice_short_name = voice1F.split(" - ")[0]
     elif text.startswith("2F"):
-        text2 = text[2:].strip()
-        voice_short_name = voice2F.split(" - ")[0]
     elif text.startswith("3F"):
-        text2 = text[2:].strip()
-        voice_short_name = voice3F.split(" - ")[0]
     elif text.startswith("1M"):
-        text2 = text[2:].strip()
-        voice_short_name = voice2.split(" - ")[0]
     elif text.startswith("2M"):
-        text2 = text[2:].strip()
-        voice_short_name = voice3.split(" - ")[0]
     elif text.startswith("1C"):
-        text2 = text[2:].strip()
-        voice_short_name = voice4.split(" - ")[0]
     else:
-        text2 = text
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
-    try:
-        communicate = edge_tts.Communicate(text2, voice=voice_short_name, rate=rate_str, pitch=pitch_str) #removed async
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-            tmp_path = tmp_file.name
-            await communicate.save(tmp_path) #added await
-        return tmp_path
-    except Exception as e:
-        print(f"Error processing segment: {e}")  # Log the error
-        return None
-# Main text-to-speech function
 async def text_to_speech(text, voice, rate, pitch):
-    """
-    Processes the input text, identifying quoted sections for different voices,
-    and generates combined audio.
-    Args:
-        text (str): The input text.
-        voice (str): The default voice.
-        rate (int): The speech rate.
-        pitch (int): The speech pitch.
-    Returns:
-        tuple: (audio_path, error_message) where audio_path is the path to the
-               combined audio file, and error_message is any error encountered.
-    """
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
-    audio_files = []
-    segments = []
-    i = 0
-    while i < len(text):
-        if text[i] == '"':
-            # Find the closing quote
-            j = i + 1
-            while j < len(text) and text[j] != '"':
-                j += 1
-            if j < len(text):
-                segments.append(("quote", text[i + 1:j]))
-                i = j + 1
-            else:
-                segments.append(("text", text[i:]))  # Handle unclosed quote
-                i = j
-        else:
-            # Find the end of the non-quote text
-            j = i + 1
-            while j < len(text) and text[j] != '"':
-                j += 1
-            segments.append(("text", text[i:j]))
-            i = j
-    for segment_type, segment_text in segments:
-        if segment_type == "quote":
-            # Determine the voice based on the prefix within the quote.
-            voice_prefix = ""
-            if segment_text.startswith("1F") or segment_text.startswith("2F") or segment_text.startswith("3F") or segment_text.startswith("1M") or segment_text.startswith("2M") or segment_text.startswith("1C"):
-                voice_prefix = segment_text[:2]
-            audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
-        else:
-            audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
         if audio_path:
             audio_files.append(audio_path)
     if not audio_files:
-        return None, None
     if len(audio_files) == 1:
         return audio_files[0], None
     else:
-        # Combine audio files
         combined_audio_path = tempfile.mktemp(suffix=".mp3")
         with open(combined_audio_path, 'wb') as outfile:
             for filename in audio_files:
-                try:
-                    with open(filename, 'rb') as infile:
-                        outfile.write(infile.read())
-                    os.remove(filename)  # Clean up individual files
-                except Exception as e:
-                    print(f"Error combining audio files: {e}")
-                    return None, gr.Error(f"Error combining audio files: {e}")
         return combined_audio_path, None

+import spaces
+import gradio as gr
+import edge_tts
 import asyncio
 import tempfile
+import os
+import re  # Import the regular expression module
+# Get all available voices
+async def get_voices():
+    voices = await edge_tts.list_voices()
+    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+# Text-to-speech function for a single paragraph
+async def paragraph_to_speech(text, voice, rate, pitch):
+    voice1 ="en-US-AndrewNeural - en-US (Male)"  #good for reading
+    voice1F ="en-US-EmmaNeural - en-US (Female)"
     voice2 = "en-US-BrianNeural - en-US (Male)"
+    voice2F = "en-US-JennyNeural - en-US (Female)"
+    voice3 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
+    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
     if not text.strip():
         return None
     if text.startswith("1F"):
+        text2 = text[2:]  # Remove the first two characters ("FF")
+        voice_short_name =voice1F.split(" - ")[0]
     elif text.startswith("2F"):
+        text2 = text[2:]  # Remove the first two characters ("FF")
+        voice_short_name =voice2F.split(" - ")[0]
     elif text.startswith("3F"):
+        text2 = text[2:]  # Remove the first two characters ("FF")
+        voice_short_name =voice3F.split(" - ")[0]
     elif text.startswith("1M"):
+        text2 = text[2:]  # Remove the first two characters ("FF")
+        voice_short_name =voice2.split(" - ")[0]
     elif text.startswith("2M"):
+        text2 = text[2:]  # Remove the first two characters ("FF")
+        voice_short_name =voice3.split(" - ")[0]
     elif text.startswith("1C"):
+        text2 = text[2:]  # Remove the first two characters ("FF")
+        voice_short_name =voice4.split(" - ")[0]
     else:
+        # Use selected voice, or fallback to default
+        voice_short_name = (voice or default_voice).split(" - ")[0]
+        text2=text
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
+    communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+        await communicate.save(tmp_path)
+    return tmp_path
+# Main text-to-speech function that processes paragraphs
 async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
+    # Split by two or more newline characters, optionally preceded by carriage returns
+    #paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
+    paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
+    audio_files = []
+    for paragraph in paragraphs:
+        audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
         if audio_path:
             audio_files.append(audio_path)
     if not audio_files:
+        return None, None  # No audio generated
+    # Combine audio files if there are multiple paragraphs
     if len(audio_files) == 1:
         return audio_files[0], None
     else:
+        # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
         combined_audio_path = tempfile.mktemp(suffix=".mp3")
         with open(combined_audio_path, 'wb') as outfile:
             for filename in audio_files:
+                with open(filename, 'rb') as infile:
+                    outfile.write(infile.read())
+                os.remove(filename)  # Clean up individual files
         return combined_audio_path, None
+# Gradio interface function
+@spaces.GPU
+def tts_interface(text, voice, rate, pitch):
+    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
+    return audio, warning
+# Create Gradio application
+import gradio as gr
+async def create_demo():
+    voices = await get_voices()
+    default_voice = "en-US-AndrewNeural - en-US (Male)"  # 👈 Pick one of the available voices
+    description = """
+    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice
+    Enter your text, select a voice, and adjust the speech rate and pitch.
+    The application will process your text paragraph by paragraph (separated by two blank lines).
+    """
+    demo = gr.Interface(
+        fn=tts_interface,
+        inputs=[
+            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
+            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
+            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
+            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
+        ],
+        outputs=[
+            gr.Audio(label="Generated Audio", type="filepath"),
+            gr.Markdown(label="Warning", visible=False)
+        ],
+        title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
+        description=description,
+        article="Process text paragraph by paragraph for smoother output.",
+        analytics_enabled=False,
+        allow_flagging=False
+    )
+    return demo
+# Run the application
+if __name__ == "__main__":
+    demo = asyncio.run(create_demo())
+    demo.launch()