NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on Apr 24

Commit

d3fce98

verified ·

1 Parent(s): 7987224

Parse text

Browse files

Parse input text - recognize paragraph by double ENTER
process each paragraph at a time

Files changed (1) hide show

app.py +41 -12

app.py CHANGED Viewed

@@ -10,13 +10,10 @@ async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-# Text-to-speech function
-async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
-        return None, gr.Warning("Please enter text to convert.")
-    if not voice:
-        return None, gr.Warning("Please select a voice.")
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
@@ -24,7 +21,37 @@ async def text_to_speech(text, voice, rate, pitch):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
-    return tmp_path, None
 # Gradio interface function
 @spaces.GPU
@@ -37,15 +64,17 @@ import gradio as gr
 async def create_demo():
     voices = await get_voices()
     description = """
     Experience the power of Voicecloning.be for text-to-speech conversion.
     """
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
-            gr.Textbox(label="Input Text", lines=5),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -54,9 +83,9 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="Voicecloning.be Text-to-Speech",
         description=description,
-        article="Experience the power of Voicecloning.be for text-to-speech conversion.",
         analytics_enabled=False,
         allow_flagging=False
     )

     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+# Text-to-speech function for a single paragraph
+async def paragraph_to_speech(text, voice, rate, pitch):
     if not text.strip():
+        return None
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
+    return tmp_path
+# Main text-to-speech function that processes paragraphs
+async def text_to_speech(text, voice, rate, pitch):
+    if not text.strip():
+        return None, gr.Warning("Please enter text to convert.")
+    if not voice:
+        return None, gr.Warning("Please select a voice.")
+    paragraphs = [p for p in text.split("\n\n") if p.strip()]
+    audio_files = []
+    for paragraph in paragraphs:
+        audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
+        if audio_path:
+            audio_files.append(audio_path)
+    if not audio_files:
+        return None, None  # No audio generated
+    # Combine audio files if there are multiple paragraphs
+    if len(audio_files) == 1:
+        return audio_files[0], None
+    else:
+        # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
+        combined_audio_path = tempfile.mktemp(suffix=".mp3")
+        with open(combined_audio_path, 'wb') as outfile:
+            for filename in audio_files:
+                with open(filename, 'rb') as infile:
+                    outfile.write(infile.read())
+                os.remove(filename)  # Clean up individual files
+        return combined_audio_path, None
 # Gradio interface function
 @spaces.GPU
 async def create_demo():
     voices = await get_voices()
     description = """
     Experience the power of Voicecloning.be for text-to-speech conversion.
+    Enter your text, select a voice, and adjust the speech rate and pitch.
+    The application will process your text paragraph by paragraph (separated by two blank lines).
     """
     demo = gr.Interface(
         fn=tts_interface,
         inputs=[
+            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
         description=description,
+        article="Process text paragraph by paragraph for smoother output.",
         analytics_enabled=False,
         allow_flagging=False
     )