Spaces:

walidadebayo
/

text-to-speech-clone

Running

App Files Files Community

walidadebayo commited on May 10

Commit

49b2f3e

1 Parent(s): 17f4c83

Add subtitle generation feature to text-to-speech functionality

Browse files

Files changed (1) hide show

app.py +103 -10

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import edge_tts
 import asyncio
 import tempfile
 import os
 async def get_voices():
@@ -13,11 +15,21 @@ async def get_voices():
     }
-async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
-        return None, "Please enter text to convert."
     if not voice:
-        return None, "Please select a voice."
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
@@ -25,17 +37,95 @@ async def text_to_speech(text, voice, rate, pitch):
     communicate = edge_tts.Communicate(
         text, voice_short_name, rate=rate_str, pitch=pitch_str
     )
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tmp_path = tmp_file.name
-        await communicate.save(tmp_path)
-    return tmp_path, None
-async def tts_interface(text, voice, rate, pitch):
-    audio, warning = await text_to_speech(text, voice, rate, pitch)
     if warning:
-        return audio, gr.Warning(warning)
-    return audio, None
 async def create_demo():
@@ -43,6 +133,7 @@ async def create_demo():
     description = """
     Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
     **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""
@@ -65,9 +156,11 @@ async def create_demo():
             gr.Slider(
                 minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
             ),
         ],
         outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False),
         ],
         title="Edge TTS Text-to-Speech",

 import asyncio
 import tempfile
 import os
+import json
+import datetime
 async def get_voices():
     }
+def format_time(milliseconds):
+    """Convert milliseconds to SRT time format (HH:MM:SS,mmm)"""
+    # Ensure milliseconds is an integer
+    milliseconds = int(milliseconds)
+    seconds, milliseconds = divmod(milliseconds, 1000)
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
+async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False):
     if not text.strip():
+        return None, None, "Please enter text to convert."
     if not voice:
+        return None, None, "Please select a voice."
     voice_short_name = voice.split(" - ")[0]
     rate_str = f"{rate:+d}%"
     communicate = edge_tts.Communicate(
         text, voice_short_name, rate=rate_str, pitch=pitch_str
     )
+    # Create temporary file for audio
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        audio_path = tmp_file.name
+    subtitle_path = None
+    if generate_subtitles:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
+            subtitle_path = srt_file.name
+        # Generate audio and collect word boundary data
+        async def process_audio():
+            word_boundaries = []
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    with open(audio_path, "ab") as audio_file:
+                        audio_file.write(chunk["data"])
+                elif chunk["type"] == "WordBoundary":
+                    word_boundaries.append(chunk)
+            return word_boundaries
+        word_boundaries = await process_audio()
+        # Group words into sensible phrases/sentences for subtitles
+        phrases = []
+        current_phrase = []
+        current_text = ""
+        phrase_start = 0
+        for i, boundary in enumerate(word_boundaries):
+            word = boundary["text"]
+            start_time = boundary["offset"] / 10000
+            duration = boundary["duration"] / 10000
+            end_time = start_time + duration
+            if not current_phrase:
+                phrase_start = start_time
+            current_phrase.append(boundary)
+            current_text += word + " "
+            # Determine if we should end this phrase and start a new one
+            should_break = False
+            # Break on punctuation
+            if word.endswith(('.', '!', '?', ':', ';', ',')) or i == len(word_boundaries) - 1:
+                should_break = True
+            # Break after a certain number of words (4-5 is typical for subtitles)
+            elif len(current_phrase) >= 5:
+                should_break = True
+            # Break on long pause (more than 300ms between words)
+            elif i < len(word_boundaries) - 1:
+                next_start = word_boundaries[i + 1]["offset"] / 10000
+                if next_start - end_time > 300:
+                    should_break = True
+            if should_break or i == len(word_boundaries) - 1:
+                if current_phrase:
+                    last_boundary = current_phrase[-1]
+                    phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
+                    phrases.append({
+                        "text": current_text.strip(),
+                        "start": phrase_start,
+                        "end": phrase_end
+                    })
+                    current_phrase = []
+                    current_text = ""
+        # Write phrases to SRT file
+        with open(subtitle_path, "w", encoding="utf-8") as srt_file:
+            for i, phrase in enumerate(phrases):
+                # Write SRT entry
+                srt_file.write(f"{i+1}\n")
+                srt_file.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
+                srt_file.write(f"{phrase['text']}\n\n")
+    else:
+        # Just generate audio
+        await communicate.save(audio_path)
+    return audio_path, subtitle_path, None
+async def tts_interface(text, voice, rate, pitch, generate_subtitles):
+    audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles)
     if warning:
+        return audio, subtitle, gr.Warning(warning)
+    return audio, subtitle, None
 async def create_demo():
     description = """
     Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
+    You can also generate subtitle files (.srt) along with the audio.
     **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""
             gr.Slider(
                 minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
             ),
+            gr.Checkbox(label="Generate Subtitles (.srt)", value=False),
         ],
         outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
+            gr.File(label="Generated Subtitles"),
             gr.Markdown(label="Warning", visible=False),
         ],
         title="Edge TTS Text-to-Speech",