Spaces:

walidadebayo
/

text-to-speech-clone

Running

File size: 6,863 Bytes

import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import json
import datetime


async def get_voices():
    voices = await edge_tts.list_voices()
    return {
        f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v["ShortName"]
        for v in voices
    }


def format_time(milliseconds):
    """Convert milliseconds to SRT time format (HH:MM:SS,mmm)"""
    # Ensure milliseconds is an integer
    milliseconds = int(milliseconds)
    seconds, milliseconds = divmod(milliseconds, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"


async def text_to_speech(text, voice, rate, pitch, generate_subtitles=False):
    if not text.strip():
        return None, None, "Please enter text to convert."
    if not voice:
        return None, None, "Please select a voice."

    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(
        text, voice_short_name, rate=rate_str, pitch=pitch_str
    )
    
    # Create temporary file for audio
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        audio_path = tmp_file.name
    
    subtitle_path = None
    if generate_subtitles:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".srt") as srt_file:
            subtitle_path = srt_file.name
            
        # Generate audio and collect word boundary data
        async def process_audio():
            word_boundaries = []
            async for chunk in communicate.stream():
                if chunk["type"] == "audio":
                    with open(audio_path, "ab") as audio_file:
                        audio_file.write(chunk["data"])
                elif chunk["type"] == "WordBoundary":
                    word_boundaries.append(chunk)
            return word_boundaries
        
        word_boundaries = await process_audio()
        
        # Group words into sensible phrases/sentences for subtitles
        phrases = []
        current_phrase = []
        current_text = ""
        phrase_start = 0
        
        for i, boundary in enumerate(word_boundaries):
            word = boundary["text"]
            start_time = boundary["offset"] / 10000
            duration = boundary["duration"] / 10000
            end_time = start_time + duration
            
            if not current_phrase:
                phrase_start = start_time
                
            current_phrase.append(boundary)
            
            if word in ['.', ',', '!', '?', ':', ';'] or word.startswith(('.', ',', '!', '?', ':', ';')):
                current_text = current_text.rstrip() + word + " "
            else:
                current_text += word + " "
            
            # Determine if we should end this phrase and start a new one
            should_break = False
            
            # Break on punctuation
            if word.endswith(('.', '!', '?', ':', ';', ',')) or i == len(word_boundaries) - 1:
                should_break = True
                
            # Break after a certain number of words (4-5 is typical for subtitles)
            elif len(current_phrase) >= 5:
                should_break = True
                
            # Break on long pause (more than 300ms between words)
            elif i < len(word_boundaries) - 1:
                next_start = word_boundaries[i + 1]["offset"] / 10000
                if next_start - end_time > 300:
                    should_break = True
            
            if should_break or i == len(word_boundaries) - 1:
                if current_phrase:
                    last_boundary = current_phrase[-1]
                    phrase_end = (last_boundary["offset"] + last_boundary["duration"]) / 10000
                    phrases.append({
                        "text": current_text.strip(),
                        "start": phrase_start,
                        "end": phrase_end
                    })
                    current_phrase = []
                    current_text = ""
        
        # Write phrases to SRT file
        with open(subtitle_path, "w", encoding="utf-8") as srt_file:
            for i, phrase in enumerate(phrases):
                # Write SRT entry
                srt_file.write(f"{i+1}\n")
                srt_file.write(f"{format_time(phrase['start'])} --> {format_time(phrase['end'])}\n")
                srt_file.write(f"{phrase['text']}\n\n")
    else:
        # Just generate audio
        await communicate.save(audio_path)
    
    return audio_path, subtitle_path, None


async def tts_interface(text, voice, rate, pitch, generate_subtitles):
    audio, subtitle, warning = await text_to_speech(text, voice, rate, pitch, generate_subtitles)
    if warning:
        return audio, subtitle, gr.Warning(warning)
    return audio, subtitle, None


async def create_demo():
    voices = await get_voices()

    description = """
    Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
    You can also generate subtitle files (.srt) along with the audio.
    
    **Note:** Edge TTS is a cloud-based service and requires an active internet connection."""

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, value="Hello, how are you doing!"),
            gr.Dropdown(
                choices=[""] + list(voices.keys()),
                label="Select Voice",
                value=list(voices.keys())[0] if voices else "",
            ),
            gr.Slider(
                minimum=-50,
                maximum=50,
                value=0,
                label="Speech Rate Adjustment (%)",
                step=1,
            ),
            gr.Slider(
                minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1
            ),
            gr.Checkbox(label="Generate Subtitles (.srt)", value=False),
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Generated Subtitles"),
            gr.Markdown(label="Warning", visible=False),
        ],
        title="Edge TTS Text-to-Speech",
        description=description,
        article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
        analytics_enabled=False,
        flagging_mode="manual",  
        api_name="predict",
    )
    return demo


async def main():
    demo = await create_demo()
    demo.queue(default_concurrency_limit=50)
    demo.launch(show_api=True, show_error=True)


if __name__ == "__main__":
    asyncio.run(main())