Edge_TTS_NGHIA_transcript

Sleeping

File size: 5,289 Bytes

2aef491
63f1d6d
 
 
 
 
13280d7
63f1d6d
0596274
63f1d6d
 
 
 
 
d3fce98
 
5f7c847
0596274
5f7c847
 
 
 
 
63f1d6d
d3fce98
5f7c847
e25395e
5f7c847
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25395e
5f7c847
 
6afb2ee
63f1d6d
 
6afb2ee
63f1d6d
 
 
d3fce98
 
 
 
 
 
 
 
 
027d5d3
 
 
d3fce98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f1d6d
 
7cba29e
63f1d6d
 
 
 
 
d81bde6
 
63f1d6d
 
0596274
d81bde6
5f7c847
d3fce98
 
d81bde6
d3fce98
63f1d6d
 
 
d3fce98
0596274
63f1d6d
 
 
 
 
 
 
d3fce98
d81bde6
d3fce98
63f1d6d

import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re  # Import the regular expression module


# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function for a single paragraph
async def paragraph_to_speech(text, voice, rate, pitch):
    voice1 ="en-US-AndrewNeural - en-US (Male)"  #good for reading
    voice1F ="en-US-EmmaNeural - en-US (Female)"
    voice2 = "en-US-GuyNeural (Male)"
    voice2F = "en-US-JennyNeural (Female)"      
    voice3 = "en-AU-WilliamNeural - en-AU (Male)" 
    voice3F = "en-HK-YanNeural - en-HK (Female)"
    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
    if not text.strip():
        return None
    if text.startswith("1F"):
        text2 = text[2:]  # Remove the first two characters ("FF")
        voice_short_name =voice1F.split(" - ")[0]
    elif text.startswith("2F"):
        text2 = text[2:]  # Remove the first two characters ("FF")
        voice_short_name =voice2F.split(" - ")[0] 
    elif text.startswith("3F"):
        text2 = text[2:]  # Remove the first two characters ("FF")
        voice_short_name =voice3F.split(" - ")[0]     
    elif text.startswith("1M"):
        text2 = text[2:]  # Remove the first two characters ("FF")
        voice_short_name =voice2.split(" - ")[0]        
    elif text.startswith("2M"):
        text2 = text[2:]  # Remove the first two characters ("FF")
        voice_short_name =voice3.split(" - ")[0]          
    elif text.startswith("1C"):
        text2 = text[2:]  # Remove the first two characters ("FF")
        voice_short_name =voice4.split(" - ")[0]          
    else:
        # Use selected voice, or fallback to default
        voice_short_name = (voice or default_voice).split(" - ")[0]
        text2=text
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

# Main text-to-speech function that processes paragraphs
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    # Split by two or more newline characters, optionally preceded by carriage returns
    paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]

    audio_files = []
    for paragraph in paragraphs:
        audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_path:
            audio_files.append(audio_path)

    if not audio_files:
        return None, None  # No audio generated

    # Combine audio files if there are multiple paragraphs
    if len(audio_files) == 1:
        return audio_files[0], None
    else:
        # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
        combined_audio_path = tempfile.mktemp(suffix=".mp3")
        with open(combined_audio_path, 'wb') as outfile:
            for filename in audio_files:
                with open(filename, 'rb') as infile:
                    outfile.write(infile.read())
                os.remove(filename)  # Clean up individual files
        return combined_audio_path, None

# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    return audio, warning

# Create Gradio application
import gradio as gr

async def create_demo():
    voices = await get_voices()
    default_voice = "en-US-AndrewNeural - en-US (Male)"  # 👈 Pick one of the available voices
    description = """
    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice
    Enter your text, select a voice, and adjust the speech rate and pitch.
    The application will process your text paragraph by paragraph (separated by two blank lines).
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
        description=description,
        article="Process text paragraph by paragraph for smoother output.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()