Edge_TTS_NGHIA_transcript

Sleeping

File size: 10,652 Bytes

5021a0c
 
 
63f1d6d
4337b98
5021a0c
552e1db
e42e13d
a4e47b1
0596274
44c7b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284179e
 
 
 
 
7042e46
552e1db
5021a0c
b0718f9
3e534e0
aa5ea31
552e1db
b0718f9
552e1db
b0718f9
 
552e1db
b0718f9
2f93aef
63f1d6d
2f93aef
 
 
284179e
b0718f9
 
aa5ea31
e4e3d3e
 
9733186
7a3f365
e4e3d3e
9733186
7a3f365
 
 
 
fdb31c0
7a3f365
 
 
2f93aef
b0718f9
2f93aef
 
 
 
 
b0718f9
552e1db
3e534e0
2f93aef
b0718f9
552e1db
2f93aef
b0718f9
552e1db
b0718f9
 
 
2f93aef
b0718f9
552e1db
2f93aef
b0718f9
552e1db
2f93aef
b0718f9
552e1db
b0718f9
 
 
 
 
552e1db
9995337
 
b0718f9
 
 
ef4c8b8
284179e
 
552e1db
 
9995337
4f3af59
 
 
 
 
 
 
9995337
4f3af59
 
 
8462870
b0718f9
 
2f93aef
0b4c9e5
 
 
2f93aef
 
284179e
2f93aef
 
 
 
552e1db
2f93aef
284179e
2f93aef
 
d3fce98
 
 
 
 
 
f067030
 
 
2f93aef
4337b98
5021a0c
284179e
2f93aef
284179e
 
 
 
2f93aef
 
 
 
 
 
 
284179e
 
 
 
2f93aef
284179e
 
 
2f93aef
284179e
2f93aef
5021a0c
552e1db
 
 
 
 
 
 
284179e
 
5021a0c
552e1db
 
5021a0c
b0718f9
 
 
 
 
 
5021a0c
 
 
552e1db
 
2f93aef
5021a0c
 
92f530c
5021a0c
552e1db
5021a0c
 
 
b0718f9
5021a0c
2f93aef
5021a0c
 
 
 
 
 
 
 
552e1db

import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re  # Import the regular expression module
from pathlib import Path
from pydub import AudioSegment

def get_silence(duration_ms=1000):
    # Create silent audio segment with specified parameters
    silent_audio = AudioSegment.silent(
        duration=duration_ms,
        frame_rate=24000  # 24kHz sampling rate
    )
    
    # Set audio parameters
    silent_audio = silent_audio.set_channels(1)  # Mono
    silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        # Export with specific bitrate and codec parameters
        silent_audio.export(
            tmp_file.name,
            format="mp3",
            bitrate="48k",
            parameters=[
                "-ac", "1",  # Mono
                "-ar", "24000",  # Sample rate
                "-sample_fmt", "s32",  # 32-bit samples
                "-codec:a", "libmp3lame"  # MP3 codec
            ]
        )
        return tmp_file.name

# Get all available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function for a single paragraph with SS handling
async def paragraph_to_speech(text, voice, rate, pitch):
    voice1 = "en-AU-WilliamNeural - en-AU (Male)"    
    voice1F ="en-GB-SoniaNeural - en-GB (Female)"
    voice2 = "en-GB-RyanNeural - en-GB (Male)"
    voice2F = "en-US-JennyNeural - en-US (Female)"
    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading    
    voice3F = "en-HK-YanNeural - en-HK (Female)"
    voice4 = "en-GB-ThomasNeural - en-GB (Male)"
    voice4F ="en-US-EmmaNeural - en-US (Female)"      
    voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
    voice6 = "en-GB-MaisieNeural - en-GB (Female)"  #Child

    if not text.strip():
        return None, []  # Return None for audio path and empty list for silence

    audio_segments = []
    silence_durations = []
    parts = re.split(r'(SS\d+\.?\d*)', text)  #this one separtate the SS## tag if any in the text.
    for part in parts:          
        if re.match(r'SS\d+\.?\d*', part):  #Check if there is Silence tag
            # At the top of your file:
            #SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
            # At the top of your file (assuming you uploaded "Silence.mp3" to root)
            #SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
            # At the top of your file:
            #SILENCE_PATH = Path(__file__).parent.absolute() / "static" / "intro.mp3"
            #if SILENCE_PATH.exists():
            #    audio_segments.append(str(SILENCE_PATH))
            #    print(f"Silence.mp3 file found at {SILENCE_PATH} and is inserted")
            #else:
            silence_duration = float(part[2:]) * 1000  # Convert to milliseconds
            print(f"Silence.mp3 file NOT FOUND")
            silence_file_path = get_silence(silence_duration)  # Store the returned filename
            audio_segments.append(silence_file_path)  # Use the stored filename
        elif part.strip():
            detect=0
            processed_text = part
            current_voice = voice
            current_rate = rate
            current_pitch = pitch
            if part.startswith("1F"):
                detect=1
                current_voice = voice1F.split(" - ")[0]
                current_pitch = 25
            elif part.startswith("2F"):
                detect=1
                current_voice = voice2F.split(" - ")[0]
            elif part.startswith("3F"):
                detect=1
                current_voice = voice3F.split(" - ")[0]
            elif part.startswith("4F"):
                #detect=1
                current_voice = voice4F.split(" - ")[0]                
            elif part.startswith("1M"):
                detect=1
                current_voice = voice1.split(" - ")[0]
            elif part.startswith("2M"):
                detect=1
                current_voice = voice2.split(" - ")[0]
            elif part.startswith("3M"):
                detect=1
                current_voice = voice3.split(" - ")[0]
            elif part.startswith("4M"):
                detect=1
                current_voice = voice4.split(" - ")[0]              
            elif part.startswith("1O"):  # Old man voice
                detect=1
                current_voice = voice5.split(" - ")[0]
                current_pitch = -20
                current_rate = -10
            elif part.startswith("1C"):  #Child voice
                detect=1
                current_voice = voice6.split(" - ")[0]                
            else:
                # Use selected voice, or fallback to default
                #voice_short_name = (voice or default_voice).split(" - ")[0]
                current_voice = (voice or default_voice).split(" - ")[0]
                processed_text=part[:]
                # Step 1: Use regex to find the first number, possibly negative, after a prefix (e.g., F-)
            #match = re.search(r'[A-Za-z]\d+', part)  # Look for a letter followed by one or more digits
            match = re.search(r'[A-Za-z]+\-?\d+', part)  # Look for a letter(s) followed by an optional '-' and digits
            if match:
                # Extract the prefix (e.g., '2F') and number (e.g., '-20')
                prefix = ''.join([ch for ch in match.group() if ch.isalpha()])  # Extract letters (prefix)
                number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-']))  # Extract digits (number)
                current_pitch = number
                # Step 2: Remove the found number from the string
                new_text = re.sub(r'[A-Za-z]+\-?\d+', '', part, count=1).strip()  # Remove prefix and number (e.g., '2F-20')
                #processed_text = new_text[2:]  #cut out the prefix like 1F, 3M etc
                processed_text = new_text[len(prefix):]  # Dynamically remove the prefix part
            else:
                if detect:
                   processed_text = part[2:]
            rate_str = f"{current_rate:+d}%"
            #if  part[2:4].isdigit():
            #    processed_text = part[4:]
            #    pitch = int(part[2:4])
            pitch_str = f"{current_pitch:+d}Hz"
            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            audio_segments.append(tmp_path)
        else:
            audio_segments.append(None) # Empty string

    return audio_segments, silence_durations

# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    #paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
    # Split the text using straight quotes (") and curly quotes (“ and ”)
    paragraphs = [p.strip() for p in re.split(r'[“”"]', text) if p.strip()]
    final_audio_segments = []

    for paragraph in paragraphs:
        audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_paths:
            for i, path in enumerate(audio_paths):
                final_audio_segments.append(path)
                if i < len(silence_times):
                    final_audio_segments.append(silence_times[i])

    if not any(isinstance(item, str) for item in final_audio_segments):
        return None, None  # No actual audio generated

    if all(not isinstance(item, str) for item in final_audio_segments):
        return None, "Only silence markers found."

    combined_audio_path = tempfile.mktemp(suffix=".mp3")
    with open(combined_audio_path, 'wb') as outfile:
        for segment in final_audio_segments:
            if isinstance(segment, str):
                try:
                    with open(segment, 'rb') as infile:
                        outfile.write(infile.read())
                    os.remove(segment)  # Clean up individual files
                except FileNotFoundError:
                    print(f"Warning: Audio file not found: {segment}")
    return combined_audio_path, None

# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    return audio, warning

# Create Gradio application
import gradio as gr

async def create_demo():
    voices = await get_voices()
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
    description = """
    Default = <b>"en-US-AndrewMultilingualNeural - en-US (Male), 
    other voices 1F:en-GB-SoniaNeural,    2F:en-US-JennyNeural,  3F:en-HK-YanNeural,  4F:en-US-EmmaNeural
                 1M:en-AU-WilliamNeural,  2M:en-GB-RyanNeural,   3M:en-US-BrianMultilingualNeural,  4M:en-GB-ThomasNeural
                 1C: en-GB-MaisieNeural (Childvoice), 1O = en-GB-RyanNeural (OldMan)"</b>
    You can insert silence using the marker 'SS##' example "SS2.0" 
    Enter your text, select a voice, and adjust the speech rate and pitch. Can also set like 1F-20  or 1M24.
    """

    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="TTS using Edge Engine.. ENGLISH!",
        description=description,
        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()