Edge_TTS_NGHIA_transcript

Sleeping

File size: 8,845 Bytes

import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
import struct
import wave

# Function to create a temporary silent WAV file
def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
    """Creates a temporary WAV file containing silence."""
    if duration <= 0:
        raise ValueError("Duration must be positive.")
    
    num_frames = int(duration * sample_rate)
    silent_data = b'\x00' * (num_frames * num_channels * sample_width)

    temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
    with wave.open(temp_wav_path, 'w') as wf:
        wf.setnchannels(num_channels)
        wf.setframerate(sample_rate)
        wf.setsampwidth(sample_width)
        wf.writeframes(silent_data)
    return temp_wav_path

# Function to process text and generate audio for a single paragraph
async def paragraph_to_speech(text, voice, rate, pitch):
    voices = {
        "voice1F": "en-US-EmmaNeural - en-US (Female)",
        "voice2F": "en-US-JennyNeural - en-US (Female)",
        "voice3F": "en-HK-YanNeural - en-HK (Female)",
        "voice1": "en-AU-WilliamNeural - en-AU (Male)",
        "voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
        "voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
        "voice4": "en-GB-MaisieNeural - en-GB (Female)",  # Child
        "voice5": "en-GB-RyanNeural - en-GB (Male)"  # Old Man
    }

    if not text.strip():
        return None, []  # Return None for audio path and empty list for silence

    audio_segments = []
    temp_dir = tempfile.gettempdir()
    parts = re.split(r'(SS\d+\.?\d*)', text)

    for part in parts:
        if re.match(r'SS\d+\.?\d*', part):
            try:
                silence_duration = float(part[2:])
                silent_wav_path = create_silent_wav(silence_duration, temp_dir)
                audio_segments.append(silent_wav_path)
            except ValueError:
                print(f"Warning: Invalid silence duration format: {part}")
        elif part.strip():
            processed_text = part
            current_voice = voice
            current_rate = rate
            current_pitch = pitch

            # Select voice based on part prefix
            if part.startswith("1F"):
                processed_text = part[2:]
                current_voice = voices["voice1F"]
            elif part.startswith("2F"):
                processed_text = part[2:]
                current_voice = voices["voice2F"]
            elif part.startswith("3F"):
                processed_text = part[2:]
                current_voice = voices["voice3F"]
            elif part.startswith("1M"):
                processed_text = part[2:]
                current_voice = voices["voice1"]
            elif part.startswith("2M"):
                processed_text = part[2:]
                current_voice = voices["voice2"]
            elif part.startswith("3M"):
                processed_text = part[2:]
                current_voice = voices["voice3"]
            elif part.startswith("1C"):
                processed_text = part[2:]
                current_voice = voices["voice4"]
            elif part.startswith("1O"):
                processed_text = part[2:]
                current_voice = voices["voice5"]
                current_pitch = -30
                current_rate = -20
            else:
                current_voice = (voice or voices["voice1"]).split(" - ")[0]
                processed_text = part[:]

            rate_str = f"{current_rate:+d}%"
            pitch_str = f"{current_pitch:+d}Hz"
            communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)

            # Save speech output to temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                tmp_path = tmp_file.name
                await communicate.save(tmp_path)
            audio_segments.append(tmp_path)
        else:
            audio_segments.append(None)  # Empty string

    return audio_segments, []  # Returning empty list for silence times as we are directly creating silent WAV

# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")

    paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
    final_audio_segments = []

    for paragraph in paragraphs:
        audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
        if audio_paths:
            final_audio_segments.extend(audio_paths)

    if not any(isinstance(item, str) for item in final_audio_segments):
        return None, None  # No actual audio generated

    if all(not isinstance(item, str) for item in final_audio_segments):
        return None, "Only silence markers found."

    combined_audio_path = tempfile.mktemp(suffix=".wav")
    with wave.open(combined_audio_path, 'w') as outfile:
        first_audio = True
        sample_rate = None
        num_channels = None
        sample_width = None

        for segment_path in final_audio_segments:
            if isinstance(segment_path, str):
                try:
                    with wave.open(segment_path, 'rb') as infile:
                        current_num_channels = infile.getnchannels()
                        current_sample_rate = infile.getframerate()
                        current_sample_width = infile.getsampwidth()
                        frames = infile.readframes(infile.getnframes())

                        if first_audio:
                            num_channels = current_num_channels
                            sample_rate = current_sample_rate
                            sample_width = current_sample_width
                            outfile.setnchannels(num_channels)
                            outfile.setframerate(sample_rate)
                            outfile.setsampwidth(sample_width)
                            first_audio = False
                        elif (current_num_channels != num_channels or
                              current_sample_rate != sample_rate or
                              current_sample_width != sample_width):
                            print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
                            continue

                        outfile.writeframes(frames)
                    os.remove(segment_path)  # Clean up individual files
                except wave.Error as e:
                    print(f"Warning: Error reading WAV file {segment_path}: {e}")
                except FileNotFoundError:
                    print(f"Warning: Audio file not found: {segment_path}")

    return combined_audio_path, None

# Gradio interface function (wrapper to run async code)
def tts_interface_sync(text, voice, rate, pitch):
    return asyncio.run(tts_interface(text, voice, rate, pitch))

# Gradio interface
async def create_demo():
    voices = await get_voices()  # Now this function is defined
    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
    description = """
    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
    You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
    Enter your text, select a voice, and adjust the speech rate and pitch.
    The application will process your text paragraph by paragraph (separated by two blank lines).
    """

    demo = gr.Interface(
        fn=tts_interface_sync,
        inputs=[ 
            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[ 
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
        description=description,
        article="Process text paragraph by paragraph for smoother output and insert silence markers.",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()