Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import edge_tts | |
import asyncio | |
import tempfile | |
import os | |
import re # Import the regular expression module | |
import struct | |
import wave | |
# Function to create a temporary silent WAV file | |
def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2): | |
"""Creates a temporary WAV file containing silence. | |
Args: | |
duration (float): Duration of silence in seconds. | |
temp_dir (str): Directory to save the temporary file. | |
sample_rate (int): Sample rate of the audio (samples per second). | |
num_channels (int): Number of audio channels (1 for mono, 2 for stereo). | |
sample_width (int): Sample width in bytes (e.g., 2 for 16-bit). | |
Returns: | |
str: Path to the temporary silent WAV file. | |
""" | |
num_frames = int(duration * sample_rate) | |
silent_data = b'\x00' * (num_frames * num_channels * sample_width) | |
temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav") | |
with wave.open(temp_wav_path, 'w') as wf: | |
wf.setnchannels(num_channels) | |
wf.setframerate(sample_rate) | |
wf.setsampwidth(sample_width) | |
wf.writeframes(silent_data) | |
return temp_wav_path | |
# Text-to-speech function for a single paragraph with SS handling | |
async def paragraph_to_speech(text, voice, rate, pitch): | |
voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading | |
voice1F ="en-US-EmmaNeural - en-US (Female)" | |
voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)" | |
voice2F = "en-US-JennyNeural - en-US (Female)" | |
voice1 = "en-AU-WilliamNeural - en-AU (Male)" | |
voice3F = "en-HK-YanNeural - en-HK (Female)" | |
voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child | |
voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man | |
if not text.strip(): | |
return None, [] # Return None for audio path and empty list for silence | |
audio_segments = [] | |
temp_dir = tempfile.gettempdir() | |
parts = re.split(r'(SS\d+\.?\d*)', text) | |
for part in parts: | |
if re.match(r'SS\d+\.?\d*', part): | |
try: | |
silence_duration = float(part[2:]) | |
# Assuming default WAV parameters for silence | |
silent_wav_path = create_silent_wav(silence_duration, temp_dir) | |
audio_segments.append(silent_wav_path) | |
except ValueError: | |
print(f"Warning: Invalid silence duration format: {part}") | |
elif part.strip(): | |
processed_text = part | |
current_voice = voice | |
current_rate = rate | |
current_pitch = pitch | |
if part.startswith("1F"): | |
processed_text = part[2:] | |
current_voice = voice1F.split(" - ")[0] | |
elif part.startswith("2F"): | |
processed_text = part[2:] | |
current_voice = voice2F.split(" - ")[0] | |
elif part.startswith("3F"): | |
processed_text = part[2:] | |
current_voice = voice3F.split(" - ")[0] | |
elif part.startswith("1M"): | |
processed_text = part[2:] | |
current_voice = voice1.split(" - ")[0] | |
elif part.startswith("2M"): | |
processed_text = part[2:] | |
current_voice = voice2.split(" - ")[0] | |
elif part.startswith("3M"): | |
processed_text = part[2:] | |
current_voice = voice3.split(" - ")[0] | |
elif part.startswith("1C"): | |
processed_text = part[2:] | |
current_voice = voice4.split(" - ")[0] | |
elif part.startswith("1O"): | |
processed_text = part[2:] | |
current_voice = voice5.split(" - ")[0] | |
current_pitch = -30 | |
current_rate = -20 | |
else: | |
current_voice = (voice or default_voice).split(" - ")[0] | |
processed_text=part[:] | |
rate_str = f"{current_rate:+d}%" | |
pitch_str = f"{current_pitch:+d}Hz" | |
communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
tmp_path = tmp_file.name | |
await communicate.save(tmp_path) | |
audio_segments.append(tmp_path) | |
else: | |
audio_segments.append(None) # Empty string | |
return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV | |
# Main text-to-speech function that processes paragraphs and silence | |
async def text_to_speech(text, voice, rate, pitch): | |
if not text.strip(): | |
return None, gr.Warning("Please enter text to convert.") | |
if not voice: | |
return None, gr.Warning("Please select a voice.") | |
paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()] | |
final_audio_segments = [] | |
for paragraph in paragraphs: | |
audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch) | |
if audio_paths: | |
final_audio_segments.extend(audio_paths) | |
if not any(isinstance(item, str) for item in final_audio_segments): | |
return None, None # No actual audio generated | |
if all(not isinstance(item, str) for item in final_audio_segments): | |
return None, "Only silence markers found." | |
combined_audio_path = tempfile.mktemp(suffix=".wav") | |
with wave.open(combined_audio_path, 'w') as outfile: | |
first_audio = True | |
sample_rate = None | |
num_channels = None | |
sample_width = None | |
for segment_path in final_audio_segments: | |
if isinstance(segment_path, str): | |
try: | |
with wave.open(segment_path, 'rb') as infile: | |
current_num_channels = infile.getnchannels() | |
current_sample_rate = infile.getframerate() | |
current_sample_width = infile.getsampwidth() | |
frames = infile.readframes(infile.getnframes()) | |
if first_audio: | |
num_channels = current_num_channels | |
sample_rate = current_sample_rate | |
sample_width = current_sample_width | |
outfile.setnchannels(num_channels) | |
outfile.setframerate(sample_rate) | |
outfile.setsampwidth(sample_width) | |
first_audio = False | |
elif (current_num_channels != num_channels or | |
current_sample_rate != sample_rate or | |
current_sample_width != sample_width): | |
print(f"Warning: Audio segment {segment_path} has different format. Skipping.") | |
continue | |
outfile.writeframes(frames) | |
os.remove(segment_path) # Clean up individual files | |
except wave.Error as e: | |
print(f"Warning: Error reading WAV file {segment_path}: {e}") | |
except FileNotFoundError: | |
print(f"Warning: Audio file not found: {segment_path}") | |
return combined_audio_path, None | |
# Gradio interface function | |
def tts_interface(text, voice, rate, pitch): | |
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) | |
return audio, warning | |
async def get_voices(): | |
voices_list = await edge_tts.list_voices() | |
voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list} | |
return voices_dict | |
# Create Gradio application | |
async def create_demo(): | |
voices = await get_voices() | |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # 👈 Pick one of the available voices | |
description = """ | |
Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan | |
You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause). | |
Enter your text, select a voice, and adjust the speech rate and pitch. | |
The application will process your text paragraph by paragraph (separated by two blank lines). | |
""" | |
demo = gr.Interface( | |
fn=tts_interface, | |
inputs=[ | |
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."), | |
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1) | |
], | |
outputs=[ | |
gr.Audio(label="Generated Audio", type="filepath"), | |
gr.Markdown(label="Warning", visible=False) | |
], | |
title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output", | |
description=description, | |
article="Process text paragraph by paragraph for smoother output and insert silence markers.", | |
analytics_enabled=False, | |
allow_flagging=False | |
) | |
return demo | |
# Run the application | |
if __name__ == "__main__": | |
demo = asyncio.run(create_demo()) | |
demo.launch() |