Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import edge_tts | |
import asyncio | |
import tempfile | |
import os | |
import re # Import the regular expression module | |
from pathlib import Path | |
# At the top of your file: | |
SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3" | |
# Get all available voices | |
async def get_voices(): | |
voices = await edge_tts.list_voices() | |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} | |
# Text-to-speech function for a single paragraph with SS handling | |
async def paragraph_to_speech(text, voice, rate, pitch): | |
voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading | |
voice1F ="en-US-EmmaNeural - en-US (Female)" | |
voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)" | |
voice2F = "en-US-JennyNeural - en-US (Female)" | |
voice1 = "en-AU-WilliamNeural - en-AU (Male)" | |
voice3F = "en-HK-YanNeural - en-HK (Female)" | |
voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child | |
voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man | |
if not text.strip(): | |
return None, [] # Return None for audio path and empty list for silence | |
audio_segments = [] | |
silence_durations = [] | |
parts = re.split(r'(SS\d+\.?\d*)', text) | |
for part in parts: | |
if re.match(r'SS\d+\.?\d*', part): | |
if SILENCE_PATH.exists(): | |
audio_segments.append(str(SILENCE_PATH)) | |
print(f"Silence added at {SILENCE_PATH}") | |
else: | |
print(f"Silence.mp3 file NOT FOUND") | |
elif part.strip(): | |
processed_text = part | |
current_voice = voice | |
current_rate = rate | |
current_pitch = pitch | |
if part.startswith("1F"): | |
processed_text = part[2:] | |
current_voice = voice1F.split(" - ")[0] | |
elif part.startswith("2F"): | |
processed_text = part[2:] | |
current_voice = voice2F.split(" - ")[0] | |
elif part.startswith("3F"): | |
processed_text = part[2:] | |
current_voice = voice3F.split(" - ")[0] | |
elif part.startswith("1M"): | |
processed_text = part[2:] | |
current_voice = voice1.split(" - ")[0] | |
elif part.startswith("2M"): | |
processed_text = part[2:] | |
current_voice = voice2.split(" - ")[0] | |
elif part.startswith("3M"): | |
processed_text = part[2:] | |
current_voice = voice3.split(" - ")[0] | |
elif part.startswith("1C"): | |
processed_text = part[2:] | |
current_voice = voice4.split(" - ")[0] | |
elif part.startswith("1O"): | |
processed_text = part[2:] | |
current_voice = voice5.split(" - ")[0] | |
current_pitch = -30 | |
current_rate = -20 | |
else: | |
# Use selected voice, or fallback to default | |
#voice_short_name = (voice or default_voice).split(" - ")[0] | |
current_voice = (voice or default_voice).split(" - ")[0] | |
processed_text=part[:] | |
rate_str = f"{current_rate:+d}%" | |
pitch_str = f"{current_pitch:+d}Hz" | |
communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
tmp_path = tmp_file.name | |
await communicate.save(tmp_path) | |
audio_segments.append(tmp_path) | |
else: | |
audio_segments.append(None) # Empty string | |
return audio_segments, silence_durations | |
# Main text-to-speech function that processes paragraphs and silence | |
async def text_to_speech(text, voice, rate, pitch): | |
if not text.strip(): | |
return None, gr.Warning("Please enter text to convert.") | |
if not voice: | |
return None, gr.Warning("Please select a voice.") | |
paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()] | |
final_audio_segments = [] | |
for paragraph in paragraphs: | |
audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch) | |
if audio_paths: | |
for i, path in enumerate(audio_paths): | |
final_audio_segments.append(path) | |
if i < len(silence_times): | |
final_audio_segments.append(silence_times[i]) | |
if not any(isinstance(item, str) for item in final_audio_segments): | |
return None, None # No actual audio generated | |
if all(not isinstance(item, str) for item in final_audio_segments): | |
return None, "Only silence markers found." | |
combined_audio_path = tempfile.mktemp(suffix=".mp3") | |
with open(combined_audio_path, 'wb') as outfile: | |
for segment in final_audio_segments: | |
if isinstance(segment, str): | |
try: | |
with open(segment, 'rb') as infile: | |
outfile.write(infile.read()) | |
os.remove(segment) # Clean up individual files | |
except FileNotFoundError: | |
print(f"Warning: Audio file not found: {segment}") | |
return combined_audio_path, None | |
# Gradio interface function | |
def tts_interface(text, voice, rate, pitch): | |
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) | |
return audio, warning | |
# Create Gradio application | |
import gradio as gr | |
async def create_demo(): | |
voices = await get_voices() | |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # 👈 Pick one of the available voices | |
description = """ | |
Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan | |
You can insert silence using the marker 'SS' (This will insert a Silence period from the Silence.mp3 file). | |
Enter your text, select a voice, and adjust the speech rate and pitch. | |
The application will process your text paragraph by paragraph (separated by two blank lines). | |
""" | |
demo = gr.Interface( | |
fn=tts_interface, | |
inputs=[ | |
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."), | |
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1) | |
], | |
outputs=[ | |
gr.Audio(label="Generated Audio", type="filepath"), | |
gr.Markdown(label="Warning", visible=False) | |
], | |
title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)", | |
description=description, | |
article="Process text paragraph by paragraph for smoother output and insert silence markers.", | |
analytics_enabled=False, | |
allow_flagging=False | |
) | |
return demo | |
# Run the application | |
if __name__ == "__main__": | |
demo = asyncio.run(create_demo()) | |
demo.launch() |