Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import edge_tts | |
import asyncio | |
import tempfile | |
import os | |
import re # Import the regular expression module | |
from pathlib import Path | |
from pydub import AudioSegment | |
def get_silence(duration_ms=1000): | |
# Create silent audio segment with specified parameters | |
silent_audio = AudioSegment.silent( | |
duration=duration_ms, | |
frame_rate=24000 # 24kHz sampling rate | |
) | |
# Set audio parameters | |
silent_audio = silent_audio.set_channels(1) # Mono | |
silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
# Export with specific bitrate and codec parameters | |
silent_audio.export( | |
tmp_file.name, | |
format="mp3", | |
bitrate="48k", | |
parameters=[ | |
"-ac", "1", # Mono | |
"-ar", "24000", # Sample rate | |
"-sample_fmt", "s32", # 32-bit samples | |
"-codec:a", "libmp3lame" # MP3 codec | |
] | |
) | |
return tmp_file.name | |
# Get all available voices | |
async def get_voices(): | |
voices = await edge_tts.list_voices() | |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} | |
# Text-to-speech function for a single paragraph with SS handling | |
async def paragraph_to_speech(text, voice, rate, pitch): | |
voice1 = "en-AU-WilliamNeural - en-AU (Male)" | |
voice1F ="en-GB-SoniaNeural - en-GB (Female)" | |
voice2 = "en-GB-RyanNeural - en-GB (Male)" | |
voice2F = "en-US-JennyNeural - en-US (Female)" | |
voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading | |
voice3F = "en-HK-YanNeural - en-HK (Female)" | |
voice4 = "en-GB-ThomasNeural - en-GB (Male)" | |
voice4F ="en-US-EmmaNeural - en-US (Female)" | |
voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man | |
voice6 = "en-GB-MaisieNeural - en-GB (Female)" #Child | |
if not text.strip(): | |
return None, [] # Return None for audio path and empty list for silence | |
audio_segments = [] | |
silence_durations = [] | |
parts = re.split(r'(SS\d+\.?\d*)', text) #this one separtate the SS## tag if any in the text. | |
for part in parts: | |
if re.match(r'SS\d+\.?\d*', part): #Check if there is Silence tag | |
# At the top of your file: | |
#SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3" | |
# At the top of your file (assuming you uploaded "Silence.mp3" to root) | |
#SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3" | |
# At the top of your file: | |
#SILENCE_PATH = Path(__file__).parent.absolute() / "static" / "intro.mp3" | |
#if SILENCE_PATH.exists(): | |
# audio_segments.append(str(SILENCE_PATH)) | |
# print(f"Silence.mp3 file found at {SILENCE_PATH} and is inserted") | |
#else: | |
silence_duration = float(part[2:]) * 1000 # Convert to milliseconds | |
print(f"Silence.mp3 file NOT FOUND") | |
silence_file_path = get_silence(silence_duration) # Store the returned filename | |
audio_segments.append(silence_file_path) # Use the stored filename | |
elif part.strip(): | |
detect=0 | |
processed_text = part | |
current_voice = voice | |
current_rate = rate | |
current_pitch = pitch | |
if part.startswith("1F"): | |
detect=1 | |
current_voice = voice1F.split(" - ")[0] | |
current_pitch = 25 | |
elif part.startswith("2F"): | |
detect=1 | |
current_voice = voice2F.split(" - ")[0] | |
elif part.startswith("3F"): | |
detect=1 | |
current_voice = voice3F.split(" - ")[0] | |
elif part.startswith("4F"): | |
#detect=1 | |
current_voice = voice4F.split(" - ")[0] | |
elif part.startswith("1M"): | |
detect=1 | |
current_voice = voice1.split(" - ")[0] | |
elif part.startswith("2M"): | |
detect=1 | |
current_voice = voice2.split(" - ")[0] | |
elif part.startswith("3M"): | |
detect=1 | |
current_voice = voice3.split(" - ")[0] | |
elif part.startswith("4M"): | |
detect=1 | |
current_voice = voice4.split(" - ")[0] | |
elif part.startswith("1O"): # Old man voice | |
detect=1 | |
current_voice = voice5.split(" - ")[0] | |
current_pitch = -20 | |
current_rate = -10 | |
elif part.startswith("1C"): #Child voice | |
detect=1 | |
current_voice = voice6.split(" - ")[0] | |
else: | |
# Use selected voice, or fallback to default | |
#voice_short_name = (voice or default_voice).split(" - ")[0] | |
current_voice = (voice or default_voice).split(" - ")[0] | |
processed_text=part[:] | |
# Step 1: Use regex to find the first number, possibly negative, after a prefix (e.g., F-) | |
#match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits | |
match = re.search(r'[A-Za-z]+\-?\d+', part) # Look for a letter(s) followed by an optional '-' and digits | |
if match: | |
# Extract the prefix (e.g., '2F') and number (e.g., '-20') | |
prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix) | |
number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number) | |
current_pitch = number | |
# Step 2: Remove the found number from the string | |
new_text = re.sub(r'[A-Za-z]+\-?\d+', '', part, count=1).strip() # Remove prefix and number (e.g., '2F-20') | |
#processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc | |
processed_text = new_text[len(prefix):] # Dynamically remove the prefix part | |
else: | |
if detect: | |
processed_text = part[2:] | |
rate_str = f"{current_rate:+d}%" | |
#if part[2:4].isdigit(): | |
# processed_text = part[4:] | |
# pitch = int(part[2:4]) | |
pitch_str = f"{current_pitch:+d}Hz" | |
communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
tmp_path = tmp_file.name | |
await communicate.save(tmp_path) | |
audio_segments.append(tmp_path) | |
else: | |
audio_segments.append(None) # Empty string | |
return audio_segments, silence_durations | |
# Main text-to-speech function that processes paragraphs and silence | |
async def text_to_speech(text, voice, rate, pitch): | |
if not text.strip(): | |
return None, gr.Warning("Please enter text to convert.") | |
if not voice: | |
return None, gr.Warning("Please select a voice.") | |
#paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()] | |
# Split the text using straight quotes (") and curly quotes (โ and โ) | |
paragraphs = [p.strip() for p in re.split(r'[โโ"]', text) if p.strip()] | |
final_audio_segments = [] | |
for paragraph in paragraphs: | |
audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch) | |
if audio_paths: | |
for i, path in enumerate(audio_paths): | |
final_audio_segments.append(path) | |
if i < len(silence_times): | |
final_audio_segments.append(silence_times[i]) | |
if not any(isinstance(item, str) for item in final_audio_segments): | |
return None, None # No actual audio generated | |
if all(not isinstance(item, str) for item in final_audio_segments): | |
return None, "Only silence markers found." | |
combined_audio_path = tempfile.mktemp(suffix=".mp3") | |
with open(combined_audio_path, 'wb') as outfile: | |
for segment in final_audio_segments: | |
if isinstance(segment, str): | |
try: | |
with open(segment, 'rb') as infile: | |
outfile.write(infile.read()) | |
os.remove(segment) # Clean up individual files | |
except FileNotFoundError: | |
print(f"Warning: Audio file not found: {segment}") | |
return combined_audio_path, None | |
# Gradio interface function | |
def tts_interface(text, voice, rate, pitch): | |
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) | |
return audio, warning | |
# Create Gradio application | |
import gradio as gr | |
async def create_demo(): | |
voices = await get_voices() | |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # ๐ Pick one of the available voices | |
description = """ | |
Default = <b>"en-US-AndrewMultilingualNeural - en-US (Male), | |
other voices 1F:en-GB-SoniaNeural, 2F:en-US-JennyNeural, 3F:en-HK-YanNeural, 4F:en-US-EmmaNeural | |
1M:en-AU-WilliamNeural, 2M:en-GB-RyanNeural, 3M:en-US-BrianMultilingualNeural, 4M:en-GB-ThomasNeural | |
1C: en-GB-MaisieNeural (Childvoice), 1O = en-GB-RyanNeural (OldMan)"</b> | |
You can insert silence using the marker 'SS##' example "SS2.0" | |
Enter your text, select a voice, and adjust the speech rate and pitch. Can also set like 1F-20 or 1M24. | |
""" | |
demo = gr.Interface( | |
fn=tts_interface, | |
inputs=[ | |
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."), | |
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1) | |
], | |
outputs=[ | |
gr.Audio(label="Generated Audio", type="filepath"), | |
gr.Markdown(label="Warning", visible=False) | |
], | |
title="TTS using Edge Engine.. ENGLISH!", | |
description=description, | |
article="Process text paragraph by paragraph for smoother output and insert silence markers.", | |
analytics_enabled=False, | |
allow_flagging=False | |
) | |
return demo | |
# Run the application | |
if __name__ == "__main__": | |
demo = asyncio.run(create_demo()) | |
demo.launch() |