Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
import edge_tts | |
import asyncio | |
import tempfile | |
import os | |
import re # Import the regular expression module | |
# Get all available voices | |
async def get_voices(): | |
voices = await edge_tts.list_voices() | |
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} | |
# Text-to-speech function for a single paragraph | |
async def paragraph_to_speech(text, voice, rate, pitch): | |
voice1 ="en-US-AndrewNeural - en-US (Male)" #good for reading | |
voice1F ="en-US-EmmaNeural - en-US (Female)" | |
voice2 = "en-US-GuyNeural (Male)" | |
voice2F = "en-US-JennyNeural (Female)" | |
voice3 = "en-AU-WilliamNeural - en-AU (Male)" | |
voice3F = "en-HK-YanNeural - en-HK (Female)" | |
voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child | |
if not text.strip(): | |
return None | |
if text.startswith("1F"): | |
text2 = text[2:] # Remove the first two characters ("FF") | |
voice_short_name =voice1F.split(" - ")[0] | |
elif text.startswith("2F"): | |
text2 = text[2:] # Remove the first two characters ("FF") | |
voice_short_name =voice2F.split(" - ")[0] | |
elif text.startswith("3F"): | |
text2 = text[2:] # Remove the first two characters ("FF") | |
voice_short_name =voice3F.split(" - ")[0] | |
elif text.startswith("1M"): | |
text2 = text[2:] # Remove the first two characters ("FF") | |
voice_short_name =voice2.split(" - ")[0] | |
elif text.startswith("2M"): | |
text2 = text[2:] # Remove the first two characters ("FF") | |
voice_short_name =voice3.split(" - ")[0] | |
elif text.startswith("1C"): | |
text2 = text[2:] # Remove the first two characters ("FF") | |
voice_short_name =voice4.split(" - ")[0] | |
else: | |
# Use selected voice, or fallback to default | |
voice_short_name = (voice or default_voice).split(" - ")[0] | |
text2=text | |
rate_str = f"{rate:+d}%" | |
pitch_str = f"{pitch:+d}Hz" | |
communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: | |
tmp_path = tmp_file.name | |
await communicate.save(tmp_path) | |
return tmp_path | |
# Main text-to-speech function that processes paragraphs | |
async def text_to_speech(text, voice, rate, pitch): | |
if not text.strip(): | |
return None, gr.Warning("Please enter text to convert.") | |
if not voice: | |
return None, gr.Warning("Please select a voice.") | |
# Split by two or more newline characters, optionally preceded by carriage returns | |
paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()] | |
audio_files = [] | |
for paragraph in paragraphs: | |
audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch) | |
if audio_path: | |
audio_files.append(audio_path) | |
if not audio_files: | |
return None, None # No audio generated | |
# Combine audio files if there are multiple paragraphs | |
if len(audio_files) == 1: | |
return audio_files[0], None | |
else: | |
# Simple concatenation for now - consider using a proper audio editing library for smoother transitions | |
combined_audio_path = tempfile.mktemp(suffix=".mp3") | |
with open(combined_audio_path, 'wb') as outfile: | |
for filename in audio_files: | |
with open(filename, 'rb') as infile: | |
outfile.write(infile.read()) | |
os.remove(filename) # Clean up individual files | |
return combined_audio_path, None | |
# Gradio interface function | |
def tts_interface(text, voice, rate, pitch): | |
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) | |
return audio, warning | |
# Create Gradio application | |
import gradio as gr | |
async def create_demo(): | |
voices = await get_voices() | |
default_voice = "en-US-AndrewNeural - en-US (Male)" # 👈 Pick one of the available voices | |
description = """ | |
Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice | |
Enter your text, select a voice, and adjust the speech rate and pitch. | |
The application will process your text paragraph by paragraph (separated by two blank lines). | |
""" | |
demo = gr.Interface( | |
fn=tts_interface, | |
inputs=[ | |
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."), | |
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice), | |
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), | |
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) | |
], | |
outputs=[ | |
gr.Audio(label="Generated Audio", type="filepath"), | |
gr.Markdown(label="Warning", visible=False) | |
], | |
title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)", | |
description=description, | |
article="Process text paragraph by paragraph for smoother output.", | |
analytics_enabled=False, | |
allow_flagging=False | |
) | |
return demo | |
# Run the application | |
if __name__ == "__main__": | |
demo = asyncio.run(create_demo()) | |
demo.launch() |