Spaces:
Sleeping
Sleeping
File size: 5,289 Bytes
2aef491 63f1d6d 13280d7 63f1d6d 0596274 63f1d6d d3fce98 5f7c847 0596274 5f7c847 63f1d6d d3fce98 5f7c847 e25395e 5f7c847 e25395e 5f7c847 6afb2ee 63f1d6d 6afb2ee 63f1d6d d3fce98 027d5d3 d3fce98 63f1d6d 7cba29e 63f1d6d d81bde6 63f1d6d 0596274 d81bde6 5f7c847 d3fce98 d81bde6 d3fce98 63f1d6d d3fce98 0596274 63f1d6d d3fce98 d81bde6 d3fce98 63f1d6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re # Import the regular expression module
# Get all available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-speech function for a single paragraph
async def paragraph_to_speech(text, voice, rate, pitch):
voice1 ="en-US-AndrewNeural - en-US (Male)" #good for reading
voice1F ="en-US-EmmaNeural - en-US (Female)"
voice2 = "en-US-GuyNeural (Male)"
voice2F = "en-US-JennyNeural (Female)"
voice3 = "en-AU-WilliamNeural - en-AU (Male)"
voice3F = "en-HK-YanNeural - en-HK (Female)"
voice4 = "en-GB-MaisieNeural - en-GB (Female)" #Child
if not text.strip():
return None
if text.startswith("1F"):
text2 = text[2:] # Remove the first two characters ("FF")
voice_short_name =voice1F.split(" - ")[0]
elif text.startswith("2F"):
text2 = text[2:] # Remove the first two characters ("FF")
voice_short_name =voice2F.split(" - ")[0]
elif text.startswith("3F"):
text2 = text[2:] # Remove the first two characters ("FF")
voice_short_name =voice3F.split(" - ")[0]
elif text.startswith("1M"):
text2 = text[2:] # Remove the first two characters ("FF")
voice_short_name =voice2.split(" - ")[0]
elif text.startswith("2M"):
text2 = text[2:] # Remove the first two characters ("FF")
voice_short_name =voice3.split(" - ")[0]
elif text.startswith("1C"):
text2 = text[2:] # Remove the first two characters ("FF")
voice_short_name =voice4.split(" - ")[0]
else:
# Use selected voice, or fallback to default
voice_short_name = (voice or default_voice).split(" - ")[0]
text2=text
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
# Main text-to-speech function that processes paragraphs
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, gr.Warning("Please enter text to convert.")
if not voice:
return None, gr.Warning("Please select a voice.")
# Split by two or more newline characters, optionally preceded by carriage returns
paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
audio_files = []
for paragraph in paragraphs:
audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
if audio_path:
audio_files.append(audio_path)
if not audio_files:
return None, None # No audio generated
# Combine audio files if there are multiple paragraphs
if len(audio_files) == 1:
return audio_files[0], None
else:
# Simple concatenation for now - consider using a proper audio editing library for smoother transitions
combined_audio_path = tempfile.mktemp(suffix=".mp3")
with open(combined_audio_path, 'wb') as outfile:
for filename in audio_files:
with open(filename, 'rb') as infile:
outfile.write(infile.read())
os.remove(filename) # Clean up individual files
return combined_audio_path, None
# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
return audio, warning
# Create Gradio application
import gradio as gr
async def create_demo():
voices = await get_voices()
default_voice = "en-US-AndrewNeural - en-US (Male)" # 👈 Pick one of the available voices
description = """
Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice
Enter your text, select a voice, and adjust the speech rate and pitch.
The application will process your text paragraph by paragraph (separated by two blank lines).
"""
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Markdown(label="Warning", visible=False)
],
title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
description=description,
article="Process text paragraph by paragraph for smoother output.",
analytics_enabled=False,
allow_flagging=False
)
return demo
# Run the application
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch() |