Spaces:
Sleeping
Sleeping
File size: 10,652 Bytes
5021a0c 63f1d6d 4337b98 5021a0c 552e1db e42e13d a4e47b1 0596274 44c7b6f 284179e 7042e46 552e1db 5021a0c b0718f9 3e534e0 aa5ea31 552e1db b0718f9 552e1db b0718f9 552e1db b0718f9 2f93aef 63f1d6d 2f93aef 284179e b0718f9 aa5ea31 e4e3d3e 9733186 7a3f365 e4e3d3e 9733186 7a3f365 fdb31c0 7a3f365 2f93aef b0718f9 2f93aef b0718f9 552e1db 3e534e0 2f93aef b0718f9 552e1db 2f93aef b0718f9 552e1db b0718f9 2f93aef b0718f9 552e1db 2f93aef b0718f9 552e1db 2f93aef b0718f9 552e1db b0718f9 552e1db 9995337 b0718f9 ef4c8b8 284179e 552e1db 9995337 4f3af59 9995337 4f3af59 8462870 b0718f9 2f93aef 0b4c9e5 2f93aef 284179e 2f93aef 552e1db 2f93aef 284179e 2f93aef d3fce98 f067030 2f93aef 4337b98 5021a0c 284179e 2f93aef 284179e 2f93aef 284179e 2f93aef 284179e 2f93aef 284179e 2f93aef 5021a0c 552e1db 284179e 5021a0c 552e1db 5021a0c b0718f9 5021a0c 552e1db 2f93aef 5021a0c 92f530c 5021a0c 552e1db 5021a0c b0718f9 5021a0c 2f93aef 5021a0c 552e1db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re # Import the regular expression module
from pathlib import Path
from pydub import AudioSegment
def get_silence(duration_ms=1000):
# Create silent audio segment with specified parameters
silent_audio = AudioSegment.silent(
duration=duration_ms,
frame_rate=24000 # 24kHz sampling rate
)
# Set audio parameters
silent_audio = silent_audio.set_channels(1) # Mono
silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
# Export with specific bitrate and codec parameters
silent_audio.export(
tmp_file.name,
format="mp3",
bitrate="48k",
parameters=[
"-ac", "1", # Mono
"-ar", "24000", # Sample rate
"-sample_fmt", "s32", # 32-bit samples
"-codec:a", "libmp3lame" # MP3 codec
]
)
return tmp_file.name
# Get all available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-speech function for a single paragraph with SS handling
async def paragraph_to_speech(text, voice, rate, pitch):
voice1 = "en-AU-WilliamNeural - en-AU (Male)"
voice1F ="en-GB-SoniaNeural - en-GB (Female)"
voice2 = "en-GB-RyanNeural - en-GB (Male)"
voice2F = "en-US-JennyNeural - en-US (Female)"
voice3 ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
voice3F = "en-HK-YanNeural - en-HK (Female)"
voice4 = "en-GB-ThomasNeural - en-GB (Male)"
voice4F ="en-US-EmmaNeural - en-US (Female)"
voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
voice6 = "en-GB-MaisieNeural - en-GB (Female)" #Child
if not text.strip():
return None, [] # Return None for audio path and empty list for silence
audio_segments = []
silence_durations = []
parts = re.split(r'(SS\d+\.?\d*)', text) #this one separtate the SS## tag if any in the text.
for part in parts:
if re.match(r'SS\d+\.?\d*', part): #Check if there is Silence tag
# At the top of your file:
#SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
# At the top of your file (assuming you uploaded "Silence.mp3" to root)
#SILENCE_PATH = Path(__file__).parent.absolute() / "Silence.mp3"
# At the top of your file:
#SILENCE_PATH = Path(__file__).parent.absolute() / "static" / "intro.mp3"
#if SILENCE_PATH.exists():
# audio_segments.append(str(SILENCE_PATH))
# print(f"Silence.mp3 file found at {SILENCE_PATH} and is inserted")
#else:
silence_duration = float(part[2:]) * 1000 # Convert to milliseconds
print(f"Silence.mp3 file NOT FOUND")
silence_file_path = get_silence(silence_duration) # Store the returned filename
audio_segments.append(silence_file_path) # Use the stored filename
elif part.strip():
detect=0
processed_text = part
current_voice = voice
current_rate = rate
current_pitch = pitch
if part.startswith("1F"):
detect=1
current_voice = voice1F.split(" - ")[0]
current_pitch = 25
elif part.startswith("2F"):
detect=1
current_voice = voice2F.split(" - ")[0]
elif part.startswith("3F"):
detect=1
current_voice = voice3F.split(" - ")[0]
elif part.startswith("4F"):
#detect=1
current_voice = voice4F.split(" - ")[0]
elif part.startswith("1M"):
detect=1
current_voice = voice1.split(" - ")[0]
elif part.startswith("2M"):
detect=1
current_voice = voice2.split(" - ")[0]
elif part.startswith("3M"):
detect=1
current_voice = voice3.split(" - ")[0]
elif part.startswith("4M"):
detect=1
current_voice = voice4.split(" - ")[0]
elif part.startswith("1O"): # Old man voice
detect=1
current_voice = voice5.split(" - ")[0]
current_pitch = -20
current_rate = -10
elif part.startswith("1C"): #Child voice
detect=1
current_voice = voice6.split(" - ")[0]
else:
# Use selected voice, or fallback to default
#voice_short_name = (voice or default_voice).split(" - ")[0]
current_voice = (voice or default_voice).split(" - ")[0]
processed_text=part[:]
# Step 1: Use regex to find the first number, possibly negative, after a prefix (e.g., F-)
#match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits
match = re.search(r'[A-Za-z]+\-?\d+', part) # Look for a letter(s) followed by an optional '-' and digits
if match:
# Extract the prefix (e.g., '2F') and number (e.g., '-20')
prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix)
number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number)
current_pitch = number
# Step 2: Remove the found number from the string
new_text = re.sub(r'[A-Za-z]+\-?\d+', '', part, count=1).strip() # Remove prefix and number (e.g., '2F-20')
#processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc
processed_text = new_text[len(prefix):] # Dynamically remove the prefix part
else:
if detect:
processed_text = part[2:]
rate_str = f"{current_rate:+d}%"
#if part[2:4].isdigit():
# processed_text = part[4:]
# pitch = int(part[2:4])
pitch_str = f"{current_pitch:+d}Hz"
communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
audio_segments.append(tmp_path)
else:
audio_segments.append(None) # Empty string
return audio_segments, silence_durations
# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, gr.Warning("Please enter text to convert.")
if not voice:
return None, gr.Warning("Please select a voice.")
#paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
# Split the text using straight quotes (") and curly quotes (โ and โ)
paragraphs = [p.strip() for p in re.split(r'[โโ"]', text) if p.strip()]
final_audio_segments = []
for paragraph in paragraphs:
audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
if audio_paths:
for i, path in enumerate(audio_paths):
final_audio_segments.append(path)
if i < len(silence_times):
final_audio_segments.append(silence_times[i])
if not any(isinstance(item, str) for item in final_audio_segments):
return None, None # No actual audio generated
if all(not isinstance(item, str) for item in final_audio_segments):
return None, "Only silence markers found."
combined_audio_path = tempfile.mktemp(suffix=".mp3")
with open(combined_audio_path, 'wb') as outfile:
for segment in final_audio_segments:
if isinstance(segment, str):
try:
with open(segment, 'rb') as infile:
outfile.write(infile.read())
os.remove(segment) # Clean up individual files
except FileNotFoundError:
print(f"Warning: Audio file not found: {segment}")
return combined_audio_path, None
# Gradio interface function
@spaces.GPU
def tts_interface(text, voice, rate, pitch):
audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
return audio, warning
# Create Gradio application
import gradio as gr
async def create_demo():
voices = await get_voices()
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)" # ๐ Pick one of the available voices
description = """
Default = <b>"en-US-AndrewMultilingualNeural - en-US (Male),
other voices 1F:en-GB-SoniaNeural, 2F:en-US-JennyNeural, 3F:en-HK-YanNeural, 4F:en-US-EmmaNeural
1M:en-AU-WilliamNeural, 2M:en-GB-RyanNeural, 3M:en-US-BrianMultilingualNeural, 4M:en-GB-ThomasNeural
1C: en-GB-MaisieNeural (Childvoice), 1O = en-GB-RyanNeural (OldMan)"</b>
You can insert silence using the marker 'SS##' example "SS2.0"
Enter your text, select a voice, and adjust the speech rate and pitch. Can also set like 1F-20 or 1M24.
"""
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Markdown(label="Warning", visible=False)
],
title="TTS using Edge Engine.. ENGLISH!",
description=description,
article="Process text paragraph by paragraph for smoother output and insert silence markers.",
analytics_enabled=False,
allow_flagging=False
)
return demo
# Run the application
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch() |