Spaces:
Sleeping
Sleeping
File size: 8,845 Bytes
5021a0c 63f1d6d 4337b98 5021a0c 2928892 7042e46 2928892 7042e46 0596274 2928892 7042e46 2928892 5021a0c 2928892 2f93aef 63f1d6d 2f93aef 7042e46 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef ef4c8b8 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef d3fce98 2928892 2f93aef 4337b98 5021a0c 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 2928892 2f93aef 5021a0c 2928892 5021a0c 2928892 5021a0c 40b007d 2928892 5021a0c 92f530c 2f93aef 5021a0c 2928892 2f93aef 5021a0c 92f530c 5021a0c 2928892 5021a0c 2928892 5021a0c 2f93aef 5021a0c 2928892 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import spaces
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
import struct
import wave
# Function to create a temporary silent WAV file
def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
"""Creates a temporary WAV file containing silence."""
if duration <= 0:
raise ValueError("Duration must be positive.")
num_frames = int(duration * sample_rate)
silent_data = b'\x00' * (num_frames * num_channels * sample_width)
temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
with wave.open(temp_wav_path, 'w') as wf:
wf.setnchannels(num_channels)
wf.setframerate(sample_rate)
wf.setsampwidth(sample_width)
wf.writeframes(silent_data)
return temp_wav_path
# Function to process text and generate audio for a single paragraph
async def paragraph_to_speech(text, voice, rate, pitch):
voices = {
"voice1F": "en-US-EmmaNeural - en-US (Female)",
"voice2F": "en-US-JennyNeural - en-US (Female)",
"voice3F": "en-HK-YanNeural - en-HK (Female)",
"voice1": "en-AU-WilliamNeural - en-AU (Male)",
"voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
"voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
"voice4": "en-GB-MaisieNeural - en-GB (Female)", # Child
"voice5": "en-GB-RyanNeural - en-GB (Male)" # Old Man
}
if not text.strip():
return None, [] # Return None for audio path and empty list for silence
audio_segments = []
temp_dir = tempfile.gettempdir()
parts = re.split(r'(SS\d+\.?\d*)', text)
for part in parts:
if re.match(r'SS\d+\.?\d*', part):
try:
silence_duration = float(part[2:])
silent_wav_path = create_silent_wav(silence_duration, temp_dir)
audio_segments.append(silent_wav_path)
except ValueError:
print(f"Warning: Invalid silence duration format: {part}")
elif part.strip():
processed_text = part
current_voice = voice
current_rate = rate
current_pitch = pitch
# Select voice based on part prefix
if part.startswith("1F"):
processed_text = part[2:]
current_voice = voices["voice1F"]
elif part.startswith("2F"):
processed_text = part[2:]
current_voice = voices["voice2F"]
elif part.startswith("3F"):
processed_text = part[2:]
current_voice = voices["voice3F"]
elif part.startswith("1M"):
processed_text = part[2:]
current_voice = voices["voice1"]
elif part.startswith("2M"):
processed_text = part[2:]
current_voice = voices["voice2"]
elif part.startswith("3M"):
processed_text = part[2:]
current_voice = voices["voice3"]
elif part.startswith("1C"):
processed_text = part[2:]
current_voice = voices["voice4"]
elif part.startswith("1O"):
processed_text = part[2:]
current_voice = voices["voice5"]
current_pitch = -30
current_rate = -20
else:
current_voice = (voice or voices["voice1"]).split(" - ")[0]
processed_text = part[:]
rate_str = f"{current_rate:+d}%"
pitch_str = f"{current_pitch:+d}Hz"
communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
# Save speech output to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
audio_segments.append(tmp_path)
else:
audio_segments.append(None) # Empty string
return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
# Main text-to-speech function that processes paragraphs and silence
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, gr.Warning("Please enter text to convert.")
if not voice:
return None, gr.Warning("Please select a voice.")
paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
final_audio_segments = []
for paragraph in paragraphs:
audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
if audio_paths:
final_audio_segments.extend(audio_paths)
if not any(isinstance(item, str) for item in final_audio_segments):
return None, None # No actual audio generated
if all(not isinstance(item, str) for item in final_audio_segments):
return None, "Only silence markers found."
combined_audio_path = tempfile.mktemp(suffix=".wav")
with wave.open(combined_audio_path, 'w') as outfile:
first_audio = True
sample_rate = None
num_channels = None
sample_width = None
for segment_path in final_audio_segments:
if isinstance(segment_path, str):
try:
with wave.open(segment_path, 'rb') as infile:
current_num_channels = infile.getnchannels()
current_sample_rate = infile.getframerate()
current_sample_width = infile.getsampwidth()
frames = infile.readframes(infile.getnframes())
if first_audio:
num_channels = current_num_channels
sample_rate = current_sample_rate
sample_width = current_sample_width
outfile.setnchannels(num_channels)
outfile.setframerate(sample_rate)
outfile.setsampwidth(sample_width)
first_audio = False
elif (current_num_channels != num_channels or
current_sample_rate != sample_rate or
current_sample_width != sample_width):
print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
continue
outfile.writeframes(frames)
os.remove(segment_path) # Clean up individual files
except wave.Error as e:
print(f"Warning: Error reading WAV file {segment_path}: {e}")
except FileNotFoundError:
print(f"Warning: Audio file not found: {segment_path}")
return combined_audio_path, None
# Gradio interface function (wrapper to run async code)
def tts_interface_sync(text, voice, rate, pitch):
return asyncio.run(tts_interface(text, voice, rate, pitch))
# Gradio interface
async def create_demo():
voices = await get_voices() # Now this function is defined
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
description = """
Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian, 1C: Childvoice, 1O = OldMan
You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
Enter your text, select a voice, and adjust the speech rate and pitch.
The application will process your text paragraph by paragraph (separated by two blank lines).
"""
demo = gr.Interface(
fn=tts_interface_sync,
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Markdown(label="Warning", visible=False)
],
title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
description=description,
article="Process text paragraph by paragraph for smoother output and insert silence markers.",
analytics_enabled=False,
allow_flagging=False
)
return demo
# Run the application
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch()
|