Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
import numpy as np
|
5 |
+
import tempfile
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import librosa
|
9 |
+
import soundfile as sf
|
10 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
11 |
+
from datasets import load_dataset
|
12 |
+
import warnings
|
13 |
+
import gc
|
14 |
+
warnings.filterwarnings("ignore")
|
15 |
+
|
16 |
+
class VoiceCloningTTS:
|
17 |
+
def __init__(self):
|
18 |
+
"""Initialize the TTS system with SpeechT5 model"""
|
19 |
+
# Use CPU for HF Spaces to avoid memory issues
|
20 |
+
self.device = torch.device("cpu")
|
21 |
+
print(f"Using device: {self.device}")
|
22 |
+
|
23 |
+
try:
|
24 |
+
# Load SpeechT5 models with memory optimization
|
25 |
+
print("Loading SpeechT5 processor...")
|
26 |
+
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
27 |
+
|
28 |
+
print("Loading SpeechT5 TTS model...")
|
29 |
+
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
30 |
+
self.model.to(self.device)
|
31 |
+
self.model.eval() # Set to evaluation mode
|
32 |
+
|
33 |
+
print("Loading SpeechT5 vocoder...")
|
34 |
+
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
35 |
+
self.vocoder.to(self.device)
|
36 |
+
self.vocoder.eval()
|
37 |
+
|
38 |
+
# Load default speaker embeddings
|
39 |
+
print("Loading speaker embeddings...")
|
40 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
41 |
+
self.default_speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
|
42 |
+
|
43 |
+
self.user_speaker_embeddings = None
|
44 |
+
self.sample_rate = 16000
|
45 |
+
|
46 |
+
print("β
TTS system initialized successfully!")
|
47 |
+
|
48 |
+
except Exception as e:
|
49 |
+
print(f"β Error initializing TTS system: {str(e)}")
|
50 |
+
raise e
|
51 |
+
|
52 |
+
def extract_speaker_embedding(self, audio_path):
|
53 |
+
"""Extract speaker embedding from uploaded audio"""
|
54 |
+
try:
|
55 |
+
print(f"Processing audio file: {audio_path}")
|
56 |
+
|
57 |
+
# Load and preprocess audio
|
58 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
59 |
+
print(f"Original audio shape: {waveform.shape}, sample rate: {sample_rate}")
|
60 |
+
|
61 |
+
# Resample if necessary
|
62 |
+
if sample_rate != self.sample_rate:
|
63 |
+
print(f"Resampling from {sample_rate} to {self.sample_rate}")
|
64 |
+
resampler = torchaudio.transforms.Resample(sample_rate, self.sample_rate)
|
65 |
+
waveform = resampler(waveform)
|
66 |
+
|
67 |
+
# Convert to mono if stereo
|
68 |
+
if waveform.shape[0] > 1:
|
69 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
70 |
+
print("Converted to mono")
|
71 |
+
|
72 |
+
# Ensure minimum length (at least 1 second)
|
73 |
+
min_length = self.sample_rate
|
74 |
+
if waveform.shape[1] < min_length:
|
75 |
+
# Pad with zeros if too short
|
76 |
+
padding = min_length - waveform.shape[1]
|
77 |
+
waveform = torch.nn.functional.pad(waveform, (0, padding))
|
78 |
+
print(f"Padded audio to minimum length")
|
79 |
+
|
80 |
+
# Limit maximum length (30 seconds max for memory efficiency)
|
81 |
+
max_length = 30 * self.sample_rate
|
82 |
+
if waveform.shape[1] > max_length:
|
83 |
+
waveform = waveform[:, :max_length]
|
84 |
+
print("Truncated audio to 30 seconds")
|
85 |
+
|
86 |
+
# Normalize audio
|
87 |
+
waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
|
88 |
+
|
89 |
+
# Convert to numpy for librosa processing
|
90 |
+
audio_numpy = waveform.squeeze().numpy()
|
91 |
+
|
92 |
+
print("Extracting audio features...")
|
93 |
+
|
94 |
+
# Extract comprehensive audio features
|
95 |
+
try:
|
96 |
+
# MFCC features (mel-frequency cepstral coefficients)
|
97 |
+
mfccs = librosa.feature.mfcc(y=audio_numpy, sr=self.sample_rate, n_mfcc=13)
|
98 |
+
mfcc_mean = np.mean(mfccs, axis=1)
|
99 |
+
mfcc_std = np.std(mfccs, axis=1)
|
100 |
+
|
101 |
+
# Spectral features
|
102 |
+
spectral_centroids = librosa.feature.spectral_centroid(y=audio_numpy, sr=self.sample_rate)
|
103 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_numpy, sr=self.sample_rate)
|
104 |
+
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_numpy, sr=self.sample_rate)
|
105 |
+
zero_crossing_rate = librosa.feature.zero_crossing_rate(audio_numpy)
|
106 |
+
|
107 |
+
# Pitch features
|
108 |
+
pitches, magnitudes = librosa.piptrack(y=audio_numpy, sr=self.sample_rate)
|
109 |
+
pitch_mean = np.mean(pitches[pitches > 0]) if np.any(pitches > 0) else 0
|
110 |
+
|
111 |
+
# Chroma features
|
112 |
+
chroma = librosa.feature.chroma_stft(y=audio_numpy, sr=self.sample_rate)
|
113 |
+
chroma_mean = np.mean(chroma, axis=1)
|
114 |
+
|
115 |
+
# Combine all features
|
116 |
+
features = np.concatenate([
|
117 |
+
mfcc_mean,
|
118 |
+
mfcc_std,
|
119 |
+
[np.mean(spectral_centroids)],
|
120 |
+
[np.mean(spectral_rolloff)],
|
121 |
+
[np.mean(spectral_bandwidth)],
|
122 |
+
[np.mean(zero_crossing_rate)],
|
123 |
+
[pitch_mean],
|
124 |
+
chroma_mean
|
125 |
+
])
|
126 |
+
|
127 |
+
print(f"Extracted {len(features)} audio features")
|
128 |
+
|
129 |
+
except Exception as e:
|
130 |
+
print(f"Error extracting features: {e}")
|
131 |
+
# Simple fallback feature extraction
|
132 |
+
features = np.array([
|
133 |
+
np.mean(audio_numpy),
|
134 |
+
np.std(audio_numpy),
|
135 |
+
np.max(audio_numpy),
|
136 |
+
np.min(audio_numpy)
|
137 |
+
])
|
138 |
+
|
139 |
+
# Create speaker embedding by modifying the default embedding
|
140 |
+
base_embedding = self.default_speaker_embeddings.clone()
|
141 |
+
|
142 |
+
# Normalize features
|
143 |
+
features_normalized = (features - np.mean(features)) / (np.std(features) + 1e-8)
|
144 |
+
|
145 |
+
# Create modification vector (pad or truncate to match embedding size)
|
146 |
+
embedding_size = base_embedding.shape[1] # Should be 512
|
147 |
+
if len(features_normalized) > embedding_size:
|
148 |
+
modification_vector = features_normalized[:embedding_size]
|
149 |
+
else:
|
150 |
+
modification_vector = np.pad(features_normalized,
|
151 |
+
(0, embedding_size - len(features_normalized)),
|
152 |
+
'constant', constant_values=0)
|
153 |
+
|
154 |
+
modification_tensor = torch.tensor(modification_vector, dtype=torch.float32).to(self.device)
|
155 |
+
|
156 |
+
# Apply modifications to create unique speaker embedding
|
157 |
+
# Use a smaller modification factor for stability
|
158 |
+
speaker_embedding = base_embedding + 0.05 * modification_tensor.unsqueeze(0)
|
159 |
+
|
160 |
+
# Normalize the final embedding
|
161 |
+
speaker_embedding = torch.nn.functional.normalize(speaker_embedding, p=2, dim=1)
|
162 |
+
|
163 |
+
print("β
Speaker embedding created successfully!")
|
164 |
+
return speaker_embedding, "β
Voice profile extracted successfully! You can now generate speech in this voice."
|
165 |
+
|
166 |
+
except Exception as e:
|
167 |
+
print(f"β Error in extract_speaker_embedding: {str(e)}")
|
168 |
+
return None, f"β Error processing audio: {str(e)}"
|
169 |
+
|
170 |
+
def synthesize_speech(self, text, use_cloned_voice=True):
|
171 |
+
"""Convert text to speech using the specified voice"""
|
172 |
+
try:
|
173 |
+
if not text.strip():
|
174 |
+
return None, "β Please enter some text to convert."
|
175 |
+
|
176 |
+
# Limit text length for memory efficiency
|
177 |
+
if len(text) > 500:
|
178 |
+
text = text[:500]
|
179 |
+
print("Text truncated to 500 characters for memory efficiency")
|
180 |
+
|
181 |
+
print(f"Synthesizing speech for text: '{text[:50]}...'")
|
182 |
+
|
183 |
+
# Choose speaker embedding
|
184 |
+
if use_cloned_voice and self.user_speaker_embeddings is not None:
|
185 |
+
speaker_embeddings = self.user_speaker_embeddings
|
186 |
+
voice_type = "your cloned voice"
|
187 |
+
print("Using cloned voice")
|
188 |
+
else:
|
189 |
+
speaker_embeddings = self.default_speaker_embeddings
|
190 |
+
voice_type = "default voice"
|
191 |
+
print("Using default voice")
|
192 |
+
|
193 |
+
# Tokenize text
|
194 |
+
inputs = self.processor(text=text, return_tensors="pt")
|
195 |
+
input_ids = inputs["input_ids"].to(self.device)
|
196 |
+
|
197 |
+
print("Generating speech...")
|
198 |
+
|
199 |
+
# Generate speech with memory optimization
|
200 |
+
with torch.no_grad():
|
201 |
+
# Clear cache before generation
|
202 |
+
if torch.cuda.is_available():
|
203 |
+
torch.cuda.empty_cache()
|
204 |
+
|
205 |
+
speech = self.model.generate_speech(
|
206 |
+
input_ids,
|
207 |
+
speaker_embeddings,
|
208 |
+
vocoder=self.vocoder
|
209 |
+
)
|
210 |
+
|
211 |
+
# Convert to numpy
|
212 |
+
speech_numpy = speech.cpu().numpy()
|
213 |
+
|
214 |
+
print(f"Generated audio shape: {speech_numpy.shape}")
|
215 |
+
|
216 |
+
# Create temporary file
|
217 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
218 |
+
sf.write(tmp_file.name, speech_numpy, self.sample_rate)
|
219 |
+
print(f"Audio saved to: {tmp_file.name}")
|
220 |
+
|
221 |
+
# Clean up memory
|
222 |
+
del speech, input_ids
|
223 |
+
gc.collect()
|
224 |
+
|
225 |
+
return tmp_file.name, f"β
Speech generated successfully using {voice_type}!"
|
226 |
+
|
227 |
+
except Exception as e:
|
228 |
+
print(f"β Error in synthesize_speech: {str(e)}")
|
229 |
+
return None, f"β Error generating speech: {str(e)}"
|
230 |
+
|
231 |
+
# Initialize the TTS system
|
232 |
+
print("π Initializing Voice Cloning TTS System...")
|
233 |
+
tts_system = VoiceCloningTTS()
|
234 |
+
|
235 |
+
def process_voice_upload(audio_file):
|
236 |
+
"""Process uploaded voice file"""
|
237 |
+
if audio_file is None:
|
238 |
+
return "β Please upload an audio file first.", gr.update(interactive=False), gr.update(interactive=False)
|
239 |
+
|
240 |
+
try:
|
241 |
+
speaker_embedding, message = tts_system.extract_speaker_embedding(audio_file)
|
242 |
+
|
243 |
+
if speaker_embedding is not None:
|
244 |
+
tts_system.user_speaker_embeddings = speaker_embedding
|
245 |
+
return message, gr.update(interactive=True), gr.update(interactive=True)
|
246 |
+
else:
|
247 |
+
return message, gr.update(interactive=False), gr.update(interactive=False)
|
248 |
+
except Exception as e:
|
249 |
+
error_msg = f"β Error processing audio: {str(e)}"
|
250 |
+
return error_msg, gr.update(interactive=False), gr.update(interactive=False)
|
251 |
+
|
252 |
+
def generate_speech(text, use_cloned_voice):
|
253 |
+
"""Generate speech from text"""
|
254 |
+
if not text.strip():
|
255 |
+
return None, "β Please enter some text to convert."
|
256 |
+
|
257 |
+
try:
|
258 |
+
audio_file, message = tts_system.synthesize_speech(text, use_cloned_voice)
|
259 |
+
return audio_file, message
|
260 |
+
except Exception as e:
|
261 |
+
error_msg = f"β Error generating speech: {str(e)}"
|
262 |
+
return None, error_msg
|
263 |
+
|
264 |
+
def clear_voice_profile():
|
265 |
+
"""Clear the uploaded voice profile"""
|
266 |
+
tts_system.user_speaker_embeddings = None
|
267 |
+
return ("π Voice profile cleared. Upload a new audio file to clone a voice.",
|
268 |
+
gr.update(interactive=False),
|
269 |
+
gr.update(interactive=False))
|
270 |
+
|
271 |
+
def update_generate_button(text, use_cloned):
|
272 |
+
"""Update generate button state based on inputs"""
|
273 |
+
text_ready = bool(text.strip())
|
274 |
+
voice_ready = (not use_cloned) or (tts_system.user_speaker_embeddings is not None)
|
275 |
+
return gr.update(interactive=text_ready and voice_ready)
|
276 |
+
|
277 |
+
# Create Gradio interface optimized for HF Spaces
|
278 |
+
with gr.Blocks(
|
279 |
+
title="π€ Voice Cloning TTS System",
|
280 |
+
theme=gr.themes.Soft(),
|
281 |
+
css="""
|
282 |
+
.gradio-container {
|
283 |
+
max-width: 1000px !important;
|
284 |
+
margin: auto !important;
|
285 |
+
}
|
286 |
+
.header {
|
287 |
+
text-align: center;
|
288 |
+
margin-bottom: 30px;
|
289 |
+
padding: 20px;
|
290 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
291 |
+
border-radius: 15px;
|
292 |
+
color: white;
|
293 |
+
}
|
294 |
+
.step-box {
|
295 |
+
border: 2px solid #e1e5e9;
|
296 |
+
border-radius: 12px;
|
297 |
+
padding: 20px;
|
298 |
+
margin: 15px 0;
|
299 |
+
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
300 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
301 |
+
}
|
302 |
+
.tips-box {
|
303 |
+
background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
|
304 |
+
border-radius: 12px;
|
305 |
+
padding: 20px;
|
306 |
+
margin: 20px 0;
|
307 |
+
border-left: 5px solid #ff6b6b;
|
308 |
+
}
|
309 |
+
"""
|
310 |
+
) as demo:
|
311 |
+
|
312 |
+
gr.HTML("""
|
313 |
+
<div class="header">
|
314 |
+
<h1>π€ AI Voice Cloning TTS System</h1>
|
315 |
+
<p>π Upload your voice sample and convert any text to speech in YOUR voice!</p>
|
316 |
+
<p>β¨ Powered by Microsoft SpeechT5 & Advanced Voice Analysis</p>
|
317 |
+
</div>
|
318 |
+
""")
|
319 |
+
|
320 |
+
with gr.Row():
|
321 |
+
with gr.Column(scale=1):
|
322 |
+
gr.HTML('<div class="step-box"><h3>ποΈ Step 1: Upload Your Voice Sample</h3><p>Record or upload 10-30 seconds of clear English speech</p></div>')
|
323 |
+
|
324 |
+
voice_upload = gr.Audio(
|
325 |
+
label="π€ Voice Sample (English)",
|
326 |
+
type="filepath",
|
327 |
+
sources=["upload", "microphone"],
|
328 |
+
format="wav"
|
329 |
+
)
|
330 |
+
|
331 |
+
upload_status = gr.Textbox(
|
332 |
+
label="π Voice Analysis Status",
|
333 |
+
interactive=False,
|
334 |
+
value="β³ Please upload an audio file to extract your voice profile.",
|
335 |
+
lines=2
|
336 |
+
)
|
337 |
+
|
338 |
+
clear_btn = gr.Button("ποΈ Clear Voice Profile", variant="secondary", size="sm")
|
339 |
+
|
340 |
+
with gr.Column(scale=1):
|
341 |
+
gr.HTML('<div class="step-box"><h3>βοΈ Step 2: Enter Your Text</h3><p>Type the text you want to convert to speech</p></div>')
|
342 |
+
|
343 |
+
text_input = gr.Textbox(
|
344 |
+
label="π Text to Convert (Max 500 characters)",
|
345 |
+
placeholder="Enter the text you want to convert to speech using your cloned voice...",
|
346 |
+
lines=5,
|
347 |
+
max_lines=8
|
348 |
+
)
|
349 |
+
|
350 |
+
use_cloned_voice = gr.Checkbox(
|
351 |
+
label="π Use My Cloned Voice",
|
352 |
+
value=True,
|
353 |
+
interactive=False,
|
354 |
+
info="Uncheck to use default voice"
|
355 |
+
)
|
356 |
+
|
357 |
+
generate_btn = gr.Button(
|
358 |
+
"π΅ Generate Speech",
|
359 |
+
variant="primary",
|
360 |
+
interactive=False,
|
361 |
+
size="lg"
|
362 |
+
)
|
363 |
+
|
364 |
+
gr.HTML('<div class="step-box"><h3>π Step 3: Your Generated Speech</h3></div>')
|
365 |
+
|
366 |
+
with gr.Row():
|
367 |
+
with gr.Column():
|
368 |
+
output_audio = gr.Audio(
|
369 |
+
label="π§ Generated Speech Audio",
|
370 |
+
type="filepath",
|
371 |
+
interactive=False
|
372 |
+
)
|
373 |
+
|
374 |
+
generation_status = gr.Textbox(
|
375 |
+
label="β‘ Generation Status",
|
376 |
+
interactive=False,
|
377 |
+
lines=2
|
378 |
+
)
|
379 |
+
|
380 |
+
# Tips and information section
|
381 |
+
gr.HTML("""
|
382 |
+
<div class="tips-box">
|
383 |
+
<h3>π‘ Pro Tips for Best Results:</h3>
|
384 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-top: 15px;">
|
385 |
+
<div>
|
386 |
+
<h4>π€ Voice Sample Quality:</h4>
|
387 |
+
<ul>
|
388 |
+
<li>Use clear, natural English speech</li>
|
389 |
+
<li>10-30 seconds duration is optimal</li>
|
390 |
+
<li>Minimize background noise</li>
|
391 |
+
<li>Speak at normal pace and volume</li>
|
392 |
+
</ul>
|
393 |
+
</div>
|
394 |
+
<div>
|
395 |
+
<h4>π Text Guidelines:</h4>
|
396 |
+
<ul>
|
397 |
+
<li>English text works best</li>
|
398 |
+
<li>Keep sentences natural and clear</li>
|
399 |
+
<li>Avoid very long paragraphs</li>
|
400 |
+
<li>Punctuation helps with intonation</li>
|
401 |
+
</ul>
|
402 |
+
</div>
|
403 |
+
</div>
|
404 |
+
<div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.7); border-radius: 8px;">
|
405 |
+
<strong>π¬ How it works:</strong> The system analyzes your voice's unique characteristics (pitch, tone, formants)
|
406 |
+
and creates a personalized voice profile that's used to generate speech that sounds like you!
|
407 |
+
</div>
|
408 |
+
</div>
|
409 |
+
""")
|
410 |
+
|
411 |
+
# Event handlers with proper state management
|
412 |
+
voice_upload.change(
|
413 |
+
fn=process_voice_upload,
|
414 |
+
inputs=[voice_upload],
|
415 |
+
outputs=[upload_status, use_cloned_voice, generate_btn]
|
416 |
+
)
|
417 |
+
|
418 |
+
text_input.change(
|
419 |
+
fn=update_generate_button,
|
420 |
+
inputs=[text_input, use_cloned_voice],
|
421 |
+
outputs=[generate_btn]
|
422 |
+
)
|
423 |
+
|
424 |
+
use_cloned_voice.change(
|
425 |
+
fn=update_generate_button,
|
426 |
+
inputs=[text_input, use_cloned_voice],
|
427 |
+
outputs=[generate_btn]
|
428 |
+
)
|
429 |
+
|
430 |
+
generate_btn.click(
|
431 |
+
fn=generate_speech,
|
432 |
+
inputs=[text_input, use_cloned_voice],
|
433 |
+
outputs=[output_audio, generation_status]
|
434 |
+
)
|
435 |
+
|
436 |
+
clear_btn.click(
|
437 |
+
fn=clear_voice_profile,
|
438 |
+
outputs=[upload_status, use_cloned_voice, generate_btn]
|
439 |
+
)
|
440 |
+
|
441 |
+
# Launch configuration for Hugging Face Spaces
|
442 |
+
if __name__ == "__main__":
|
443 |
+
print("π Starting Voice Cloning TTS System on Hugging Face Spaces...")
|
444 |
+
demo.launch(
|
445 |
+
share=True # HF Spaces handles sharing automatically
|
446 |
+
)
|