Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,10 @@ import gradio as gr
|
|
12 |
from transformers import AutoModel, logging as trf_logging
|
13 |
from huggingface_hub import login, hf_hub_download, scan_cache_dir
|
14 |
|
|
|
|
|
|
|
|
|
15 |
# Enable verbose logging for transformers
|
16 |
trf_logging.set_verbosity_info()
|
17 |
|
@@ -33,34 +37,73 @@ model = None
|
|
33 |
# Define the repository ID
|
34 |
repo_id = "ai4bharat/IndicF5"
|
35 |
|
36 |
-
# Improved model loading with error handling
|
37 |
-
|
38 |
-
|
39 |
-
# Try direct loading first
|
40 |
-
model = AutoModel.from_pretrained(
|
41 |
-
repo_id,
|
42 |
-
trust_remote_code=True,
|
43 |
-
revision="main"
|
44 |
-
).to(device)
|
45 |
-
print(f"Model loaded successfully! Type: {type(model)}")
|
46 |
|
47 |
-
#
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
try:
|
55 |
-
|
56 |
model = AutoModel.from_pretrained(
|
57 |
repo_id,
|
58 |
trust_remote_code=True,
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
).to(device)
|
61 |
-
print("Model loaded
|
62 |
except Exception as e2:
|
63 |
print(f"❌ All attempts to load model failed: {e2}")
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# Advanced audio processing functions
|
66 |
def remove_noise(audio_data, threshold=0.01):
|
@@ -147,54 +190,79 @@ def enhance_audio(audio_data):
|
|
147 |
|
148 |
return audio_data
|
149 |
|
150 |
-
# Load audio from URL with improved error handling
|
151 |
-
def load_audio_from_url(url):
|
152 |
print(f"Downloading reference audio from {url}")
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
temp_file.close()
|
161 |
-
print(f"Saved reference audio to temp file: {temp_file.name}")
|
162 |
-
|
163 |
-
# Try different methods to read the audio file
|
164 |
-
audio_data = None
|
165 |
-
sample_rate = None
|
166 |
-
|
167 |
-
# Try SoundFile first
|
168 |
try:
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
-
# Try
|
175 |
try:
|
176 |
-
audio_data, sample_rate =
|
177 |
-
print(f"Audio loaded with
|
178 |
-
except Exception as
|
179 |
-
print(f"
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
return sample_rate, audio_data
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
print("⚠️ Returning default silence as reference audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
return 24000, np.zeros(int(24000)) # 1 second of silence at 24kHz
|
199 |
|
200 |
# Split text into chunks for streaming
|
@@ -241,7 +309,7 @@ def split_into_chunks(text, max_length=30):
|
|
241 |
print(f"Split text into {len(final_chunks)} chunks")
|
242 |
return final_chunks
|
243 |
|
244 |
-
# Improved model wrapper
|
245 |
class ModelWrapper:
|
246 |
def __init__(self, model):
|
247 |
self.model = model
|
@@ -274,9 +342,14 @@ class ModelWrapper:
|
|
274 |
def generate(self, text, ref_audio_path, ref_text, **kwargs):
|
275 |
"""Generate speech with improved error handling and preprocessing"""
|
276 |
print(f"\n==== MODEL INFERENCE ====")
|
277 |
-
print(f"Text
|
278 |
print(f"Reference audio path: {ref_audio_path}")
|
279 |
|
|
|
|
|
|
|
|
|
|
|
280 |
# Check if files exist
|
281 |
if not os.path.exists(ref_audio_path):
|
282 |
print(f"⚠️ Reference audio file not found")
|
@@ -292,25 +365,31 @@ class ModelWrapper:
|
|
292 |
{"text": text, "ref_audio_path": ref_audio_path, "ref_text": ref_text},
|
293 |
# Second try: alternative parameter names
|
294 |
{"text": text, "reference_audio": ref_audio_path, "speaker_text": ref_text},
|
295 |
-
# Third try:
|
|
|
|
|
296 |
{"text": text, "reference_audio": ref_audio_path},
|
297 |
-
#
|
298 |
{"text": text},
|
299 |
-
#
|
300 |
{} # Will use positional below
|
301 |
]
|
302 |
|
303 |
-
# Try each parameter combination
|
304 |
for i, params in enumerate(param_combinations):
|
305 |
try:
|
306 |
method = getattr(self.model, method_name)
|
307 |
print(f"Attempt {i+1}: Calling model.{method_name} with {list(params.keys())} parameters")
|
308 |
|
309 |
-
#
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
314 |
|
315 |
print(f"✓ Call succeeded with parameters: {list(params.keys())}")
|
316 |
break # Exit loop if successful
|
@@ -344,7 +423,7 @@ class ModelWrapper:
|
|
344 |
# Create model wrapper
|
345 |
model_wrapper = ModelWrapper(model) if model is not None else None
|
346 |
|
347 |
-
# Streaming TTS class with improved audio quality
|
348 |
class StreamingTTS:
|
349 |
def __init__(self):
|
350 |
self.is_generating = False
|
@@ -354,10 +433,15 @@ class StreamingTTS:
|
|
354 |
self.output_file = None
|
355 |
self.all_chunks = []
|
356 |
self.sample_rate = 24000 # Default sample rate
|
|
|
357 |
|
358 |
# Create temp directory
|
359 |
-
|
360 |
-
|
|
|
|
|
|
|
|
|
361 |
|
362 |
def prepare_ref_audio(self, ref_audio, ref_sr):
|
363 |
"""Prepare reference audio with enhanced quality"""
|
@@ -400,13 +484,17 @@ class StreamingTTS:
|
|
400 |
print(f"Error cleaning up: {e}")
|
401 |
|
402 |
def generate(self, text, ref_audio, ref_sr, ref_text):
|
403 |
-
"""Start generation in a new thread"""
|
404 |
if self.is_generating:
|
405 |
print("Already generating speech, please wait")
|
406 |
return
|
|
|
|
|
|
|
|
|
407 |
|
408 |
# Check model is loaded
|
409 |
-
if model_wrapper is None:
|
410 |
print("⚠️ Model is not loaded. Cannot generate speech.")
|
411 |
return
|
412 |
|
@@ -424,9 +512,18 @@ class StreamingTTS:
|
|
424 |
def _process_streaming(self, text, ref_audio, ref_sr, ref_text):
|
425 |
"""Process text in chunks with high-quality audio generation"""
|
426 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
# Prepare reference audio
|
428 |
self.prepare_ref_audio(ref_audio, ref_sr)
|
429 |
|
|
|
|
|
|
|
430 |
# Split text into smaller chunks for faster processing
|
431 |
chunks = split_into_chunks(text)
|
432 |
print(f"Processing {len(chunks)} chunks")
|
@@ -441,15 +538,19 @@ class StreamingTTS:
|
|
441 |
break
|
442 |
|
443 |
chunk_start = time.time()
|
444 |
-
print(f"Processing chunk {i+1}/{len(chunks)}: {chunk}")
|
445 |
|
446 |
# Generate speech for this chunk
|
447 |
try:
|
|
|
|
|
|
|
448 |
with torch.inference_mode():
|
|
|
449 |
chunk_audio = model_wrapper.generate(
|
450 |
-
chunk,
|
451 |
-
self.ref_audio_path,
|
452 |
-
ref_text
|
453 |
)
|
454 |
|
455 |
if chunk_audio is None or (hasattr(chunk_audio, 'size') and chunk_audio.size == 0):
|
@@ -489,7 +590,15 @@ class StreamingTTS:
|
|
489 |
print(f"Total generation time: {total_time:.2f}s")
|
490 |
|
491 |
except Exception as e:
|
492 |
-
print(f"Error in streaming TTS: {str(e)[:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
finally:
|
494 |
self.is_generating = False
|
495 |
print("Generation complete")
|
@@ -511,7 +620,7 @@ class StreamingTTS:
|
|
511 |
EXAMPLES = [{
|
512 |
"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
|
513 |
"ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
|
514 |
-
"synth_text": "
|
515 |
}]
|
516 |
|
517 |
print("\nPreloading reference audio...")
|
@@ -530,7 +639,7 @@ def stop_generation():
|
|
530 |
streaming_tts.stop()
|
531 |
return "Generation stopped"
|
532 |
|
533 |
-
# Gradio interface
|
534 |
with gr.Blocks() as iface:
|
535 |
gr.Markdown("## 🚀 IndicF5 Malayalam TTS")
|
536 |
|
@@ -574,21 +683,33 @@ with gr.Blocks() as iface:
|
|
574 |
if ref_audio is None:
|
575 |
return None, "⚠️ Reference audio not loaded. Cannot generate speech.", "Error: Reference audio not loaded"
|
576 |
|
|
|
|
|
|
|
577 |
# Capture stdout for debug purposes
|
578 |
import io
|
579 |
from contextlib import redirect_stdout
|
580 |
f = io.StringIO()
|
581 |
with redirect_stdout(f):
|
582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
|
584 |
debug_log = f.getvalue()
|
585 |
|
586 |
# Add a delay to ensure file is created
|
587 |
-
time.sleep(
|
588 |
|
589 |
audio_path = streaming_tts.get_current_audio()
|
590 |
if audio_path and os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
|
591 |
-
return audio_path, "
|
592 |
else:
|
593 |
return None, "Starting generation... please wait", debug_log
|
594 |
|
@@ -602,5 +723,7 @@ def exit_handler():
|
|
602 |
import atexit
|
603 |
atexit.register(exit_handler)
|
604 |
|
|
|
605 |
print("Starting Gradio interface...")
|
|
|
606 |
iface.launch()
|
|
|
12 |
from transformers import AutoModel, logging as trf_logging
|
13 |
from huggingface_hub import login, hf_hub_download, scan_cache_dir
|
14 |
|
15 |
+
# Increase timeout for transformers HTTP requests
|
16 |
+
import os
|
17 |
+
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "300" # 5 minutes timeout
|
18 |
+
|
19 |
# Enable verbose logging for transformers
|
20 |
trf_logging.set_verbosity_info()
|
21 |
|
|
|
37 |
# Define the repository ID
|
38 |
repo_id = "ai4bharat/IndicF5"
|
39 |
|
40 |
+
# Improved model loading with error handling and cache checking
|
41 |
+
def load_model_with_retry(max_retries=3, retry_delay=5):
|
42 |
+
global model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
+
# First, check if model is already in cache
|
45 |
+
print("Checking if model is in cache...")
|
46 |
+
try:
|
47 |
+
cache_info = scan_cache_dir()
|
48 |
+
model_in_cache = any(repo_id in repo.repo_id for repo in cache_info.repos)
|
49 |
+
if model_in_cache:
|
50 |
+
print(f"Model {repo_id} found in cache, loading locally...")
|
51 |
+
model = AutoModel.from_pretrained(
|
52 |
+
repo_id,
|
53 |
+
trust_remote_code=True,
|
54 |
+
local_files_only=True
|
55 |
+
).to(device)
|
56 |
+
print("Model loaded from cache successfully!")
|
57 |
+
return
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Cache check failed: {e}")
|
60 |
|
61 |
+
# If not in cache or cache check failed, try loading with retries
|
62 |
+
for attempt in range(max_retries):
|
63 |
+
try:
|
64 |
+
print(f"Loading {repo_id} model (attempt {attempt+1}/{max_retries})...")
|
65 |
+
model = AutoModel.from_pretrained(
|
66 |
+
repo_id,
|
67 |
+
trust_remote_code=True,
|
68 |
+
revision="main",
|
69 |
+
use_auth_token=hf_token, # Use token if available
|
70 |
+
low_cpu_mem_usage=True # Reduce memory usage
|
71 |
+
).to(device)
|
72 |
+
|
73 |
+
print(f"Model loaded successfully! Type: {type(model)}")
|
74 |
+
|
75 |
+
# Check model attributes
|
76 |
+
model_methods = [method for method in dir(model) if not method.startswith('_') and callable(getattr(model, method))]
|
77 |
+
print(f"Available model methods: {model_methods[:10]}...")
|
78 |
+
|
79 |
+
return # Success, exit function
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
print(f"⚠️ Attempt {attempt+1}/{max_retries} failed: {e}")
|
83 |
+
if attempt < max_retries - 1:
|
84 |
+
print(f"Waiting {retry_delay} seconds before retrying...")
|
85 |
+
time.sleep(retry_delay)
|
86 |
+
retry_delay *= 1.5 # Exponential backoff
|
87 |
+
|
88 |
+
# If all attempts failed, try one last time with fallback options
|
89 |
try:
|
90 |
+
print("Trying with fallback options...")
|
91 |
model = AutoModel.from_pretrained(
|
92 |
repo_id,
|
93 |
trust_remote_code=True,
|
94 |
+
revision="main",
|
95 |
+
local_files_only=False,
|
96 |
+
use_auth_token=hf_token,
|
97 |
+
force_download=False,
|
98 |
+
resume_download=True
|
99 |
).to(device)
|
100 |
+
print("Model loaded with fallback options!")
|
101 |
except Exception as e2:
|
102 |
print(f"❌ All attempts to load model failed: {e2}")
|
103 |
+
print("Will continue without model loaded.")
|
104 |
+
|
105 |
+
# Call the improved loading function
|
106 |
+
load_model_with_retry()
|
107 |
|
108 |
# Advanced audio processing functions
|
109 |
def remove_noise(audio_data, threshold=0.01):
|
|
|
190 |
|
191 |
return audio_data
|
192 |
|
193 |
+
# Load audio from URL with improved error handling and retries
|
194 |
+
def load_audio_from_url(url, max_retries=3):
|
195 |
print(f"Downloading reference audio from {url}")
|
196 |
+
|
197 |
+
for attempt in range(max_retries):
|
198 |
+
try:
|
199 |
+
# Use a longer timeout
|
200 |
+
response = requests.get(url, timeout=60) # 60 second timeout
|
201 |
+
|
202 |
+
if response.status_code == 200:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
try:
|
204 |
+
# Save content to a temp file
|
205 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
206 |
+
temp_file.write(response.content)
|
207 |
+
temp_file.close()
|
208 |
+
print(f"Saved reference audio to temp file: {temp_file.name}")
|
209 |
+
|
210 |
+
# Try different methods to read the audio file
|
211 |
+
audio_data = None
|
212 |
+
sample_rate = None
|
213 |
|
214 |
+
# Try SoundFile first
|
215 |
try:
|
216 |
+
audio_data, sample_rate = sf.read(temp_file.name)
|
217 |
+
print(f"Audio loaded with SoundFile: {sample_rate}Hz, {len(audio_data)} samples")
|
218 |
+
except Exception as sf_error:
|
219 |
+
print(f"SoundFile failed: {sf_error}")
|
220 |
+
|
221 |
+
# Try librosa as fallback
|
222 |
+
try:
|
223 |
+
audio_data, sample_rate = librosa.load(temp_file.name, sr=None)
|
224 |
+
print(f"Audio loaded with librosa: {sample_rate}Hz, shape={audio_data.shape}")
|
225 |
+
except Exception as lr_error:
|
226 |
+
print(f"Librosa also failed: {lr_error}")
|
|
|
227 |
|
228 |
+
# Clean up temp file
|
229 |
+
os.unlink(temp_file.name)
|
230 |
+
|
231 |
+
if audio_data is not None:
|
232 |
+
# Apply audio enhancement to the reference
|
233 |
+
audio_data = enhance_audio(audio_data)
|
234 |
+
return sample_rate, audio_data
|
235 |
+
|
236 |
+
except Exception as e:
|
237 |
+
print(f"Failed to process audio data: {e}")
|
238 |
+
else:
|
239 |
+
print(f"Failed to download audio: status code {response.status_code}")
|
240 |
+
|
241 |
+
except requests.exceptions.Timeout:
|
242 |
+
if attempt < max_retries - 1:
|
243 |
+
wait_time = (attempt + 1) * 5 # Exponential backoff
|
244 |
+
print(f"Request timed out. Retrying in {wait_time} seconds...")
|
245 |
+
time.sleep(wait_time)
|
246 |
+
else:
|
247 |
+
print("All retry attempts failed due to timeout.")
|
248 |
+
except Exception as e:
|
249 |
+
print(f"Error downloading audio: {e}")
|
250 |
+
if attempt < max_retries - 1:
|
251 |
+
time.sleep(5)
|
252 |
+
|
253 |
+
# If we reach here, all attempts failed
|
254 |
print("⚠️ Returning default silence as reference audio")
|
255 |
+
|
256 |
+
# Try to load a local backup audio if provided
|
257 |
+
backup_path = "backup_reference.wav"
|
258 |
+
if os.path.exists(backup_path):
|
259 |
+
try:
|
260 |
+
audio_data, sample_rate = sf.read(backup_path)
|
261 |
+
print(f"Loaded backup reference audio: {sample_rate}Hz")
|
262 |
+
return sample_rate, audio_data
|
263 |
+
except Exception as e:
|
264 |
+
print(f"Failed to load backup audio: {e}")
|
265 |
+
|
266 |
return 24000, np.zeros(int(24000)) # 1 second of silence at 24kHz
|
267 |
|
268 |
# Split text into chunks for streaming
|
|
|
309 |
print(f"Split text into {len(final_chunks)} chunks")
|
310 |
return final_chunks
|
311 |
|
312 |
+
# Improved model wrapper with timeout handling
|
313 |
class ModelWrapper:
|
314 |
def __init__(self, model):
|
315 |
self.model = model
|
|
|
342 |
def generate(self, text, ref_audio_path, ref_text, **kwargs):
|
343 |
"""Generate speech with improved error handling and preprocessing"""
|
344 |
print(f"\n==== MODEL INFERENCE ====")
|
345 |
+
print(f"Text to generate: '{text}'") # Make sure this is the text we want to generate
|
346 |
print(f"Reference audio path: {ref_audio_path}")
|
347 |
|
348 |
+
# Check if model is loaded
|
349 |
+
if self.model is None:
|
350 |
+
print("⚠️ Model is not loaded. Cannot generate speech.")
|
351 |
+
return np.zeros(int(24000)) # Return silence
|
352 |
+
|
353 |
# Check if files exist
|
354 |
if not os.path.exists(ref_audio_path):
|
355 |
print(f"⚠️ Reference audio file not found")
|
|
|
365 |
{"text": text, "ref_audio_path": ref_audio_path, "ref_text": ref_text},
|
366 |
# Second try: alternative parameter names
|
367 |
{"text": text, "reference_audio": ref_audio_path, "speaker_text": ref_text},
|
368 |
+
# Third try: alternative parameter names 2
|
369 |
+
{"text": text, "reference_audio": ref_audio_path, "reference_text": ref_text},
|
370 |
+
# Fourth try: just text and audio
|
371 |
{"text": text, "reference_audio": ref_audio_path},
|
372 |
+
# Fifth try: just text
|
373 |
{"text": text},
|
374 |
+
# Sixth try: positional arguments
|
375 |
{} # Will use positional below
|
376 |
]
|
377 |
|
378 |
+
# Try each parameter combination with timeout
|
379 |
for i, params in enumerate(param_combinations):
|
380 |
try:
|
381 |
method = getattr(self.model, method_name)
|
382 |
print(f"Attempt {i+1}: Calling model.{method_name} with {list(params.keys())} parameters")
|
383 |
|
384 |
+
# Set a timeout for inference
|
385 |
+
with torch.inference_mode():
|
386 |
+
# For the positional arguments case
|
387 |
+
if not params:
|
388 |
+
print(f"Using positional args with text='{text}'")
|
389 |
+
result = method(text, ref_audio_path, ref_text, **kwargs)
|
390 |
+
else:
|
391 |
+
print(f"Using keyword args with text='{params.get('text')}'")
|
392 |
+
result = method(**params, **kwargs)
|
393 |
|
394 |
print(f"✓ Call succeeded with parameters: {list(params.keys())}")
|
395 |
break # Exit loop if successful
|
|
|
423 |
# Create model wrapper
|
424 |
model_wrapper = ModelWrapper(model) if model is not None else None
|
425 |
|
426 |
+
# Streaming TTS class with improved audio quality and error handling
|
427 |
class StreamingTTS:
|
428 |
def __init__(self):
|
429 |
self.is_generating = False
|
|
|
433 |
self.output_file = None
|
434 |
self.all_chunks = []
|
435 |
self.sample_rate = 24000 # Default sample rate
|
436 |
+
self.current_text = "" # Track current text being processed
|
437 |
|
438 |
# Create temp directory
|
439 |
+
try:
|
440 |
+
self.temp_dir = tempfile.mkdtemp()
|
441 |
+
print(f"Created temp directory: {self.temp_dir}")
|
442 |
+
except Exception as e:
|
443 |
+
print(f"Error creating temp directory: {e}")
|
444 |
+
self.temp_dir = "." # Use current directory as fallback
|
445 |
|
446 |
def prepare_ref_audio(self, ref_audio, ref_sr):
|
447 |
"""Prepare reference audio with enhanced quality"""
|
|
|
484 |
print(f"Error cleaning up: {e}")
|
485 |
|
486 |
def generate(self, text, ref_audio, ref_sr, ref_text):
|
487 |
+
"""Start generation in a new thread with validation"""
|
488 |
if self.is_generating:
|
489 |
print("Already generating speech, please wait")
|
490 |
return
|
491 |
+
|
492 |
+
# Store the text for verification
|
493 |
+
self.current_text = text
|
494 |
+
print(f"Setting current text to: '{self.current_text}'")
|
495 |
|
496 |
# Check model is loaded
|
497 |
+
if model_wrapper is None or model is None:
|
498 |
print("⚠️ Model is not loaded. Cannot generate speech.")
|
499 |
return
|
500 |
|
|
|
512 |
def _process_streaming(self, text, ref_audio, ref_sr, ref_text):
|
513 |
"""Process text in chunks with high-quality audio generation"""
|
514 |
try:
|
515 |
+
# Double check text matches what we expect
|
516 |
+
if text != self.current_text:
|
517 |
+
print(f"⚠️ Text mismatch detected! Expected: '{self.current_text}', Got: '{text}'")
|
518 |
+
# Use the stored text to be safe
|
519 |
+
text = self.current_text
|
520 |
+
|
521 |
# Prepare reference audio
|
522 |
self.prepare_ref_audio(ref_audio, ref_sr)
|
523 |
|
524 |
+
# Print the text we're actually going to process
|
525 |
+
print(f"Processing text: '{text}'")
|
526 |
+
|
527 |
# Split text into smaller chunks for faster processing
|
528 |
chunks = split_into_chunks(text)
|
529 |
print(f"Processing {len(chunks)} chunks")
|
|
|
538 |
break
|
539 |
|
540 |
chunk_start = time.time()
|
541 |
+
print(f"Processing chunk {i+1}/{len(chunks)}: '{chunk}'")
|
542 |
|
543 |
# Generate speech for this chunk
|
544 |
try:
|
545 |
+
# Set timeout for inference
|
546 |
+
chunk_timeout = 30 # 30 seconds timeout per chunk
|
547 |
+
|
548 |
with torch.inference_mode():
|
549 |
+
# Explicitly pass the chunk text
|
550 |
chunk_audio = model_wrapper.generate(
|
551 |
+
text=chunk, # Make sure we're using the current chunk
|
552 |
+
ref_audio_path=self.ref_audio_path,
|
553 |
+
ref_text=ref_text
|
554 |
)
|
555 |
|
556 |
if chunk_audio is None or (hasattr(chunk_audio, 'size') and chunk_audio.size == 0):
|
|
|
590 |
print(f"Total generation time: {total_time:.2f}s")
|
591 |
|
592 |
except Exception as e:
|
593 |
+
print(f"Error in streaming TTS: {str(e)[:200]}")
|
594 |
+
# Try to write whatever we have so far
|
595 |
+
if len(self.all_chunks) > 0:
|
596 |
+
try:
|
597 |
+
combined = np.concatenate(self.all_chunks)
|
598 |
+
sf.write(self.output_file, combined, 24000, format='WAV', subtype='FLOAT')
|
599 |
+
print("Saved partial output")
|
600 |
+
except Exception as e2:
|
601 |
+
print(f"Failed to save partial output: {e2}")
|
602 |
finally:
|
603 |
self.is_generating = False
|
604 |
print("Generation complete")
|
|
|
620 |
EXAMPLES = [{
|
621 |
"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
|
622 |
"ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
|
623 |
+
"synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
|
624 |
}]
|
625 |
|
626 |
print("\nPreloading reference audio...")
|
|
|
639 |
streaming_tts.stop()
|
640 |
return "Generation stopped"
|
641 |
|
642 |
+
# Gradio interface with offline mode
|
643 |
with gr.Blocks() as iface:
|
644 |
gr.Markdown("## 🚀 IndicF5 Malayalam TTS")
|
645 |
|
|
|
683 |
if ref_audio is None:
|
684 |
return None, "⚠️ Reference audio not loaded. Cannot generate speech.", "Error: Reference audio not loaded"
|
685 |
|
686 |
+
# Print the text being processed
|
687 |
+
print(f"🔍 User input text: '{text}'")
|
688 |
+
|
689 |
# Capture stdout for debug purposes
|
690 |
import io
|
691 |
from contextlib import redirect_stdout
|
692 |
f = io.StringIO()
|
693 |
with redirect_stdout(f):
|
694 |
+
try:
|
695 |
+
# Make sure the text is explicitly passed as the first parameter
|
696 |
+
streaming_tts.generate(
|
697 |
+
text=text, # Explicitly name parameter
|
698 |
+
ref_audio=ref_audio,
|
699 |
+
ref_sr=ref_sr,
|
700 |
+
ref_text=EXAMPLES[0]["ref_text"] if EXAMPLES else ""
|
701 |
+
)
|
702 |
+
except Exception as e:
|
703 |
+
print(f"Error starting generation: {e}")
|
704 |
|
705 |
debug_log = f.getvalue()
|
706 |
|
707 |
# Add a delay to ensure file is created
|
708 |
+
time.sleep(2.0)
|
709 |
|
710 |
audio_path = streaming_tts.get_current_audio()
|
711 |
if audio_path and os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
|
712 |
+
return audio_path, f"Generated speech for: {text[:30]}...", debug_log
|
713 |
else:
|
714 |
return None, "Starting generation... please wait", debug_log
|
715 |
|
|
|
723 |
import atexit
|
724 |
atexit.register(exit_handler)
|
725 |
|
726 |
+
# Start the interface with flexible port selection
|
727 |
print("Starting Gradio interface...")
|
728 |
+
# Try a range of ports if 7860 is busy
|
729 |
iface.launch()
|