Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
import librosa
|
6 |
+
import requests
|
7 |
+
import tempfile
|
8 |
+
import threading
|
9 |
+
import numpy as np
|
10 |
+
import soundfile as sf
|
11 |
+
import gradio as gr
|
12 |
+
from transformers import AutoModel, logging as trf_logging
|
13 |
+
from huggingface_hub import login, hf_hub_download, scan_cache_dir
|
14 |
+
|
15 |
+
# Enable verbose logging for transformers
|
16 |
+
trf_logging.set_verbosity_info()
|
17 |
+
|
18 |
+
# Login (optional)
|
19 |
+
hf_token = os.getenv("HF_TOKEN")
|
20 |
+
if hf_token:
|
21 |
+
print("🔐 Logging into Hugging Face with token...")
|
22 |
+
login(token=hf_token)
|
23 |
+
else:
|
24 |
+
print("⚠️ HF_TOKEN not found. Proceeding without login...")
|
25 |
+
|
26 |
+
# Load model with GPU if available
|
27 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
28 |
+
print(f"🔧 Using device: {device}")
|
29 |
+
|
30 |
+
# Initialize model variable
|
31 |
+
model = None
|
32 |
+
|
33 |
+
# Define the repository ID
|
34 |
+
repo_id = "ai4bharat/IndicF5"
|
35 |
+
|
36 |
+
# Improved model loading with error handling
|
37 |
+
try:
|
38 |
+
print(f"Loading {repo_id} model...")
|
39 |
+
# Try direct loading first
|
40 |
+
model = AutoModel.from_pretrained(
|
41 |
+
repo_id,
|
42 |
+
trust_remote_code=True,
|
43 |
+
revision="main"
|
44 |
+
).to(device)
|
45 |
+
print(f"Model loaded successfully! Type: {type(model)}")
|
46 |
+
|
47 |
+
# Check model attributes
|
48 |
+
model_methods = [method for method in dir(model) if not method.startswith('_') and callable(getattr(model, method))]
|
49 |
+
print(f"Available model methods: {model_methods[:10]}...")
|
50 |
+
|
51 |
+
except Exception as e:
|
52 |
+
print(f"⚠️ Error loading model directly: {e}")
|
53 |
+
|
54 |
+
try:
|
55 |
+
# Try loading with local_files_only if model is cached
|
56 |
+
model = AutoModel.from_pretrained(
|
57 |
+
repo_id,
|
58 |
+
trust_remote_code=True,
|
59 |
+
local_files_only=True
|
60 |
+
).to(device)
|
61 |
+
print("Model loaded from cache!")
|
62 |
+
except Exception as e2:
|
63 |
+
print(f"❌ All attempts to load model failed: {e2}")
|
64 |
+
|
65 |
+
# Advanced audio processing functions
|
66 |
+
def remove_noise(audio_data, threshold=0.01):
|
67 |
+
"""Apply simple noise gate to remove low-level noise"""
|
68 |
+
if audio_data is None:
|
69 |
+
return np.zeros(1000)
|
70 |
+
|
71 |
+
# Convert to numpy if needed
|
72 |
+
if isinstance(audio_data, torch.Tensor):
|
73 |
+
audio_data = audio_data.detach().cpu().numpy()
|
74 |
+
if isinstance(audio_data, list):
|
75 |
+
audio_data = np.array(audio_data)
|
76 |
+
|
77 |
+
# Apply noise gate
|
78 |
+
noise_mask = np.abs(audio_data) < threshold
|
79 |
+
clean_audio = audio_data.copy()
|
80 |
+
clean_audio[noise_mask] = 0
|
81 |
+
|
82 |
+
return clean_audio
|
83 |
+
|
84 |
+
def apply_smoothing(audio_data, window_size=5):
|
85 |
+
"""Apply gentle smoothing to reduce artifacts"""
|
86 |
+
if audio_data is None or len(audio_data) < window_size*2:
|
87 |
+
return audio_data
|
88 |
+
|
89 |
+
# Simple moving average filter
|
90 |
+
kernel = np.ones(window_size) / window_size
|
91 |
+
smoothed = np.convolve(audio_data, kernel, mode='same')
|
92 |
+
|
93 |
+
# Keep original at the edges
|
94 |
+
smoothed[:window_size] = audio_data[:window_size]
|
95 |
+
smoothed[-window_size:] = audio_data[-window_size:]
|
96 |
+
|
97 |
+
return smoothed
|
98 |
+
|
99 |
+
def enhance_audio(audio_data):
|
100 |
+
"""Process audio to improve quality and reduce noise"""
|
101 |
+
if audio_data is None:
|
102 |
+
return np.zeros(1000)
|
103 |
+
|
104 |
+
# Ensure numpy array
|
105 |
+
if isinstance(audio_data, torch.Tensor):
|
106 |
+
audio_data = audio_data.detach().cpu().numpy()
|
107 |
+
if isinstance(audio_data, list):
|
108 |
+
audio_data = np.array(audio_data)
|
109 |
+
|
110 |
+
# Ensure correct shape and dtype
|
111 |
+
if len(audio_data.shape) > 1:
|
112 |
+
audio_data = audio_data.flatten()
|
113 |
+
if audio_data.dtype != np.float32:
|
114 |
+
audio_data = audio_data.astype(np.float32)
|
115 |
+
|
116 |
+
# Skip processing if audio is empty or too short
|
117 |
+
if audio_data.size < 100:
|
118 |
+
return audio_data
|
119 |
+
|
120 |
+
# Check if the audio has reasonable amplitude
|
121 |
+
rms = np.sqrt(np.mean(audio_data**2))
|
122 |
+
print(f"Initial RMS: {rms}")
|
123 |
+
|
124 |
+
# Apply gain if needed
|
125 |
+
if rms < 0.05: # Very quiet
|
126 |
+
target_rms = 0.2
|
127 |
+
gain = target_rms / max(rms, 0.0001)
|
128 |
+
print(f"Applying gain factor: {gain}")
|
129 |
+
audio_data = audio_data * gain
|
130 |
+
|
131 |
+
# Remove DC offset
|
132 |
+
audio_data = audio_data - np.mean(audio_data)
|
133 |
+
|
134 |
+
# Apply noise gate to remove low-level noise
|
135 |
+
audio_data = remove_noise(audio_data, threshold=0.01)
|
136 |
+
|
137 |
+
# Apply gentle smoothing to reduce artifacts
|
138 |
+
audio_data = apply_smoothing(audio_data, window_size=3)
|
139 |
+
|
140 |
+
# Apply soft limiting to prevent clipping
|
141 |
+
max_amp = np.max(np.abs(audio_data))
|
142 |
+
if max_amp > 0.95:
|
143 |
+
audio_data = 0.95 * audio_data / max_amp
|
144 |
+
|
145 |
+
# Apply subtle compression for better audibility
|
146 |
+
audio_data = np.tanh(audio_data * 1.1) * 0.9
|
147 |
+
|
148 |
+
return audio_data
|
149 |
+
|
150 |
+
# Load audio from URL with improved error handling
|
151 |
+
def load_audio_from_url(url):
|
152 |
+
print(f"Downloading reference audio from {url}")
|
153 |
+
try:
|
154 |
+
response = requests.get(url)
|
155 |
+
if response.status_code == 200:
|
156 |
+
try:
|
157 |
+
# Save content to a temp file
|
158 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
159 |
+
temp_file.write(response.content)
|
160 |
+
temp_file.close()
|
161 |
+
print(f"Saved reference audio to temp file: {temp_file.name}")
|
162 |
+
|
163 |
+
# Try different methods to read the audio file
|
164 |
+
audio_data = None
|
165 |
+
sample_rate = None
|
166 |
+
|
167 |
+
# Try SoundFile first
|
168 |
+
try:
|
169 |
+
audio_data, sample_rate = sf.read(temp_file.name)
|
170 |
+
print(f"Audio loaded with SoundFile: {sample_rate}Hz, {len(audio_data)} samples")
|
171 |
+
except Exception as sf_error:
|
172 |
+
print(f"SoundFile failed: {sf_error}")
|
173 |
+
|
174 |
+
# Try librosa as fallback
|
175 |
+
try:
|
176 |
+
audio_data, sample_rate = librosa.load(temp_file.name, sr=None)
|
177 |
+
print(f"Audio loaded with librosa: {sample_rate}Hz, shape={audio_data.shape}")
|
178 |
+
except Exception as lr_error:
|
179 |
+
print(f"Librosa also failed: {lr_error}")
|
180 |
+
|
181 |
+
# Clean up temp file
|
182 |
+
os.unlink(temp_file.name)
|
183 |
+
|
184 |
+
if audio_data is not None:
|
185 |
+
# Apply audio enhancement to the reference
|
186 |
+
audio_data = enhance_audio(audio_data)
|
187 |
+
return sample_rate, audio_data
|
188 |
+
|
189 |
+
except Exception as e:
|
190 |
+
print(f"Failed to process audio data: {e}")
|
191 |
+
else:
|
192 |
+
print(f"Failed to download audio: status code {response.status_code}")
|
193 |
+
except Exception as e:
|
194 |
+
print(f"Error downloading audio: {e}")
|
195 |
+
|
196 |
+
# Return default values as fallback
|
197 |
+
print("⚠️ Returning default silence as reference audio")
|
198 |
+
return 24000, np.zeros(int(24000)) # 1 second of silence at 24kHz
|
199 |
+
|
200 |
+
# Split text into chunks for streaming
|
201 |
+
def split_into_chunks(text, max_length=30):
|
202 |
+
"""Split text into smaller chunks based on punctuation and length"""
|
203 |
+
# First split by sentences
|
204 |
+
sentence_markers = ['.', '?', '!', ';', ':', '।', '॥']
|
205 |
+
chunks = []
|
206 |
+
current = ""
|
207 |
+
|
208 |
+
# Initial coarse splitting by sentence markers
|
209 |
+
for char in text:
|
210 |
+
current += char
|
211 |
+
if char in sentence_markers and current.strip():
|
212 |
+
chunks.append(current.strip())
|
213 |
+
current = ""
|
214 |
+
|
215 |
+
if current.strip():
|
216 |
+
chunks.append(current.strip())
|
217 |
+
|
218 |
+
# Further break down long sentences
|
219 |
+
final_chunks = []
|
220 |
+
for chunk in chunks:
|
221 |
+
if len(chunk) <= max_length:
|
222 |
+
final_chunks.append(chunk)
|
223 |
+
else:
|
224 |
+
# Try splitting by commas for long sentences
|
225 |
+
comma_splits = chunk.split(',')
|
226 |
+
current_part = ""
|
227 |
+
|
228 |
+
for part in comma_splits:
|
229 |
+
if len(current_part) + len(part) <= max_length:
|
230 |
+
if current_part:
|
231 |
+
current_part += ","
|
232 |
+
current_part += part
|
233 |
+
else:
|
234 |
+
if current_part:
|
235 |
+
final_chunks.append(current_part.strip())
|
236 |
+
current_part = part
|
237 |
+
|
238 |
+
if current_part:
|
239 |
+
final_chunks.append(current_part.strip())
|
240 |
+
|
241 |
+
print(f"Split text into {len(final_chunks)} chunks")
|
242 |
+
return final_chunks
|
243 |
+
|
244 |
+
# Improved model wrapper
|
245 |
+
class ModelWrapper:
|
246 |
+
def __init__(self, model):
|
247 |
+
self.model = model
|
248 |
+
print(f"Model wrapper initialized with model type: {type(model)}")
|
249 |
+
|
250 |
+
# Discover the appropriate generation method
|
251 |
+
self.generation_method = self._find_generation_method()
|
252 |
+
|
253 |
+
def _find_generation_method(self):
|
254 |
+
"""Find the appropriate method to generate speech"""
|
255 |
+
if self.model is None:
|
256 |
+
return None
|
257 |
+
|
258 |
+
# Look for plausible generation methods
|
259 |
+
candidates = [
|
260 |
+
"generate_speech", "tts", "generate_audio", "synthesize",
|
261 |
+
"generate", "forward", "__call__"
|
262 |
+
]
|
263 |
+
|
264 |
+
# Check for methods containing these keywords
|
265 |
+
for name in dir(self.model):
|
266 |
+
if any(candidate in name.lower() for candidate in candidates):
|
267 |
+
print(f"Found potential generation method: {name}")
|
268 |
+
return name
|
269 |
+
|
270 |
+
# If nothing specific found, default to __call__
|
271 |
+
print("No specific generation method found, will use __call__")
|
272 |
+
return "__call__"
|
273 |
+
|
274 |
+
def generate(self, text, ref_audio_path, ref_text, **kwargs):
|
275 |
+
"""Generate speech with improved error handling and preprocessing"""
|
276 |
+
print(f"\n==== MODEL INFERENCE ====")
|
277 |
+
print(f"Text input: '{text}'")
|
278 |
+
print(f"Reference audio path: {ref_audio_path}")
|
279 |
+
|
280 |
+
# Check if files exist
|
281 |
+
if not os.path.exists(ref_audio_path):
|
282 |
+
print(f"⚠️ Reference audio file not found")
|
283 |
+
return None
|
284 |
+
|
285 |
+
# Try different calling approaches
|
286 |
+
result = None
|
287 |
+
method_name = self.generation_method if self.generation_method else "__call__"
|
288 |
+
|
289 |
+
# Set up different parameter combinations to try
|
290 |
+
param_combinations = [
|
291 |
+
# First try: standard keyword parameters
|
292 |
+
{"text": text, "ref_audio_path": ref_audio_path, "ref_text": ref_text},
|
293 |
+
# Second try: alternative parameter names
|
294 |
+
{"text": text, "reference_audio": ref_audio_path, "speaker_text": ref_text},
|
295 |
+
# Third try: just text and audio
|
296 |
+
{"text": text, "reference_audio": ref_audio_path},
|
297 |
+
# Fourth try: just text
|
298 |
+
{"text": text},
|
299 |
+
# Fifth try: positional arguments
|
300 |
+
{} # Will use positional below
|
301 |
+
]
|
302 |
+
|
303 |
+
# Try each parameter combination
|
304 |
+
for i, params in enumerate(param_combinations):
|
305 |
+
try:
|
306 |
+
method = getattr(self.model, method_name)
|
307 |
+
print(f"Attempt {i+1}: Calling model.{method_name} with {list(params.keys())} parameters")
|
308 |
+
|
309 |
+
# For the positional arguments case
|
310 |
+
if not params:
|
311 |
+
result = method(text, ref_audio_path, ref_text, **kwargs)
|
312 |
+
else:
|
313 |
+
result = method(**params, **kwargs)
|
314 |
+
|
315 |
+
print(f"✓ Call succeeded with parameters: {list(params.keys())}")
|
316 |
+
break # Exit loop if successful
|
317 |
+
|
318 |
+
except Exception as e:
|
319 |
+
print(f"✗ Attempt {i+1} failed: {str(e)[:100]}...")
|
320 |
+
continue
|
321 |
+
|
322 |
+
# Process the result
|
323 |
+
if result is not None:
|
324 |
+
# Handle tuple results (might be audio, sample_rate)
|
325 |
+
if isinstance(result, tuple):
|
326 |
+
result = result[0] # Extract first element, assuming it's audio
|
327 |
+
|
328 |
+
# Convert torch tensor to numpy if needed
|
329 |
+
if isinstance(result, torch.Tensor):
|
330 |
+
result = result.detach().cpu().numpy()
|
331 |
+
|
332 |
+
# Ensure array is 1D
|
333 |
+
if hasattr(result, 'shape') and len(result.shape) > 1:
|
334 |
+
result = result.flatten()
|
335 |
+
|
336 |
+
# Apply advanced audio processing to improve quality
|
337 |
+
result = enhance_audio(result)
|
338 |
+
|
339 |
+
return result
|
340 |
+
else:
|
341 |
+
print("❌ All inference attempts failed")
|
342 |
+
return np.zeros(int(24000)) # Return 1 second of silence as fallback
|
343 |
+
|
344 |
+
# Create model wrapper
|
345 |
+
model_wrapper = ModelWrapper(model) if model is not None else None
|
346 |
+
|
347 |
+
# Streaming TTS class with improved audio quality
|
348 |
+
class StreamingTTS:
|
349 |
+
def __init__(self):
|
350 |
+
self.is_generating = False
|
351 |
+
self.should_stop = False
|
352 |
+
self.temp_dir = None
|
353 |
+
self.ref_audio_path = None
|
354 |
+
self.output_file = None
|
355 |
+
self.all_chunks = []
|
356 |
+
self.sample_rate = 24000 # Default sample rate
|
357 |
+
|
358 |
+
# Create temp directory
|
359 |
+
self.temp_dir = tempfile.mkdtemp()
|
360 |
+
print(f"Created temp directory: {self.temp_dir}")
|
361 |
+
|
362 |
+
def prepare_ref_audio(self, ref_audio, ref_sr):
|
363 |
+
"""Prepare reference audio with enhanced quality"""
|
364 |
+
try:
|
365 |
+
if self.ref_audio_path is None:
|
366 |
+
self.ref_audio_path = os.path.join(self.temp_dir, "ref_audio.wav")
|
367 |
+
|
368 |
+
# Process the reference audio to ensure clean quality
|
369 |
+
ref_audio = enhance_audio(ref_audio)
|
370 |
+
|
371 |
+
# Save the reference audio
|
372 |
+
sf.write(self.ref_audio_path, ref_audio, ref_sr, format='WAV', subtype='FLOAT')
|
373 |
+
print(f"Saved reference audio to: {self.ref_audio_path}")
|
374 |
+
|
375 |
+
# Verify file was created
|
376 |
+
if os.path.exists(self.ref_audio_path):
|
377 |
+
print(f"Reference audio saved successfully: {os.path.getsize(self.ref_audio_path)} bytes")
|
378 |
+
else:
|
379 |
+
print("⚠️ Failed to create reference audio file!")
|
380 |
+
|
381 |
+
# Create output file
|
382 |
+
if self.output_file is None:
|
383 |
+
self.output_file = os.path.join(self.temp_dir, "output.wav")
|
384 |
+
print(f"Output will be saved to: {self.output_file}")
|
385 |
+
except Exception as e:
|
386 |
+
print(f"Error preparing reference audio: {e}")
|
387 |
+
|
388 |
+
def cleanup(self):
|
389 |
+
"""Clean up temporary files"""
|
390 |
+
if self.temp_dir:
|
391 |
+
try:
|
392 |
+
if os.path.exists(self.ref_audio_path):
|
393 |
+
os.remove(self.ref_audio_path)
|
394 |
+
if os.path.exists(self.output_file):
|
395 |
+
os.remove(self.output_file)
|
396 |
+
os.rmdir(self.temp_dir)
|
397 |
+
self.temp_dir = None
|
398 |
+
print("Cleaned up temporary files")
|
399 |
+
except Exception as e:
|
400 |
+
print(f"Error cleaning up: {e}")
|
401 |
+
|
402 |
+
def generate(self, text, ref_audio, ref_sr, ref_text):
|
403 |
+
"""Start generation in a new thread"""
|
404 |
+
if self.is_generating:
|
405 |
+
print("Already generating speech, please wait")
|
406 |
+
return
|
407 |
+
|
408 |
+
# Check model is loaded
|
409 |
+
if model_wrapper is None:
|
410 |
+
print("⚠️ Model is not loaded. Cannot generate speech.")
|
411 |
+
return
|
412 |
+
|
413 |
+
self.is_generating = True
|
414 |
+
self.should_stop = False
|
415 |
+
self.all_chunks = []
|
416 |
+
|
417 |
+
# Start in a new thread
|
418 |
+
threading.Thread(
|
419 |
+
target=self._process_streaming,
|
420 |
+
args=(text, ref_audio, ref_sr, ref_text),
|
421 |
+
daemon=True
|
422 |
+
).start()
|
423 |
+
|
424 |
+
def _process_streaming(self, text, ref_audio, ref_sr, ref_text):
|
425 |
+
"""Process text in chunks with high-quality audio generation"""
|
426 |
+
try:
|
427 |
+
# Prepare reference audio
|
428 |
+
self.prepare_ref_audio(ref_audio, ref_sr)
|
429 |
+
|
430 |
+
# Split text into smaller chunks for faster processing
|
431 |
+
chunks = split_into_chunks(text)
|
432 |
+
print(f"Processing {len(chunks)} chunks")
|
433 |
+
|
434 |
+
combined_audio = None
|
435 |
+
total_start_time = time.time()
|
436 |
+
|
437 |
+
# Process each chunk
|
438 |
+
for i, chunk in enumerate(chunks):
|
439 |
+
if self.should_stop:
|
440 |
+
print("Stopping generation as requested")
|
441 |
+
break
|
442 |
+
|
443 |
+
chunk_start = time.time()
|
444 |
+
print(f"Processing chunk {i+1}/{len(chunks)}: {chunk}")
|
445 |
+
|
446 |
+
# Generate speech for this chunk
|
447 |
+
try:
|
448 |
+
with torch.inference_mode():
|
449 |
+
chunk_audio = model_wrapper.generate(
|
450 |
+
chunk,
|
451 |
+
self.ref_audio_path,
|
452 |
+
ref_text
|
453 |
+
)
|
454 |
+
|
455 |
+
if chunk_audio is None or (hasattr(chunk_audio, 'size') and chunk_audio.size == 0):
|
456 |
+
print("⚠️ Empty audio returned for this chunk")
|
457 |
+
chunk_audio = np.zeros(int(24000 * 0.5)) # 0.5s silence
|
458 |
+
|
459 |
+
# Process the audio to improve quality
|
460 |
+
chunk_audio = enhance_audio(chunk_audio)
|
461 |
+
|
462 |
+
chunk_time = time.time() - chunk_start
|
463 |
+
print(f"✓ Chunk {i+1} processed in {chunk_time:.2f}s")
|
464 |
+
|
465 |
+
# Add small silence between chunks
|
466 |
+
silence = np.zeros(int(24000 * 0.1)) # 0.1s silence
|
467 |
+
chunk_audio = np.concatenate([chunk_audio, silence])
|
468 |
+
|
469 |
+
# Add to our collection
|
470 |
+
self.all_chunks.append(chunk_audio)
|
471 |
+
|
472 |
+
# Combine all chunks so far
|
473 |
+
if combined_audio is None:
|
474 |
+
combined_audio = chunk_audio
|
475 |
+
else:
|
476 |
+
combined_audio = np.concatenate([combined_audio, chunk_audio])
|
477 |
+
|
478 |
+
# Process combined audio for consistent quality
|
479 |
+
processed_audio = enhance_audio(combined_audio)
|
480 |
+
|
481 |
+
# Write intermediate output
|
482 |
+
sf.write(self.output_file, processed_audio, 24000, format='WAV', subtype='FLOAT')
|
483 |
+
|
484 |
+
except Exception as e:
|
485 |
+
print(f"Error processing chunk {i+1}: {str(e)[:100]}")
|
486 |
+
continue
|
487 |
+
|
488 |
+
total_time = time.time() - total_start_time
|
489 |
+
print(f"Total generation time: {total_time:.2f}s")
|
490 |
+
|
491 |
+
except Exception as e:
|
492 |
+
print(f"Error in streaming TTS: {str(e)[:100]}")
|
493 |
+
finally:
|
494 |
+
self.is_generating = False
|
495 |
+
print("Generation complete")
|
496 |
+
|
497 |
+
def get_current_audio(self):
|
498 |
+
"""Get current audio file path for Gradio"""
|
499 |
+
if self.output_file and os.path.exists(self.output_file):
|
500 |
+
file_size = os.path.getsize(self.output_file)
|
501 |
+
if file_size > 0:
|
502 |
+
return self.output_file
|
503 |
+
return None
|
504 |
+
|
505 |
+
def stop(self):
|
506 |
+
"""Stop generation"""
|
507 |
+
self.should_stop = True
|
508 |
+
print("Stop request received")
|
509 |
+
|
510 |
+
# Load reference example (Malayalam)
|
511 |
+
EXAMPLES = [{
|
512 |
+
"audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
|
513 |
+
"ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
|
514 |
+
"synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
|
515 |
+
}]
|
516 |
+
|
517 |
+
print("\nPreloading reference audio...")
|
518 |
+
ref_sr, ref_audio = load_audio_from_url(EXAMPLES[0]["audio_url"])
|
519 |
+
|
520 |
+
if ref_audio is None:
|
521 |
+
print("⚠️ Failed to load reference audio. Using silence instead.")
|
522 |
+
ref_audio = np.zeros(int(24000))
|
523 |
+
ref_sr = 24000
|
524 |
+
|
525 |
+
# Initialize streaming TTS
|
526 |
+
streaming_tts = StreamingTTS()
|
527 |
+
|
528 |
+
# Add a stop button functionality
|
529 |
+
def stop_generation():
|
530 |
+
streaming_tts.stop()
|
531 |
+
return "Generation stopped"
|
532 |
+
|
533 |
+
# Gradio interface
|
534 |
+
with gr.Blocks() as iface:
|
535 |
+
gr.Markdown("## 🚀 IndicF5 Malayalam TTS")
|
536 |
+
|
537 |
+
with gr.Row():
|
538 |
+
gr.Markdown("### System Status:")
|
539 |
+
system_status = gr.Markdown(f"- Device: {device}\n- Model loaded: {'Yes' if model is not None else 'No'}\n- Reference audio: {'Loaded' if ref_audio is not None else 'Not loaded'}")
|
540 |
+
|
541 |
+
with gr.Row():
|
542 |
+
text_input = gr.Textbox(
|
543 |
+
label="Malayalam Text",
|
544 |
+
placeholder="Enter text here...",
|
545 |
+
lines=3,
|
546 |
+
value=EXAMPLES[0]["synth_text"] if EXAMPLES else "ഹലോ, എന്തൊക്കെ ഉണ്ട് വിശേഷം?"
|
547 |
+
)
|
548 |
+
|
549 |
+
with gr.Row():
|
550 |
+
generate_btn = gr.Button("🎤 Generate Speech", variant="primary")
|
551 |
+
stop_btn = gr.Button("🛑 Stop Generation", variant="secondary")
|
552 |
+
|
553 |
+
# Status indicator
|
554 |
+
status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
|
555 |
+
|
556 |
+
# Audio output
|
557 |
+
output_audio = gr.Audio(
|
558 |
+
label="Generated Speech",
|
559 |
+
type="filepath",
|
560 |
+
autoplay=True
|
561 |
+
)
|
562 |
+
|
563 |
+
# Debug information (hidden by default)
|
564 |
+
with gr.Accordion("Advanced", open=False):
|
565 |
+
debug_output = gr.Textbox(label="Debug Log", value="", lines=5)
|
566 |
+
|
567 |
+
def start_generation(text):
|
568 |
+
if not text.strip():
|
569 |
+
return None, "Please enter some text", "Error: Empty text input"
|
570 |
+
|
571 |
+
if model is None:
|
572 |
+
return None, "⚠️ Model not loaded. Cannot generate speech.", "Error: Model not loaded"
|
573 |
+
|
574 |
+
if ref_audio is None:
|
575 |
+
return None, "⚠️ Reference audio not loaded. Cannot generate speech.", "Error: Reference audio not loaded"
|
576 |
+
|
577 |
+
# Capture stdout for debug purposes
|
578 |
+
import io
|
579 |
+
from contextlib import redirect_stdout
|
580 |
+
f = io.StringIO()
|
581 |
+
with redirect_stdout(f):
|
582 |
+
streaming_tts.generate(text, ref_audio, ref_sr, EXAMPLES[0]["ref_text"] if EXAMPLES else "")
|
583 |
+
|
584 |
+
debug_log = f.getvalue()
|
585 |
+
|
586 |
+
# Add a delay to ensure file is created
|
587 |
+
time.sleep(1.5)
|
588 |
+
|
589 |
+
audio_path = streaming_tts.get_current_audio()
|
590 |
+
if audio_path and os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
|
591 |
+
return audio_path, "Generation started - audio playing", debug_log
|
592 |
+
else:
|
593 |
+
return None, "Starting generation... please wait", debug_log
|
594 |
+
|
595 |
+
generate_btn.click(start_generation, inputs=text_input, outputs=[output_audio, status_text, debug_output])
|
596 |
+
stop_btn.click(stop_generation, inputs=None, outputs=status_text)
|
597 |
+
|
598 |
+
# Cleanup on exit
|
599 |
+
def exit_handler():
|
600 |
+
streaming_tts.cleanup()
|
601 |
+
|
602 |
+
import atexit
|
603 |
+
atexit.register(exit_handler)
|
604 |
+
|
605 |
+
print("Starting Gradio interface...")
|
606 |
+
iface.launch()
|