Update app.py
Browse files
app.py
CHANGED
@@ -1751,7 +1751,7 @@ gc.collect()
|
|
1751 |
model_bark = None
|
1752 |
processor_bark = None
|
1753 |
whisper_model = None
|
1754 |
-
bark_voice_preset = "v2/
|
1755 |
|
1756 |
# Thread pool for async operations
|
1757 |
executor = ThreadPoolExecutor(max_workers=2)
|
@@ -1788,31 +1788,31 @@ def load_models_lazy():
|
|
1788 |
print(f"β
Whisper model loaded on {device}")
|
1789 |
|
1790 |
def bark_tts_async(text):
|
1791 |
-
"""
|
1792 |
def _generate():
|
1793 |
-
load_models_lazy()
|
1794 |
-
print(f"π Synthesizing TTS for: {text}")
|
1795 |
-
|
1796 |
-
# Ensure we're using the correct device
|
1797 |
device = next(model_bark.parameters()).device
|
1798 |
-
print(f"π Bark model
|
1799 |
-
|
|
|
|
|
1800 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1801 |
-
|
1802 |
-
|
1803 |
-
|
1804 |
with torch.no_grad():
|
1805 |
-
speech_values = model_bark.generate(
|
1806 |
-
|
1807 |
-
|
1808 |
speech = speech_values.cpu().numpy().squeeze()
|
1809 |
speech = (speech * 32767).astype(np.int16)
|
1810 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1811 |
wavfile.write(temp_wav.name, 22050, speech)
|
1812 |
return temp_wav.name
|
1813 |
-
|
1814 |
return executor.submit(_generate)
|
1815 |
|
|
|
1816 |
def whisper_stt(audio_path):
|
1817 |
"""Lazy loading whisper STT"""
|
1818 |
if not audio_path or not os.path.exists(audio_path):
|
|
|
1751 |
model_bark = None
|
1752 |
processor_bark = None
|
1753 |
whisper_model = None
|
1754 |
+
bark_voice_preset = "v2/en_speaker_6"
|
1755 |
|
1756 |
# Thread pool for async operations
|
1757 |
executor = ThreadPoolExecutor(max_workers=2)
|
|
|
1788 |
print(f"β
Whisper model loaded on {device}")
|
1789 |
|
1790 |
def bark_tts_async(text):
|
1791 |
+
"""Fully correct async TTS generation with Bark"""
|
1792 |
def _generate():
|
1793 |
+
load_models_lazy()
|
|
|
|
|
|
|
1794 |
device = next(model_bark.parameters()).device
|
1795 |
+
print(f"π Bark model on: {device}")
|
1796 |
+
print(f"ποΈ Speaking: {text}")
|
1797 |
+
|
1798 |
+
# π§ Prepare full input using processor (not just input_ids)
|
1799 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1800 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
1801 |
+
|
1802 |
+
# β
Generate using unpacked args β this includes all required prompt tensors
|
1803 |
with torch.no_grad():
|
1804 |
+
speech_values = model_bark.generate(**inputs)
|
1805 |
+
|
1806 |
+
# β
Convert to audio
|
1807 |
speech = speech_values.cpu().numpy().squeeze()
|
1808 |
speech = (speech * 32767).astype(np.int16)
|
1809 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1810 |
wavfile.write(temp_wav.name, 22050, speech)
|
1811 |
return temp_wav.name
|
1812 |
+
|
1813 |
return executor.submit(_generate)
|
1814 |
|
1815 |
+
|
1816 |
def whisper_stt(audio_path):
|
1817 |
"""Lazy loading whisper STT"""
|
1818 |
if not audio_path or not os.path.exists(audio_path):
|