Commit
Β·
ae625e2
1
Parent(s):
1a5a90b
added bark tts for faster loading
Browse files- app.py +14 -33
- requirements.txt +2 -1
app.py
CHANGED
@@ -1695,7 +1695,7 @@ import numpy as np
|
|
1695 |
import scipy.io.wavfile as wavfile
|
1696 |
import os
|
1697 |
import json
|
1698 |
-
from
|
1699 |
import torch, gc
|
1700 |
from faster_whisper import WhisperModel
|
1701 |
import asyncio
|
@@ -1707,8 +1707,7 @@ torch.cuda.empty_cache()
|
|
1707 |
gc.collect()
|
1708 |
|
1709 |
# Global variables for lazy loading
|
1710 |
-
|
1711 |
-
processor_bark = None
|
1712 |
faster_whisper_model = None
|
1713 |
bark_voice_preset = "v2/en_speaker_6"
|
1714 |
|
@@ -1726,20 +1725,16 @@ else:
|
|
1726 |
|
1727 |
def load_models_lazy():
|
1728 |
"""Load models only when needed"""
|
1729 |
-
global
|
1730 |
|
1731 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
1732 |
print(f"π Using device: {device}")
|
1733 |
-
|
1734 |
-
if
|
1735 |
-
print("π Loading
|
1736 |
-
|
1737 |
-
|
1738 |
-
|
1739 |
-
if processor_bark is None:
|
1740 |
-
print("π Loading Bark processor...")
|
1741 |
-
processor_bark = AutoProcessor.from_pretrained("suno/bark")
|
1742 |
-
print("β
Bark processor loaded")
|
1743 |
|
1744 |
if faster_whisper_model is None:
|
1745 |
print("π Loading Faster-Whisper model...")
|
@@ -1748,32 +1743,18 @@ def load_models_lazy():
|
|
1748 |
print(f"β
Faster-Whisper model loaded on {device}")
|
1749 |
|
1750 |
|
1751 |
-
def
|
1752 |
-
"""Fully correct async TTS generation with Bark"""
|
1753 |
def _generate():
|
1754 |
load_models_lazy()
|
1755 |
-
device = next(model_bark.parameters()).device
|
1756 |
-
print(f"π Bark model on: {device}")
|
1757 |
print(f"ποΈ Speaking: {text}")
|
1758 |
-
|
1759 |
-
# π§ Prepare full input using processor (not just input_ids)
|
1760 |
-
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1761 |
-
inputs = {k: v.to(device) for k, v in inputs.items()}
|
1762 |
-
|
1763 |
-
# β
Generate using unpacked args β this includes all required prompt tensors
|
1764 |
-
with torch.no_grad():
|
1765 |
-
speech_values = model_bark.generate(**inputs)
|
1766 |
-
|
1767 |
-
# β
Convert to audio
|
1768 |
-
speech = speech_values.cpu().numpy().squeeze()
|
1769 |
-
speech = (speech * 32767).astype(np.int16)
|
1770 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1771 |
-
|
1772 |
return temp_wav.name
|
1773 |
|
1774 |
return executor.submit(_generate)
|
1775 |
|
1776 |
|
|
|
1777 |
def whisper_stt(audio_path):
|
1778 |
"""STT using Faster-Whisper"""
|
1779 |
if not audio_path or not os.path.exists(audio_path):
|
@@ -1918,7 +1899,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
1918 |
# Generate audio with Bark (wait for it)
|
1919 |
start = time.perf_counter()
|
1920 |
cleaned_text = first_q.strip().replace("\n", " ")
|
1921 |
-
audio_future =
|
1922 |
audio_path = audio_future.result()
|
1923 |
print("β±οΈ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
|
1924 |
|
@@ -2014,7 +1995,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
2014 |
state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
|
2015 |
|
2016 |
# Generate TTS asynchronously for next question too
|
2017 |
-
audio_future =
|
2018 |
# For now, we'll wait for it (you can make this async too)
|
2019 |
audio_path = audio_future.result()
|
2020 |
|
|
|
1695 |
import scipy.io.wavfile as wavfile
|
1696 |
import os
|
1697 |
import json
|
1698 |
+
from TTS.api import TTS
|
1699 |
import torch, gc
|
1700 |
from faster_whisper import WhisperModel
|
1701 |
import asyncio
|
|
|
1707 |
gc.collect()
|
1708 |
|
1709 |
# Global variables for lazy loading
|
1710 |
+
tts_model = None
|
|
|
1711 |
faster_whisper_model = None
|
1712 |
bark_voice_preset = "v2/en_speaker_6"
|
1713 |
|
|
|
1725 |
|
1726 |
def load_models_lazy():
|
1727 |
"""Load models only when needed"""
|
1728 |
+
global tts_model, faster_whisper_model
|
1729 |
|
1730 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
1731 |
print(f"π Using device: {device}")
|
1732 |
+
|
1733 |
+
if tts_model is None:
|
1734 |
+
print("π Loading Coqui TTS model...")
|
1735 |
+
tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False,
|
1736 |
+
gpu=torch.cuda.is_available())
|
1737 |
+
print("β
Coqui TTS model loaded")
|
|
|
|
|
|
|
|
|
1738 |
|
1739 |
if faster_whisper_model is None:
|
1740 |
print("π Loading Faster-Whisper model...")
|
|
|
1743 |
print(f"β
Faster-Whisper model loaded on {device}")
|
1744 |
|
1745 |
|
1746 |
+
def tts_async(text):
|
|
|
1747 |
def _generate():
|
1748 |
load_models_lazy()
|
|
|
|
|
1749 |
print(f"ποΈ Speaking: {text}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1750 |
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1751 |
+
tts_model.tts_to_file(text=text, file_path=temp_wav.name)
|
1752 |
return temp_wav.name
|
1753 |
|
1754 |
return executor.submit(_generate)
|
1755 |
|
1756 |
|
1757 |
+
|
1758 |
def whisper_stt(audio_path):
|
1759 |
"""STT using Faster-Whisper"""
|
1760 |
if not audio_path or not os.path.exists(audio_path):
|
|
|
1899 |
# Generate audio with Bark (wait for it)
|
1900 |
start = time.perf_counter()
|
1901 |
cleaned_text = first_q.strip().replace("\n", " ")
|
1902 |
+
audio_future = tts_async(cleaned_text)
|
1903 |
audio_path = audio_future.result()
|
1904 |
print("β±οΈ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
|
1905 |
|
|
|
1995 |
state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
|
1996 |
|
1997 |
# Generate TTS asynchronously for next question too
|
1998 |
+
audio_future = tts_async(next_q)
|
1999 |
# For now, we'll wait for it (you can make this async too)
|
2000 |
audio_path = audio_future.result()
|
2001 |
|
requirements.txt
CHANGED
@@ -38,4 +38,5 @@ accelerate==0.29.3
|
|
38 |
huggingface_hub==0.20.3
|
39 |
textract==1.6.3
|
40 |
bitsandbytes
|
41 |
-
faster-whisper==0.10.0
|
|
|
|
38 |
huggingface_hub==0.20.3
|
39 |
textract==1.6.3
|
40 |
bitsandbytes
|
41 |
+
faster-whisper==0.10.0
|
42 |
+
TTS==0.22.0
|