Spaces:

husseinelsaadi
/

ai-interviewer-demo

Paused

App Files Files Community

husseinelsaadi commited on Jul 17

Commit

ae625e2

1 Parent(s): 1a5a90b

added bark tts for faster loading

Browse files

Files changed (2) hide show

app.py +14 -33
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1695,7 +1695,7 @@ import numpy as np
 import scipy.io.wavfile as wavfile
 import os
 import json
-from transformers import BarkModel, AutoProcessor
 import torch, gc
 from faster_whisper import WhisperModel
 import asyncio
@@ -1707,8 +1707,7 @@ torch.cuda.empty_cache()
 gc.collect()
 # Global variables for lazy loading
-model_bark = None
-processor_bark = None
 faster_whisper_model = None
 bark_voice_preset = "v2/en_speaker_6"
@@ -1726,20 +1725,16 @@ else:
 def load_models_lazy():
     """Load models only when needed"""
-    global model_bark, processor_bark, faster_whisper_model
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔁 Using device: {device}")
-    if model_bark is None:
-        print("🔁 Loading Bark model...")
-        model_bark = BarkModel.from_pretrained("suno/bark").to(device)
-        print(f"✅ Bark model loaded on {device}")
-    if processor_bark is None:
-        print("🔁 Loading Bark processor...")
-        processor_bark = AutoProcessor.from_pretrained("suno/bark")
-        print("✅ Bark processor loaded")
     if faster_whisper_model is None:
         print("🔁 Loading Faster-Whisper model...")
@@ -1748,32 +1743,18 @@ def load_models_lazy():
         print(f"✅ Faster-Whisper model loaded on {device}")
-def bark_tts_async(text):
-    """Fully correct async TTS generation with Bark"""
     def _generate():
         load_models_lazy()
-        device = next(model_bark.parameters()).device
-        print(f"🔁 Bark model on: {device}")
         print(f"🎙️ Speaking: {text}")
-        # 🧠 Prepare full input using processor (not just input_ids)
-        inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # ✅ Generate using unpacked args — this includes all required prompt tensors
-        with torch.no_grad():
-            speech_values = model_bark.generate(**inputs)
-        # ✅ Convert to audio
-        speech = speech_values.cpu().numpy().squeeze()
-        speech = (speech * 32767).astype(np.int16)
         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        wavfile.write(temp_wav.name, 22050, speech)
         return temp_wav.name
     return executor.submit(_generate)
 def whisper_stt(audio_path):
     """STT using Faster-Whisper"""
     if not audio_path or not os.path.exists(audio_path):
@@ -1918,7 +1899,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         # Generate audio with Bark (wait for it)
         start = time.perf_counter()
         cleaned_text = first_q.strip().replace("\n", " ")
-        audio_future = bark_tts_async(cleaned_text)
         audio_path = audio_future.result()
         print("⏱️ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
@@ -2014,7 +1995,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
             # Generate TTS asynchronously for next question too
-            audio_future = bark_tts_async(next_q)
             # For now, we'll wait for it (you can make this async too)
             audio_path = audio_future.result()

 import scipy.io.wavfile as wavfile
 import os
 import json
+from TTS.api import TTS
 import torch, gc
 from faster_whisper import WhisperModel
 import asyncio
 gc.collect()
 # Global variables for lazy loading
+tts_model = None
 faster_whisper_model = None
 bark_voice_preset = "v2/en_speaker_6"
 def load_models_lazy():
     """Load models only when needed"""
+    global tts_model, faster_whisper_model
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔁 Using device: {device}")
+    if tts_model is None:
+        print("🔁 Loading Coqui TTS model...")
+        tts_model = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False,
+                        gpu=torch.cuda.is_available())
+        print("✅ Coqui TTS model loaded")
     if faster_whisper_model is None:
         print("🔁 Loading Faster-Whisper model...")
         print(f"✅ Faster-Whisper model loaded on {device}")
+def tts_async(text):
     def _generate():
         load_models_lazy()
         print(f"🎙️ Speaking: {text}")
         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        tts_model.tts_to_file(text=text, file_path=temp_wav.name)
         return temp_wav.name
     return executor.submit(_generate)
 def whisper_stt(audio_path):
     """STT using Faster-Whisper"""
     if not audio_path or not os.path.exists(audio_path):
         # Generate audio with Bark (wait for it)
         start = time.perf_counter()
         cleaned_text = first_q.strip().replace("\n", " ")
+        audio_future = tts_async(cleaned_text)
         audio_path = audio_future.result()
         print("⏱️ Bark TTS took", round(time.perf_counter() - start, 2), "seconds")
             state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
             # Generate TTS asynchronously for next question too
+            audio_future = tts_async(next_q)
             # For now, we'll wait for it (you can make this async too)
             audio_path = audio_future.result()

requirements.txt CHANGED Viewed

@@ -38,4 +38,5 @@ accelerate==0.29.3
 huggingface_hub==0.20.3
 textract==1.6.3
 bitsandbytes
-faster-whisper==0.10.0

 huggingface_hub==0.20.3
 textract==1.6.3
 bitsandbytes
+faster-whisper==0.10.0
+TTS==0.22.0