Spaces:

husseinelsaadi
/

ai-interviewer-demo

Paused

App Files Files Community

husseinelsaadi commited on Jul 16

Commit

fa6324d

verified ·

1 Parent(s): 5468de9

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -12

app.py CHANGED Viewed

@@ -1729,7 +1729,6 @@ def extract_candidate_details(file_path):
 #     )
 # demo.launch(debug=True)
 import gradio as gr
 import time
 import tempfile
@@ -1757,14 +1756,26 @@ bark_voice_preset = "v2/en_speaker_9"
 # Thread pool for async operations
 executor = ThreadPoolExecutor(max_workers=2)
 def load_models_lazy():
     """Load models only when needed"""
     global model_bark, processor_bark, whisper_model
     if model_bark is None:
         print("🔁 Loading Bark model...")
-        model_bark = BarkModel.from_pretrained("suno/bark").to("cuda" if torch.cuda.is_available() else "cpu")
-        print("✅ Bark model loaded")
     if processor_bark is None:
         print("🔁 Loading Bark processor...")
@@ -1773,16 +1784,22 @@ def load_models_lazy():
     if whisper_model is None:
         print("🔁 Loading Whisper model...")
-        whisper_model = whisper.load_model("base", device="cuda")
-        print("✅ Whisper model loaded")
 def bark_tts_async(text):
     """Async TTS generation"""
     def _generate():
         load_models_lazy()  # Load only when needed
         print(f"🔁 Synthesizing TTS for: {text}")
         inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
-        input_ids = inputs["input_ids"].to(model_bark.device)
         start = time.time()
         with torch.no_grad():
             speech_values = model_bark.generate(
@@ -1791,7 +1808,8 @@ def bark_tts_async(text):
                 fine_temperature=0.4,
                 coarse_temperature=0.8
             )
-        print(f"✅ Bark finished in {round(time.time() - start, 2)}s")
         speech = speech_values.cpu().numpy().squeeze()
         speech = (speech * 32767).astype(np.int16)
         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
@@ -1806,6 +1824,11 @@ def whisper_stt(audio_path):
         return ""
     load_models_lazy()  # Load only when needed
     result = whisper_model.transcribe(audio_path)
     return result["text"]
@@ -1942,9 +1965,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             return gr.update(), gr.update(), tts_future_obj
     start_interview_final_btn.click(
-        start_interview_immediate,
-        [user_data],
-        [interview_state, interview_pre_section, interview_section, question_audio, question_text]
     )
     def transcribe(audio_path):
@@ -2030,5 +2059,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
     )
-demo.launch(debug=True)

 #     )
 # demo.launch(debug=True)
 import gradio as gr
 import time
 import tempfile
 # Thread pool for async operations
 executor = ThreadPoolExecutor(max_workers=2)
+# Add after your imports
+if torch.cuda.is_available():
+    print(f"🔥 CUDA Available: {torch.cuda.get_device_name(0)}")
+    print(f"🔥 CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+    # Set default device
+    torch.cuda.set_device(0)
+else:
+    print("⚠️  CUDA not available, using CPU")
 def load_models_lazy():
     """Load models only when needed"""
     global model_bark, processor_bark, whisper_model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"🔁 Using device: {device}")
     if model_bark is None:
         print("🔁 Loading Bark model...")
+        model_bark = BarkModel.from_pretrained("suno/bark").to(device)
+        print(f"✅ Bark model loaded on {device}")
     if processor_bark is None:
         print("🔁 Loading Bark processor...")
     if whisper_model is None:
         print("🔁 Loading Whisper model...")
+        whisper_model = whisper.load_model("base", device=device)
+        print(f"✅ Whisper model loaded on {device}")
 def bark_tts_async(text):
     """Async TTS generation"""
     def _generate():
         load_models_lazy()  # Load only when needed
         print(f"🔁 Synthesizing TTS for: {text}")
+        # Ensure we're using the correct device
+        device = next(model_bark.parameters()).device
+        print(f"🔁 Bark model is on device: {device}")
         inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
+        input_ids = inputs["input_ids"].to(device)  # Move to same device as model
         start = time.time()
         with torch.no_grad():
             speech_values = model_bark.generate(
                 fine_temperature=0.4,
                 coarse_temperature=0.8
             )
+        print(f"✅ Bark finished in {round(time.time() - start, 2)}s on {device}")
         speech = speech_values.cpu().numpy().squeeze()
         speech = (speech * 32767).astype(np.int16)
         temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         return ""
     load_models_lazy()  # Load only when needed
+    # Check what device Whisper is actually using
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"🔁 Whisper transcribing on {device}")
     result = whisper_model.transcribe(audio_path)
     return result["text"]
             return gr.update(), gr.update(), tts_future_obj
     start_interview_final_btn.click(
+        start_interview_immediate,
+        [user_data],
+        [interview_state, tts_future, interview_pre_section, interview_section, loading_status, question_audio, question_text]
+    ).then(
+        # Check TTS status every 500ms
+        check_tts_ready,
+        [interview_state, tts_future],
+        [question_audio, loading_status, tts_future],
+        every=0.5
     )
     def transcribe(audio_path):
         lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, stt_transcript]
     )
+demo.launch(debug=True)