Spaces:

husseinelsaadi
/

ai-interviewer-demo

Paused

husseinelsaadi commited on 2 days ago

Commit

269d410

verified ·

1 Parent(s): b1f8948

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1481,21 +1481,25 @@ bark_voice_preset = "v2/en_speaker_5"
 def bark_tts(text):
     print(f"🔁 Synthesizing TTS for: {text}")
-    processed = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
     input_ids = processed["input_ids"].to(model_bark.device)
     attention_mask = processed.get("attention_mask", None)
     if attention_mask is not None:
         attention_mask = attention_mask.to(model_bark.device)
     start = time.time()
     speech_values = model_bark.generate(
         input_ids=input_ids,
         attention_mask=attention_mask,
-        pad_token_id=10000,          # Optional safety
-        max_new_tokens=100           # ✅ Passed once only here
     )
     print(f"✅ Bark finished in {round(time.time() - start, 2)}s")
     speech = speech_values.cpu().numpy().squeeze()
     speech = (speech * 32767).astype(np.int16)
     temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
@@ -1504,6 +1508,7 @@ def bark_tts(text):
 # Whisper STT
 print("🔁 Loading Whisper model...")
 whisper_model = whisper.load_model("base", device="cuda")

 def bark_tts(text):
     print(f"🔁 Synthesizing TTS for: {text}")
+    # DON'T pass voice_preset here — it will inject generation kwargs internally!
+    processed = processor_bark(text, return_tensors="pt")
     input_ids = processed["input_ids"].to(model_bark.device)
     attention_mask = processed.get("attention_mask", None)
     if attention_mask is not None:
         attention_mask = attention_mask.to(model_bark.device)
     start = time.time()
+    # Pass ONLY these manually — clean, controlled
     speech_values = model_bark.generate(
         input_ids=input_ids,
         attention_mask=attention_mask,
+        pad_token_id=10000,
+        max_new_tokens=100,
     )
     print(f"✅ Bark finished in {round(time.time() - start, 2)}s")
+    # Post-processing
     speech = speech_values.cpu().numpy().squeeze()
     speech = (speech * 32767).astype(np.int16)
     temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
 # Whisper STT
 print("🔁 Loading Whisper model...")
 whisper_model = whisper.load_model("base", device="cuda")