Spaces:

husseinelsaadi
/

ai-interviewer-demo

Paused

App Files Files Community

husseinelsaadi commited on Jul 16

Commit

be4261f

verified ·

1 Parent(s): f25a98e

Update app.py

Browse files

Files changed (1) hide show

app.py +437 -377

app.py CHANGED Viewed

@@ -1448,43 +1448,435 @@ def extract_candidate_details(file_path):
         "skills": skills
     }
 import gradio as gr
 import time
 import tempfile
 import numpy as np
 import scipy.io.wavfile as wavfile
-import cv2
 import os
-import json
-from moviepy.editor import VideoFileClip
-import shutil
-from transformers import BarkModel, AutoProcessor
-import torch, gc
-import whisper
-from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
-import librosa
 import torch
-print(torch.cuda.is_available())  # ✅ Tells you if GPU is available
-torch.cuda.empty_cache()
-gc.collect()
-# Bark TTS
-print("🔁 Loading Bark model...")
-model_bark = BarkModel.from_pretrained("suno/bark")
-print("✅ Bark model loaded")
-print("🔁 Loading Bark processor...")
 processor_bark = AutoProcessor.from_pretrained("suno/bark")
-print("✅ Bark processor loaded")
-print("🔁 Moving Bark model to device...")
-model_bark.to("cuda" if torch.cuda.is_available() else "cpu")
-print("✅ Bark model on device")
 bark_voice_preset = "v2/en_speaker_6"
 def bark_tts(text):
-    print(f"🔁 Synthesizing TTS for: {text}")
     inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
     inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
     speech_values = model_bark.generate(**inputs)
@@ -1494,366 +1886,34 @@ def bark_tts(text):
     wavfile.write(temp_wav.name, 22050, speech)
     return temp_wav.name
-# Whisper STT
-print("🔁 Loading Whisper model...")
-whisper_model = whisper.load_model("base", device="cuda")
-print("✅ Whisper model loaded")
 def whisper_stt(audio_path):
-    if not audio_path or not os.path.exists(audio_path): return ""
     result = whisper_model.transcribe(audio_path)
     return result["text"]
-# DeepFace (Video Face Emotion)
-def ensure_mp4(video_input):
-    # video_input could be a file-like object, a path, or a Gradio temp path
-    if isinstance(video_input, str):
-        input_path = video_input
-    else:
-        # It's a file-like object (rare for Gradio video, but handle it)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_in:
-            temp_in.write(video_input.read())
-            input_path = temp_in.name
-    # If already mp4, return as is
-    if input_path.endswith(".mp4"):
-        return input_path
-    # Convert to mp4 using moviepy
-    mp4_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
-    try:
-        clip = VideoFileClip(input_path)
-        clip.write_videofile(mp4_path, codec="libx264", audio=False, verbose=False, logger=None)
-        clip.close()
-    except Exception as e:
-        print("Video conversion failed:", e)
-        # As fallback, just copy original
-        shutil.copy(input_path, mp4_path)
-    return mp4_path
-def analyze_video_emotions(video_input, sample_rate=15):
-    # Convert input to an mp4 file OpenCV can process
-    mp4_path = ensure_mp4(video_input)
-    if not mp4_path or not os.path.exists(mp4_path):
-        return "no_face"
-    cap = cv2.VideoCapture(mp4_path)
-    frame_count = 0
-    emotion_counts = {}
-    while True:
-        ret, frame = cap.read()
-        if not ret: break
-        if frame_count % sample_rate == 0:
-            try:
-                result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
-                dominant = result[0]["dominant_emotion"] if isinstance(result, list) else result["dominant_emotion"]
-                emotion_counts[dominant] = emotion_counts.get(dominant, 0) + 1
-            except Exception: pass
-        frame_count += 1
-    cap.release()
-    if not emotion_counts: return "no_face"
-    return max(emotion_counts.items(), key=lambda x: x[1])[0]
-# Original Hugging Face model: HaniaRuby/speech-emotion-recognition-wav2vec2
-local_wav2vec_model_path = "HaniaRuby/speech-emotion-recognition-wav2vec2" # Local path to the downloaded model files
-print("🔁 Loading Wav2Vec processor and model...")
-wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
-wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
-wav2vec_model = wav2vec_model.to("cuda" if torch.cuda.is_available() else "cpu")
-print("✅ Wav2Vec model loaded")
-wav2vec_model.eval()
-voice_label_map = {
-    0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy',
-    4: 'neutral', 5: 'sad', 6: 'surprise'
-}
-def analyze_audio_emotion(audio_path):
-    print(f"🔁 Analyzing audio emotion for: {audio_path}")
-    if not audio_path or not os.path.exists(audio_path): return "neutral"
-    speech, sr = librosa.load(audio_path, sr=16000)
-    inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
-    # 🔥 Move model and inputs to GPU
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    wav2vec_model.to(device)
-    inputs = {k: v.to(device) for k, v in inputs.items()}
-    with torch.no_grad():
-        logits = wav2vec_model(**inputs).logits
-    probs = torch.nn.functional.softmax(logits, dim=-1)
-    predicted_id = torch.argmax(probs, dim=-1).item()
-    return voice_label_map.get(predicted_id, "neutral")
-# --- Effective confidence calculation
-def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
-    emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
-    answer_score_map = {"excellent": 1.0, "good": 0.8, "medium": 0.6, "poor": 0.3}
-    voice_score, face_score, answer_score = emotion_map.get(voice_label, 0.5), emotion_map.get(face_label, 0.5), answer_score_map.get(answer_score_label, 0.5)
-    avg_emotion = (voice_score + face_score) / 2
-    control_bonus = max(0, answer_score - avg_emotion) * k
-    eff_conf = (0.5 * answer_score + 0.22 * voice_score + 0.18 * face_score + 0.1 * control_bonus)
-    return {"effective_confidence": round(eff_conf, 3), "answer_score": round(answer_score, 2), "voice_score": round(voice_score, 2), "face_score": round(face_score, 2), "control_bonus": round(control_bonus, 3)}
-seniority_mapping = {
-    "Entry-level": 1, "Junior": 2, "Mid-Level": 3, "Senior": 4, "Lead": 5
-}
-# --- 2. Gradio App ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    user_data = gr.State({})
-    interview_state = gr.State({})
-    missing_fields_state = gr.State([])
-    # --- UI Layout ---
-    with gr.Column(visible=True) as user_info_section:
-        gr.Markdown("## Candidate Information")
-        cv_file = gr.File(label="Upload CV")
-        job_desc = gr.Textbox(label="Job Description")
-        start_btn = gr.Button("Continue", interactive=False)
-    with gr.Column(visible=False) as missing_section:
-        gr.Markdown("## Missing Information")
-        name_in = gr.Textbox(label="Name", visible=False)
-        role_in = gr.Textbox(label="Job Role", visible=False)
-        seniority_in = gr.Dropdown(list(seniority_mapping.keys()), label="Seniority", visible=False)
-        skills_in = gr.Textbox(label="Skills", visible=False)
-        submit_btn = gr.Button("Submit", interactive=False)
-    with gr.Column(visible=False) as interview_pre_section:
-        pre_interview_greeting_md = gr.Markdown()
-        start_interview_final_btn = gr.Button("Start Interview")
-    with gr.Column(visible=False) as interview_section:
-        gr.Markdown("## Interview in Progress")
-        question_audio = gr.Audio(label="Listen", interactive=False, autoplay=True)
-        question_text = gr.Markdown()
-        user_audio_input = gr.Audio(sources=["microphone"], type="filepath", label="1. Record Audio Answer")
-        user_video_input = gr.Video(sources=["webcam"], label="2. Record Video Answer")
-        stt_transcript = gr.Textbox(label="Transcribed Answer (edit if needed)")
-        confirm_btn = gr.Button("Confirm Answer")
-        evaluation_display = gr.Markdown()
-        emotion_display = gr.Markdown()
-        interview_summary = gr.Markdown(visible=False)
-    # --- UI Logic ---
-    def validate_start_btn(cv_file, job_desc):
-        return gr.update(interactive=(cv_file is not None and hasattr(cv_file, "name") and bool(job_desc and job_desc.strip())))
-    cv_file.change(validate_start_btn, [cv_file, job_desc], start_btn)
-    job_desc.change(validate_start_btn, [cv_file, job_desc], start_btn)
-    def process_and_route_initial(cv_file, job_desc):
-        details = extract_candidate_details(cv_file.name)
-        job_info = extract_job_details(job_desc)
-        data = {
-            "name": details.get("name", "unknown"), "job_role": job_info.get("job_title", "unknown"),
-            "seniority": job_info.get("experience_level", "unknown"), "skills": job_info.get("skills", [])
-        }
-        missing = [k for k, v in data.items() if (isinstance(v, str) and v.lower() == "unknown") or not v]
-        if missing:
-            return data, missing, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
-        else:
-            greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' when ready."
-            return data, missing, gr.update(visible=False), gr.update(visible=False), gr.update(visible=True, value=greeting)
-    start_btn.click(
-        process_and_route_initial,
-        [cv_file, job_desc],
-        [user_data, missing_fields_state, user_info_section, missing_section, pre_interview_greeting_md]
-    )
-    def show_missing(missing):
-        if missing is None: missing = []
-        return gr.update(visible="name" in missing), gr.update(visible="job_role" in missing), gr.update(visible="seniority" in missing), gr.update(visible="skills" in missing)
-    missing_fields_state.change(show_missing, missing_fields_state, [name_in, role_in, seniority_in, skills_in])
-    def validate_fields(name, role, seniority, skills, missing):
-        if not missing: return gr.update(interactive=False)
-        all_filled = all([(not ("name" in missing) or bool(name.strip())), (not ("job_role" in missing) or bool(role.strip())), (not ("seniority" in missing) or bool(seniority)), (not ("skills" in missing) or bool(skills.strip())),])
-        return gr.update(interactive=all_filled)
-    for inp in [name_in, role_in, seniority_in, skills_in]:
-        inp.change(validate_fields, [name_in, role_in, seniority_in, skills_in, missing_fields_state], submit_btn)
-    def complete_manual(data, name, role, seniority, skills):
-        if data["name"].lower() == "unknown": data["name"] = name
-        if data["job_role"].lower() == "unknown": data["job_role"] = role
-        if data["seniority"].lower() == "unknown": data["seniority"] = seniority
-        if not data["skills"]: data["skills"] = [s.strip() for s in skills.split(",")]
-        greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' to begin."
-        return data, gr.update(visible=False), gr.update(visible=True), gr.update(value=greeting)
-    submit_btn.click(complete_manual, [user_data, name_in, role_in, seniority_in, skills_in], [user_data, missing_section, interview_pre_section, pre_interview_greeting_md])
-    def start_interview(data):
-        # --- Advanced state with full logging ---
-        state = {
-            "questions": [], "answers": [], "face_labels": [], "voice_labels": [], "timings": [],
-            "question_evaluations": [], "answer_evaluations": [], "effective_confidences": [],
-            "conversation_history": [],
-            "difficulty_adjustment": None,
-            "question_idx": 0, "max_questions": 3, "q_start_time": time.time(),
-            "log": []
-        }
-        # --- Optionally: context retrieval here (currently just blank) ---
-        context = ""
-        prompt = build_interview_prompt(
-            conversation_history=[], user_response="", context=context, job_role=data["job_role"],
-            skills=data["skills"], seniority=data["seniority"], difficulty_adjustment=None,
-            voice_label="neutral", face_label="neutral"
-        )
-        #here the original one
-        # first_q = groq_llm.predict(prompt)
-        # # Evaluate Q for quality
-        # q_eval = eval_question_quality(first_q, data["job_role"], data["seniority"], None)
-        # state["questions"].append(first_q)
-        # state["question_evaluations"].append(q_eval)
-        #here the testing one
-        first_q = groq_llm.predict(prompt)
-        q_eval = {
-            "Score": "N/A",
-            "Reasoning": "Skipped to reduce processing time",
-            "Improvements": []
-        }
-        state["questions"].append(first_q)
-        state["question_evaluations"].append(q_eval)
-        state["conversation_history"].append({'role': 'Interviewer', 'content': first_q})
-        audio_path = bark_tts(first_q)
-        # LOG
-        state["log"].append({"type": "question", "question": first_q, "question_eval": q_eval, "timestamp": time.time()})
-        return state, gr.update(visible=False), gr.update(visible=True), audio_path, f"*Question 1:* {first_q}"
-    start_interview_final_btn.click(start_interview, [user_data], [interview_state, interview_pre_section, interview_section, question_audio, question_text])
-    def transcribe(audio_path):
-        return whisper_stt(audio_path)
-    user_audio_input.change(transcribe, user_audio_input, stt_transcript)
-    def process_answer(transcript, audio_path, video_path, state, data):
-        if not transcript and not video_path:
-            return state, gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
-        elapsed = round(time.time() - state.get("q_start_time", time.time()), 2)
-        state["timings"].append(elapsed)
-        state["answers"].append(transcript)
-        state["conversation_history"].append({'role': 'Candidate', 'content': transcript})
-        # --- 1. Emotion analysis ---
-        # voice_label = analyze_audio_emotion(audio_path)
-        # face_label = analyze_video_emotions(video_path)
-        # state["voice_labels"].append(voice_label)
-        # state["face_labels"].append(face_label)
-        #just for testing
-        voice_label = "neutral"
-        face_label = "neutral"
-        state["voice_labels"].append(voice_label)
-        state["face_labels"].append(face_label)
-        # --- 2. Evaluate previous Q and Answer ---
-        last_q = state["questions"][-1]
-        q_eval = state["question_evaluations"][-1]  # Already in state
-        ref_answer = generate_reference_answer(last_q, data["job_role"], data["seniority"])
-        answer_eval = evaluate_answer(last_q, transcript, ref_answer, data["job_role"], data["seniority"], None)
-        state["answer_evaluations"].append(answer_eval)
-        answer_score = answer_eval.get("Score", "medium") if answer_eval else "medium"
-        # --- 3. Adaptive difficulty ---
-        if answer_score == "excellent":
-            state["difficulty_adjustment"] = "harder"
-        elif answer_score in ("medium", "poor"):
-            state["difficulty_adjustment"] = "easier"
-        else:
-            state["difficulty_adjustment"] = None
-        # --- 4. Effective confidence ---
-        # eff_conf = interpret_confidence(voice_label, face_label, answer_score)
-        # state["effective_confidences"].append(eff_conf)
-        #just for testing:
-        eff_conf = {"effective_confidence": 0.6}
-        state["effective_confidences"].append(eff_conf)
-        # --- LOG ---
-        state["log"].append({
-            "type": "answer",
-            "question": last_q,
-            "answer": transcript,
-            "answer_eval": answer_eval,
-            "ref_answer": ref_answer,
-            "face_label": face_label,
-            "voice_label": voice_label,
-            "effective_confidence": eff_conf,
-            "timing": elapsed,
-            "timestamp": time.time()
-        })
-        # --- Next or End ---
-        qidx = state["question_idx"] + 1
-        if qidx >= state["max_questions"]:
-            # Save as JSON (optionally)
-            timestamp = time.strftime("%Y%m%d_%H%M%S")
-            log_file = f"interview_log_{timestamp}.json"
-            with open(log_file, "w", encoding="utf-8") as f:
-                json.dump(state["log"], f, indent=2, ensure_ascii=False)
-            # Report
-            summary = "# Interview Summary\n"
-            for i, q in enumerate(state["questions"]):
-                summary += (f"\n### Q{i + 1}: {q}\n"
-                            f"- *Answer*: {state['answers'][i]}\n"
-                            f"- *Q Eval*: {state['question_evaluations'][i]}\n"
-                            f"- *A Eval*: {state['answer_evaluations'][i]}\n"
-                            #also this are removed just for testing :(
-                            # f"- *Face Emotion: {state['face_labels'][i]}, **Voice Emotion*: {state['voice_labels'][i]}\n"
-                            # f"- *Effective Confidence*: {state['effective_confidences'][i]['effective_confidence']}\n"
-                            f"- *Time*: {state['timings'][i]}s\n")
-            summary += f"\n\n⏺ Full log saved as {log_file}."
-            return (state, gr.update(visible=True, value=summary), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"))
-        else:
-            # --- Build next prompt using adaptive difficulty ---
-            state["question_idx"] = qidx
-            state["q_start_time"] = time.time()
-            context = ""  # You can add your context logic here
-            prompt = build_interview_prompt(
-                conversation_history=state["conversation_history"],
-                user_response=transcript,
-                context=context,
-                job_role=data["job_role"],
-                skills=data["skills"],
-                seniority=data["seniority"],
-                difficulty_adjustment=state["difficulty_adjustment"],
-                face_label=face_label,
-                voice_label=voice_label,
-                effective_confidence=eff_conf
-            )
-            next_q = groq_llm.predict(prompt)
-            # Evaluate Q quality
-            q_eval = eval_question_quality(next_q, data["job_role"], data["seniority"], None)
-            state["questions"].append(next_q)
-            state["question_evaluations"].append(q_eval)
-            state["conversation_history"].append({'role': 'Interviewer', 'content': next_q})
-            state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
-            audio_path = bark_tts(next_q)
-            # Display evaluations
-            eval_md = f"*Last Answer Eval:* {answer_eval}\n\n*Effective Confidence:* {eff_conf}"
-            return (
-                state, gr.update(visible=False), audio_path, f"*Question {qidx + 1}:* {next_q}",
-                gr.update(value=None), gr.update(value=None),
-                gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"),
-            )
-    confirm_btn.click(
-        process_answer,
-        [stt_transcript, user_audio_input, user_video_input, interview_state, user_data],
-        [interview_state, interview_summary, question_audio, question_text, user_audio_input, user_video_input, emotion_display]
-    ).then(
-        lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, user_video_input]
-    )
 demo.launch(debug=True)

         "skills": skills
     }
+# import gradio as gr
+# import time
+# import tempfile
+# import numpy as np
+# import scipy.io.wavfile as wavfile
+# import cv2
+# import os
+# import json
+# from moviepy.editor import VideoFileClip
+# import shutil
+# from transformers import BarkModel, AutoProcessor
+# import torch, gc
+# import whisper
+# from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
+# import librosa
+# import torch
+# print(torch.cuda.is_available())  # ✅ Tells you if GPU is available
+# torch.cuda.empty_cache()
+# gc.collect()
+# # Bark TTS
+# print("🔁 Loading Bark model...")
+# model_bark = BarkModel.from_pretrained("suno/bark")
+# print("✅ Bark model loaded")
+# print("🔁 Loading Bark processor...")
+# processor_bark = AutoProcessor.from_pretrained("suno/bark")
+# print("✅ Bark processor loaded")
+# print("🔁 Moving Bark model to device...")
+# model_bark.to("cuda" if torch.cuda.is_available() else "cpu")
+# print("✅ Bark model on device")
+# bark_voice_preset = "v2/en_speaker_6"
+# def bark_tts(text):
+#     print(f"🔁 Synthesizing TTS for: {text}")
+#     inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
+#     inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
+#     speech_values = model_bark.generate(**inputs)
+#     speech = speech_values.cpu().numpy().squeeze()
+#     speech = (speech * 32767).astype(np.int16)
+#     temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#     wavfile.write(temp_wav.name, 22050, speech)
+#     return temp_wav.name
+# # Whisper STT
+# print("🔁 Loading Whisper model...")
+# whisper_model = whisper.load_model("base", device="cuda")
+# print("✅ Whisper model loaded")
+# def whisper_stt(audio_path):
+#     if not audio_path or not os.path.exists(audio_path): return ""
+#     result = whisper_model.transcribe(audio_path)
+#     return result["text"]
+# # DeepFace (Video Face Emotion)
+# def ensure_mp4(video_input):
+#     # video_input could be a file-like object, a path, or a Gradio temp path
+#     if isinstance(video_input, str):
+#         input_path = video_input
+#     else:
+#         # It's a file-like object (rare for Gradio video, but handle it)
+#         with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_in:
+#             temp_in.write(video_input.read())
+#             input_path = temp_in.name
+#     # If already mp4, return as is
+#     if input_path.endswith(".mp4"):
+#         return input_path
+#     # Convert to mp4 using moviepy
+#     mp4_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+#     try:
+#         clip = VideoFileClip(input_path)
+#         clip.write_videofile(mp4_path, codec="libx264", audio=False, verbose=False, logger=None)
+#         clip.close()
+#     except Exception as e:
+#         print("Video conversion failed:", e)
+#         # As fallback, just copy original
+#         shutil.copy(input_path, mp4_path)
+#     return mp4_path
+# def analyze_video_emotions(video_input, sample_rate=15):
+#     # Convert input to an mp4 file OpenCV can process
+#     mp4_path = ensure_mp4(video_input)
+#     if not mp4_path or not os.path.exists(mp4_path):
+#         return "no_face"
+#     cap = cv2.VideoCapture(mp4_path)
+#     frame_count = 0
+#     emotion_counts = {}
+#     while True:
+#         ret, frame = cap.read()
+#         if not ret: break
+#         if frame_count % sample_rate == 0:
+#             try:
+#                 result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
+#                 dominant = result[0]["dominant_emotion"] if isinstance(result, list) else result["dominant_emotion"]
+#                 emotion_counts[dominant] = emotion_counts.get(dominant, 0) + 1
+#             except Exception: pass
+#         frame_count += 1
+#     cap.release()
+#     if not emotion_counts: return "no_face"
+#     return max(emotion_counts.items(), key=lambda x: x[1])[0]
+# # Original Hugging Face model: HaniaRuby/speech-emotion-recognition-wav2vec2
+# local_wav2vec_model_path = "HaniaRuby/speech-emotion-recognition-wav2vec2" # Local path to the downloaded model files
+# print("🔁 Loading Wav2Vec processor and model...")
+# wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
+# wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
+# wav2vec_model = wav2vec_model.to("cuda" if torch.cuda.is_available() else "cpu")
+# print("✅ Wav2Vec model loaded")
+# wav2vec_model.eval()
+# voice_label_map = {
+#     0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy',
+#     4: 'neutral', 5: 'sad', 6: 'surprise'
+# }
+# def analyze_audio_emotion(audio_path):
+#     print(f"🔁 Analyzing audio emotion for: {audio_path}")
+#     if not audio_path or not os.path.exists(audio_path): return "neutral"
+#     speech, sr = librosa.load(audio_path, sr=16000)
+#     inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
+#     # 🔥 Move model and inputs to GPU
+#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     wav2vec_model.to(device)
+#     inputs = {k: v.to(device) for k, v in inputs.items()}
+#     with torch.no_grad():
+#         logits = wav2vec_model(**inputs).logits
+#     probs = torch.nn.functional.softmax(logits, dim=-1)
+#     predicted_id = torch.argmax(probs, dim=-1).item()
+#     return voice_label_map.get(predicted_id, "neutral")
+# # --- Effective confidence calculation
+# def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
+#     emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
+#     answer_score_map = {"excellent": 1.0, "good": 0.8, "medium": 0.6, "poor": 0.3}
+#     voice_score, face_score, answer_score = emotion_map.get(voice_label, 0.5), emotion_map.get(face_label, 0.5), answer_score_map.get(answer_score_label, 0.5)
+#     avg_emotion = (voice_score + face_score) / 2
+#     control_bonus = max(0, answer_score - avg_emotion) * k
+#     eff_conf = (0.5 * answer_score + 0.22 * voice_score + 0.18 * face_score + 0.1 * control_bonus)
+#     return {"effective_confidence": round(eff_conf, 3), "answer_score": round(answer_score, 2), "voice_score": round(voice_score, 2), "face_score": round(face_score, 2), "control_bonus": round(control_bonus, 3)}
+# seniority_mapping = {
+#     "Entry-level": 1, "Junior": 2, "Mid-Level": 3, "Senior": 4, "Lead": 5
+# }
+# # --- 2. Gradio App ---
+# with gr.Blocks(theme=gr.themes.Soft()) as demo:
+#     user_data = gr.State({})
+#     interview_state = gr.State({})
+#     missing_fields_state = gr.State([])
+#     # --- UI Layout ---
+#     with gr.Column(visible=True) as user_info_section:
+#         gr.Markdown("## Candidate Information")
+#         cv_file = gr.File(label="Upload CV")
+#         job_desc = gr.Textbox(label="Job Description")
+#         start_btn = gr.Button("Continue", interactive=False)
+#     with gr.Column(visible=False) as missing_section:
+#         gr.Markdown("## Missing Information")
+#         name_in = gr.Textbox(label="Name", visible=False)
+#         role_in = gr.Textbox(label="Job Role", visible=False)
+#         seniority_in = gr.Dropdown(list(seniority_mapping.keys()), label="Seniority", visible=False)
+#         skills_in = gr.Textbox(label="Skills", visible=False)
+#         submit_btn = gr.Button("Submit", interactive=False)
+#     with gr.Column(visible=False) as interview_pre_section:
+#         pre_interview_greeting_md = gr.Markdown()
+#         start_interview_final_btn = gr.Button("Start Interview")
+#     with gr.Column(visible=False) as interview_section:
+#         gr.Markdown("## Interview in Progress")
+#         question_audio = gr.Audio(label="Listen", interactive=False, autoplay=True)
+#         question_text = gr.Markdown()
+#         user_audio_input = gr.Audio(sources=["microphone"], type="filepath", label="1. Record Audio Answer")
+#         user_video_input = gr.Video(sources=["webcam"], label="2. Record Video Answer")
+#         stt_transcript = gr.Textbox(label="Transcribed Answer (edit if needed)")
+#         confirm_btn = gr.Button("Confirm Answer")
+#         evaluation_display = gr.Markdown()
+#         emotion_display = gr.Markdown()
+#         interview_summary = gr.Markdown(visible=False)
+#     # --- UI Logic ---
+#     def validate_start_btn(cv_file, job_desc):
+#         return gr.update(interactive=(cv_file is not None and hasattr(cv_file, "name") and bool(job_desc and job_desc.strip())))
+#     cv_file.change(validate_start_btn, [cv_file, job_desc], start_btn)
+#     job_desc.change(validate_start_btn, [cv_file, job_desc], start_btn)
+#     def process_and_route_initial(cv_file, job_desc):
+#         details = extract_candidate_details(cv_file.name)
+#         job_info = extract_job_details(job_desc)
+#         data = {
+#             "name": details.get("name", "unknown"), "job_role": job_info.get("job_title", "unknown"),
+#             "seniority": job_info.get("experience_level", "unknown"), "skills": job_info.get("skills", [])
+#         }
+#         missing = [k for k, v in data.items() if (isinstance(v, str) and v.lower() == "unknown") or not v]
+#         if missing:
+#             return data, missing, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+#         else:
+#             greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' when ready."
+#             return data, missing, gr.update(visible=False), gr.update(visible=False), gr.update(visible=True, value=greeting)
+#     start_btn.click(
+#         process_and_route_initial,
+#         [cv_file, job_desc],
+#         [user_data, missing_fields_state, user_info_section, missing_section, pre_interview_greeting_md]
+#     )
+#     def show_missing(missing):
+#         if missing is None: missing = []
+#         return gr.update(visible="name" in missing), gr.update(visible="job_role" in missing), gr.update(visible="seniority" in missing), gr.update(visible="skills" in missing)
+#     missing_fields_state.change(show_missing, missing_fields_state, [name_in, role_in, seniority_in, skills_in])
+#     def validate_fields(name, role, seniority, skills, missing):
+#         if not missing: return gr.update(interactive=False)
+#         all_filled = all([(not ("name" in missing) or bool(name.strip())), (not ("job_role" in missing) or bool(role.strip())), (not ("seniority" in missing) or bool(seniority)), (not ("skills" in missing) or bool(skills.strip())),])
+#         return gr.update(interactive=all_filled)
+#     for inp in [name_in, role_in, seniority_in, skills_in]:
+#         inp.change(validate_fields, [name_in, role_in, seniority_in, skills_in, missing_fields_state], submit_btn)
+#     def complete_manual(data, name, role, seniority, skills):
+#         if data["name"].lower() == "unknown": data["name"] = name
+#         if data["job_role"].lower() == "unknown": data["job_role"] = role
+#         if data["seniority"].lower() == "unknown": data["seniority"] = seniority
+#         if not data["skills"]: data["skills"] = [s.strip() for s in skills.split(",")]
+#         greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' to begin."
+#         return data, gr.update(visible=False), gr.update(visible=True), gr.update(value=greeting)
+#     submit_btn.click(complete_manual, [user_data, name_in, role_in, seniority_in, skills_in], [user_data, missing_section, interview_pre_section, pre_interview_greeting_md])
+#     def start_interview(data):
+#         # --- Advanced state with full logging ---
+#         state = {
+#             "questions": [], "answers": [], "face_labels": [], "voice_labels": [], "timings": [],
+#             "question_evaluations": [], "answer_evaluations": [], "effective_confidences": [],
+#             "conversation_history": [],
+#             "difficulty_adjustment": None,
+#             "question_idx": 0, "max_questions": 3, "q_start_time": time.time(),
+#             "log": []
+#         }
+#         # --- Optionally: context retrieval here (currently just blank) ---
+#         context = ""
+#         prompt = build_interview_prompt(
+#             conversation_history=[], user_response="", context=context, job_role=data["job_role"],
+#             skills=data["skills"], seniority=data["seniority"], difficulty_adjustment=None,
+#             voice_label="neutral", face_label="neutral"
+#         )
+#         #here the original one
+#         # first_q = groq_llm.predict(prompt)
+#         # # Evaluate Q for quality
+#         # q_eval = eval_question_quality(first_q, data["job_role"], data["seniority"], None)
+#         # state["questions"].append(first_q)
+#         # state["question_evaluations"].append(q_eval)
+#         #here the testing one
+#         first_q = groq_llm.predict(prompt)
+#         q_eval = {
+#             "Score": "N/A",
+#             "Reasoning": "Skipped to reduce processing time",
+#             "Improvements": []
+#         }
+#         state["questions"].append(first_q)
+#         state["question_evaluations"].append(q_eval)
+#         state["conversation_history"].append({'role': 'Interviewer', 'content': first_q})
+#         audio_path = bark_tts(first_q)
+#         # LOG
+#         state["log"].append({"type": "question", "question": first_q, "question_eval": q_eval, "timestamp": time.time()})
+#         return state, gr.update(visible=False), gr.update(visible=True), audio_path, f"*Question 1:* {first_q}"
+#     start_interview_final_btn.click(start_interview, [user_data], [interview_state, interview_pre_section, interview_section, question_audio, question_text])
+#     def transcribe(audio_path):
+#         return whisper_stt(audio_path)
+#     user_audio_input.change(transcribe, user_audio_input, stt_transcript)
+#     def process_answer(transcript, audio_path, video_path, state, data):
+#         if not transcript and not video_path:
+#             return state, gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
+#         elapsed = round(time.time() - state.get("q_start_time", time.time()), 2)
+#         state["timings"].append(elapsed)
+#         state["answers"].append(transcript)
+#         state["conversation_history"].append({'role': 'Candidate', 'content': transcript})
+#         # --- 1. Emotion analysis ---
+#         # voice_label = analyze_audio_emotion(audio_path)
+#         # face_label = analyze_video_emotions(video_path)
+#         # state["voice_labels"].append(voice_label)
+#         # state["face_labels"].append(face_label)
+#         #just for testing
+#         voice_label = "neutral"
+#         face_label = "neutral"
+#         state["voice_labels"].append(voice_label)
+#         state["face_labels"].append(face_label)
+#         # --- 2. Evaluate previous Q and Answer ---
+#         last_q = state["questions"][-1]
+#         q_eval = state["question_evaluations"][-1]  # Already in state
+#         ref_answer = generate_reference_answer(last_q, data["job_role"], data["seniority"])
+#         answer_eval = evaluate_answer(last_q, transcript, ref_answer, data["job_role"], data["seniority"], None)
+#         state["answer_evaluations"].append(answer_eval)
+#         answer_score = answer_eval.get("Score", "medium") if answer_eval else "medium"
+#         # --- 3. Adaptive difficulty ---
+#         if answer_score == "excellent":
+#             state["difficulty_adjustment"] = "harder"
+#         elif answer_score in ("medium", "poor"):
+#             state["difficulty_adjustment"] = "easier"
+#         else:
+#             state["difficulty_adjustment"] = None
+#         # --- 4. Effective confidence ---
+#         # eff_conf = interpret_confidence(voice_label, face_label, answer_score)
+#         # state["effective_confidences"].append(eff_conf)
+#         #just for testing:
+#         eff_conf = {"effective_confidence": 0.6}
+#         state["effective_confidences"].append(eff_conf)
+#         # --- LOG ---
+#         state["log"].append({
+#             "type": "answer",
+#             "question": last_q,
+#             "answer": transcript,
+#             "answer_eval": answer_eval,
+#             "ref_answer": ref_answer,
+#             "face_label": face_label,
+#             "voice_label": voice_label,
+#             "effective_confidence": eff_conf,
+#             "timing": elapsed,
+#             "timestamp": time.time()
+#         })
+#         # --- Next or End ---
+#         qidx = state["question_idx"] + 1
+#         if qidx >= state["max_questions"]:
+#             # Save as JSON (optionally)
+#             timestamp = time.strftime("%Y%m%d_%H%M%S")
+#             log_file = f"interview_log_{timestamp}.json"
+#             with open(log_file, "w", encoding="utf-8") as f:
+#                 json.dump(state["log"], f, indent=2, ensure_ascii=False)
+#             # Report
+#             summary = "# Interview Summary\n"
+#             for i, q in enumerate(state["questions"]):
+#                 summary += (f"\n### Q{i + 1}: {q}\n"
+#                             f"- *Answer*: {state['answers'][i]}\n"
+#                             f"- *Q Eval*: {state['question_evaluations'][i]}\n"
+#                             f"- *A Eval*: {state['answer_evaluations'][i]}\n"
+#                             #also this are removed just for testing :(
+#                             # f"- *Face Emotion: {state['face_labels'][i]}, **Voice Emotion*: {state['voice_labels'][i]}\n"
+#                             # f"- *Effective Confidence*: {state['effective_confidences'][i]['effective_confidence']}\n"
+#                             f"- *Time*: {state['timings'][i]}s\n")
+#             summary += f"\n\n⏺ Full log saved as {log_file}."
+#             return (state, gr.update(visible=True, value=summary), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"))
+#         else:
+#             # --- Build next prompt using adaptive difficulty ---
+#             state["question_idx"] = qidx
+#             state["q_start_time"] = time.time()
+#             context = ""  # You can add your context logic here
+#             prompt = build_interview_prompt(
+#                 conversation_history=state["conversation_history"],
+#                 user_response=transcript,
+#                 context=context,
+#                 job_role=data["job_role"],
+#                 skills=data["skills"],
+#                 seniority=data["seniority"],
+#                 difficulty_adjustment=state["difficulty_adjustment"],
+#                 face_label=face_label,
+#                 voice_label=voice_label,
+#                 effective_confidence=eff_conf
+#             )
+#             next_q = groq_llm.predict(prompt)
+#             # Evaluate Q quality
+#             q_eval = eval_question_quality(next_q, data["job_role"], data["seniority"], None)
+#             state["questions"].append(next_q)
+#             state["question_evaluations"].append(q_eval)
+#             state["conversation_history"].append({'role': 'Interviewer', 'content': next_q})
+#             state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
+#             audio_path = bark_tts(next_q)
+#             # Display evaluations
+#             eval_md = f"*Last Answer Eval:* {answer_eval}\n\n*Effective Confidence:* {eff_conf}"
+#             return (
+#                 state, gr.update(visible=False), audio_path, f"*Question {qidx + 1}:* {next_q}",
+#                 gr.update(value=None), gr.update(value=None),
+#                 gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"),
+#             )
+#     confirm_btn.click(
+#         process_answer,
+#         [stt_transcript, user_audio_input, user_video_input, interview_state, user_data],
+#         [interview_state, interview_summary, question_audio, question_text, user_audio_input, user_video_input, emotion_display]
+#     ).then(
+#         lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, user_video_input]
+#     )
+# demo.launch(debug=True)
 import gradio as gr
 import time
 import tempfile
 import numpy as np
 import scipy.io.wavfile as wavfile
 import os
 import torch
+import whisper
+from transformers import BarkModel, AutoProcessor
+# Initialize Bark (TTS)
+model_bark = BarkModel.from_pretrained("suno/bark").to("cuda" if torch.cuda.is_available() else "cpu")
 processor_bark = AutoProcessor.from_pretrained("suno/bark")
 bark_voice_preset = "v2/en_speaker_6"
+# Initialize Whisper (STT)
+whisper_model = whisper.load_model("base", device="cuda" if torch.cuda.is_available() else "cpu")
 def bark_tts(text):
     inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
     inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
     speech_values = model_bark.generate(**inputs)
     wavfile.write(temp_wav.name, 22050, speech)
     return temp_wav.name
 def whisper_stt(audio_path):
+    if not audio_path or not os.path.exists(audio_path):
+        return ""
     result = whisper_model.transcribe(audio_path)
     return result["text"]
+# Dummy Groq API stub (replace with actual logic)
+def groq_llm_predict(prompt):
+    return f"[Mock Question] Based on: {prompt}"  # Replace with groq_llm.predict(prompt)
+def interview_loop(state, audio_path):
+    transcript = whisper_stt(audio_path)
+    state["conversation"].append({"role": "Candidate", "content": transcript})
+    prompt = "\n".join([f"{turn['role']}: {turn['content']}" for turn in state["conversation"]])
+    next_q = groq_llm_predict(prompt)
+    state["conversation"].append({"role": "Interviewer", "content": next_q})
+    audio_out = bark_tts(next_q)
+    return state, audio_out, transcript
+with gr.Blocks() as demo:
+    state = gr.State({"conversation": []})
+    question_audio = gr.Audio(label="Interviewer's Question", interactive=False, autoplay=True)
+    user_audio_input = gr.Audio(source="microphone", type="filepath", label="Your Answer")
+    transcript_box = gr.Textbox(label="Transcript", interactive=False)
+    user_audio_input.change(interview_loop, [state, user_audio_input], [state, question_audio, transcript_box])
 demo.launch(debug=True)