Update app.py
Browse files
app.py
CHANGED
@@ -1448,43 +1448,435 @@ def extract_candidate_details(file_path):
|
|
1448 |
"skills": skills
|
1449 |
}
|
1450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1451 |
import gradio as gr
|
1452 |
import time
|
1453 |
import tempfile
|
1454 |
import numpy as np
|
1455 |
import scipy.io.wavfile as wavfile
|
1456 |
-
import cv2
|
1457 |
import os
|
1458 |
-
import json
|
1459 |
-
from moviepy.editor import VideoFileClip
|
1460 |
-
import shutil
|
1461 |
-
from transformers import BarkModel, AutoProcessor
|
1462 |
-
import torch, gc
|
1463 |
-
import whisper
|
1464 |
-
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
1465 |
-
import librosa
|
1466 |
-
|
1467 |
import torch
|
1468 |
-
|
1469 |
-
|
1470 |
-
gc.collect()
|
1471 |
-
|
1472 |
-
|
1473 |
-
# Bark TTS
|
1474 |
-
print("🔁 Loading Bark model...")
|
1475 |
-
model_bark = BarkModel.from_pretrained("suno/bark")
|
1476 |
-
print("✅ Bark model loaded")
|
1477 |
|
1478 |
-
|
|
|
1479 |
processor_bark = AutoProcessor.from_pretrained("suno/bark")
|
1480 |
-
print("✅ Bark processor loaded")
|
1481 |
-
print("🔁 Moving Bark model to device...")
|
1482 |
-
model_bark.to("cuda" if torch.cuda.is_available() else "cpu")
|
1483 |
-
print("✅ Bark model on device")
|
1484 |
bark_voice_preset = "v2/en_speaker_6"
|
1485 |
|
|
|
|
|
|
|
1486 |
def bark_tts(text):
|
1487 |
-
print(f"🔁 Synthesizing TTS for: {text}")
|
1488 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1489 |
inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
|
1490 |
speech_values = model_bark.generate(**inputs)
|
@@ -1494,366 +1886,34 @@ def bark_tts(text):
|
|
1494 |
wavfile.write(temp_wav.name, 22050, speech)
|
1495 |
return temp_wav.name
|
1496 |
|
1497 |
-
# Whisper STT
|
1498 |
-
print("🔁 Loading Whisper model...")
|
1499 |
-
whisper_model = whisper.load_model("base", device="cuda")
|
1500 |
-
print("✅ Whisper model loaded")
|
1501 |
def whisper_stt(audio_path):
|
1502 |
-
if not audio_path or not os.path.exists(audio_path):
|
|
|
1503 |
result = whisper_model.transcribe(audio_path)
|
1504 |
return result["text"]
|
1505 |
|
|
|
|
|
|
|
1506 |
|
1507 |
-
|
1508 |
-
|
1509 |
-
|
1510 |
-
if isinstance(video_input, str):
|
1511 |
-
input_path = video_input
|
1512 |
-
else:
|
1513 |
-
# It's a file-like object (rare for Gradio video, but handle it)
|
1514 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_in:
|
1515 |
-
temp_in.write(video_input.read())
|
1516 |
-
input_path = temp_in.name
|
1517 |
-
|
1518 |
-
# If already mp4, return as is
|
1519 |
-
if input_path.endswith(".mp4"):
|
1520 |
-
return input_path
|
1521 |
|
1522 |
-
|
1523 |
-
|
1524 |
-
|
1525 |
-
clip = VideoFileClip(input_path)
|
1526 |
-
clip.write_videofile(mp4_path, codec="libx264", audio=False, verbose=False, logger=None)
|
1527 |
-
clip.close()
|
1528 |
-
except Exception as e:
|
1529 |
-
print("Video conversion failed:", e)
|
1530 |
-
# As fallback, just copy original
|
1531 |
-
shutil.copy(input_path, mp4_path)
|
1532 |
-
return mp4_path
|
1533 |
-
|
1534 |
-
def analyze_video_emotions(video_input, sample_rate=15):
|
1535 |
-
# Convert input to an mp4 file OpenCV can process
|
1536 |
-
mp4_path = ensure_mp4(video_input)
|
1537 |
-
if not mp4_path or not os.path.exists(mp4_path):
|
1538 |
-
return "no_face"
|
1539 |
-
cap = cv2.VideoCapture(mp4_path)
|
1540 |
-
frame_count = 0
|
1541 |
-
emotion_counts = {}
|
1542 |
-
while True:
|
1543 |
-
ret, frame = cap.read()
|
1544 |
-
if not ret: break
|
1545 |
-
if frame_count % sample_rate == 0:
|
1546 |
-
try:
|
1547 |
-
result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
|
1548 |
-
dominant = result[0]["dominant_emotion"] if isinstance(result, list) else result["dominant_emotion"]
|
1549 |
-
emotion_counts[dominant] = emotion_counts.get(dominant, 0) + 1
|
1550 |
-
except Exception: pass
|
1551 |
-
frame_count += 1
|
1552 |
-
cap.release()
|
1553 |
-
if not emotion_counts: return "no_face"
|
1554 |
-
return max(emotion_counts.items(), key=lambda x: x[1])[0]
|
1555 |
-
|
1556 |
-
# Original Hugging Face model: HaniaRuby/speech-emotion-recognition-wav2vec2
|
1557 |
-
local_wav2vec_model_path = "HaniaRuby/speech-emotion-recognition-wav2vec2" # Local path to the downloaded model files
|
1558 |
-
print("🔁 Loading Wav2Vec processor and model...")
|
1559 |
-
wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
|
1560 |
-
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
|
1561 |
-
wav2vec_model = wav2vec_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
1562 |
-
print("✅ Wav2Vec model loaded")
|
1563 |
-
wav2vec_model.eval()
|
1564 |
-
voice_label_map = {
|
1565 |
-
0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy',
|
1566 |
-
4: 'neutral', 5: 'sad', 6: 'surprise'
|
1567 |
-
}
|
1568 |
|
|
|
|
|
1569 |
|
|
|
|
|
|
|
|
|
|
|
1570 |
|
1571 |
-
|
1572 |
-
print(f"🔁 Analyzing audio emotion for: {audio_path}")
|
1573 |
-
if not audio_path or not os.path.exists(audio_path): return "neutral"
|
1574 |
-
|
1575 |
-
speech, sr = librosa.load(audio_path, sr=16000)
|
1576 |
-
inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
|
1577 |
-
|
1578 |
-
# 🔥 Move model and inputs to GPU
|
1579 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
1580 |
-
wav2vec_model.to(device)
|
1581 |
-
inputs = {k: v.to(device) for k, v in inputs.items()}
|
1582 |
-
|
1583 |
-
with torch.no_grad():
|
1584 |
-
logits = wav2vec_model(**inputs).logits
|
1585 |
-
|
1586 |
-
probs = torch.nn.functional.softmax(logits, dim=-1)
|
1587 |
-
predicted_id = torch.argmax(probs, dim=-1).item()
|
1588 |
-
return voice_label_map.get(predicted_id, "neutral")
|
1589 |
-
|
1590 |
-
|
1591 |
-
# --- Effective confidence calculation
|
1592 |
-
def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
|
1593 |
-
emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
|
1594 |
-
answer_score_map = {"excellent": 1.0, "good": 0.8, "medium": 0.6, "poor": 0.3}
|
1595 |
-
voice_score, face_score, answer_score = emotion_map.get(voice_label, 0.5), emotion_map.get(face_label, 0.5), answer_score_map.get(answer_score_label, 0.5)
|
1596 |
-
avg_emotion = (voice_score + face_score) / 2
|
1597 |
-
control_bonus = max(0, answer_score - avg_emotion) * k
|
1598 |
-
eff_conf = (0.5 * answer_score + 0.22 * voice_score + 0.18 * face_score + 0.1 * control_bonus)
|
1599 |
-
return {"effective_confidence": round(eff_conf, 3), "answer_score": round(answer_score, 2), "voice_score": round(voice_score, 2), "face_score": round(face_score, 2), "control_bonus": round(control_bonus, 3)}
|
1600 |
-
|
1601 |
-
seniority_mapping = {
|
1602 |
-
"Entry-level": 1, "Junior": 2, "Mid-Level": 3, "Senior": 4, "Lead": 5
|
1603 |
-
}
|
1604 |
-
|
1605 |
-
|
1606 |
-
# --- 2. Gradio App ---
|
1607 |
-
|
1608 |
-
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
1609 |
-
user_data = gr.State({})
|
1610 |
-
interview_state = gr.State({})
|
1611 |
-
missing_fields_state = gr.State([])
|
1612 |
-
|
1613 |
-
# --- UI Layout ---
|
1614 |
-
with gr.Column(visible=True) as user_info_section:
|
1615 |
-
gr.Markdown("## Candidate Information")
|
1616 |
-
cv_file = gr.File(label="Upload CV")
|
1617 |
-
job_desc = gr.Textbox(label="Job Description")
|
1618 |
-
start_btn = gr.Button("Continue", interactive=False)
|
1619 |
-
|
1620 |
-
with gr.Column(visible=False) as missing_section:
|
1621 |
-
gr.Markdown("## Missing Information")
|
1622 |
-
name_in = gr.Textbox(label="Name", visible=False)
|
1623 |
-
role_in = gr.Textbox(label="Job Role", visible=False)
|
1624 |
-
seniority_in = gr.Dropdown(list(seniority_mapping.keys()), label="Seniority", visible=False)
|
1625 |
-
skills_in = gr.Textbox(label="Skills", visible=False)
|
1626 |
-
submit_btn = gr.Button("Submit", interactive=False)
|
1627 |
-
|
1628 |
-
with gr.Column(visible=False) as interview_pre_section:
|
1629 |
-
pre_interview_greeting_md = gr.Markdown()
|
1630 |
-
start_interview_final_btn = gr.Button("Start Interview")
|
1631 |
-
|
1632 |
-
with gr.Column(visible=False) as interview_section:
|
1633 |
-
gr.Markdown("## Interview in Progress")
|
1634 |
-
question_audio = gr.Audio(label="Listen", interactive=False, autoplay=True)
|
1635 |
-
question_text = gr.Markdown()
|
1636 |
-
user_audio_input = gr.Audio(sources=["microphone"], type="filepath", label="1. Record Audio Answer")
|
1637 |
-
user_video_input = gr.Video(sources=["webcam"], label="2. Record Video Answer")
|
1638 |
-
stt_transcript = gr.Textbox(label="Transcribed Answer (edit if needed)")
|
1639 |
-
confirm_btn = gr.Button("Confirm Answer")
|
1640 |
-
evaluation_display = gr.Markdown()
|
1641 |
-
emotion_display = gr.Markdown()
|
1642 |
-
interview_summary = gr.Markdown(visible=False)
|
1643 |
-
|
1644 |
-
# --- UI Logic ---
|
1645 |
-
|
1646 |
-
def validate_start_btn(cv_file, job_desc):
|
1647 |
-
return gr.update(interactive=(cv_file is not None and hasattr(cv_file, "name") and bool(job_desc and job_desc.strip())))
|
1648 |
-
cv_file.change(validate_start_btn, [cv_file, job_desc], start_btn)
|
1649 |
-
job_desc.change(validate_start_btn, [cv_file, job_desc], start_btn)
|
1650 |
-
|
1651 |
-
def process_and_route_initial(cv_file, job_desc):
|
1652 |
-
details = extract_candidate_details(cv_file.name)
|
1653 |
-
job_info = extract_job_details(job_desc)
|
1654 |
-
data = {
|
1655 |
-
"name": details.get("name", "unknown"), "job_role": job_info.get("job_title", "unknown"),
|
1656 |
-
"seniority": job_info.get("experience_level", "unknown"), "skills": job_info.get("skills", [])
|
1657 |
-
}
|
1658 |
-
missing = [k for k, v in data.items() if (isinstance(v, str) and v.lower() == "unknown") or not v]
|
1659 |
-
if missing:
|
1660 |
-
return data, missing, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
1661 |
-
else:
|
1662 |
-
greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' when ready."
|
1663 |
-
return data, missing, gr.update(visible=False), gr.update(visible=False), gr.update(visible=True, value=greeting)
|
1664 |
-
start_btn.click(
|
1665 |
-
process_and_route_initial,
|
1666 |
-
[cv_file, job_desc],
|
1667 |
-
[user_data, missing_fields_state, user_info_section, missing_section, pre_interview_greeting_md]
|
1668 |
-
)
|
1669 |
-
|
1670 |
-
def show_missing(missing):
|
1671 |
-
if missing is None: missing = []
|
1672 |
-
return gr.update(visible="name" in missing), gr.update(visible="job_role" in missing), gr.update(visible="seniority" in missing), gr.update(visible="skills" in missing)
|
1673 |
-
missing_fields_state.change(show_missing, missing_fields_state, [name_in, role_in, seniority_in, skills_in])
|
1674 |
-
|
1675 |
-
def validate_fields(name, role, seniority, skills, missing):
|
1676 |
-
if not missing: return gr.update(interactive=False)
|
1677 |
-
all_filled = all([(not ("name" in missing) or bool(name.strip())), (not ("job_role" in missing) or bool(role.strip())), (not ("seniority" in missing) or bool(seniority)), (not ("skills" in missing) or bool(skills.strip())),])
|
1678 |
-
return gr.update(interactive=all_filled)
|
1679 |
-
for inp in [name_in, role_in, seniority_in, skills_in]:
|
1680 |
-
inp.change(validate_fields, [name_in, role_in, seniority_in, skills_in, missing_fields_state], submit_btn)
|
1681 |
-
|
1682 |
-
def complete_manual(data, name, role, seniority, skills):
|
1683 |
-
if data["name"].lower() == "unknown": data["name"] = name
|
1684 |
-
if data["job_role"].lower() == "unknown": data["job_role"] = role
|
1685 |
-
if data["seniority"].lower() == "unknown": data["seniority"] = seniority
|
1686 |
-
if not data["skills"]: data["skills"] = [s.strip() for s in skills.split(",")]
|
1687 |
-
greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' to begin."
|
1688 |
-
return data, gr.update(visible=False), gr.update(visible=True), gr.update(value=greeting)
|
1689 |
-
submit_btn.click(complete_manual, [user_data, name_in, role_in, seniority_in, skills_in], [user_data, missing_section, interview_pre_section, pre_interview_greeting_md])
|
1690 |
-
|
1691 |
-
def start_interview(data):
|
1692 |
-
# --- Advanced state with full logging ---
|
1693 |
-
state = {
|
1694 |
-
"questions": [], "answers": [], "face_labels": [], "voice_labels": [], "timings": [],
|
1695 |
-
"question_evaluations": [], "answer_evaluations": [], "effective_confidences": [],
|
1696 |
-
"conversation_history": [],
|
1697 |
-
"difficulty_adjustment": None,
|
1698 |
-
"question_idx": 0, "max_questions": 3, "q_start_time": time.time(),
|
1699 |
-
"log": []
|
1700 |
-
}
|
1701 |
-
# --- Optionally: context retrieval here (currently just blank) ---
|
1702 |
-
context = ""
|
1703 |
-
prompt = build_interview_prompt(
|
1704 |
-
conversation_history=[], user_response="", context=context, job_role=data["job_role"],
|
1705 |
-
skills=data["skills"], seniority=data["seniority"], difficulty_adjustment=None,
|
1706 |
-
voice_label="neutral", face_label="neutral"
|
1707 |
-
)
|
1708 |
-
#here the original one
|
1709 |
-
# first_q = groq_llm.predict(prompt)
|
1710 |
-
# # Evaluate Q for quality
|
1711 |
-
# q_eval = eval_question_quality(first_q, data["job_role"], data["seniority"], None)
|
1712 |
-
# state["questions"].append(first_q)
|
1713 |
-
# state["question_evaluations"].append(q_eval)
|
1714 |
-
|
1715 |
-
#here the testing one
|
1716 |
-
first_q = groq_llm.predict(prompt)
|
1717 |
-
q_eval = {
|
1718 |
-
"Score": "N/A",
|
1719 |
-
"Reasoning": "Skipped to reduce processing time",
|
1720 |
-
"Improvements": []
|
1721 |
-
}
|
1722 |
-
state["questions"].append(first_q)
|
1723 |
-
state["question_evaluations"].append(q_eval)
|
1724 |
-
|
1725 |
-
|
1726 |
-
state["conversation_history"].append({'role': 'Interviewer', 'content': first_q})
|
1727 |
-
audio_path = bark_tts(first_q)
|
1728 |
-
# LOG
|
1729 |
-
state["log"].append({"type": "question", "question": first_q, "question_eval": q_eval, "timestamp": time.time()})
|
1730 |
-
return state, gr.update(visible=False), gr.update(visible=True), audio_path, f"*Question 1:* {first_q}"
|
1731 |
-
start_interview_final_btn.click(start_interview, [user_data], [interview_state, interview_pre_section, interview_section, question_audio, question_text])
|
1732 |
-
|
1733 |
-
def transcribe(audio_path):
|
1734 |
-
return whisper_stt(audio_path)
|
1735 |
-
user_audio_input.change(transcribe, user_audio_input, stt_transcript)
|
1736 |
-
|
1737 |
-
def process_answer(transcript, audio_path, video_path, state, data):
|
1738 |
-
if not transcript and not video_path:
|
1739 |
-
return state, gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
|
1740 |
-
elapsed = round(time.time() - state.get("q_start_time", time.time()), 2)
|
1741 |
-
state["timings"].append(elapsed)
|
1742 |
-
state["answers"].append(transcript)
|
1743 |
-
state["conversation_history"].append({'role': 'Candidate', 'content': transcript})
|
1744 |
-
|
1745 |
-
# --- 1. Emotion analysis ---
|
1746 |
-
# voice_label = analyze_audio_emotion(audio_path)
|
1747 |
-
# face_label = analyze_video_emotions(video_path)
|
1748 |
-
# state["voice_labels"].append(voice_label)
|
1749 |
-
# state["face_labels"].append(face_label)
|
1750 |
-
|
1751 |
-
#just for testing
|
1752 |
-
voice_label = "neutral"
|
1753 |
-
face_label = "neutral"
|
1754 |
-
state["voice_labels"].append(voice_label)
|
1755 |
-
state["face_labels"].append(face_label)
|
1756 |
-
|
1757 |
-
|
1758 |
-
|
1759 |
-
# --- 2. Evaluate previous Q and Answer ---
|
1760 |
-
last_q = state["questions"][-1]
|
1761 |
-
q_eval = state["question_evaluations"][-1] # Already in state
|
1762 |
-
ref_answer = generate_reference_answer(last_q, data["job_role"], data["seniority"])
|
1763 |
-
answer_eval = evaluate_answer(last_q, transcript, ref_answer, data["job_role"], data["seniority"], None)
|
1764 |
-
state["answer_evaluations"].append(answer_eval)
|
1765 |
-
answer_score = answer_eval.get("Score", "medium") if answer_eval else "medium"
|
1766 |
-
|
1767 |
-
# --- 3. Adaptive difficulty ---
|
1768 |
-
if answer_score == "excellent":
|
1769 |
-
state["difficulty_adjustment"] = "harder"
|
1770 |
-
elif answer_score in ("medium", "poor"):
|
1771 |
-
state["difficulty_adjustment"] = "easier"
|
1772 |
-
else:
|
1773 |
-
state["difficulty_adjustment"] = None
|
1774 |
-
|
1775 |
-
# --- 4. Effective confidence ---
|
1776 |
-
# eff_conf = interpret_confidence(voice_label, face_label, answer_score)
|
1777 |
-
# state["effective_confidences"].append(eff_conf)
|
1778 |
-
|
1779 |
-
#just for testing:
|
1780 |
-
eff_conf = {"effective_confidence": 0.6}
|
1781 |
-
state["effective_confidences"].append(eff_conf)
|
1782 |
-
|
1783 |
-
|
1784 |
-
# --- LOG ---
|
1785 |
-
state["log"].append({
|
1786 |
-
"type": "answer",
|
1787 |
-
"question": last_q,
|
1788 |
-
"answer": transcript,
|
1789 |
-
"answer_eval": answer_eval,
|
1790 |
-
"ref_answer": ref_answer,
|
1791 |
-
"face_label": face_label,
|
1792 |
-
"voice_label": voice_label,
|
1793 |
-
"effective_confidence": eff_conf,
|
1794 |
-
"timing": elapsed,
|
1795 |
-
"timestamp": time.time()
|
1796 |
-
})
|
1797 |
-
|
1798 |
-
# --- Next or End ---
|
1799 |
-
qidx = state["question_idx"] + 1
|
1800 |
-
if qidx >= state["max_questions"]:
|
1801 |
-
# Save as JSON (optionally)
|
1802 |
-
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
1803 |
-
log_file = f"interview_log_{timestamp}.json"
|
1804 |
-
with open(log_file, "w", encoding="utf-8") as f:
|
1805 |
-
json.dump(state["log"], f, indent=2, ensure_ascii=False)
|
1806 |
-
# Report
|
1807 |
-
summary = "# Interview Summary\n"
|
1808 |
-
for i, q in enumerate(state["questions"]):
|
1809 |
-
summary += (f"\n### Q{i + 1}: {q}\n"
|
1810 |
-
f"- *Answer*: {state['answers'][i]}\n"
|
1811 |
-
f"- *Q Eval*: {state['question_evaluations'][i]}\n"
|
1812 |
-
f"- *A Eval*: {state['answer_evaluations'][i]}\n"
|
1813 |
-
#also this are removed just for testing :(
|
1814 |
-
# f"- *Face Emotion: {state['face_labels'][i]}, **Voice Emotion*: {state['voice_labels'][i]}\n"
|
1815 |
-
# f"- *Effective Confidence*: {state['effective_confidences'][i]['effective_confidence']}\n"
|
1816 |
-
f"- *Time*: {state['timings'][i]}s\n")
|
1817 |
-
summary += f"\n\n⏺ Full log saved as {log_file}."
|
1818 |
-
return (state, gr.update(visible=True, value=summary), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"))
|
1819 |
-
else:
|
1820 |
-
# --- Build next prompt using adaptive difficulty ---
|
1821 |
-
state["question_idx"] = qidx
|
1822 |
-
state["q_start_time"] = time.time()
|
1823 |
-
context = "" # You can add your context logic here
|
1824 |
-
prompt = build_interview_prompt(
|
1825 |
-
conversation_history=state["conversation_history"],
|
1826 |
-
user_response=transcript,
|
1827 |
-
context=context,
|
1828 |
-
job_role=data["job_role"],
|
1829 |
-
skills=data["skills"],
|
1830 |
-
seniority=data["seniority"],
|
1831 |
-
difficulty_adjustment=state["difficulty_adjustment"],
|
1832 |
-
face_label=face_label,
|
1833 |
-
voice_label=voice_label,
|
1834 |
-
effective_confidence=eff_conf
|
1835 |
-
)
|
1836 |
-
next_q = groq_llm.predict(prompt)
|
1837 |
-
# Evaluate Q quality
|
1838 |
-
q_eval = eval_question_quality(next_q, data["job_role"], data["seniority"], None)
|
1839 |
-
state["questions"].append(next_q)
|
1840 |
-
state["question_evaluations"].append(q_eval)
|
1841 |
-
state["conversation_history"].append({'role': 'Interviewer', 'content': next_q})
|
1842 |
-
state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
|
1843 |
-
audio_path = bark_tts(next_q)
|
1844 |
-
# Display evaluations
|
1845 |
-
eval_md = f"*Last Answer Eval:* {answer_eval}\n\n*Effective Confidence:* {eff_conf}"
|
1846 |
-
return (
|
1847 |
-
state, gr.update(visible=False), audio_path, f"*Question {qidx + 1}:* {next_q}",
|
1848 |
-
gr.update(value=None), gr.update(value=None),
|
1849 |
-
gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"),
|
1850 |
-
)
|
1851 |
-
confirm_btn.click(
|
1852 |
-
process_answer,
|
1853 |
-
[stt_transcript, user_audio_input, user_video_input, interview_state, user_data],
|
1854 |
-
[interview_state, interview_summary, question_audio, question_text, user_audio_input, user_video_input, emotion_display]
|
1855 |
-
).then(
|
1856 |
-
lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, user_video_input]
|
1857 |
-
)
|
1858 |
|
1859 |
demo.launch(debug=True)
|
|
|
|
1448 |
"skills": skills
|
1449 |
}
|
1450 |
|
1451 |
+
# import gradio as gr
|
1452 |
+
# import time
|
1453 |
+
# import tempfile
|
1454 |
+
# import numpy as np
|
1455 |
+
# import scipy.io.wavfile as wavfile
|
1456 |
+
# import cv2
|
1457 |
+
# import os
|
1458 |
+
# import json
|
1459 |
+
# from moviepy.editor import VideoFileClip
|
1460 |
+
# import shutil
|
1461 |
+
# from transformers import BarkModel, AutoProcessor
|
1462 |
+
# import torch, gc
|
1463 |
+
# import whisper
|
1464 |
+
# from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
1465 |
+
# import librosa
|
1466 |
+
|
1467 |
+
# import torch
|
1468 |
+
# print(torch.cuda.is_available()) # ✅ Tells you if GPU is available
|
1469 |
+
# torch.cuda.empty_cache()
|
1470 |
+
# gc.collect()
|
1471 |
+
|
1472 |
+
|
1473 |
+
# # Bark TTS
|
1474 |
+
# print("🔁 Loading Bark model...")
|
1475 |
+
# model_bark = BarkModel.from_pretrained("suno/bark")
|
1476 |
+
# print("✅ Bark model loaded")
|
1477 |
+
|
1478 |
+
# print("🔁 Loading Bark processor...")
|
1479 |
+
# processor_bark = AutoProcessor.from_pretrained("suno/bark")
|
1480 |
+
# print("✅ Bark processor loaded")
|
1481 |
+
# print("🔁 Moving Bark model to device...")
|
1482 |
+
# model_bark.to("cuda" if torch.cuda.is_available() else "cpu")
|
1483 |
+
# print("✅ Bark model on device")
|
1484 |
+
# bark_voice_preset = "v2/en_speaker_6"
|
1485 |
+
|
1486 |
+
# def bark_tts(text):
|
1487 |
+
# print(f"🔁 Synthesizing TTS for: {text}")
|
1488 |
+
# inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1489 |
+
# inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
|
1490 |
+
# speech_values = model_bark.generate(**inputs)
|
1491 |
+
# speech = speech_values.cpu().numpy().squeeze()
|
1492 |
+
# speech = (speech * 32767).astype(np.int16)
|
1493 |
+
# temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
1494 |
+
# wavfile.write(temp_wav.name, 22050, speech)
|
1495 |
+
# return temp_wav.name
|
1496 |
+
|
1497 |
+
# # Whisper STT
|
1498 |
+
# print("🔁 Loading Whisper model...")
|
1499 |
+
# whisper_model = whisper.load_model("base", device="cuda")
|
1500 |
+
# print("✅ Whisper model loaded")
|
1501 |
+
# def whisper_stt(audio_path):
|
1502 |
+
# if not audio_path or not os.path.exists(audio_path): return ""
|
1503 |
+
# result = whisper_model.transcribe(audio_path)
|
1504 |
+
# return result["text"]
|
1505 |
+
|
1506 |
+
|
1507 |
+
# # DeepFace (Video Face Emotion)
|
1508 |
+
# def ensure_mp4(video_input):
|
1509 |
+
# # video_input could be a file-like object, a path, or a Gradio temp path
|
1510 |
+
# if isinstance(video_input, str):
|
1511 |
+
# input_path = video_input
|
1512 |
+
# else:
|
1513 |
+
# # It's a file-like object (rare for Gradio video, but handle it)
|
1514 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as temp_in:
|
1515 |
+
# temp_in.write(video_input.read())
|
1516 |
+
# input_path = temp_in.name
|
1517 |
+
|
1518 |
+
# # If already mp4, return as is
|
1519 |
+
# if input_path.endswith(".mp4"):
|
1520 |
+
# return input_path
|
1521 |
+
|
1522 |
+
# # Convert to mp4 using moviepy
|
1523 |
+
# mp4_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
1524 |
+
# try:
|
1525 |
+
# clip = VideoFileClip(input_path)
|
1526 |
+
# clip.write_videofile(mp4_path, codec="libx264", audio=False, verbose=False, logger=None)
|
1527 |
+
# clip.close()
|
1528 |
+
# except Exception as e:
|
1529 |
+
# print("Video conversion failed:", e)
|
1530 |
+
# # As fallback, just copy original
|
1531 |
+
# shutil.copy(input_path, mp4_path)
|
1532 |
+
# return mp4_path
|
1533 |
+
|
1534 |
+
# def analyze_video_emotions(video_input, sample_rate=15):
|
1535 |
+
# # Convert input to an mp4 file OpenCV can process
|
1536 |
+
# mp4_path = ensure_mp4(video_input)
|
1537 |
+
# if not mp4_path or not os.path.exists(mp4_path):
|
1538 |
+
# return "no_face"
|
1539 |
+
# cap = cv2.VideoCapture(mp4_path)
|
1540 |
+
# frame_count = 0
|
1541 |
+
# emotion_counts = {}
|
1542 |
+
# while True:
|
1543 |
+
# ret, frame = cap.read()
|
1544 |
+
# if not ret: break
|
1545 |
+
# if frame_count % sample_rate == 0:
|
1546 |
+
# try:
|
1547 |
+
# result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
|
1548 |
+
# dominant = result[0]["dominant_emotion"] if isinstance(result, list) else result["dominant_emotion"]
|
1549 |
+
# emotion_counts[dominant] = emotion_counts.get(dominant, 0) + 1
|
1550 |
+
# except Exception: pass
|
1551 |
+
# frame_count += 1
|
1552 |
+
# cap.release()
|
1553 |
+
# if not emotion_counts: return "no_face"
|
1554 |
+
# return max(emotion_counts.items(), key=lambda x: x[1])[0]
|
1555 |
+
|
1556 |
+
# # Original Hugging Face model: HaniaRuby/speech-emotion-recognition-wav2vec2
|
1557 |
+
# local_wav2vec_model_path = "HaniaRuby/speech-emotion-recognition-wav2vec2" # Local path to the downloaded model files
|
1558 |
+
# print("🔁 Loading Wav2Vec processor and model...")
|
1559 |
+
# wav2vec_processor = Wav2Vec2Processor.from_pretrained(local_wav2vec_model_path)
|
1560 |
+
# wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained(local_wav2vec_model_path)
|
1561 |
+
# wav2vec_model = wav2vec_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
1562 |
+
# print("✅ Wav2Vec model loaded")
|
1563 |
+
# wav2vec_model.eval()
|
1564 |
+
# voice_label_map = {
|
1565 |
+
# 0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy',
|
1566 |
+
# 4: 'neutral', 5: 'sad', 6: 'surprise'
|
1567 |
+
# }
|
1568 |
+
|
1569 |
+
|
1570 |
+
|
1571 |
+
# def analyze_audio_emotion(audio_path):
|
1572 |
+
# print(f"🔁 Analyzing audio emotion for: {audio_path}")
|
1573 |
+
# if not audio_path or not os.path.exists(audio_path): return "neutral"
|
1574 |
+
|
1575 |
+
# speech, sr = librosa.load(audio_path, sr=16000)
|
1576 |
+
# inputs = wav2vec_processor(speech, sampling_rate=16000, return_tensors="pt")
|
1577 |
+
|
1578 |
+
# # 🔥 Move model and inputs to GPU
|
1579 |
+
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
1580 |
+
# wav2vec_model.to(device)
|
1581 |
+
# inputs = {k: v.to(device) for k, v in inputs.items()}
|
1582 |
+
|
1583 |
+
# with torch.no_grad():
|
1584 |
+
# logits = wav2vec_model(**inputs).logits
|
1585 |
+
|
1586 |
+
# probs = torch.nn.functional.softmax(logits, dim=-1)
|
1587 |
+
# predicted_id = torch.argmax(probs, dim=-1).item()
|
1588 |
+
# return voice_label_map.get(predicted_id, "neutral")
|
1589 |
+
|
1590 |
+
|
1591 |
+
# # --- Effective confidence calculation
|
1592 |
+
# def interpret_confidence(voice_label, face_label, answer_score_label, k=0.2):
|
1593 |
+
# emotion_map = {"happy": 0.9, "neutral": 0.6, "surprised": 0.7, "sad": 0.4, "angry": 0.3, "disgust": 0.2, "fear": 0.3, "no_face": 0.5, "unknown": 0.5}
|
1594 |
+
# answer_score_map = {"excellent": 1.0, "good": 0.8, "medium": 0.6, "poor": 0.3}
|
1595 |
+
# voice_score, face_score, answer_score = emotion_map.get(voice_label, 0.5), emotion_map.get(face_label, 0.5), answer_score_map.get(answer_score_label, 0.5)
|
1596 |
+
# avg_emotion = (voice_score + face_score) / 2
|
1597 |
+
# control_bonus = max(0, answer_score - avg_emotion) * k
|
1598 |
+
# eff_conf = (0.5 * answer_score + 0.22 * voice_score + 0.18 * face_score + 0.1 * control_bonus)
|
1599 |
+
# return {"effective_confidence": round(eff_conf, 3), "answer_score": round(answer_score, 2), "voice_score": round(voice_score, 2), "face_score": round(face_score, 2), "control_bonus": round(control_bonus, 3)}
|
1600 |
+
|
1601 |
+
# seniority_mapping = {
|
1602 |
+
# "Entry-level": 1, "Junior": 2, "Mid-Level": 3, "Senior": 4, "Lead": 5
|
1603 |
+
# }
|
1604 |
+
|
1605 |
+
|
1606 |
+
# # --- 2. Gradio App ---
|
1607 |
+
|
1608 |
+
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
1609 |
+
# user_data = gr.State({})
|
1610 |
+
# interview_state = gr.State({})
|
1611 |
+
# missing_fields_state = gr.State([])
|
1612 |
+
|
1613 |
+
# # --- UI Layout ---
|
1614 |
+
# with gr.Column(visible=True) as user_info_section:
|
1615 |
+
# gr.Markdown("## Candidate Information")
|
1616 |
+
# cv_file = gr.File(label="Upload CV")
|
1617 |
+
# job_desc = gr.Textbox(label="Job Description")
|
1618 |
+
# start_btn = gr.Button("Continue", interactive=False)
|
1619 |
+
|
1620 |
+
# with gr.Column(visible=False) as missing_section:
|
1621 |
+
# gr.Markdown("## Missing Information")
|
1622 |
+
# name_in = gr.Textbox(label="Name", visible=False)
|
1623 |
+
# role_in = gr.Textbox(label="Job Role", visible=False)
|
1624 |
+
# seniority_in = gr.Dropdown(list(seniority_mapping.keys()), label="Seniority", visible=False)
|
1625 |
+
# skills_in = gr.Textbox(label="Skills", visible=False)
|
1626 |
+
# submit_btn = gr.Button("Submit", interactive=False)
|
1627 |
+
|
1628 |
+
# with gr.Column(visible=False) as interview_pre_section:
|
1629 |
+
# pre_interview_greeting_md = gr.Markdown()
|
1630 |
+
# start_interview_final_btn = gr.Button("Start Interview")
|
1631 |
+
|
1632 |
+
# with gr.Column(visible=False) as interview_section:
|
1633 |
+
# gr.Markdown("## Interview in Progress")
|
1634 |
+
# question_audio = gr.Audio(label="Listen", interactive=False, autoplay=True)
|
1635 |
+
# question_text = gr.Markdown()
|
1636 |
+
# user_audio_input = gr.Audio(sources=["microphone"], type="filepath", label="1. Record Audio Answer")
|
1637 |
+
# user_video_input = gr.Video(sources=["webcam"], label="2. Record Video Answer")
|
1638 |
+
# stt_transcript = gr.Textbox(label="Transcribed Answer (edit if needed)")
|
1639 |
+
# confirm_btn = gr.Button("Confirm Answer")
|
1640 |
+
# evaluation_display = gr.Markdown()
|
1641 |
+
# emotion_display = gr.Markdown()
|
1642 |
+
# interview_summary = gr.Markdown(visible=False)
|
1643 |
+
|
1644 |
+
# # --- UI Logic ---
|
1645 |
+
|
1646 |
+
# def validate_start_btn(cv_file, job_desc):
|
1647 |
+
# return gr.update(interactive=(cv_file is not None and hasattr(cv_file, "name") and bool(job_desc and job_desc.strip())))
|
1648 |
+
# cv_file.change(validate_start_btn, [cv_file, job_desc], start_btn)
|
1649 |
+
# job_desc.change(validate_start_btn, [cv_file, job_desc], start_btn)
|
1650 |
+
|
1651 |
+
# def process_and_route_initial(cv_file, job_desc):
|
1652 |
+
# details = extract_candidate_details(cv_file.name)
|
1653 |
+
# job_info = extract_job_details(job_desc)
|
1654 |
+
# data = {
|
1655 |
+
# "name": details.get("name", "unknown"), "job_role": job_info.get("job_title", "unknown"),
|
1656 |
+
# "seniority": job_info.get("experience_level", "unknown"), "skills": job_info.get("skills", [])
|
1657 |
+
# }
|
1658 |
+
# missing = [k for k, v in data.items() if (isinstance(v, str) and v.lower() == "unknown") or not v]
|
1659 |
+
# if missing:
|
1660 |
+
# return data, missing, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
1661 |
+
# else:
|
1662 |
+
# greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' when ready."
|
1663 |
+
# return data, missing, gr.update(visible=False), gr.update(visible=False), gr.update(visible=True, value=greeting)
|
1664 |
+
# start_btn.click(
|
1665 |
+
# process_and_route_initial,
|
1666 |
+
# [cv_file, job_desc],
|
1667 |
+
# [user_data, missing_fields_state, user_info_section, missing_section, pre_interview_greeting_md]
|
1668 |
+
# )
|
1669 |
+
|
1670 |
+
# def show_missing(missing):
|
1671 |
+
# if missing is None: missing = []
|
1672 |
+
# return gr.update(visible="name" in missing), gr.update(visible="job_role" in missing), gr.update(visible="seniority" in missing), gr.update(visible="skills" in missing)
|
1673 |
+
# missing_fields_state.change(show_missing, missing_fields_state, [name_in, role_in, seniority_in, skills_in])
|
1674 |
+
|
1675 |
+
# def validate_fields(name, role, seniority, skills, missing):
|
1676 |
+
# if not missing: return gr.update(interactive=False)
|
1677 |
+
# all_filled = all([(not ("name" in missing) or bool(name.strip())), (not ("job_role" in missing) or bool(role.strip())), (not ("seniority" in missing) or bool(seniority)), (not ("skills" in missing) or bool(skills.strip())),])
|
1678 |
+
# return gr.update(interactive=all_filled)
|
1679 |
+
# for inp in [name_in, role_in, seniority_in, skills_in]:
|
1680 |
+
# inp.change(validate_fields, [name_in, role_in, seniority_in, skills_in, missing_fields_state], submit_btn)
|
1681 |
+
|
1682 |
+
# def complete_manual(data, name, role, seniority, skills):
|
1683 |
+
# if data["name"].lower() == "unknown": data["name"] = name
|
1684 |
+
# if data["job_role"].lower() == "unknown": data["job_role"] = role
|
1685 |
+
# if data["seniority"].lower() == "unknown": data["seniority"] = seniority
|
1686 |
+
# if not data["skills"]: data["skills"] = [s.strip() for s in skills.split(",")]
|
1687 |
+
# greeting = f"Hello {data['name']}, your profile is ready. Click 'Start Interview' to begin."
|
1688 |
+
# return data, gr.update(visible=False), gr.update(visible=True), gr.update(value=greeting)
|
1689 |
+
# submit_btn.click(complete_manual, [user_data, name_in, role_in, seniority_in, skills_in], [user_data, missing_section, interview_pre_section, pre_interview_greeting_md])
|
1690 |
+
|
1691 |
+
# def start_interview(data):
|
1692 |
+
# # --- Advanced state with full logging ---
|
1693 |
+
# state = {
|
1694 |
+
# "questions": [], "answers": [], "face_labels": [], "voice_labels": [], "timings": [],
|
1695 |
+
# "question_evaluations": [], "answer_evaluations": [], "effective_confidences": [],
|
1696 |
+
# "conversation_history": [],
|
1697 |
+
# "difficulty_adjustment": None,
|
1698 |
+
# "question_idx": 0, "max_questions": 3, "q_start_time": time.time(),
|
1699 |
+
# "log": []
|
1700 |
+
# }
|
1701 |
+
# # --- Optionally: context retrieval here (currently just blank) ---
|
1702 |
+
# context = ""
|
1703 |
+
# prompt = build_interview_prompt(
|
1704 |
+
# conversation_history=[], user_response="", context=context, job_role=data["job_role"],
|
1705 |
+
# skills=data["skills"], seniority=data["seniority"], difficulty_adjustment=None,
|
1706 |
+
# voice_label="neutral", face_label="neutral"
|
1707 |
+
# )
|
1708 |
+
# #here the original one
|
1709 |
+
# # first_q = groq_llm.predict(prompt)
|
1710 |
+
# # # Evaluate Q for quality
|
1711 |
+
# # q_eval = eval_question_quality(first_q, data["job_role"], data["seniority"], None)
|
1712 |
+
# # state["questions"].append(first_q)
|
1713 |
+
# # state["question_evaluations"].append(q_eval)
|
1714 |
+
|
1715 |
+
# #here the testing one
|
1716 |
+
# first_q = groq_llm.predict(prompt)
|
1717 |
+
# q_eval = {
|
1718 |
+
# "Score": "N/A",
|
1719 |
+
# "Reasoning": "Skipped to reduce processing time",
|
1720 |
+
# "Improvements": []
|
1721 |
+
# }
|
1722 |
+
# state["questions"].append(first_q)
|
1723 |
+
# state["question_evaluations"].append(q_eval)
|
1724 |
+
|
1725 |
+
|
1726 |
+
# state["conversation_history"].append({'role': 'Interviewer', 'content': first_q})
|
1727 |
+
# audio_path = bark_tts(first_q)
|
1728 |
+
# # LOG
|
1729 |
+
# state["log"].append({"type": "question", "question": first_q, "question_eval": q_eval, "timestamp": time.time()})
|
1730 |
+
# return state, gr.update(visible=False), gr.update(visible=True), audio_path, f"*Question 1:* {first_q}"
|
1731 |
+
# start_interview_final_btn.click(start_interview, [user_data], [interview_state, interview_pre_section, interview_section, question_audio, question_text])
|
1732 |
+
|
1733 |
+
# def transcribe(audio_path):
|
1734 |
+
# return whisper_stt(audio_path)
|
1735 |
+
# user_audio_input.change(transcribe, user_audio_input, stt_transcript)
|
1736 |
+
|
1737 |
+
# def process_answer(transcript, audio_path, video_path, state, data):
|
1738 |
+
# if not transcript and not video_path:
|
1739 |
+
# return state, gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
|
1740 |
+
# elapsed = round(time.time() - state.get("q_start_time", time.time()), 2)
|
1741 |
+
# state["timings"].append(elapsed)
|
1742 |
+
# state["answers"].append(transcript)
|
1743 |
+
# state["conversation_history"].append({'role': 'Candidate', 'content': transcript})
|
1744 |
+
|
1745 |
+
# # --- 1. Emotion analysis ---
|
1746 |
+
# # voice_label = analyze_audio_emotion(audio_path)
|
1747 |
+
# # face_label = analyze_video_emotions(video_path)
|
1748 |
+
# # state["voice_labels"].append(voice_label)
|
1749 |
+
# # state["face_labels"].append(face_label)
|
1750 |
+
|
1751 |
+
# #just for testing
|
1752 |
+
# voice_label = "neutral"
|
1753 |
+
# face_label = "neutral"
|
1754 |
+
# state["voice_labels"].append(voice_label)
|
1755 |
+
# state["face_labels"].append(face_label)
|
1756 |
+
|
1757 |
+
|
1758 |
+
|
1759 |
+
# # --- 2. Evaluate previous Q and Answer ---
|
1760 |
+
# last_q = state["questions"][-1]
|
1761 |
+
# q_eval = state["question_evaluations"][-1] # Already in state
|
1762 |
+
# ref_answer = generate_reference_answer(last_q, data["job_role"], data["seniority"])
|
1763 |
+
# answer_eval = evaluate_answer(last_q, transcript, ref_answer, data["job_role"], data["seniority"], None)
|
1764 |
+
# state["answer_evaluations"].append(answer_eval)
|
1765 |
+
# answer_score = answer_eval.get("Score", "medium") if answer_eval else "medium"
|
1766 |
+
|
1767 |
+
# # --- 3. Adaptive difficulty ---
|
1768 |
+
# if answer_score == "excellent":
|
1769 |
+
# state["difficulty_adjustment"] = "harder"
|
1770 |
+
# elif answer_score in ("medium", "poor"):
|
1771 |
+
# state["difficulty_adjustment"] = "easier"
|
1772 |
+
# else:
|
1773 |
+
# state["difficulty_adjustment"] = None
|
1774 |
+
|
1775 |
+
# # --- 4. Effective confidence ---
|
1776 |
+
# # eff_conf = interpret_confidence(voice_label, face_label, answer_score)
|
1777 |
+
# # state["effective_confidences"].append(eff_conf)
|
1778 |
+
|
1779 |
+
# #just for testing:
|
1780 |
+
# eff_conf = {"effective_confidence": 0.6}
|
1781 |
+
# state["effective_confidences"].append(eff_conf)
|
1782 |
+
|
1783 |
+
|
1784 |
+
# # --- LOG ---
|
1785 |
+
# state["log"].append({
|
1786 |
+
# "type": "answer",
|
1787 |
+
# "question": last_q,
|
1788 |
+
# "answer": transcript,
|
1789 |
+
# "answer_eval": answer_eval,
|
1790 |
+
# "ref_answer": ref_answer,
|
1791 |
+
# "face_label": face_label,
|
1792 |
+
# "voice_label": voice_label,
|
1793 |
+
# "effective_confidence": eff_conf,
|
1794 |
+
# "timing": elapsed,
|
1795 |
+
# "timestamp": time.time()
|
1796 |
+
# })
|
1797 |
+
|
1798 |
+
# # --- Next or End ---
|
1799 |
+
# qidx = state["question_idx"] + 1
|
1800 |
+
# if qidx >= state["max_questions"]:
|
1801 |
+
# # Save as JSON (optionally)
|
1802 |
+
# timestamp = time.strftime("%Y%m%d_%H%M%S")
|
1803 |
+
# log_file = f"interview_log_{timestamp}.json"
|
1804 |
+
# with open(log_file, "w", encoding="utf-8") as f:
|
1805 |
+
# json.dump(state["log"], f, indent=2, ensure_ascii=False)
|
1806 |
+
# # Report
|
1807 |
+
# summary = "# Interview Summary\n"
|
1808 |
+
# for i, q in enumerate(state["questions"]):
|
1809 |
+
# summary += (f"\n### Q{i + 1}: {q}\n"
|
1810 |
+
# f"- *Answer*: {state['answers'][i]}\n"
|
1811 |
+
# f"- *Q Eval*: {state['question_evaluations'][i]}\n"
|
1812 |
+
# f"- *A Eval*: {state['answer_evaluations'][i]}\n"
|
1813 |
+
# #also this are removed just for testing :(
|
1814 |
+
# # f"- *Face Emotion: {state['face_labels'][i]}, **Voice Emotion*: {state['voice_labels'][i]}\n"
|
1815 |
+
# # f"- *Effective Confidence*: {state['effective_confidences'][i]['effective_confidence']}\n"
|
1816 |
+
# f"- *Time*: {state['timings'][i]}s\n")
|
1817 |
+
# summary += f"\n\n⏺ Full log saved as {log_file}."
|
1818 |
+
# return (state, gr.update(visible=True, value=summary), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"))
|
1819 |
+
# else:
|
1820 |
+
# # --- Build next prompt using adaptive difficulty ---
|
1821 |
+
# state["question_idx"] = qidx
|
1822 |
+
# state["q_start_time"] = time.time()
|
1823 |
+
# context = "" # You can add your context logic here
|
1824 |
+
# prompt = build_interview_prompt(
|
1825 |
+
# conversation_history=state["conversation_history"],
|
1826 |
+
# user_response=transcript,
|
1827 |
+
# context=context,
|
1828 |
+
# job_role=data["job_role"],
|
1829 |
+
# skills=data["skills"],
|
1830 |
+
# seniority=data["seniority"],
|
1831 |
+
# difficulty_adjustment=state["difficulty_adjustment"],
|
1832 |
+
# face_label=face_label,
|
1833 |
+
# voice_label=voice_label,
|
1834 |
+
# effective_confidence=eff_conf
|
1835 |
+
# )
|
1836 |
+
# next_q = groq_llm.predict(prompt)
|
1837 |
+
# # Evaluate Q quality
|
1838 |
+
# q_eval = eval_question_quality(next_q, data["job_role"], data["seniority"], None)
|
1839 |
+
# state["questions"].append(next_q)
|
1840 |
+
# state["question_evaluations"].append(q_eval)
|
1841 |
+
# state["conversation_history"].append({'role': 'Interviewer', 'content': next_q})
|
1842 |
+
# state["log"].append({"type": "question", "question": next_q, "question_eval": q_eval, "timestamp": time.time()})
|
1843 |
+
# audio_path = bark_tts(next_q)
|
1844 |
+
# # Display evaluations
|
1845 |
+
# eval_md = f"*Last Answer Eval:* {answer_eval}\n\n*Effective Confidence:* {eff_conf}"
|
1846 |
+
# return (
|
1847 |
+
# state, gr.update(visible=False), audio_path, f"*Question {qidx + 1}:* {next_q}",
|
1848 |
+
# gr.update(value=None), gr.update(value=None),
|
1849 |
+
# gr.update(visible=True, value=f"Last Detected — Face: {face_label}, Voice: {voice_label}"),
|
1850 |
+
# )
|
1851 |
+
# confirm_btn.click(
|
1852 |
+
# process_answer,
|
1853 |
+
# [stt_transcript, user_audio_input, user_video_input, interview_state, user_data],
|
1854 |
+
# [interview_state, interview_summary, question_audio, question_text, user_audio_input, user_video_input, emotion_display]
|
1855 |
+
# ).then(
|
1856 |
+
# lambda: (gr.update(value=None), gr.update(value=None)), None, [user_audio_input, user_video_input]
|
1857 |
+
# )
|
1858 |
+
|
1859 |
+
# demo.launch(debug=True)
|
1860 |
+
|
1861 |
import gradio as gr
|
1862 |
import time
|
1863 |
import tempfile
|
1864 |
import numpy as np
|
1865 |
import scipy.io.wavfile as wavfile
|
|
|
1866 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1867 |
import torch
|
1868 |
+
import whisper
|
1869 |
+
from transformers import BarkModel, AutoProcessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1870 |
|
1871 |
+
# Initialize Bark (TTS)
|
1872 |
+
model_bark = BarkModel.from_pretrained("suno/bark").to("cuda" if torch.cuda.is_available() else "cpu")
|
1873 |
processor_bark = AutoProcessor.from_pretrained("suno/bark")
|
|
|
|
|
|
|
|
|
1874 |
bark_voice_preset = "v2/en_speaker_6"
|
1875 |
|
1876 |
+
# Initialize Whisper (STT)
|
1877 |
+
whisper_model = whisper.load_model("base", device="cuda" if torch.cuda.is_available() else "cpu")
|
1878 |
+
|
1879 |
def bark_tts(text):
|
|
|
1880 |
inputs = processor_bark(text, return_tensors="pt", voice_preset=bark_voice_preset)
|
1881 |
inputs = {k: v.to(model_bark.device) for k, v in inputs.items()}
|
1882 |
speech_values = model_bark.generate(**inputs)
|
|
|
1886 |
wavfile.write(temp_wav.name, 22050, speech)
|
1887 |
return temp_wav.name
|
1888 |
|
|
|
|
|
|
|
|
|
1889 |
def whisper_stt(audio_path):
|
1890 |
+
if not audio_path or not os.path.exists(audio_path):
|
1891 |
+
return ""
|
1892 |
result = whisper_model.transcribe(audio_path)
|
1893 |
return result["text"]
|
1894 |
|
1895 |
+
# Dummy Groq API stub (replace with actual logic)
|
1896 |
+
def groq_llm_predict(prompt):
|
1897 |
+
return f"[Mock Question] Based on: {prompt}" # Replace with groq_llm.predict(prompt)
|
1898 |
|
1899 |
+
def interview_loop(state, audio_path):
|
1900 |
+
transcript = whisper_stt(audio_path)
|
1901 |
+
state["conversation"].append({"role": "Candidate", "content": transcript})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1902 |
|
1903 |
+
prompt = "\n".join([f"{turn['role']}: {turn['content']}" for turn in state["conversation"]])
|
1904 |
+
next_q = groq_llm_predict(prompt)
|
1905 |
+
state["conversation"].append({"role": "Interviewer", "content": next_q})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1906 |
|
1907 |
+
audio_out = bark_tts(next_q)
|
1908 |
+
return state, audio_out, transcript
|
1909 |
|
1910 |
+
with gr.Blocks() as demo:
|
1911 |
+
state = gr.State({"conversation": []})
|
1912 |
+
question_audio = gr.Audio(label="Interviewer's Question", interactive=False, autoplay=True)
|
1913 |
+
user_audio_input = gr.Audio(source="microphone", type="filepath", label="Your Answer")
|
1914 |
+
transcript_box = gr.Textbox(label="Transcript", interactive=False)
|
1915 |
|
1916 |
+
user_audio_input.change(interview_loop, [state, user_audio_input], [state, question_audio, transcript_box])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1917 |
|
1918 |
demo.launch(debug=True)
|
1919 |
+
|