Commit
·
5fc8fb2
1
Parent(s):
5b71262
updated eval question
Browse files
app.py
CHANGED
@@ -537,79 +537,59 @@ from typing import Dict
|
|
537 |
def eval_question_quality(
|
538 |
question: str,
|
539 |
job_role: str,
|
540 |
-
seniority: str
|
541 |
-
judge_pipeline=None,
|
542 |
-
max_retries=1 # Allow at least 1 retry on parse fail
|
543 |
) -> Dict[str, str]:
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
if not judge_pipeline:
|
551 |
-
return {
|
552 |
-
"Score": "Error",
|
553 |
-
"Reasoning": "Judge pipeline not available",
|
554 |
-
"Improvements": "Please provide a valid language model pipeline"
|
555 |
-
}
|
556 |
|
557 |
-
|
558 |
-
|
559 |
-
Now evaluate this question:
|
560 |
-
\"{question}\"
|
561 |
-
"""
|
562 |
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
do_sample=False,
|
568 |
-
temperature=0.1,
|
569 |
-
repetition_penalty=1.2
|
570 |
-
)[0]["generated_text"]
|
571 |
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
if not match:
|
576 |
-
raise ValueError("Could not locate JSON structure in model output.")
|
577 |
-
json_str = match.group(0)
|
578 |
-
result = json.loads(json_str)
|
579 |
-
|
580 |
-
# Validate required fields and values
|
581 |
-
required_keys = ["Score", "Reasoning", "Improvements"]
|
582 |
-
valid_scores = {"Poor", "Medium", "Good", "Excellent"}
|
583 |
-
if not all(k in result for k in required_keys):
|
584 |
-
raise ValueError("Missing required fields.")
|
585 |
-
if result["Score"] not in valid_scores:
|
586 |
-
raise ValueError("Invalid score value.")
|
587 |
-
return result
|
588 |
|
589 |
-
|
590 |
-
|
591 |
-
|
|
|
|
|
|
|
|
|
592 |
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
|
|
|
|
|
|
|
|
|
|
603 |
|
604 |
except Exception as e:
|
605 |
-
|
606 |
return {
|
607 |
"Score": "Poor",
|
608 |
-
"Reasoning":
|
609 |
"Improvements": [
|
610 |
-
"
|
611 |
-
"
|
612 |
-
"
|
613 |
]
|
614 |
}
|
615 |
|
@@ -621,9 +601,10 @@ def evaluate_answer(
|
|
621 |
seniority: str,
|
622 |
) -> Dict[str, str]:
|
623 |
"""
|
624 |
-
Fast and structured answer evaluation using Groq LLM (e.g. Mixtral
|
625 |
"""
|
626 |
import time, json
|
|
|
627 |
|
628 |
prompt = f"""
|
629 |
You are a technical interviewer evaluating a candidate for a {seniority} {job_role} role.
|
@@ -650,12 +631,19 @@ Respond ONLY with valid JSON in the following format:
|
|
650 |
|
651 |
try:
|
652 |
start = time.time()
|
653 |
-
|
654 |
print("⏱️ evaluate_answer duration:", round(time.time() - start, 2), "s")
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
|
660 |
result = json.loads(json_str)
|
661 |
if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
|
|
|
537 |
def eval_question_quality(
|
538 |
question: str,
|
539 |
job_role: str,
|
540 |
+
seniority: str
|
|
|
|
|
541 |
) -> Dict[str, str]:
|
542 |
+
"""
|
543 |
+
Evaluate the quality of a generated interview question using Groq LLM.
|
544 |
+
Returns a structured JSON with score, reasoning, and suggestions.
|
545 |
+
"""
|
546 |
+
import time, json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
|
548 |
+
prompt = f"""
|
549 |
+
You are a senior AI hiring expert evaluating the quality of an interview question for a {seniority} {job_role} role.
|
|
|
|
|
|
|
550 |
|
551 |
+
Evaluate the question based on:
|
552 |
+
- Relevance to the role and level
|
553 |
+
- Clarity and conciseness
|
554 |
+
- Depth of technical insight
|
|
|
|
|
|
|
|
|
555 |
|
556 |
+
---
|
557 |
+
Question: {question}
|
558 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
+
Respond only with a valid JSON like:
|
561 |
+
{{
|
562 |
+
"Score": "Poor" | "Medium" | "Good" | "Excellent",
|
563 |
+
"Reasoning": "short justification",
|
564 |
+
"Improvements": ["tip1", "tip2"]
|
565 |
+
}}
|
566 |
+
"""
|
567 |
|
568 |
+
try:
|
569 |
+
start = time.time()
|
570 |
+
response = groq_llm.invoke(prompt)
|
571 |
+
print("⏱️ eval_question_quality duration:", round(time.time() - start, 2), "s")
|
572 |
+
|
573 |
+
# Extract JSON safely
|
574 |
+
start_idx = response.rfind("{")
|
575 |
+
end_idx = response.rfind("}") + 1
|
576 |
+
json_str = response[start_idx:end_idx]
|
577 |
+
result = json.loads(json_str)
|
578 |
+
|
579 |
+
if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
|
580 |
+
return result
|
581 |
+
else:
|
582 |
+
raise ValueError("Invalid Score value in model output")
|
583 |
|
584 |
except Exception as e:
|
585 |
+
print(f"⚠️ eval_question_quality fallback: {e}")
|
586 |
return {
|
587 |
"Score": "Poor",
|
588 |
+
"Reasoning": "Evaluation failed, using fallback.",
|
589 |
"Improvements": [
|
590 |
+
"Ensure the question is relevant and clear.",
|
591 |
+
"Avoid vague or overly generic phrasing.",
|
592 |
+
"Include role-specific context if needed."
|
593 |
]
|
594 |
}
|
595 |
|
|
|
601 |
seniority: str,
|
602 |
) -> Dict[str, str]:
|
603 |
"""
|
604 |
+
Fast and structured answer evaluation using Groq LLM (e.g. Mixtral or LLaMA 3).
|
605 |
"""
|
606 |
import time, json
|
607 |
+
from langchain_core.messages import AIMessage
|
608 |
|
609 |
prompt = f"""
|
610 |
You are a technical interviewer evaluating a candidate for a {seniority} {job_role} role.
|
|
|
631 |
|
632 |
try:
|
633 |
start = time.time()
|
634 |
+
raw = groq_llm.invoke(prompt)
|
635 |
print("⏱️ evaluate_answer duration:", round(time.time() - start, 2), "s")
|
636 |
+
|
637 |
+
if isinstance(raw, AIMessage):
|
638 |
+
output = raw.content
|
639 |
+
else:
|
640 |
+
output = str(raw)
|
641 |
+
|
642 |
+
print("🔍 Raw Groq Response:\n", output)
|
643 |
+
|
644 |
+
start_idx = output.rfind("{")
|
645 |
+
end_idx = output.rfind("}") + 1
|
646 |
+
json_str = output[start_idx:end_idx]
|
647 |
|
648 |
result = json.loads(json_str)
|
649 |
if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
|