Commit
·
6e96b44
1
Parent(s):
0c4015d
evaluate answer updated
Browse files
app.py
CHANGED
@@ -620,40 +620,39 @@ def evaluate_answer(
|
|
620 |
job_role: str,
|
621 |
seniority: str,
|
622 |
judge_pipeline=None,
|
623 |
-
max_retries=1
|
624 |
) -> Dict[str, str]:
|
625 |
"""
|
626 |
-
Evaluates
|
627 |
-
Guarantees a valid, actionable result even if the model fails.
|
628 |
"""
|
629 |
|
630 |
import time
|
631 |
-
|
632 |
-
|
633 |
-
judge_pipeline = globals().get("judge_pipeline")
|
634 |
|
635 |
-
|
636 |
-
|
637 |
-
"Score": "Error",
|
638 |
-
"Reasoning": "Judge pipeline not available",
|
639 |
-
"Improvements": [
|
640 |
-
"Please provide a valid language model pipeline"
|
641 |
-
]
|
642 |
-
}
|
643 |
|
644 |
-
|
645 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
You are an expert technical interviewer evaluating a candidate's response for a {job_role} position at the {seniority} level.
|
647 |
You are provided with:
|
648 |
- The question asked
|
649 |
- The candidate's response
|
650 |
- A reference answer that represents a high-quality expected answer
|
|
|
651 |
Evaluate the candidate's response based on:
|
652 |
- Technical correctness
|
653 |
- Clarity and depth of explanation
|
654 |
- Relevance to the job role and seniority
|
655 |
- Completeness and structure
|
656 |
-
|
|
|
657 |
--------------------------
|
658 |
Question:
|
659 |
{question}
|
@@ -662,72 +661,50 @@ Candidate Answer:
|
|
662 |
Reference Answer:
|
663 |
{ref_answer}
|
664 |
--------------------------
|
665 |
-
Now return your evaluation as a valid JSON object using
|
666 |
- "Score": One of ["Poor", "Medium", "Good", "Excellent"]
|
667 |
-
- "Reasoning": 2-3 sentence explanation
|
668 |
-
- "Improvements":
|
669 |
-
Example:
|
670 |
-
{{
|
671 |
-
"Score": "Good",
|
672 |
-
"Reasoning": "The answer demonstrates a good understanding of the concept and touches on key ideas, but lacks depth in explaining the trade-offs between techniques.",
|
673 |
-
"Improvements": [
|
674 |
-
"Explain when this method might fail or produce biased results",
|
675 |
-
"Include examples or metrics to support the explanation",
|
676 |
-
"Clarify the specific business impact or outcome achieved"
|
677 |
-
]
|
678 |
-
}}
|
679 |
Respond only with the JSON:
|
680 |
"""
|
681 |
-
for attempt in range(max_retries + 1):
|
682 |
-
output = judge_pipeline(
|
683 |
-
prompt,
|
684 |
-
max_new_tokens=512,
|
685 |
-
temperature=0.3,
|
686 |
-
do_sample=False
|
687 |
-
)[0]["generated_text"]
|
688 |
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
|
|
|
|
|
|
|
|
|
|
711 |
|
712 |
-
# Fallback: always return a default 'Poor' score if all attempts fail
|
713 |
-
return {
|
714 |
-
"Score": "Poor",
|
715 |
-
"Reasoning": "The evaluation model failed to produce a valid score or parse output; defaulted to 'Poor'. Please check model output and prompt formatting.",
|
716 |
-
"Improvements": [
|
717 |
-
"Be more specific and detailed in the answer.",
|
718 |
-
"Structure your response with clear points.",
|
719 |
-
"Relate your answer more closely to the job role and question."
|
720 |
-
]
|
721 |
-
}
|
722 |
except Exception as e:
|
723 |
-
logging.
|
724 |
return {
|
725 |
"Score": "Poor",
|
726 |
-
"Reasoning":
|
727 |
"Improvements": [
|
728 |
-
"
|
729 |
-
"
|
730 |
-
"
|
731 |
]
|
732 |
}
|
733 |
|
|
|
620 |
job_role: str,
|
621 |
seniority: str,
|
622 |
judge_pipeline=None,
|
|
|
623 |
) -> Dict[str, str]:
|
624 |
"""
|
625 |
+
Evaluates candidate answer via judge_pipeline with faster, robust output.
|
|
|
626 |
"""
|
627 |
|
628 |
import time
|
629 |
+
import json
|
630 |
+
import logging
|
|
|
631 |
|
632 |
+
if judge_pipeline is None:
|
633 |
+
judge_pipeline = globals().get("judge_pipeline")
|
|
|
|
|
|
|
|
|
|
|
|
|
634 |
|
635 |
+
if not judge_pipeline:
|
636 |
+
return {
|
637 |
+
"Score": "Error",
|
638 |
+
"Reasoning": "Judge pipeline not available",
|
639 |
+
"Improvements": ["Provide a valid language model pipeline"]
|
640 |
+
}
|
641 |
+
|
642 |
+
prompt = f"""
|
643 |
You are an expert technical interviewer evaluating a candidate's response for a {job_role} position at the {seniority} level.
|
644 |
You are provided with:
|
645 |
- The question asked
|
646 |
- The candidate's response
|
647 |
- A reference answer that represents a high-quality expected answer
|
648 |
+
|
649 |
Evaluate the candidate's response based on:
|
650 |
- Technical correctness
|
651 |
- Clarity and depth of explanation
|
652 |
- Relevance to the job role and seniority
|
653 |
- Completeness and structure
|
654 |
+
|
655 |
+
Be objective, concise, and professional.
|
656 |
--------------------------
|
657 |
Question:
|
658 |
{question}
|
|
|
661 |
Reference Answer:
|
662 |
{ref_answer}
|
663 |
--------------------------
|
664 |
+
Now return your evaluation as a valid JSON object using these keys:
|
665 |
- "Score": One of ["Poor", "Medium", "Good", "Excellent"]
|
666 |
+
- "Reasoning": 2-3 sentence explanation
|
667 |
+
- "Improvements": List of 2-3 suggestions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
668 |
Respond only with the JSON:
|
669 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
670 |
|
671 |
+
try:
|
672 |
+
start = time.time()
|
673 |
+
output = judge_pipeline(
|
674 |
+
prompt,
|
675 |
+
max_new_tokens=512,
|
676 |
+
temperature=0.3,
|
677 |
+
do_sample=False
|
678 |
+
)[0]["generated_text"]
|
679 |
+
|
680 |
+
duration = round(time.time() - start, 2)
|
681 |
+
print(f"⏱️ evaluate_answer duration: {duration}s")
|
682 |
+
|
683 |
+
# Fast JSON parse
|
684 |
+
start_idx = output.rfind("{")
|
685 |
+
end_idx = output.rfind("}") + 1
|
686 |
+
|
687 |
+
if start_idx != -1 and end_idx > start_idx:
|
688 |
+
json_str = output[start_idx:end_idx]
|
689 |
+
result = json.loads(json_str)
|
690 |
+
if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
|
691 |
+
return {
|
692 |
+
"Score": result["Score"],
|
693 |
+
"Reasoning": result.get("Reasoning", ""),
|
694 |
+
"Improvements": result.get("Improvements", [])
|
695 |
+
}
|
696 |
+
|
697 |
+
raise ValueError("No valid JSON with score found")
|
698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
699 |
except Exception as e:
|
700 |
+
logging.warning(f"[evaluate_answer] fallback triggered: {e}")
|
701 |
return {
|
702 |
"Score": "Poor",
|
703 |
+
"Reasoning": "Auto fallback due to model error or slow response.",
|
704 |
"Improvements": [
|
705 |
+
"Structure your response better.",
|
706 |
+
"Clarify technical points.",
|
707 |
+
"Include practical examples."
|
708 |
]
|
709 |
}
|
710 |
|