husseinelsaadi commited on
Commit
6e96b44
·
1 Parent(s): 0c4015d

evaluate answer updated

Browse files
Files changed (1) hide show
  1. app.py +51 -74
app.py CHANGED
@@ -620,40 +620,39 @@ def evaluate_answer(
620
  job_role: str,
621
  seniority: str,
622
  judge_pipeline=None,
623
- max_retries=1
624
  ) -> Dict[str, str]:
625
  """
626
- Evaluates a candidate's answer to an interview question and returns a structured judgment.
627
- Guarantees a valid, actionable result even if the model fails.
628
  """
629
 
630
  import time
631
- try:
632
- if judge_pipeline is None:
633
- judge_pipeline = globals().get("judge_pipeline")
634
 
635
- if not judge_pipeline:
636
- return {
637
- "Score": "Error",
638
- "Reasoning": "Judge pipeline not available",
639
- "Improvements": [
640
- "Please provide a valid language model pipeline"
641
- ]
642
- }
643
 
644
- # Enhanced prompt (your version)
645
- prompt = f"""
 
 
 
 
 
 
646
  You are an expert technical interviewer evaluating a candidate's response for a {job_role} position at the {seniority} level.
647
  You are provided with:
648
  - The question asked
649
  - The candidate's response
650
  - A reference answer that represents a high-quality expected answer
 
651
  Evaluate the candidate's response based on:
652
  - Technical correctness
653
  - Clarity and depth of explanation
654
  - Relevance to the job role and seniority
655
  - Completeness and structure
656
- Be objective, concise, and use professional language. Be fair but critical.
 
657
  --------------------------
658
  Question:
659
  {question}
@@ -662,72 +661,50 @@ Candidate Answer:
662
  Reference Answer:
663
  {ref_answer}
664
  --------------------------
665
- Now return your evaluation as a valid JSON object using exactly these keys:
666
  - "Score": One of ["Poor", "Medium", "Good", "Excellent"]
667
- - "Reasoning": 2-3 sentence explanation justifying the score, covering clarity, accuracy, completeness, or relevance
668
- - "Improvements": A list of 2-3 specific and constructive suggestions to help the candidate improve this answer
669
- Example:
670
- {{
671
- "Score": "Good",
672
- "Reasoning": "The answer demonstrates a good understanding of the concept and touches on key ideas, but lacks depth in explaining the trade-offs between techniques.",
673
- "Improvements": [
674
- "Explain when this method might fail or produce biased results",
675
- "Include examples or metrics to support the explanation",
676
- "Clarify the specific business impact or outcome achieved"
677
- ]
678
- }}
679
  Respond only with the JSON:
680
  """
681
- for attempt in range(max_retries + 1):
682
- output = judge_pipeline(
683
- prompt,
684
- max_new_tokens=512,
685
- temperature=0.3,
686
- do_sample=False
687
- )[0]["generated_text"]
688
 
689
- # Try to extract JSON response from output robustly
690
- try:
691
- start_idx = output.rfind("{")
692
- end_idx = output.rfind("}") + 1
693
-
694
- if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
695
- json_str = output[start_idx:end_idx]
696
- result = json.loads(json_str)
697
- valid_scores = {"Poor", "Medium", "Good", "Excellent"}
698
- if result.get("Score") in valid_scores:
699
- return {
700
- "Score": result["Score"],
701
- "Reasoning": result.get("Reasoning", "No explanation provided."),
702
- "Improvements": result.get("Improvements", ["No improvement suggestions provided."])
703
- }
704
- else:
705
- raise ValueError(f"Invalid Score value: {result.get('Score')}")
706
- else:
707
- raise ValueError("JSON format not found in output")
708
- except Exception as e:
709
- logging.warning(f"evaluate_answer: Attempt {attempt+1} failed to parse model output: {e}")
710
- time.sleep(0.2) # Small wait before retry
 
 
 
 
 
711
 
712
- # Fallback: always return a default 'Poor' score if all attempts fail
713
- return {
714
- "Score": "Poor",
715
- "Reasoning": "The evaluation model failed to produce a valid score or parse output; defaulted to 'Poor'. Please check model output and prompt formatting.",
716
- "Improvements": [
717
- "Be more specific and detailed in the answer.",
718
- "Structure your response with clear points.",
719
- "Relate your answer more closely to the job role and question."
720
- ]
721
- }
722
  except Exception as e:
723
- logging.error(f"Evaluation failed: {e}", exc_info=True)
724
  return {
725
  "Score": "Poor",
726
- "Reasoning": f"Critical error occurred: {str(e)}. Defaulted to 'Poor'.",
727
  "Improvements": [
728
- "Try again with a different answer.",
729
- "Check your judge pipeline connection.",
730
- "Contact support if the error persists."
731
  ]
732
  }
733
 
 
620
  job_role: str,
621
  seniority: str,
622
  judge_pipeline=None,
 
623
  ) -> Dict[str, str]:
624
  """
625
+ Evaluates candidate answer via judge_pipeline with faster, robust output.
 
626
  """
627
 
628
  import time
629
+ import json
630
+ import logging
 
631
 
632
+ if judge_pipeline is None:
633
+ judge_pipeline = globals().get("judge_pipeline")
 
 
 
 
 
 
634
 
635
+ if not judge_pipeline:
636
+ return {
637
+ "Score": "Error",
638
+ "Reasoning": "Judge pipeline not available",
639
+ "Improvements": ["Provide a valid language model pipeline"]
640
+ }
641
+
642
+ prompt = f"""
643
  You are an expert technical interviewer evaluating a candidate's response for a {job_role} position at the {seniority} level.
644
  You are provided with:
645
  - The question asked
646
  - The candidate's response
647
  - A reference answer that represents a high-quality expected answer
648
+
649
  Evaluate the candidate's response based on:
650
  - Technical correctness
651
  - Clarity and depth of explanation
652
  - Relevance to the job role and seniority
653
  - Completeness and structure
654
+
655
+ Be objective, concise, and professional.
656
  --------------------------
657
  Question:
658
  {question}
 
661
  Reference Answer:
662
  {ref_answer}
663
  --------------------------
664
+ Now return your evaluation as a valid JSON object using these keys:
665
  - "Score": One of ["Poor", "Medium", "Good", "Excellent"]
666
+ - "Reasoning": 2-3 sentence explanation
667
+ - "Improvements": List of 2-3 suggestions
 
 
 
 
 
 
 
 
 
 
668
  Respond only with the JSON:
669
  """
 
 
 
 
 
 
 
670
 
671
+ try:
672
+ start = time.time()
673
+ output = judge_pipeline(
674
+ prompt,
675
+ max_new_tokens=512,
676
+ temperature=0.3,
677
+ do_sample=False
678
+ )[0]["generated_text"]
679
+
680
+ duration = round(time.time() - start, 2)
681
+ print(f"⏱️ evaluate_answer duration: {duration}s")
682
+
683
+ # Fast JSON parse
684
+ start_idx = output.rfind("{")
685
+ end_idx = output.rfind("}") + 1
686
+
687
+ if start_idx != -1 and end_idx > start_idx:
688
+ json_str = output[start_idx:end_idx]
689
+ result = json.loads(json_str)
690
+ if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
691
+ return {
692
+ "Score": result["Score"],
693
+ "Reasoning": result.get("Reasoning", ""),
694
+ "Improvements": result.get("Improvements", [])
695
+ }
696
+
697
+ raise ValueError("No valid JSON with score found")
698
 
 
 
 
 
 
 
 
 
 
 
699
  except Exception as e:
700
+ logging.warning(f"[evaluate_answer] fallback triggered: {e}")
701
  return {
702
  "Score": "Poor",
703
+ "Reasoning": "Auto fallback due to model error or slow response.",
704
  "Improvements": [
705
+ "Structure your response better.",
706
+ "Clarify technical points.",
707
+ "Include practical examples."
708
  ]
709
  }
710