Spaces:

husseinelsaadi
/

ai-interviewer-demo

Paused

App Files Files Community

husseinelsaadi commited on Jul 19

Commit

6e96b44

1 Parent(s): 0c4015d

evaluate answer updated

Browse files

Files changed (1) hide show

app.py +51 -74

app.py CHANGED Viewed

@@ -620,40 +620,39 @@ def evaluate_answer(
     job_role: str,
     seniority: str,
     judge_pipeline=None,
-    max_retries=1
 ) -> Dict[str, str]:
     """
-    Evaluates a candidate's answer to an interview question and returns a structured judgment.
-    Guarantees a valid, actionable result even if the model fails.
     """
     import time
-    try:
-        if judge_pipeline is None:
-            judge_pipeline = globals().get("judge_pipeline")
-        if not judge_pipeline:
-            return {
-                "Score": "Error",
-                "Reasoning": "Judge pipeline not available",
-                "Improvements": [
-                    "Please provide a valid language model pipeline"
-                ]
-            }
-        # Enhanced prompt (your version)
-        prompt = f"""
 You are an expert technical interviewer evaluating a candidate's response for a {job_role} position at the {seniority} level.
 You are provided with:
 - The question asked
 - The candidate's response
 - A reference answer that represents a high-quality expected answer
 Evaluate the candidate's response based on:
 - Technical correctness
 - Clarity and depth of explanation
 - Relevance to the job role and seniority
 - Completeness and structure
-Be objective, concise, and use professional language. Be fair but critical.
 --------------------------
 Question:
 {question}
@@ -662,72 +661,50 @@ Candidate Answer:
 Reference Answer:
 {ref_answer}
 --------------------------
-Now return your evaluation as a valid JSON object using exactly these keys:
 - "Score": One of ["Poor", "Medium", "Good", "Excellent"]
-- "Reasoning": 2-3 sentence explanation justifying the score, covering clarity, accuracy, completeness, or relevance
-- "Improvements": A list of 2-3 specific and constructive suggestions to help the candidate improve this answer
-Example:
-{{
-  "Score": "Good",
-  "Reasoning": "The answer demonstrates a good understanding of the concept and touches on key ideas, but lacks depth in explaining the trade-offs between techniques.",
-  "Improvements": [
-    "Explain when this method might fail or produce biased results",
-    "Include examples or metrics to support the explanation",
-    "Clarify the specific business impact or outcome achieved"
-  ]
-}}
 Respond only with the JSON:
 """
-        for attempt in range(max_retries + 1):
-            output = judge_pipeline(
-                prompt,
-                max_new_tokens=512,
-                temperature=0.3,
-                do_sample=False
-            )[0]["generated_text"]
-            # Try to extract JSON response from output robustly
-            try:
-                start_idx = output.rfind("{")
-                end_idx = output.rfind("}") + 1
-                if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
-                    json_str = output[start_idx:end_idx]
-                    result = json.loads(json_str)
-                    valid_scores = {"Poor", "Medium", "Good", "Excellent"}
-                    if result.get("Score") in valid_scores:
-                        return {
-                            "Score": result["Score"],
-                            "Reasoning": result.get("Reasoning", "No explanation provided."),
-                            "Improvements": result.get("Improvements", ["No improvement suggestions provided."])
-                        }
-                    else:
-                        raise ValueError(f"Invalid Score value: {result.get('Score')}")
-                else:
-                    raise ValueError("JSON format not found in output")
-            except Exception as e:
-                logging.warning(f"evaluate_answer: Attempt {attempt+1} failed to parse model output: {e}")
-                time.sleep(0.2)  # Small wait before retry
-        # Fallback: always return a default 'Poor' score if all attempts fail
-        return {
-            "Score": "Poor",
-            "Reasoning": "The evaluation model failed to produce a valid score or parse output; defaulted to 'Poor'. Please check model output and prompt formatting.",
-            "Improvements": [
-                "Be more specific and detailed in the answer.",
-                "Structure your response with clear points.",
-                "Relate your answer more closely to the job role and question."
-            ]
-        }
     except Exception as e:
-        logging.error(f"Evaluation failed: {e}", exc_info=True)
         return {
             "Score": "Poor",
-            "Reasoning": f"Critical error occurred: {str(e)}. Defaulted to 'Poor'.",
             "Improvements": [
-                "Try again with a different answer.",
-                "Check your judge pipeline connection.",
-                "Contact support if the error persists."
             ]
         }

     job_role: str,
     seniority: str,
     judge_pipeline=None,
 ) -> Dict[str, str]:
     """
+    Evaluates candidate answer via judge_pipeline with faster, robust output.
     """
     import time
+    import json
+    import logging
+    if judge_pipeline is None:
+        judge_pipeline = globals().get("judge_pipeline")
+    if not judge_pipeline:
+        return {
+            "Score": "Error",
+            "Reasoning": "Judge pipeline not available",
+            "Improvements": ["Provide a valid language model pipeline"]
+        }
+    prompt = f"""
 You are an expert technical interviewer evaluating a candidate's response for a {job_role} position at the {seniority} level.
 You are provided with:
 - The question asked
 - The candidate's response
 - A reference answer that represents a high-quality expected answer
 Evaluate the candidate's response based on:
 - Technical correctness
 - Clarity and depth of explanation
 - Relevance to the job role and seniority
 - Completeness and structure
+Be objective, concise, and professional.
 --------------------------
 Question:
 {question}
 Reference Answer:
 {ref_answer}
 --------------------------
+Now return your evaluation as a valid JSON object using these keys:
 - "Score": One of ["Poor", "Medium", "Good", "Excellent"]
+- "Reasoning": 2-3 sentence explanation
+- "Improvements": List of 2-3 suggestions
 Respond only with the JSON:
 """
+    try:
+        start = time.time()
+        output = judge_pipeline(
+            prompt,
+            max_new_tokens=512,
+            temperature=0.3,
+            do_sample=False
+        )[0]["generated_text"]
+        duration = round(time.time() - start, 2)
+        print(f"⏱️ evaluate_answer duration: {duration}s")
+        # Fast JSON parse
+        start_idx = output.rfind("{")
+        end_idx = output.rfind("}") + 1
+        if start_idx != -1 and end_idx > start_idx:
+            json_str = output[start_idx:end_idx]
+            result = json.loads(json_str)
+            if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
+                return {
+                    "Score": result["Score"],
+                    "Reasoning": result.get("Reasoning", ""),
+                    "Improvements": result.get("Improvements", [])
+                }
+        raise ValueError("No valid JSON with score found")
     except Exception as e:
+        logging.warning(f"[evaluate_answer] fallback triggered: {e}")
         return {
             "Score": "Poor",
+            "Reasoning": "Auto fallback due to model error or slow response.",
             "Improvements": [
+                "Structure your response better.",
+                "Clarify technical points.",
+                "Include practical examples."
             ]
         }