husseinelsaadi commited on
Commit
5fc8fb2
·
1 Parent(s): 5b71262

updated eval question

Browse files
Files changed (1) hide show
  1. app.py +56 -68
app.py CHANGED
@@ -537,79 +537,59 @@ from typing import Dict
537
  def eval_question_quality(
538
  question: str,
539
  job_role: str,
540
- seniority: str,
541
- judge_pipeline=None,
542
- max_retries=1 # Allow at least 1 retry on parse fail
543
  ) -> Dict[str, str]:
544
- import time
545
- try:
546
- # Use provided pipeline or fall back to global
547
- if judge_pipeline is None:
548
- judge_pipeline = globals().get("judge_pipeline")
549
-
550
- if not judge_pipeline:
551
- return {
552
- "Score": "Error",
553
- "Reasoning": "Judge pipeline not available",
554
- "Improvements": "Please provide a valid language model pipeline"
555
- }
556
 
557
- prompt = f"""
558
- ... (same as your prompt) ...
559
- Now evaluate this question:
560
- \"{question}\"
561
- """
562
 
563
- for attempt in range(max_retries + 1):
564
- response = judge_pipeline(
565
- prompt,
566
- max_new_tokens=512,
567
- do_sample=False,
568
- temperature=0.1,
569
- repetition_penalty=1.2
570
- )[0]["generated_text"]
571
 
572
- try:
573
- # Fallback to last {...} block
574
- match = re.search(r'\{.*\}', response, re.DOTALL)
575
- if not match:
576
- raise ValueError("Could not locate JSON structure in model output.")
577
- json_str = match.group(0)
578
- result = json.loads(json_str)
579
-
580
- # Validate required fields and values
581
- required_keys = ["Score", "Reasoning", "Improvements"]
582
- valid_scores = {"Poor", "Medium", "Good", "Excellent"}
583
- if not all(k in result for k in required_keys):
584
- raise ValueError("Missing required fields.")
585
- if result["Score"] not in valid_scores:
586
- raise ValueError("Invalid score value.")
587
- return result
588
 
589
- except Exception as e:
590
- logging.warning(f"Attempt {attempt+1} JSON parsing failed: {e}")
591
- time.sleep(0.2) # Small delay before retry
 
 
 
 
592
 
593
- # If all attempts fail, return a default valid dict
594
- return {
595
- "Score": "Poor",
596
- "Reasoning": "The evaluation model failed to produce a valid score, so defaulted to 'Poor'. Check model output and prompt formatting.",
597
- "Improvements": [
598
- "Ensure the question is clear and role-relevant.",
599
- "Double-check prompt and formatting.",
600
- "Try rephrasing the question to match rubric."
601
- ]
602
- }
 
 
 
 
 
603
 
604
  except Exception as e:
605
- logging.error(f"Error in eval_question_quality: {type(e).__name__}: {e}", exc_info=True)
606
  return {
607
  "Score": "Poor",
608
- "Reasoning": f"Critical error occurred: {str(e)}. Defaulted to 'Poor'.",
609
  "Improvements": [
610
- "Retry with a different question.",
611
- "Check your judge pipeline connection.",
612
- "Contact support if this persists."
613
  ]
614
  }
615
 
@@ -621,9 +601,10 @@ def evaluate_answer(
621
  seniority: str,
622
  ) -> Dict[str, str]:
623
  """
624
- Fast and structured answer evaluation using Groq LLM (e.g. Mixtral-8x7b).
625
  """
626
  import time, json
 
627
 
628
  prompt = f"""
629
  You are a technical interviewer evaluating a candidate for a {seniority} {job_role} role.
@@ -650,12 +631,19 @@ Respond ONLY with valid JSON in the following format:
650
 
651
  try:
652
  start = time.time()
653
- response = groq_llm.invoke(prompt)
654
  print("⏱️ evaluate_answer duration:", round(time.time() - start, 2), "s")
655
- print("🔍 Raw Groq Response:\n", response)
656
- start_idx = response.rfind("{")
657
- end_idx = response.rfind("}") + 1
658
- json_str = response[start_idx:end_idx]
 
 
 
 
 
 
 
659
 
660
  result = json.loads(json_str)
661
  if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
 
537
  def eval_question_quality(
538
  question: str,
539
  job_role: str,
540
+ seniority: str
 
 
541
  ) -> Dict[str, str]:
542
+ """
543
+ Evaluate the quality of a generated interview question using Groq LLM.
544
+ Returns a structured JSON with score, reasoning, and suggestions.
545
+ """
546
+ import time, json
 
 
 
 
 
 
 
547
 
548
+ prompt = f"""
549
+ You are a senior AI hiring expert evaluating the quality of an interview question for a {seniority} {job_role} role.
 
 
 
550
 
551
+ Evaluate the question based on:
552
+ - Relevance to the role and level
553
+ - Clarity and conciseness
554
+ - Depth of technical insight
 
 
 
 
555
 
556
+ ---
557
+ Question: {question}
558
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
+ Respond only with a valid JSON like:
561
+ {{
562
+ "Score": "Poor" | "Medium" | "Good" | "Excellent",
563
+ "Reasoning": "short justification",
564
+ "Improvements": ["tip1", "tip2"]
565
+ }}
566
+ """
567
 
568
+ try:
569
+ start = time.time()
570
+ response = groq_llm.invoke(prompt)
571
+ print("⏱️ eval_question_quality duration:", round(time.time() - start, 2), "s")
572
+
573
+ # Extract JSON safely
574
+ start_idx = response.rfind("{")
575
+ end_idx = response.rfind("}") + 1
576
+ json_str = response[start_idx:end_idx]
577
+ result = json.loads(json_str)
578
+
579
+ if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}:
580
+ return result
581
+ else:
582
+ raise ValueError("Invalid Score value in model output")
583
 
584
  except Exception as e:
585
+ print(f"⚠️ eval_question_quality fallback: {e}")
586
  return {
587
  "Score": "Poor",
588
+ "Reasoning": "Evaluation failed, using fallback.",
589
  "Improvements": [
590
+ "Ensure the question is relevant and clear.",
591
+ "Avoid vague or overly generic phrasing.",
592
+ "Include role-specific context if needed."
593
  ]
594
  }
595
 
 
601
  seniority: str,
602
  ) -> Dict[str, str]:
603
  """
604
+ Fast and structured answer evaluation using Groq LLM (e.g. Mixtral or LLaMA 3).
605
  """
606
  import time, json
607
+ from langchain_core.messages import AIMessage
608
 
609
  prompt = f"""
610
  You are a technical interviewer evaluating a candidate for a {seniority} {job_role} role.
 
631
 
632
  try:
633
  start = time.time()
634
+ raw = groq_llm.invoke(prompt)
635
  print("⏱️ evaluate_answer duration:", round(time.time() - start, 2), "s")
636
+
637
+ if isinstance(raw, AIMessage):
638
+ output = raw.content
639
+ else:
640
+ output = str(raw)
641
+
642
+ print("🔍 Raw Groq Response:\n", output)
643
+
644
+ start_idx = output.rfind("{")
645
+ end_idx = output.rfind("}") + 1
646
+ json_str = output[start_idx:end_idx]
647
 
648
  result = json.loads(json_str)
649
  if result.get("Score") in {"Poor", "Medium", "Good", "Excellent"}: