Spaces:

DeepJudge
/

Applicant-Task-Submission

Running

App Files Files

Timothy-Vinzent commited on Feb 20

Commit

122c32d

verified ·

1 Parent(s): ebb0c31

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -10

app.py CHANGED Viewed

@@ -107,22 +107,57 @@ def submit_prompt(email, name, system_prompt):
             print("llm answer", answer)
         except Exception as e:
             answer = f"Error during OpenAI API call: {str(e)}"
-        # Simple evaluation: check if the expected output is a substring of the answer (case-insensitive).
-        if expected.lower() in answer.lower():
             score += 1
             verdict = "Correct"
-            print(f"{expected.lower()} DOES NOT MATCH {answer.lower()}")
-        else:
-            verdict = "Incorrect"
-            print(f"{expected.lower()} MATCHES {answer.lower()}")
         responses.append(
             f"Question: {question}\n"
-            f"Answer: {answer}\n"
-            f"Expected: {expected}\n"
             f"Result: {verdict}\n"
         )
     result_details = "\n".join(responses)

             print("llm answer", answer)
         except Exception as e:
             answer = f"Error during OpenAI API call: {str(e)}"
+        # Step 1: Check if the answer is valid JSON
+        try:
+            parsed_answer = json.loads(answer)
+        except json.JSONDecodeError as e:
+            verdict = f"Incorrect (Invalid JSON): {str(e)}"
+            responses.append(
+                f"Question: {question}\n"
+                f"Answer: {answer}\n"
+                f"Expected: {json.dumps(expected)}\n"
+                f"Result: {verdict}\n"
+            )
+            print(verdict)
+            continue
+        # Step 2: Check if all required keys are present
+        required_keys = ["document_level", "clause_level"]
+        missing_keys = [key for key in required_keys if key not in parsed_answer]
+        if missing_keys:
+            verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
+            responses.append(
+                f"Question: {question}\n"
+                f"Answer: {json.dumps(parsed_answer)}\n"
+                f"Expected: {json.dumps(expected)}\n"
+                f"Result: {verdict}\n"
+            )
+            print(verdict)
+            continue
+        # Step 3: Check if values for each key match
+        incorrect_values = []
+        for key in required_keys:
+            if parsed_answer[key] != expected[key]:
+                incorrect_values.append(key)
+        if len(incorrect_values) == 2:
+            verdict = "Incorrect (Both values are incorrect)"
+        elif len(incorrect_values) == 1:
+            verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
+        else:
             score += 1
             verdict = "Correct"
         responses.append(
             f"Question: {question}\n"
+            f"Answer: {json.dumps(parsed_answer)}\n"
+            f"Expected: {json.dumps(expected)}\n"
             f"Result: {verdict}\n"
         )
+        print(verdict)
     result_details = "\n".join(responses)