Spaces:

DeepJudge
/

Applicant-Task-Submission

Running

App Files Files

arvind6599 commited on Jul 21

Commit

64feb25

1 Parent(s): 54a0bc8

Added evaluation methods

Browse files

Files changed (1) hide show

app.py +126 -19

app.py CHANGED Viewed

@@ -96,7 +96,11 @@ def validate_email(email):
     email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
     return re.match(email_regex, email) is not None
-def submit_prompt(email, name, system_prompt):
     """
     Handles the full submission process:
      - Validates email format.
@@ -129,32 +133,122 @@ def submit_prompt(email, name, system_prompt):
     # Sanitize inputs.
     email = sanitize_input(email)
     name = sanitize_input(name)
-    system_prompt = sanitize_prompt(system_prompt)
     score = 0
     responses = []    # For display output.
     verdicts = []     # For storing each question's verdict in the sheet.
     answers_list = [] # For storing each question's answer in the sheet.
     # Process each evaluation question.
     for item in EVALUATION_QUESTIONS:
         question = item["question"]
-        docs = item["docs"]
         expected = item["expected"]
         try:
             response = client.chat.completions.create(
                 model="gpt-4o-mini",  # Ensure this model identifier matches your deployed model.
                 messages=[
-                    {"role": "system", "content": system_prompt},
                     {"role": "user", "content": question}
                 ]
             )
-            answer = response.choices[0].message.content.strip()
         except Exception as e:
-            answer = f"Error during OpenAI API call: {str(e)}"
         verdict = ""
-        # Check if the answer is a valid JSON.
         try:
             parsed_answer = json.loads(answer)
             answer_to_store = json.dumps(parsed_answer)  # Normalize parsed JSON as string.
@@ -171,7 +265,7 @@ def submit_prompt(email, name, system_prompt):
             continue
         # Verify that all required keys are present.
-        required_keys = ["document_level", "clause_level"]
         missing_keys = [key for key in required_keys if key not in parsed_answer]
         if missing_keys:
             verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
@@ -212,7 +306,8 @@ def submit_prompt(email, name, system_prompt):
     # Record this email locally so that subsequent submissions are blocked.
     submitted_emails.add(email)
     # Prepare the row for Google Sheets:
     # The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
     row = [name, email, system_prompt, str(score)]
@@ -341,25 +436,37 @@ def build_interface():
         email_input = gr.Textbox(label="Email", placeholder="[email protected]")
         name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith")
-        system_prompt_input = gr.Textbox(
-            label="System Prompt",
             placeholder="Enter your system prompt here...",
             lines=6,
         )
         submit_button = gr.Button("Submit")
         output_text = gr.Textbox(label="Results", lines=15)
         submit_button.click(
             fn=submit_prompt,
-            inputs=[email_input, name_input, system_prompt_input],
             outputs=output_text,
         )
     return demo
 if __name__ == "__main__":
-    interface = build_interface()
-    # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
-    interface.launch(server_name="0.0.0.0", server_port=7860)

     email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
     return re.match(email_regex, email) is not None
+def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3):
     """
     Handles the full submission process:
      - Validates email format.
     # Sanitize inputs.
     email = sanitize_input(email)
     name = sanitize_input(name)
+    system_prompt_1 = sanitize_prompt(system_prompt_1)
+    system_prompt_2 = sanitize_prompt(system_prompt_2)
+    system_prompt_3 = sanitize_prompt(system_prompt_3)
     score = 0
     responses = []    # For display output.
     verdicts = []     # For storing each question's verdict in the sheet.
     answers_list = [] # For storing each question's answer in the sheet.
+    start_tag = "<user_message>"
+    end_tag = "</user_message>"
     # Process each evaluation question.
     for item in EVALUATION_QUESTIONS:
+        # Usual assumption is that the question is relevant unless proven otherwise.
+        notRelevant = False
         question = item["question"]
+        docs = item["docs"].split("---") if item["docs"] else []
         expected = item["expected"]
         try:
             response = client.chat.completions.create(
                 model="gpt-4o-mini",  # Ensure this model identifier matches your deployed model.
                 messages=[
+                    {"role": "system", "content": system_prompt_1},
                     {"role": "user", "content": question}
                 ]
             )
+            output_1 = response.choices[0].message.content.strip()
         except Exception as e:
+            output_1 = f"Error during OpenAI API call: {str(e)}"
+        # Check if the answer contains the user message tags.
+        if start_tag in output_1 and end_tag in output_1:
+            # Extract the content between the tags.
+            start_index = output_1.index(start_tag) + len(start_tag)
+            end_index = output_1.index(end_tag)
+            # Extract the answer between the tags and stop the execution for this question as the query is deemed irrelevant.
+            answer = output_1[start_index:end_index].strip()
+            notRelevant = True
+        else:
+            # If no tags, treat the entire answer as the response.
+            output1 = output_1.strip()
+            output2 = ""
+            for doc in docs:
+                try:
+                    response = client.chat.completions.create(
+                        model="gpt-4o-mini",
+                        messages=[
+                            {"role": "system", "content": system_prompt_2},
+                            {"role": "user", "content": f"Target company context: \n{output1} \n\nDocument:\n {doc}"}
+                        ]
+                    )
+                    output2 += "\n" + response.choices[0].message.content.strip()
+                except Exception as e:
+                    output2 += f"\nError processing document: {str(e)}"
+            # Prepare the final output for LLM3.
+            answer = output2.strip()
+            try:
+                response = client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {"role": "system", "content": system_prompt_3},
+                        {"role": "user", "content": f"Extracted information: \n{answer}"}
+                    ]
+                )
+                answer = response.choices[0].message.content.strip()
+            except Exception as e:
+                answer = f"Error during final OpenAI API call: {str(e)}"
         verdict = ""
+        # When the expected output is a string, it indicates that the query was irrelevant.
+        if isinstance(expected, str):
+            if notRelevant:
+                verdict = f"Correct"
+                score += 1
+                responses.append(
+                    f"Question: {question}\n"
+                    f"Answer: {answer}\n"
+                    f"Expected: {expected}\n"
+                    f"Result: {verdict}\n"
+                )
+                verdicts.append(verdict)
+                answers_list.append(answer)
+                continue
+            else:
+                verdict = "Incorrect (Query was irrelevant, but no user message found)"
+                responses.append(
+                    f"Question: {question}\n"
+                    f"Answer: {answer}\n"
+                    f"Expected: {expected}\n"
+                    f"Result: {verdict}\n"
+                )
+                verdicts.append(verdict)
+                answers_list.append(answer)
+                continue
+        # If the expected output is a JSON object but answer is a String
+        if notRelevant and not isinstance(expected, str):
+            verdict = "Incorrect (Query was relevant, but user message found)"
+            responses.append(
+                f"Question: {question}\n"
+                f"Answer: {answer}\n"
+                f"Expected: {json.dumps(expected)}\n"
+                f"Result: {verdict}\n"
+            )
+            verdicts.append(verdict)
+            answers_list.append(answer)
+            continue
         try:
             parsed_answer = json.loads(answer)
             answer_to_store = json.dumps(parsed_answer)  # Normalize parsed JSON as string.
             continue
         # Verify that all required keys are present.
+        required_keys = ["buyer_firm", "seller_firm", "third_party", "contains_target_firm"]
         missing_keys = [key for key in required_keys if key not in parsed_answer]
         if missing_keys:
             verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
     # Record this email locally so that subsequent submissions are blocked.
     submitted_emails.add(email)
+    system_prompt = f"{system_prompt_1}\n---\n{system_prompt_2}\n---\n{system_prompt_3}"
     # Prepare the row for Google Sheets:
     # The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
     row = [name, email, system_prompt, str(score)]
         email_input = gr.Textbox(label="Email", placeholder="[email protected]")
         name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith")
+        system_prompt_input_1 = gr.Textbox(
+            label="System Prompt for LLM1",
             placeholder="Enter your system prompt here...",
             lines=6,
         )
+        system_prompt_input_2 = gr.Textbox(
+            label="System Prompt for LLM2",
+            placeholder="Enter your system prompt here...",
+            lines=10,
+        )
+        system_prompt_input_3 = gr.Textbox(
+            label="System Prompt for LLM3",
+            placeholder="Enter your system prompt here...",
+            lines=6,
+        )
         submit_button = gr.Button("Submit")
         output_text = gr.Textbox(label="Results", lines=15)
         submit_button.click(
             fn=submit_prompt,
+            inputs=[email_input, name_input, system_prompt_input_1, system_prompt_input_2, system_prompt_input_3],
             outputs=output_text,
         )
     return demo
 if __name__ == "__main__":
+    # interface = build_interface()
+    # # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
+    # interface.launch(server_name="0.0.0.0", server_port=7860)
+    submit_prompt()