arvind6599 commited on
Commit
64feb25
·
1 Parent(s): 54a0bc8

Added evaluation methods

Browse files
Files changed (1) hide show
  1. app.py +126 -19
app.py CHANGED
@@ -96,7 +96,11 @@ def validate_email(email):
96
  email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
97
  return re.match(email_regex, email) is not None
98
 
99
- def submit_prompt(email, name, system_prompt):
 
 
 
 
100
  """
101
  Handles the full submission process:
102
  - Validates email format.
@@ -129,32 +133,122 @@ def submit_prompt(email, name, system_prompt):
129
  # Sanitize inputs.
130
  email = sanitize_input(email)
131
  name = sanitize_input(name)
132
- system_prompt = sanitize_prompt(system_prompt)
133
-
 
 
134
  score = 0
135
  responses = [] # For display output.
136
  verdicts = [] # For storing each question's verdict in the sheet.
137
  answers_list = [] # For storing each question's answer in the sheet.
138
 
 
 
 
 
 
139
  # Process each evaluation question.
140
  for item in EVALUATION_QUESTIONS:
 
 
141
  question = item["question"]
142
- docs = item["docs"]
143
  expected = item["expected"]
144
  try:
145
  response = client.chat.completions.create(
146
  model="gpt-4o-mini", # Ensure this model identifier matches your deployed model.
147
  messages=[
148
- {"role": "system", "content": system_prompt},
149
  {"role": "user", "content": question}
150
  ]
151
  )
152
- answer = response.choices[0].message.content.strip()
153
  except Exception as e:
154
- answer = f"Error during OpenAI API call: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  verdict = ""
157
- # Check if the answer is a valid JSON.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  try:
159
  parsed_answer = json.loads(answer)
160
  answer_to_store = json.dumps(parsed_answer) # Normalize parsed JSON as string.
@@ -171,7 +265,7 @@ def submit_prompt(email, name, system_prompt):
171
  continue
172
 
173
  # Verify that all required keys are present.
174
- required_keys = ["document_level", "clause_level"]
175
  missing_keys = [key for key in required_keys if key not in parsed_answer]
176
  if missing_keys:
177
  verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
@@ -212,7 +306,8 @@ def submit_prompt(email, name, system_prompt):
212
 
213
  # Record this email locally so that subsequent submissions are blocked.
214
  submitted_emails.add(email)
215
-
 
216
  # Prepare the row for Google Sheets:
217
  # The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
218
  row = [name, email, system_prompt, str(score)]
@@ -341,25 +436,37 @@ def build_interface():
341
 
342
  email_input = gr.Textbox(label="Email", placeholder="[email protected]")
343
  name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith")
344
- system_prompt_input = gr.Textbox(
345
- label="System Prompt",
346
  placeholder="Enter your system prompt here...",
347
  lines=6,
348
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  submit_button = gr.Button("Submit")
350
  output_text = gr.Textbox(label="Results", lines=15)
351
 
352
  submit_button.click(
353
  fn=submit_prompt,
354
- inputs=[email_input, name_input, system_prompt_input],
355
  outputs=output_text,
356
  )
357
-
358
-
359
-
360
  return demo
361
 
362
  if __name__ == "__main__":
363
- interface = build_interface()
364
- # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
365
- interface.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
96
  email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
97
  return re.match(email_regex, email) is not None
98
 
99
+
100
+
101
+
102
+
103
+ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3):
104
  """
105
  Handles the full submission process:
106
  - Validates email format.
 
133
  # Sanitize inputs.
134
  email = sanitize_input(email)
135
  name = sanitize_input(name)
136
+ system_prompt_1 = sanitize_prompt(system_prompt_1)
137
+ system_prompt_2 = sanitize_prompt(system_prompt_2)
138
+ system_prompt_3 = sanitize_prompt(system_prompt_3)
139
+
140
  score = 0
141
  responses = [] # For display output.
142
  verdicts = [] # For storing each question's verdict in the sheet.
143
  answers_list = [] # For storing each question's answer in the sheet.
144
 
145
+ start_tag = "<user_message>"
146
+ end_tag = "</user_message>"
147
+
148
+
149
+
150
  # Process each evaluation question.
151
  for item in EVALUATION_QUESTIONS:
152
+ # Usual assumption is that the question is relevant unless proven otherwise.
153
+ notRelevant = False
154
  question = item["question"]
155
+ docs = item["docs"].split("---") if item["docs"] else []
156
  expected = item["expected"]
157
  try:
158
  response = client.chat.completions.create(
159
  model="gpt-4o-mini", # Ensure this model identifier matches your deployed model.
160
  messages=[
161
+ {"role": "system", "content": system_prompt_1},
162
  {"role": "user", "content": question}
163
  ]
164
  )
165
+ output_1 = response.choices[0].message.content.strip()
166
  except Exception as e:
167
+ output_1 = f"Error during OpenAI API call: {str(e)}"
168
+
169
+ # Check if the answer contains the user message tags.
170
+ if start_tag in output_1 and end_tag in output_1:
171
+ # Extract the content between the tags.
172
+ start_index = output_1.index(start_tag) + len(start_tag)
173
+ end_index = output_1.index(end_tag)
174
+ # Extract the answer between the tags and stop the execution for this question as the query is deemed irrelevant.
175
+ answer = output_1[start_index:end_index].strip()
176
+ notRelevant = True
177
+ else:
178
+ # If no tags, treat the entire answer as the response.
179
+ output1 = output_1.strip()
180
+ output2 = ""
181
+
182
+ for doc in docs:
183
+ try:
184
+ response = client.chat.completions.create(
185
+ model="gpt-4o-mini",
186
+ messages=[
187
+ {"role": "system", "content": system_prompt_2},
188
+ {"role": "user", "content": f"Target company context: \n{output1} \n\nDocument:\n {doc}"}
189
+ ]
190
+ )
191
+ output2 += "\n" + response.choices[0].message.content.strip()
192
+ except Exception as e:
193
+ output2 += f"\nError processing document: {str(e)}"
194
+
195
+ # Prepare the final output for LLM3.
196
+
197
+ answer = output2.strip()
198
+ try:
199
+ response = client.chat.completions.create(
200
+ model="gpt-4o-mini",
201
+ messages=[
202
+ {"role": "system", "content": system_prompt_3},
203
+ {"role": "user", "content": f"Extracted information: \n{answer}"}
204
+ ]
205
+ )
206
+ answer = response.choices[0].message.content.strip()
207
+ except Exception as e:
208
+ answer = f"Error during final OpenAI API call: {str(e)}"
209
+
210
 
211
  verdict = ""
212
+
213
+ # When the expected output is a string, it indicates that the query was irrelevant.
214
+ if isinstance(expected, str):
215
+ if notRelevant:
216
+ verdict = f"Correct"
217
+ score += 1
218
+ responses.append(
219
+ f"Question: {question}\n"
220
+ f"Answer: {answer}\n"
221
+ f"Expected: {expected}\n"
222
+ f"Result: {verdict}\n"
223
+ )
224
+ verdicts.append(verdict)
225
+ answers_list.append(answer)
226
+ continue
227
+ else:
228
+ verdict = "Incorrect (Query was irrelevant, but no user message found)"
229
+ responses.append(
230
+ f"Question: {question}\n"
231
+ f"Answer: {answer}\n"
232
+ f"Expected: {expected}\n"
233
+ f"Result: {verdict}\n"
234
+ )
235
+ verdicts.append(verdict)
236
+ answers_list.append(answer)
237
+ continue
238
+
239
+ # If the expected output is a JSON object but answer is a String
240
+ if notRelevant and not isinstance(expected, str):
241
+ verdict = "Incorrect (Query was relevant, but user message found)"
242
+ responses.append(
243
+ f"Question: {question}\n"
244
+ f"Answer: {answer}\n"
245
+ f"Expected: {json.dumps(expected)}\n"
246
+ f"Result: {verdict}\n"
247
+ )
248
+ verdicts.append(verdict)
249
+ answers_list.append(answer)
250
+ continue
251
+
252
  try:
253
  parsed_answer = json.loads(answer)
254
  answer_to_store = json.dumps(parsed_answer) # Normalize parsed JSON as string.
 
265
  continue
266
 
267
  # Verify that all required keys are present.
268
+ required_keys = ["buyer_firm", "seller_firm", "third_party", "contains_target_firm"]
269
  missing_keys = [key for key in required_keys if key not in parsed_answer]
270
  if missing_keys:
271
  verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
 
306
 
307
  # Record this email locally so that subsequent submissions are blocked.
308
  submitted_emails.add(email)
309
+ system_prompt = f"{system_prompt_1}\n---\n{system_prompt_2}\n---\n{system_prompt_3}"
310
+
311
  # Prepare the row for Google Sheets:
312
  # The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
313
  row = [name, email, system_prompt, str(score)]
 
436
 
437
  email_input = gr.Textbox(label="Email", placeholder="[email protected]")
438
  name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith")
439
+ system_prompt_input_1 = gr.Textbox(
440
+ label="System Prompt for LLM1",
441
  placeholder="Enter your system prompt here...",
442
  lines=6,
443
  )
444
+
445
+ system_prompt_input_2 = gr.Textbox(
446
+ label="System Prompt for LLM2",
447
+ placeholder="Enter your system prompt here...",
448
+ lines=10,
449
+ )
450
+
451
+ system_prompt_input_3 = gr.Textbox(
452
+ label="System Prompt for LLM3",
453
+ placeholder="Enter your system prompt here...",
454
+ lines=6,
455
+ )
456
+
457
  submit_button = gr.Button("Submit")
458
  output_text = gr.Textbox(label="Results", lines=15)
459
 
460
  submit_button.click(
461
  fn=submit_prompt,
462
+ inputs=[email_input, name_input, system_prompt_input_1, system_prompt_input_2, system_prompt_input_3],
463
  outputs=output_text,
464
  )
 
 
 
465
  return demo
466
 
467
  if __name__ == "__main__":
468
+ # interface = build_interface()
469
+ # # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
470
+ # interface.launch(server_name="0.0.0.0", server_port=7860)
471
+
472
+ submit_prompt()