arvind6599 commited on
Commit
7e8b548
·
1 Parent(s): 8096fdc

Logging llm1 output and corrected error message for wrong keys

Browse files
Files changed (1) hide show
  1. app.py +8 -9
app.py CHANGED
@@ -146,7 +146,6 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
146
  end_tag = "</user_message>"
147
 
148
 
149
-
150
  # Process each evaluation question.
151
  for item in EVALUATION_QUESTIONS:
152
  # Usual assumption is that the question is relevant unless proven otherwise.
@@ -220,7 +219,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
220
  score += 1
221
  responses.append(
222
  f"Question: {question}\n"
223
- f"Answer: {answer}\n"
224
  f"Expected: {expected}\n"
225
  f"Result: {verdict}\n"
226
  )
@@ -231,7 +230,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
231
  verdict = "Incorrect (Query was irrelevant, but no user message found)"
232
  responses.append(
233
  f"Question: {question}\n"
234
- f"Answer: {answer}\n"
235
  f"Expected: {expected}\n"
236
  f"Result: {verdict}\n"
237
  )
@@ -244,7 +243,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
244
  verdict = "Incorrect (Query was relevant, but user message found)"
245
  responses.append(
246
  f"Question: {question}\n"
247
- f"Answer: {answer}\n"
248
  f"Expected: {json.dumps(expected)}\n"
249
  f"Result: {verdict}\n"
250
  )
@@ -259,7 +258,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
259
  verdict = f"Incorrect (Invalid JSON: {str(e)})"
260
  responses.append(
261
  f"Question: {question}\n"
262
- f"Answer: {answer}\n"
263
  f"Expected: {json.dumps(expected)}\n"
264
  f"Result: {verdict}\n"
265
  )
@@ -274,7 +273,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
274
  verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
275
  responses.append(
276
  f"Question: {question}\n"
277
- f"Answer: {json.dumps(parsed_answer)}\n"
278
  f"Expected: {json.dumps(expected)}\n"
279
  f"Result: {verdict}\n"
280
  )
@@ -288,8 +287,8 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
288
  if parsed_answer[key] != expected[key]:
289
  incorrect_values.append(key)
290
 
291
- if len(incorrect_values) == 2:
292
- verdict = "Incorrect (Both values are incorrect)"
293
  elif len(incorrect_values) == 1:
294
  verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
295
  else:
@@ -298,7 +297,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
298
 
299
  responses.append(
300
  f"Question: {question}\n"
301
- f"Answer: {json.dumps(parsed_answer)}\n"
302
  f"Expected: {json.dumps(expected)}\n"
303
  f"Result: {verdict}\n"
304
  )
 
146
  end_tag = "</user_message>"
147
 
148
 
 
149
  # Process each evaluation question.
150
  for item in EVALUATION_QUESTIONS:
151
  # Usual assumption is that the question is relevant unless proven otherwise.
 
219
  score += 1
220
  responses.append(
221
  f"Question: {question}\n"
222
+ f"Answer: {output1}\n --- \n{answer}\n"
223
  f"Expected: {expected}\n"
224
  f"Result: {verdict}\n"
225
  )
 
230
  verdict = "Incorrect (Query was irrelevant, but no user message found)"
231
  responses.append(
232
  f"Question: {question}\n"
233
+ f"Answer: {output1}\n --- \n{answer}\n"
234
  f"Expected: {expected}\n"
235
  f"Result: {verdict}\n"
236
  )
 
243
  verdict = "Incorrect (Query was relevant, but user message found)"
244
  responses.append(
245
  f"Question: {question}\n"
246
+ f"Answer: {output1}\n --- \n{answer}\n"
247
  f"Expected: {json.dumps(expected)}\n"
248
  f"Result: {verdict}\n"
249
  )
 
258
  verdict = f"Incorrect (Invalid JSON: {str(e)})"
259
  responses.append(
260
  f"Question: {question}\n"
261
+ f"Answer: {output1}\n --- \n{answer}\n"
262
  f"Expected: {json.dumps(expected)}\n"
263
  f"Result: {verdict}\n"
264
  )
 
273
  verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
274
  responses.append(
275
  f"Question: {question}\n"
276
+ f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
277
  f"Expected: {json.dumps(expected)}\n"
278
  f"Result: {verdict}\n"
279
  )
 
287
  if parsed_answer[key] != expected[key]:
288
  incorrect_values.append(key)
289
 
290
+ if len(incorrect_values) > 1:
291
+ verdict = f"Incorrect (Values for keys {', '.join([repr(k) for k in incorrect_values])} are incorrect)"
292
  elif len(incorrect_values) == 1:
293
  verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
294
  else:
 
297
 
298
  responses.append(
299
  f"Question: {question}\n"
300
+ f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
301
  f"Expected: {json.dumps(expected)}\n"
302
  f"Result: {verdict}\n"
303
  )