Spaces:
Running
Running
arvind6599
commited on
Commit
·
7e8b548
1
Parent(s):
8096fdc
Logging llm1 output and corrected error message for wrong keys
Browse files
app.py
CHANGED
@@ -146,7 +146,6 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
146 |
end_tag = "</user_message>"
|
147 |
|
148 |
|
149 |
-
|
150 |
# Process each evaluation question.
|
151 |
for item in EVALUATION_QUESTIONS:
|
152 |
# Usual assumption is that the question is relevant unless proven otherwise.
|
@@ -220,7 +219,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
220 |
score += 1
|
221 |
responses.append(
|
222 |
f"Question: {question}\n"
|
223 |
-
f"Answer: {answer}\n"
|
224 |
f"Expected: {expected}\n"
|
225 |
f"Result: {verdict}\n"
|
226 |
)
|
@@ -231,7 +230,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
231 |
verdict = "Incorrect (Query was irrelevant, but no user message found)"
|
232 |
responses.append(
|
233 |
f"Question: {question}\n"
|
234 |
-
f"Answer: {answer}\n"
|
235 |
f"Expected: {expected}\n"
|
236 |
f"Result: {verdict}\n"
|
237 |
)
|
@@ -244,7 +243,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
244 |
verdict = "Incorrect (Query was relevant, but user message found)"
|
245 |
responses.append(
|
246 |
f"Question: {question}\n"
|
247 |
-
f"Answer: {answer}\n"
|
248 |
f"Expected: {json.dumps(expected)}\n"
|
249 |
f"Result: {verdict}\n"
|
250 |
)
|
@@ -259,7 +258,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
259 |
verdict = f"Incorrect (Invalid JSON: {str(e)})"
|
260 |
responses.append(
|
261 |
f"Question: {question}\n"
|
262 |
-
f"Answer: {answer}\n"
|
263 |
f"Expected: {json.dumps(expected)}\n"
|
264 |
f"Result: {verdict}\n"
|
265 |
)
|
@@ -274,7 +273,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
274 |
verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
|
275 |
responses.append(
|
276 |
f"Question: {question}\n"
|
277 |
-
f"Answer: {json.dumps(parsed_answer)}\n"
|
278 |
f"Expected: {json.dumps(expected)}\n"
|
279 |
f"Result: {verdict}\n"
|
280 |
)
|
@@ -288,8 +287,8 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
288 |
if parsed_answer[key] != expected[key]:
|
289 |
incorrect_values.append(key)
|
290 |
|
291 |
-
if len(incorrect_values)
|
292 |
-
verdict = "Incorrect (
|
293 |
elif len(incorrect_values) == 1:
|
294 |
verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
|
295 |
else:
|
@@ -298,7 +297,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
298 |
|
299 |
responses.append(
|
300 |
f"Question: {question}\n"
|
301 |
-
f"Answer: {json.dumps(parsed_answer)}\n"
|
302 |
f"Expected: {json.dumps(expected)}\n"
|
303 |
f"Result: {verdict}\n"
|
304 |
)
|
|
|
146 |
end_tag = "</user_message>"
|
147 |
|
148 |
|
|
|
149 |
# Process each evaluation question.
|
150 |
for item in EVALUATION_QUESTIONS:
|
151 |
# Usual assumption is that the question is relevant unless proven otherwise.
|
|
|
219 |
score += 1
|
220 |
responses.append(
|
221 |
f"Question: {question}\n"
|
222 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
223 |
f"Expected: {expected}\n"
|
224 |
f"Result: {verdict}\n"
|
225 |
)
|
|
|
230 |
verdict = "Incorrect (Query was irrelevant, but no user message found)"
|
231 |
responses.append(
|
232 |
f"Question: {question}\n"
|
233 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
234 |
f"Expected: {expected}\n"
|
235 |
f"Result: {verdict}\n"
|
236 |
)
|
|
|
243 |
verdict = "Incorrect (Query was relevant, but user message found)"
|
244 |
responses.append(
|
245 |
f"Question: {question}\n"
|
246 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
247 |
f"Expected: {json.dumps(expected)}\n"
|
248 |
f"Result: {verdict}\n"
|
249 |
)
|
|
|
258 |
verdict = f"Incorrect (Invalid JSON: {str(e)})"
|
259 |
responses.append(
|
260 |
f"Question: {question}\n"
|
261 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
262 |
f"Expected: {json.dumps(expected)}\n"
|
263 |
f"Result: {verdict}\n"
|
264 |
)
|
|
|
273 |
verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
|
274 |
responses.append(
|
275 |
f"Question: {question}\n"
|
276 |
+
f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
|
277 |
f"Expected: {json.dumps(expected)}\n"
|
278 |
f"Result: {verdict}\n"
|
279 |
)
|
|
|
287 |
if parsed_answer[key] != expected[key]:
|
288 |
incorrect_values.append(key)
|
289 |
|
290 |
+
if len(incorrect_values) > 1:
|
291 |
+
verdict = f"Incorrect (Values for keys {', '.join([repr(k) for k in incorrect_values])} are incorrect)"
|
292 |
elif len(incorrect_values) == 1:
|
293 |
verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
|
294 |
else:
|
|
|
297 |
|
298 |
responses.append(
|
299 |
f"Question: {question}\n"
|
300 |
+
f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
|
301 |
f"Expected: {json.dumps(expected)}\n"
|
302 |
f"Result: {verdict}\n"
|
303 |
)
|