Spaces:
Running
Running
arvind6599
commited on
Commit
·
7e8b548
1
Parent(s):
8096fdc
Logging llm1 output and corrected error message for wrong keys
Browse files
app.py
CHANGED
|
@@ -146,7 +146,6 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 146 |
end_tag = "</user_message>"
|
| 147 |
|
| 148 |
|
| 149 |
-
|
| 150 |
# Process each evaluation question.
|
| 151 |
for item in EVALUATION_QUESTIONS:
|
| 152 |
# Usual assumption is that the question is relevant unless proven otherwise.
|
|
@@ -220,7 +219,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 220 |
score += 1
|
| 221 |
responses.append(
|
| 222 |
f"Question: {question}\n"
|
| 223 |
-
f"Answer: {answer}\n"
|
| 224 |
f"Expected: {expected}\n"
|
| 225 |
f"Result: {verdict}\n"
|
| 226 |
)
|
|
@@ -231,7 +230,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 231 |
verdict = "Incorrect (Query was irrelevant, but no user message found)"
|
| 232 |
responses.append(
|
| 233 |
f"Question: {question}\n"
|
| 234 |
-
f"Answer: {answer}\n"
|
| 235 |
f"Expected: {expected}\n"
|
| 236 |
f"Result: {verdict}\n"
|
| 237 |
)
|
|
@@ -244,7 +243,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 244 |
verdict = "Incorrect (Query was relevant, but user message found)"
|
| 245 |
responses.append(
|
| 246 |
f"Question: {question}\n"
|
| 247 |
-
f"Answer: {answer}\n"
|
| 248 |
f"Expected: {json.dumps(expected)}\n"
|
| 249 |
f"Result: {verdict}\n"
|
| 250 |
)
|
|
@@ -259,7 +258,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 259 |
verdict = f"Incorrect (Invalid JSON: {str(e)})"
|
| 260 |
responses.append(
|
| 261 |
f"Question: {question}\n"
|
| 262 |
-
f"Answer: {answer}\n"
|
| 263 |
f"Expected: {json.dumps(expected)}\n"
|
| 264 |
f"Result: {verdict}\n"
|
| 265 |
)
|
|
@@ -274,7 +273,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 274 |
verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
|
| 275 |
responses.append(
|
| 276 |
f"Question: {question}\n"
|
| 277 |
-
f"Answer: {json.dumps(parsed_answer)}\n"
|
| 278 |
f"Expected: {json.dumps(expected)}\n"
|
| 279 |
f"Result: {verdict}\n"
|
| 280 |
)
|
|
@@ -288,8 +287,8 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 288 |
if parsed_answer[key] != expected[key]:
|
| 289 |
incorrect_values.append(key)
|
| 290 |
|
| 291 |
-
if len(incorrect_values)
|
| 292 |
-
verdict = "Incorrect (
|
| 293 |
elif len(incorrect_values) == 1:
|
| 294 |
verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
|
| 295 |
else:
|
|
@@ -298,7 +297,7 @@ def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3
|
|
| 298 |
|
| 299 |
responses.append(
|
| 300 |
f"Question: {question}\n"
|
| 301 |
-
f"Answer: {json.dumps(parsed_answer)}\n"
|
| 302 |
f"Expected: {json.dumps(expected)}\n"
|
| 303 |
f"Result: {verdict}\n"
|
| 304 |
)
|
|
|
|
| 146 |
end_tag = "</user_message>"
|
| 147 |
|
| 148 |
|
|
|
|
| 149 |
# Process each evaluation question.
|
| 150 |
for item in EVALUATION_QUESTIONS:
|
| 151 |
# Usual assumption is that the question is relevant unless proven otherwise.
|
|
|
|
| 219 |
score += 1
|
| 220 |
responses.append(
|
| 221 |
f"Question: {question}\n"
|
| 222 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
| 223 |
f"Expected: {expected}\n"
|
| 224 |
f"Result: {verdict}\n"
|
| 225 |
)
|
|
|
|
| 230 |
verdict = "Incorrect (Query was irrelevant, but no user message found)"
|
| 231 |
responses.append(
|
| 232 |
f"Question: {question}\n"
|
| 233 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
| 234 |
f"Expected: {expected}\n"
|
| 235 |
f"Result: {verdict}\n"
|
| 236 |
)
|
|
|
|
| 243 |
verdict = "Incorrect (Query was relevant, but user message found)"
|
| 244 |
responses.append(
|
| 245 |
f"Question: {question}\n"
|
| 246 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
| 247 |
f"Expected: {json.dumps(expected)}\n"
|
| 248 |
f"Result: {verdict}\n"
|
| 249 |
)
|
|
|
|
| 258 |
verdict = f"Incorrect (Invalid JSON: {str(e)})"
|
| 259 |
responses.append(
|
| 260 |
f"Question: {question}\n"
|
| 261 |
+
f"Answer: {output1}\n --- \n{answer}\n"
|
| 262 |
f"Expected: {json.dumps(expected)}\n"
|
| 263 |
f"Result: {verdict}\n"
|
| 264 |
)
|
|
|
|
| 273 |
verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
|
| 274 |
responses.append(
|
| 275 |
f"Question: {question}\n"
|
| 276 |
+
f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
|
| 277 |
f"Expected: {json.dumps(expected)}\n"
|
| 278 |
f"Result: {verdict}\n"
|
| 279 |
)
|
|
|
|
| 287 |
if parsed_answer[key] != expected[key]:
|
| 288 |
incorrect_values.append(key)
|
| 289 |
|
| 290 |
+
if len(incorrect_values) > 1:
|
| 291 |
+
verdict = f"Incorrect (Values for keys {', '.join([repr(k) for k in incorrect_values])} are incorrect)"
|
| 292 |
elif len(incorrect_values) == 1:
|
| 293 |
verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
|
| 294 |
else:
|
|
|
|
| 297 |
|
| 298 |
responses.append(
|
| 299 |
f"Question: {question}\n"
|
| 300 |
+
f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
|
| 301 |
f"Expected: {json.dumps(expected)}\n"
|
| 302 |
f"Result: {verdict}\n"
|
| 303 |
)
|