Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,67 +1,88 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
import json
|
4 |
-
|
5 |
import gradio as gr
|
6 |
from openai import OpenAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Initialize the OpenAI client with the API key from environment variables.
|
9 |
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
10 |
|
11 |
-
# In-memory
|
12 |
submitted_emails = set()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def get_evaluation_questions():
|
15 |
"""
|
16 |
Loads evaluation questions and expected answers from environment variables.
|
17 |
|
18 |
-
Expected environment
|
19 |
-
|
20 |
-
|
21 |
|
22 |
Both lists must be of equal length.
|
23 |
"""
|
24 |
questions_str = os.environ.get("TEST_QUESTION_1")
|
25 |
-
print("questions",questions_str)
|
26 |
expected_str = os.environ.get("TEST_EXPECTED_1")
|
27 |
-
print("expected",expected_str)
|
28 |
if not questions_str or not expected_str:
|
29 |
return []
|
30 |
try:
|
31 |
questions_list = json.loads(questions_str)
|
32 |
-
print("questions lst ",questions_list)
|
33 |
except Exception as e:
|
34 |
print(f"Error parsing questions: {str(e)}")
|
35 |
return []
|
36 |
-
try:
|
37 |
expected_list = json.loads(expected_str)
|
38 |
-
print("expected lst",expected_list)
|
39 |
except Exception as e:
|
40 |
-
print(f"Error parsing
|
41 |
return []
|
42 |
if len(questions_list) != len(expected_list):
|
43 |
-
print("length of question list", len(questions_list))
|
44 |
-
print("length of solution list", len(expected_list))
|
45 |
print("Mismatch in length: questions list and expected answers list must have the same length.")
|
46 |
return []
|
47 |
return [{"question": q, "expected": e} for q, e in zip(questions_list, expected_list)]
|
48 |
|
49 |
-
# Load
|
50 |
EVALUATION_QUESTIONS = get_evaluation_questions()
|
51 |
|
52 |
def sanitize_input(text):
|
53 |
"""
|
54 |
-
Sanitizes input to
|
|
|
55 |
"""
|
56 |
-
# Allow alphanumerics and some punctuation, then truncate to 500 characters.
|
57 |
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
|
58 |
return clean_text.strip()[:500]
|
59 |
-
|
60 |
def sanitize_prompt(text):
|
61 |
"""
|
62 |
-
Sanitizes
|
63 |
"""
|
64 |
return text.strip()[:8000]
|
|
|
65 |
def validate_email(email):
|
66 |
"""
|
67 |
Validates that the provided email is in a valid format.
|
@@ -72,64 +93,79 @@ def validate_email(email):
|
|
72 |
|
73 |
def submit_prompt(email, name, system_prompt):
|
74 |
"""
|
75 |
-
Handles
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
81 |
"""
|
82 |
# Validate email format.
|
83 |
if not validate_email(email):
|
84 |
return "Invalid email address. Please enter a valid email."
|
85 |
-
|
86 |
-
# Check if this email has already been
|
87 |
if email in submitted_emails:
|
88 |
return f"Submission already received for {email}. You can only submit once."
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
# Sanitize inputs.
|
91 |
email = sanitize_input(email)
|
92 |
name = sanitize_input(name)
|
93 |
-
print("UNSAINITIZED SYSTEM PROMPT", system_prompt)
|
94 |
system_prompt = sanitize_prompt(system_prompt)
|
95 |
-
|
96 |
-
|
97 |
score = 0
|
98 |
-
responses = []
|
99 |
-
|
|
|
|
|
|
|
100 |
for item in EVALUATION_QUESTIONS:
|
101 |
question = item["question"]
|
102 |
expected = item["expected"]
|
103 |
try:
|
104 |
-
# Use the new client-based API for chat completions.
|
105 |
response = client.chat.completions.create(
|
106 |
-
model="gpt-4o-mini", # Ensure this identifier matches
|
107 |
messages=[
|
108 |
{"role": "system", "content": system_prompt},
|
109 |
{"role": "user", "content": question}
|
110 |
]
|
111 |
)
|
112 |
-
# Extract the answer from the response object.
|
113 |
answer = response.choices[0].message.content.strip()
|
114 |
-
print("
|
115 |
except Exception as e:
|
116 |
answer = f"Error during OpenAI API call: {str(e)}"
|
117 |
-
|
118 |
-
|
|
|
119 |
try:
|
120 |
parsed_answer = json.loads(answer)
|
|
|
121 |
except json.JSONDecodeError as e:
|
122 |
-
verdict = f"Incorrect (Invalid JSON
|
123 |
responses.append(
|
124 |
f"Question: {question}\n"
|
125 |
f"Answer: {answer}\n"
|
126 |
f"Expected: {json.dumps(expected)}\n"
|
127 |
f"Result: {verdict}\n"
|
128 |
)
|
129 |
-
|
|
|
130 |
continue
|
131 |
-
|
132 |
-
#
|
133 |
required_keys = ["document_level", "clause_level"]
|
134 |
missing_keys = [key for key in required_keys if key not in parsed_answer]
|
135 |
if missing_keys:
|
@@ -140,15 +176,16 @@ def submit_prompt(email, name, system_prompt):
|
|
140 |
f"Expected: {json.dumps(expected)}\n"
|
141 |
f"Result: {verdict}\n"
|
142 |
)
|
143 |
-
|
|
|
144 |
continue
|
145 |
-
|
146 |
-
#
|
147 |
incorrect_values = []
|
148 |
for key in required_keys:
|
149 |
if parsed_answer[key] != expected[key]:
|
150 |
incorrect_values.append(key)
|
151 |
-
|
152 |
if len(incorrect_values) == 2:
|
153 |
verdict = "Incorrect (Both values are incorrect)"
|
154 |
elif len(incorrect_values) == 1:
|
@@ -156,24 +193,38 @@ def submit_prompt(email, name, system_prompt):
|
|
156 |
else:
|
157 |
score += 1
|
158 |
verdict = "Correct"
|
159 |
-
|
160 |
responses.append(
|
161 |
f"Question: {question}\n"
|
162 |
f"Answer: {json.dumps(parsed_answer)}\n"
|
163 |
f"Expected: {json.dumps(expected)}\n"
|
164 |
f"Result: {verdict}\n"
|
165 |
)
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
result_details = "\n".join(responses)
|
170 |
-
|
171 |
-
# Record this email
|
172 |
submitted_emails.add(email)
|
173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
return (
|
175 |
f"Thank you for your submission, {name}!\n\n"
|
176 |
-
f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\
|
|
|
177 |
)
|
178 |
|
179 |
def build_interface():
|
|
|
1 |
import os
|
2 |
import re
|
3 |
import json
|
|
|
4 |
import gradio as gr
|
5 |
from openai import OpenAI
|
6 |
+
import gspread
|
7 |
+
from google.oauth2.service_account import Credentials
|
8 |
+
|
9 |
+
# Define scopes for Google Sheets and Drive API access.
|
10 |
+
SCOPES = [
|
11 |
+
"https://www.googleapis.com/auth/spreadsheets",
|
12 |
+
"https://www.googleapis.com/auth/drive"
|
13 |
+
]
|
14 |
|
15 |
# Initialize the OpenAI client with the API key from environment variables.
|
16 |
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
17 |
|
18 |
+
# In-memory set to track submitted emails (this resets when the app restarts).
|
19 |
submitted_emails = set()
|
20 |
|
21 |
+
def get_google_sheet():
|
22 |
+
"""
|
23 |
+
Connects to the Google Sheet using service account credentials stored
|
24 |
+
in the environment variable "GOOGLE_CREDS_JSON" and returns the worksheet
|
25 |
+
named "Submissions" from the spreadsheet identified by "SPREADSHEET_ID".
|
26 |
+
"""
|
27 |
+
creds = Credentials.from_service_account_info(
|
28 |
+
json.loads(os.environ["GOOGLE_CREDS_JSON"]),
|
29 |
+
scopes=SCOPES
|
30 |
+
)
|
31 |
+
gc = gspread.authorize(creds)
|
32 |
+
sh = gc.open_by_key(os.environ["SPREADSHEET_ID"])
|
33 |
+
worksheet = sh.worksheet("Submissions")
|
34 |
+
return worksheet
|
35 |
+
|
36 |
def get_evaluation_questions():
|
37 |
"""
|
38 |
Loads evaluation questions and expected answers from environment variables.
|
39 |
|
40 |
+
Expected environment variables:
|
41 |
+
- TEST_QUESTION_1: a JSON array of user query strings.
|
42 |
+
- TEST_EXPECTED_1: a JSON array of JSON-like strings representing expected outputs.
|
43 |
|
44 |
Both lists must be of equal length.
|
45 |
"""
|
46 |
questions_str = os.environ.get("TEST_QUESTION_1")
|
47 |
+
print("questions", questions_str)
|
48 |
expected_str = os.environ.get("TEST_EXPECTED_1")
|
49 |
+
print("expected", expected_str)
|
50 |
if not questions_str or not expected_str:
|
51 |
return []
|
52 |
try:
|
53 |
questions_list = json.loads(questions_str)
|
54 |
+
print("questions lst ", questions_list)
|
55 |
except Exception as e:
|
56 |
print(f"Error parsing questions: {str(e)}")
|
57 |
return []
|
58 |
+
try:
|
59 |
expected_list = json.loads(expected_str)
|
60 |
+
print("expected lst", expected_list)
|
61 |
except Exception as e:
|
62 |
+
print(f"Error parsing expected answers: {str(e)}")
|
63 |
return []
|
64 |
if len(questions_list) != len(expected_list):
|
|
|
|
|
65 |
print("Mismatch in length: questions list and expected answers list must have the same length.")
|
66 |
return []
|
67 |
return [{"question": q, "expected": e} for q, e in zip(questions_list, expected_list)]
|
68 |
|
69 |
+
# Load evaluation questions at startup.
|
70 |
EVALUATION_QUESTIONS = get_evaluation_questions()
|
71 |
|
72 |
def sanitize_input(text):
|
73 |
"""
|
74 |
+
Sanitizes input to allow only alphanumerics and some punctuation,
|
75 |
+
then truncates to 500 characters.
|
76 |
"""
|
|
|
77 |
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
|
78 |
return clean_text.strip()[:500]
|
79 |
+
|
80 |
def sanitize_prompt(text):
|
81 |
"""
|
82 |
+
Sanitizes the system prompt by stripping and limiting its length.
|
83 |
"""
|
84 |
return text.strip()[:8000]
|
85 |
+
|
86 |
def validate_email(email):
|
87 |
"""
|
88 |
Validates that the provided email is in a valid format.
|
|
|
93 |
|
94 |
def submit_prompt(email, name, system_prompt):
|
95 |
"""
|
96 |
+
Handles the full submission process:
|
97 |
+
- Validates email format.
|
98 |
+
- Checks if the email has already been used (by in-memory set and Google Sheet).
|
99 |
+
- Sanitizes input fields.
|
100 |
+
- Processes the system prompt against each evaluation question using the OpenAI API.
|
101 |
+
- For each test question, records the verdict and answer.
|
102 |
+
- Appends the submission as a new row in the Google Sheet with columns:
|
103 |
+
Name, Email, System Prompt, Score, and for each of the 7 test questions: verdict and answer.
|
104 |
+
Returns a result message with evaluation details.
|
105 |
"""
|
106 |
# Validate email format.
|
107 |
if not validate_email(email):
|
108 |
return "Invalid email address. Please enter a valid email."
|
109 |
+
|
110 |
+
# Check if this email has already been submitted (in-memory).
|
111 |
if email in submitted_emails:
|
112 |
return f"Submission already received for {email}. You can only submit once."
|
113 |
+
|
114 |
+
# Connect to Google Sheet and check if the email already exists.
|
115 |
+
try:
|
116 |
+
sheet = get_google_sheet()
|
117 |
+
email_col = sheet.col_values(2) # Assumes column 2 contains the email addresses.
|
118 |
+
if email in email_col[1:]: # Skip header row.
|
119 |
+
return f"Submission already received for {email}. You can only submit once."
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error accessing Google Sheet: {str(e)}")
|
122 |
+
return f"Error accessing Google Sheet: {str(e)}"
|
123 |
+
|
124 |
# Sanitize inputs.
|
125 |
email = sanitize_input(email)
|
126 |
name = sanitize_input(name)
|
|
|
127 |
system_prompt = sanitize_prompt(system_prompt)
|
128 |
+
|
|
|
129 |
score = 0
|
130 |
+
responses = [] # For display output.
|
131 |
+
verdicts = [] # For storing each question's verdict in the sheet.
|
132 |
+
answers_list = [] # For storing each question's answer in the sheet.
|
133 |
+
|
134 |
+
# Process each evaluation question.
|
135 |
for item in EVALUATION_QUESTIONS:
|
136 |
question = item["question"]
|
137 |
expected = item["expected"]
|
138 |
try:
|
|
|
139 |
response = client.chat.completions.create(
|
140 |
+
model="gpt-4o-mini", # Ensure this model identifier matches your deployed model.
|
141 |
messages=[
|
142 |
{"role": "system", "content": system_prompt},
|
143 |
{"role": "user", "content": question}
|
144 |
]
|
145 |
)
|
|
|
146 |
answer = response.choices[0].message.content.strip()
|
147 |
+
print("LLM answer:", answer)
|
148 |
except Exception as e:
|
149 |
answer = f"Error during OpenAI API call: {str(e)}"
|
150 |
+
|
151 |
+
verdict = ""
|
152 |
+
# Check if the answer is a valid JSON.
|
153 |
try:
|
154 |
parsed_answer = json.loads(answer)
|
155 |
+
answer_to_store = json.dumps(parsed_answer) # Normalize parsed JSON as string.
|
156 |
except json.JSONDecodeError as e:
|
157 |
+
verdict = f"Incorrect (Invalid JSON: {str(e)})"
|
158 |
responses.append(
|
159 |
f"Question: {question}\n"
|
160 |
f"Answer: {answer}\n"
|
161 |
f"Expected: {json.dumps(expected)}\n"
|
162 |
f"Result: {verdict}\n"
|
163 |
)
|
164 |
+
verdicts.append(verdict)
|
165 |
+
answers_list.append(answer)
|
166 |
continue
|
167 |
+
|
168 |
+
# Verify that all required keys are present.
|
169 |
required_keys = ["document_level", "clause_level"]
|
170 |
missing_keys = [key for key in required_keys if key not in parsed_answer]
|
171 |
if missing_keys:
|
|
|
176 |
f"Expected: {json.dumps(expected)}\n"
|
177 |
f"Result: {verdict}\n"
|
178 |
)
|
179 |
+
verdicts.append(verdict)
|
180 |
+
answers_list.append(json.dumps(parsed_answer))
|
181 |
continue
|
182 |
+
|
183 |
+
# Compare values for each required key.
|
184 |
incorrect_values = []
|
185 |
for key in required_keys:
|
186 |
if parsed_answer[key] != expected[key]:
|
187 |
incorrect_values.append(key)
|
188 |
+
|
189 |
if len(incorrect_values) == 2:
|
190 |
verdict = "Incorrect (Both values are incorrect)"
|
191 |
elif len(incorrect_values) == 1:
|
|
|
193 |
else:
|
194 |
score += 1
|
195 |
verdict = "Correct"
|
196 |
+
|
197 |
responses.append(
|
198 |
f"Question: {question}\n"
|
199 |
f"Answer: {json.dumps(parsed_answer)}\n"
|
200 |
f"Expected: {json.dumps(expected)}\n"
|
201 |
f"Result: {verdict}\n"
|
202 |
)
|
203 |
+
verdicts.append(verdict)
|
204 |
+
answers_list.append(json.dumps(parsed_answer))
|
205 |
+
|
206 |
result_details = "\n".join(responses)
|
207 |
+
|
208 |
+
# Record this email locally so that subsequent submissions are blocked.
|
209 |
submitted_emails.add(email)
|
210 |
+
|
211 |
+
# Prepare the row for Google Sheets:
|
212 |
+
# The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
|
213 |
+
row = [name, email, system_prompt, str(score)]
|
214 |
+
for v, a in zip(verdicts, answers_list):
|
215 |
+
row.extend([v, a])
|
216 |
+
|
217 |
+
# Append the new row to the Google Sheet.
|
218 |
+
try:
|
219 |
+
sheet.append_row(row)
|
220 |
+
except Exception as e:
|
221 |
+
print(f"Error appending row to Google Sheet: {str(e)}")
|
222 |
+
return f"Error saving submission: {str(e)}"
|
223 |
+
|
224 |
return (
|
225 |
f"Thank you for your submission, {name}!\n\n"
|
226 |
+
f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\n"
|
227 |
+
f"Details:\n{result_details}"
|
228 |
)
|
229 |
|
230 |
def build_interface():
|