Spaces:
Running
Running
import os | |
import re | |
import json | |
import gradio as gr | |
from openai import OpenAI | |
import gspread | |
from google.oauth2.service_account import Credentials | |
SCOPES = [ | |
"https://www.googleapis.com/auth/spreadsheets", | |
"https://www.googleapis.com/auth/drive" | |
] | |
# Initialize the OpenAI client with the API key from environment variables. | |
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) | |
# In-memory set to track submitted emails (this resets when the app restarts). | |
submitted_emails = set() | |
def get_google_sheet(): | |
""" | |
Connects to the Google Sheet using service account credentials stored | |
in the environment variable "GOOGLE_CREDS_JSON" and returns the worksheet | |
named "Submissions" from the spreadsheet identified by "SPREADSHEET_ID". | |
""" | |
creds = Credentials.from_service_account_info( | |
json.loads(os.environ["GOOGLE_CREDS_JSON"]), | |
scopes=SCOPES | |
) | |
gc = gspread.authorize(creds) | |
sh = gc.open_by_key(os.environ["SPREADSHEET_ID"]) | |
worksheet = sh.worksheet("Submissions") | |
return worksheet | |
def get_evaluation_questions(): | |
""" | |
Loads evaluation questions and expected answers from environment variables. | |
Expected environment variables: | |
- TEST_QUESTION_1: a JSON array of user query strings. | |
- TEST_EXPECTED_1: a JSON array of JSON-like strings representing expected outputs. | |
Both lists must be of equal length. | |
""" | |
questions_str = os.environ.get("TEST_QUESTION_1") | |
expected_str = os.environ.get("TEST_EXPECTED_1") | |
if not questions_str or not expected_str: | |
return [] | |
try: | |
questions_list = json.loads(questions_str) | |
except Exception as e: | |
print(f"Error parsing questions: {str(e)}") | |
return [] | |
try: | |
expected_list = json.loads(expected_str) | |
except Exception as e: | |
print(f"Error parsing expected answers: {str(e)}") | |
return [] | |
if len(questions_list) != len(expected_list): | |
print("Mismatch in length: questions list and expected answers list must have the same length.") | |
return [] | |
return [{"question": q, "expected": e} for q, e in zip(questions_list, expected_list)] | |
# Load evaluation questions at startup. | |
EVALUATION_QUESTIONS = get_evaluation_questions() | |
def sanitize_input(text): | |
""" | |
Sanitizes input to allow only alphanumerics and some punctuation, | |
then truncates to 500 characters. | |
""" | |
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text) | |
return clean_text.strip()[:500] | |
def sanitize_prompt(text): | |
""" | |
Sanitizes the system prompt by stripping and limiting its length. | |
""" | |
return text.strip()[:8000] | |
def validate_email(email): | |
""" | |
Validates that the provided email is in a valid format. | |
Returns True if valid, False otherwise. | |
""" | |
email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$" | |
return re.match(email_regex, email) is not None | |
def submit_prompt(email, name, system_prompt): | |
""" | |
Handles the full submission process: | |
- Validates email format. | |
- Checks if the email has already been used (by in-memory set and Google Sheet). | |
- Sanitizes input fields. | |
- Processes the system prompt against each evaluation question using the OpenAI API. | |
- For each test question, records the verdict and answer. | |
- Appends the submission as a new row in the Google Sheet with columns: | |
Name, Email, System Prompt, Score, and for each of the 7 test questions: verdict and answer. | |
Returns a result message with evaluation details. | |
""" | |
# Validate email format. | |
if not validate_email(email): | |
return "Invalid email address. Please enter a valid email." | |
# Check if this email has already been submitted (in-memory). | |
if email in submitted_emails: | |
return f"Submission already received for {email}. You can only submit once." | |
# Connect to Google Sheet and check if the email already exists. | |
try: | |
sheet = get_google_sheet() | |
email_col = sheet.col_values(2) # Assumes column 2 contains the email addresses. | |
if email in email_col[1:]: # Skip header row. | |
return f"Submission already received for {email}. You can only submit once." | |
except Exception as e: | |
print(f"Error accessing Google Sheet: {str(e)}") | |
return f"Error accessing Google Sheet: {str(e)}" | |
# Sanitize inputs. | |
email = sanitize_input(email) | |
name = sanitize_input(name) | |
system_prompt = sanitize_prompt(system_prompt) | |
score = 0 | |
responses = [] # For display output. | |
verdicts = [] # For storing each question's verdict in the sheet. | |
answers_list = [] # For storing each question's answer in the sheet. | |
# Process each evaluation question. | |
for item in EVALUATION_QUESTIONS: | |
question = item["question"] | |
expected = item["expected"] | |
try: | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", # Ensure this model identifier matches your deployed model. | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": question} | |
] | |
) | |
answer = response.choices[0].message.content.strip() | |
except Exception as e: | |
answer = f"Error during OpenAI API call: {str(e)}" | |
verdict = "" | |
# Check if the answer is a valid JSON. | |
try: | |
parsed_answer = json.loads(answer) | |
answer_to_store = json.dumps(parsed_answer) # Normalize parsed JSON as string. | |
except json.JSONDecodeError as e: | |
verdict = f"Incorrect (Invalid JSON: {str(e)})" | |
responses.append( | |
f"Question: {question}\n" | |
f"Answer: {answer}\n" | |
f"Expected: {json.dumps(expected)}\n" | |
f"Result: {verdict}\n" | |
) | |
verdicts.append(verdict) | |
answers_list.append(answer) | |
continue | |
# Verify that all required keys are present. | |
required_keys = ["document_level", "clause_level"] | |
missing_keys = [key for key in required_keys if key not in parsed_answer] | |
if missing_keys: | |
verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})" | |
responses.append( | |
f"Question: {question}\n" | |
f"Answer: {json.dumps(parsed_answer)}\n" | |
f"Expected: {json.dumps(expected)}\n" | |
f"Result: {verdict}\n" | |
) | |
verdicts.append(verdict) | |
answers_list.append(json.dumps(parsed_answer)) | |
continue | |
# Compare values for each required key. | |
incorrect_values = [] | |
for key in required_keys: | |
if parsed_answer[key] != expected[key]: | |
incorrect_values.append(key) | |
if len(incorrect_values) == 2: | |
verdict = "Incorrect (Both values are incorrect)" | |
elif len(incorrect_values) == 1: | |
verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)" | |
else: | |
score += 1 | |
verdict = "Correct" | |
responses.append( | |
f"Question: {question}\n" | |
f"Answer: {json.dumps(parsed_answer)}\n" | |
f"Expected: {json.dumps(expected)}\n" | |
f"Result: {verdict}\n" | |
) | |
verdicts.append(verdict) | |
answers_list.append(json.dumps(parsed_answer)) | |
result_details = "\n".join(responses) | |
# Record this email locally so that subsequent submissions are blocked. | |
submitted_emails.add(email) | |
# Prepare the row for Google Sheets: | |
# The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer. | |
row = [name, email, system_prompt, str(score)] | |
for v, a in zip(verdicts, answers_list): | |
row.extend([v, a]) | |
# Append the new row to the Google Sheet. | |
try: | |
sheet.append_row(row) | |
except Exception as e: | |
print(f"Error appending row to Google Sheet: {str(e)}") | |
return f"Error saving submission: {str(e)}" | |
return ( | |
f"Thank you for your submission, {name}!\n\n" | |
) | |
def build_interface(): | |
""" | |
Constructs the Gradio interface with a submission button and single-submission mechanism. | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("# Applicant Task: Writing a System Prompt") | |
gr.Markdown("## Document and Clause Level Classification") | |
# General description | |
gr.Markdown(""" | |
Applicants must create a system prompt for a language model that classifies user requests about finding information into two specific categories: | |
1. **Document Level**: Determines whether the query refers to a single document or multiple documents. | |
2. **Clause Level**: Identifies whether the query is focused on: | |
- A single clause, | |
- Multiple clauses, or | |
- General information not constrained to any specific clause. | |
Imagine the user is a legal professional who is trying to find information related to their work. We need this classification to guide our downstream approach on how to retrieve this information. | |
That's why the model must return a valid JSON object with the following structure: | |
``` | |
{ | |
"document_level": "single/multiple", | |
"clause_level": "single/multiple/general" | |
} | |
``` | |
The goal is to ensure that the model's output adheres to the precscibed JSON structure and accurately classifies 7 test queries into the two respective categories. | |
This task is designed to evaluate your ability to craft prompts by adhering to the required structure, without relying on constrained decoding or "JSON mode," while simultaneously providing accurate responses. | |
""") | |
# Example Inputs and Outputs in an Accordion | |
with gr.Accordion("Example Inputs and Expected Outputs", open=False): | |
gr.Markdown(""" | |
1. **User Message Example 1:** | |
- *"Please provide the contract for the lease agreement."* | |
- **Expected Output:** | |
``` | |
{"document_level": "single", "clause_level": "general"} | |
``` | |
2. **User Message Example 2:** | |
- *"I need all clauses related to termination in the employment contract."* | |
- **Expected Output:** | |
``` | |
{"document_level": "single", "clause_level": "multiple"} | |
``` | |
3. **User Message Example 3:** | |
- *"Can you send me the financial reports and the partnership agreement?"* | |
- **Expected Output:** | |
``` | |
{"document_level": "multiple", "clause_level": "general"} | |
``` | |
4. **User Message Example 4:** | |
- *"What are the key clauses in the NDA?"* | |
- **Expected Output:** | |
``` | |
{"document_level": "single", "clause_level": "multiple"} | |
``` | |
5. **User Message Example 5:** | |
- *"Tell me about the company’s financials."* | |
- **Expected Output:** | |
``` | |
{"document_level": "single", "clause_level": "general"} | |
``` | |
6. **User Message Example 6:** | |
- *"Provide all contracts and their confidentiality clauses."* | |
- **Expected Output:** | |
``` | |
{"document_level": "multiple", "clause_level": "multiple"} | |
``` | |
7. **User Message Example 7:** | |
- *"Extract the arbitration clause from this service agreement."* | |
- **Expected Output:** | |
``` | |
{"document_level": "single", "clause_level": "single"} | |
``` | |
""") | |
# Challenge instructions in another Accordion | |
with gr.Accordion("Task Instructions", open=False): | |
gr.Markdown(""" | |
- Design a system prompt that ensures gpt4o-mini generates outputs like those above when given similar user messages. | |
The system prompt should: | |
1. Specify formatting requirements (e.g., *"Output must be a valid JSON object"*). | |
- Note that we are not using constrained decoding or any sort of JSON mode; if not correctly prompted, the LLM will output plain text. | |
- All LLM responses will be passed to json.loads(response), responses that fail the json parsing are deemed incorrect (beware of tripple backtricks etc.) | |
2. Emphasize strict adherence to classification definitions: | |
- *Single Document:* Refers to one document. | |
- *Multiple Documents:* Refers to more than one document. | |
- *Single Clause:* Refers to one specific clause. | |
- *Multiple Clauses:* Refers to more than one specific clause. | |
- *General Information:* Refers to general content not tied to specific clauses. | |
**You can only submit once, so test your system prompt thoroughly before submission!** | |
You will be scored according to the following criteria with respect to the outputs of 7 test user messages: | |
- Response is valid JSON | |
- The response contains the keys: "document_level" and "clause_level" | |
- The values for each of the keys are correct | |
Good Luck! | |
""") | |
gr.Markdown( | |
"""Please enter the same name and email as listed in your CV and submit your system prompt below. | |
You can only submit once, try to test and build out your system prompt using gpt4o-mini with temp=1 before submitting your solution. | |
We look forward to your submission! | |
""" | |
) | |
email_input = gr.Textbox(label="Email", placeholder="[email protected]") | |
name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith") | |
system_prompt_input = gr.Textbox( | |
label="System Prompt", | |
placeholder="Enter your system prompt here...", | |
lines=6, | |
) | |
submit_button = gr.Button("Submit") | |
output_text = gr.Textbox(label="Results", lines=15) | |
submit_button.click( | |
fn=submit_prompt, | |
inputs=[email_input, name_input, system_prompt_input], | |
outputs=output_text, | |
) | |
return demo | |
if __name__ == "__main__": | |
interface = build_interface() | |
# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container). | |
interface.launch(server_name="0.0.0.0", server_port=7860) | |