import os import re import json import gradio as gr from openai import OpenAI import gspread from google.oauth2.service_account import Credentials SCOPES = [ "https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive" ] # Initialize the OpenAI client with the API key from environment variables. client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) # In-memory set to track submitted emails (this resets when the app restarts). submitted_emails = set() def get_google_sheet(): """ Connects to the Google Sheet using service account credentials stored in the environment variable "GOOGLE_CREDS_JSON" and returns the worksheet named "Submissions" from the spreadsheet identified by "SPREADSHEET_ID". """ creds = Credentials.from_service_account_info( json.loads(os.environ["GOOGLE_CREDS_JSON"]), scopes=SCOPES ) gc = gspread.authorize(creds) sh = gc.open_by_key(os.environ["SPREADSHEET_ID"]) worksheet = sh.worksheet("Submissions") return worksheet def get_evaluation_questions(): """ Loads evaluation questions and expected answers from environment variables. Expected environment variables: - TEST_QUESTION_1: a JSON array of user query strings. - TEST_EXPECTED_1: a JSON array of JSON-like strings representing expected outputs. Both lists must be of equal length. """ questions_str = os.environ.get("TEST_QUESTION_1") expected_str = os.environ.get("TEST_EXPECTED_1") if not questions_str or not expected_str: return [] try: questions_list = json.loads(questions_str) except Exception as e: print(f"Error parsing questions: {str(e)}") return [] try: expected_list = json.loads(expected_str) except Exception as e: print(f"Error parsing expected answers: {str(e)}") return [] if len(questions_list) != len(expected_list): print("Mismatch in length: questions list and expected answers list must have the same length.") return [] return [{"question": q, "expected": e} for q, e in zip(questions_list, expected_list)] # Load evaluation questions at startup. EVALUATION_QUESTIONS = get_evaluation_questions() def sanitize_input(text): """ Sanitizes input to allow only alphanumerics and some punctuation, then truncates to 500 characters. """ clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text) return clean_text.strip()[:500] def sanitize_prompt(text): """ Sanitizes the system prompt by stripping and limiting its length. """ return text.strip()[:8000] def validate_email(email): """ Validates that the provided email is in a valid format. Returns True if valid, False otherwise. """ email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$" return re.match(email_regex, email) is not None def submit_prompt(email, name, system_prompt): """ Handles the full submission process: - Validates email format. - Checks if the email has already been used (by in-memory set and Google Sheet). - Sanitizes input fields. - Processes the system prompt against each evaluation question using the OpenAI API. - For each test question, records the verdict and answer. - Appends the submission as a new row in the Google Sheet with columns: Name, Email, System Prompt, Score, and for each of the 7 test questions: verdict and answer. Returns a result message with evaluation details. """ # Validate email format. if not validate_email(email): return "Invalid email address. Please enter a valid email." # Check if this email has already been submitted (in-memory). if email in submitted_emails: return f"Submission already received for {email}. You can only submit once." # Connect to Google Sheet and check if the email already exists. try: sheet = get_google_sheet() email_col = sheet.col_values(2) # Assumes column 2 contains the email addresses. if email in email_col[1:]: # Skip header row. return f"Submission already received for {email}. You can only submit once." except Exception as e: print(f"Error accessing Google Sheet: {str(e)}") return f"Error accessing Google Sheet: {str(e)}" # Sanitize inputs. email = sanitize_input(email) name = sanitize_input(name) system_prompt = sanitize_prompt(system_prompt) score = 0 responses = [] # For display output. verdicts = [] # For storing each question's verdict in the sheet. answers_list = [] # For storing each question's answer in the sheet. # Process each evaluation question. for item in EVALUATION_QUESTIONS: question = item["question"] expected = item["expected"] try: response = client.chat.completions.create( model="gpt-4o-mini", # Ensure this model identifier matches your deployed model. messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": question} ] ) answer = response.choices[0].message.content.strip() except Exception as e: answer = f"Error during OpenAI API call: {str(e)}" verdict = "" # Check if the answer is a valid JSON. try: parsed_answer = json.loads(answer) answer_to_store = json.dumps(parsed_answer) # Normalize parsed JSON as string. except json.JSONDecodeError as e: verdict = f"Incorrect (Invalid JSON: {str(e)})" responses.append( f"Question: {question}\n" f"Answer: {answer}\n" f"Expected: {json.dumps(expected)}\n" f"Result: {verdict}\n" ) verdicts.append(verdict) answers_list.append(answer) continue # Verify that all required keys are present. required_keys = ["document_level", "clause_level"] missing_keys = [key for key in required_keys if key not in parsed_answer] if missing_keys: verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})" responses.append( f"Question: {question}\n" f"Answer: {json.dumps(parsed_answer)}\n" f"Expected: {json.dumps(expected)}\n" f"Result: {verdict}\n" ) verdicts.append(verdict) answers_list.append(json.dumps(parsed_answer)) continue # Compare values for each required key. incorrect_values = [] for key in required_keys: if parsed_answer[key] != expected[key]: incorrect_values.append(key) if len(incorrect_values) == 2: verdict = "Incorrect (Both values are incorrect)" elif len(incorrect_values) == 1: verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)" else: score += 1 verdict = "Correct" responses.append( f"Question: {question}\n" f"Answer: {json.dumps(parsed_answer)}\n" f"Expected: {json.dumps(expected)}\n" f"Result: {verdict}\n" ) verdicts.append(verdict) answers_list.append(json.dumps(parsed_answer)) result_details = "\n".join(responses) # Record this email locally so that subsequent submissions are blocked. submitted_emails.add(email) # Prepare the row for Google Sheets: # The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer. row = [name, email, system_prompt, str(score)] for v, a in zip(verdicts, answers_list): row.extend([v, a]) # Append the new row to the Google Sheet. try: sheet.append_row(row) except Exception as e: print(f"Error appending row to Google Sheet: {str(e)}") return f"Error saving submission: {str(e)}" return ( f"Thank you for your submission, {name}!\n\n" ) def build_interface(): """ Constructs the Gradio interface with a submission button and single-submission mechanism. """ with gr.Blocks() as demo: gr.Markdown("# Applicant Task: Writing a System Prompt") gr.Markdown("## Document and Clause Level Classification") # General description gr.Markdown(""" Applicants must create a system prompt for a language model that classifies user requests about finding information into two specific categories: 1. **Document Level**: Determines whether the query refers to a single document or multiple documents. 2. **Clause Level**: Identifies whether the query is focused on: - A single clause, - Multiple clauses, or - General information not constrained to any specific clause. Imagine the user is a legal professional who is trying to find information related to their work. We need this classification to guide our downstream approach on how to retrieve this information. That's why the model must return a valid JSON object with the following structure: ``` { "document_level": "single/multiple", "clause_level": "single/multiple/general" } ``` The goal is to ensure that the model's output adheres to the precscibed JSON structure and accurately classifies 7 test queries into the two respective categories. This task is designed to evaluate your ability to craft prompts by adhering to the required structure, without relying on constrained decoding or "JSON mode," while simultaneously providing accurate responses. """) # Example Inputs and Outputs in an Accordion with gr.Accordion("Example Inputs and Expected Outputs", open=False): gr.Markdown(""" 1. **User Message Example 1:** - *"Please provide the contract for the lease agreement."* - **Expected Output:** ``` {"document_level": "single", "clause_level": "general"} ``` 2. **User Message Example 2:** - *"I need all clauses related to termination in the employment contract."* - **Expected Output:** ``` {"document_level": "single", "clause_level": "multiple"} ``` 3. **User Message Example 3:** - *"Can you send me the financial reports and the partnership agreement?"* - **Expected Output:** ``` {"document_level": "multiple", "clause_level": "general"} ``` 4. **User Message Example 4:** - *"What are the key clauses in the NDA?"* - **Expected Output:** ``` {"document_level": "single", "clause_level": "multiple"} ``` 5. **User Message Example 5:** - *"Tell me about the company’s financials."* - **Expected Output:** ``` {"document_level": "single", "clause_level": "general"} ``` 6. **User Message Example 6:** - *"Provide all contracts and their confidentiality clauses."* - **Expected Output:** ``` {"document_level": "multiple", "clause_level": "multiple"} ``` 7. **User Message Example 7:** - *"Extract the arbitration clause from this service agreement."* - **Expected Output:** ``` {"document_level": "single", "clause_level": "single"} ``` """) # Challenge instructions in another Accordion with gr.Accordion("Task Instructions", open=False): gr.Markdown(""" - Design a system prompt that ensures gpt4o-mini generates outputs like those above when given similar user messages. The system prompt should: 1. Specify formatting requirements (e.g., *"Output must be a valid JSON object"*). - Note that we are not using constrained decoding or any sort of JSON mode; if not correctly prompted, the LLM will output plain text. - All LLM responses will be passed to json.loads(response), responses that fail the json parsing are deemed incorrect (beware of tripple backtricks etc.) 2. Emphasize strict adherence to classification definitions: - *Single Document:* Refers to one document. - *Multiple Documents:* Refers to more than one document. - *Single Clause:* Refers to one specific clause. - *Multiple Clauses:* Refers to more than one specific clause. - *General Information:* Refers to general content not tied to specific clauses. **You can only submit once, so test your system prompt thoroughly before submission!** You will be scored according to the following criteria with respect to the outputs of 7 test user messages: - Response is valid JSON - The response contains the keys: "document_level" and "clause_level" - The values for each of the keys are correct Good Luck! """) gr.Markdown( """Please enter the same name and email as listed in your CV and submit your system prompt below. You can only submit once, try to test and build out your system prompt using gpt4o-mini with temp=1 before submitting your solution. We look forward to your submission! """ ) email_input = gr.Textbox(label="Email", placeholder="your.email@example.com") name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith") system_prompt_input = gr.Textbox( label="System Prompt", placeholder="Enter your system prompt here...", lines=6, ) submit_button = gr.Button("Submit") output_text = gr.Textbox(label="Results", lines=15) submit_button.click( fn=submit_prompt, inputs=[email_input, name_input, system_prompt_input], outputs=output_text, ) return demo if __name__ == "__main__": interface = build_interface() # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container). interface.launch(server_name="0.0.0.0", server_port=7860)