File size: 14,687 Bytes
821e9b3
 
3f8b483
821e9b3
9ed8b92
b3013af
 
 
 
 
 
 
5521e44
9ed8b92
 
 
b3013af
9ed8b92
5521e44
b3013af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
821e9b3
 
 
3f8b483
b3013af
 
 
3f8b483
 
821e9b3
3f8b483
54a0bc8
c1cd0b6
54a0bc8
 
3f8b483
 
 
47934fb
 
 
b3013af
3f8b483
 
b3013af
3f8b483
54a0bc8
 
 
 
 
 
 
 
3f8b483
 
54a0bc8
 
5521e44
b3013af
821e9b3
5521e44
821e9b3
 
b3013af
 
821e9b3
 
 
b3013af
bee7793
 
b3013af
bee7793
f5e6b21
b3013af
9ed8b92
 
 
 
821e9b3
9ed8b92
 
 
 
821e9b3
b3013af
 
 
 
 
 
 
 
 
9ed8b92
 
 
 
b3013af
 
9ed8b92
 
b3013af
 
 
 
 
 
 
 
 
 
 
9ed8b92
821e9b3
 
bee7793
b3013af
821e9b3
b3013af
 
 
 
 
821e9b3
 
54a0bc8
821e9b3
 
9ed8b92
b3013af
821e9b3
 
 
 
 
9ed8b92
821e9b3
 
b3013af
 
 
122c32d
 
b3013af
122c32d
b3013af
122c32d
 
 
 
 
 
b3013af
 
122c32d
b3013af
 
122c32d
 
 
 
 
 
 
 
 
 
b3013af
 
122c32d
b3013af
 
122c32d
 
 
 
b3013af
122c32d
 
 
 
 
821e9b3
 
b3013af
821e9b3
 
122c32d
 
821e9b3
 
b3013af
 
 
821e9b3
b3013af
 
9ed8b92
b3013af
 
 
 
 
 
 
 
 
 
 
 
 
 
9ed8b92
 
 
5521e44
821e9b3
 
9ed8b92
821e9b3
 
54a0bc8
 
 
1feb2ff
5771b1d
54a0bc8
df2b29c
54a0bc8
1feb2ff
54a0bc8
 
 
 
 
 
 
 
 
 
 
 
1feb2ff
54a0bc8
1feb2ff
54a0bc8
 
 
 
1feb2ff
 
 
54a0bc8
a032499
54a0bc8
 
 
1feb2ff
 
 
54a0bc8
1feb2ff
54a0bc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1feb2ff
 
 
54a0bc8
 
1feb2ff
54a0bc8
 
 
1feb2ff
54a0bc8
 
 
 
 
 
 
5771b1d
54a0bc8
 
 
 
 
 
5771b1d
 
54a0bc8
 
 
 
1feb2ff
54a0bc8
 
 
9ed8b92
821e9b3
7446fcd
821e9b3
 
 
9ed8b92
821e9b3
9ed8b92
821e9b3
9ed8b92
 
 
821e9b3
9ed8b92
821e9b3
54a0bc8
 
9ed8b92
821e9b3
5521e44
 
821e9b3
 
54a0bc8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import os
import re
import json
import gradio as gr
from openai import OpenAI
import gspread
from google.oauth2.service_account import Credentials

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive"
]

# Initialize the OpenAI client with the API key from environment variables.
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# In-memory set to track submitted emails (this resets when the app restarts).
submitted_emails = set()

def get_google_sheet():
    """
    Connects to the Google Sheet using service account credentials stored
    in the environment variable "GOOGLE_CREDS_JSON" and returns the worksheet
    named "Submissions" from the spreadsheet identified by "SPREADSHEET_ID".
    """
    creds = Credentials.from_service_account_info(
        json.loads(os.environ["GOOGLE_CREDS_JSON"]),
        scopes=SCOPES
    )
    gc = gspread.authorize(creds)
    sh = gc.open_by_key(os.environ["SPREADSHEET_ID"])
    worksheet = sh.worksheet("Submissions")
    return worksheet

def get_evaluation_questions():
    """
    Loads evaluation questions and expected answers from environment variables.
    
    Expected environment variables:
    - TEST_QUESTION_1: a JSON array of user query strings.
    - TEST_EXPECTED_1: a JSON array of JSON-like strings representing expected outputs.
    
    Both lists must be of equal length.
    """
    questions_str = os.environ.get("TEST_QUESTION_1")
    docs_str = os.environ.get("TEST_DOCUMENTS_1")
    expected_str = os.environ.get("TEST_EXPECTED_1")

    if not questions_str or not expected_str or not docs_str:
        return []
    try:
        questions_list = json.loads(questions_str)
    except Exception as e:
        print(f"Error parsing questions: {str(e)}")
        return []
    try:
        expected_list = json.loads(expected_str)
    except Exception as e:
        print(f"Error parsing expected answers: {str(e)}")
        return []
    try:
        docs_list = json.loads(docs_str)
    except Exception as e:
        print(f"Error parsing documents: {str(e)}")
        return []
    
    # Ensure all lists are of the same length.
    if len(questions_list) != len(expected_list) or len(questions_list) != len(docs_list):
        print("Mismatch in length: questions list and expected answers list must have the same length.")
        return []
    
    return [{"question": q, "expected": e, "docs": d} for q, e, d in zip(questions_list, expected_list, docs_list)]

# Load evaluation questions at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()

def sanitize_input(text):
    """
    Sanitizes input to allow only alphanumerics and some punctuation,
    then truncates to 500 characters.
    """
    clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
    return clean_text.strip()[:500]

def sanitize_prompt(text):
    """
    Sanitizes the system prompt by stripping and limiting its length.
    """
    return text.strip()[:8000]

def validate_email(email):
    """
    Validates that the provided email is in a valid format.
    Returns True if valid, False otherwise.
    """
    email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
    return re.match(email_regex, email) is not None

def submit_prompt(email, name, system_prompt):
    """
    Handles the full submission process:
     - Validates email format.
     - Checks if the email has already been used (by in-memory set and Google Sheet).
     - Sanitizes input fields.
     - Processes the system prompt against each evaluation question using the OpenAI API.
     - For each test question, records the verdict and answer.
     - Appends the submission as a new row in the Google Sheet with columns:
         Name, Email, System Prompt, Score, and for each of the 7 test questions: verdict and answer.
    Returns a result message with evaluation details.
    """
    # Validate email format.
    if not validate_email(email):
        return "Invalid email address. Please enter a valid email."
    
    # Check if this email has already been submitted (in-memory).
    if email in submitted_emails:
        return f"Submission already received for {email}. You can only submit once."
    
    # Connect to Google Sheet and check if the email already exists.
    try:
        sheet = get_google_sheet()
        email_col = sheet.col_values(2)  # Assumes column 2 contains the email addresses.
        if email in email_col[1:]:  # Skip header row.
            return f"Submission already received for {email}. You can only submit once."
    except Exception as e:
        print(f"Error accessing Google Sheet: {str(e)}")
        return f"Error accessing Google Sheet: {str(e)}"
    
    # Sanitize inputs.
    email = sanitize_input(email)
    name = sanitize_input(name)
    system_prompt = sanitize_prompt(system_prompt)
    
    score = 0
    responses = []    # For display output.
    verdicts = []     # For storing each question's verdict in the sheet.
    answers_list = [] # For storing each question's answer in the sheet.
    
    # Process each evaluation question.
    for item in EVALUATION_QUESTIONS:
        question = item["question"]
        docs = item["docs"]
        expected = item["expected"]
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",  # Ensure this model identifier matches your deployed model.
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ]
            )
            answer = response.choices[0].message.content.strip()
        except Exception as e:
            answer = f"Error during OpenAI API call: {str(e)}"
        
        verdict = ""
        # Check if the answer is a valid JSON.
        try:
            parsed_answer = json.loads(answer)
            answer_to_store = json.dumps(parsed_answer)  # Normalize parsed JSON as string.
        except json.JSONDecodeError as e:
            verdict = f"Incorrect (Invalid JSON: {str(e)})"
            responses.append(
                f"Question: {question}\n"
                f"Answer: {answer}\n"
                f"Expected: {json.dumps(expected)}\n"
                f"Result: {verdict}\n"
            )
            verdicts.append(verdict)
            answers_list.append(answer)
            continue
        
        # Verify that all required keys are present.
        required_keys = ["document_level", "clause_level"]
        missing_keys = [key for key in required_keys if key not in parsed_answer]
        if missing_keys:
            verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
            responses.append(
                f"Question: {question}\n"
                f"Answer: {json.dumps(parsed_answer)}\n"
                f"Expected: {json.dumps(expected)}\n"
                f"Result: {verdict}\n"
            )
            verdicts.append(verdict)
            answers_list.append(json.dumps(parsed_answer))
            continue
        
        # Compare values for each required key.
        incorrect_values = []
        for key in required_keys:
            if parsed_answer[key] != expected[key]:
                incorrect_values.append(key)
        
        if len(incorrect_values) == 2:
            verdict = "Incorrect (Both values are incorrect)"
        elif len(incorrect_values) == 1:
            verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
        else:
            score += 1
            verdict = "Correct"
        
        responses.append(
            f"Question: {question}\n"
            f"Answer: {json.dumps(parsed_answer)}\n"
            f"Expected: {json.dumps(expected)}\n"
            f"Result: {verdict}\n"
        )
        verdicts.append(verdict)
        answers_list.append(json.dumps(parsed_answer))
    
    result_details = "\n".join(responses)
    
    # Record this email locally so that subsequent submissions are blocked.
    submitted_emails.add(email)
    
    # Prepare the row for Google Sheets:
    # The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
    row = [name, email, system_prompt, str(score)]
    for v, a in zip(verdicts, answers_list):
        row.extend([v, a])
    
    # Append the new row to the Google Sheet.
    try:
        sheet.append_row(row)
    except Exception as e:
        print(f"Error appending row to Google Sheet: {str(e)}")
        return f"Error saving submission: {str(e)}"
    
    return (
        f"Thank you for your submission, {name}!\n\n"
    )

def build_interface():
    """
    Constructs the Gradio interface with a submission button and single-submission mechanism.
    """
    with gr.Blocks() as demo:
        gr.Markdown("# Applicant Task: Target Company & Law Firm Identification")
        gr.Markdown("## Identifying Parties, Law Firms, and Target Company Presence")
        
        # General description
        gr.Markdown("""
        This task involves processing a user query to determine the relevance to the intended task, followed by analyzing textual data to extract information about law firms representing parties (Buyer, Seller, and Third Parties) and verifying the presence of a target company. 

        The system is designed to sequentially leverage three LLM functions:
        
        **Step 1:** LLM1 determines if the user's query mentions any target company.  
        - If no target company is found, LLM1 responds with a message wrapped in `<user_message></user_message>` XML tags to inform the user that the query is irrelevant to this task.  
        - If the query contains a target company, LLM1 moves forward with a formatted acknowledgment of the identified target company.

        **Step 2:** LLM2 examines four separate paragraphs provided as input. For each paragraph, it extracts specific information about:
        - The Buyer's representative law firm.
        - The Seller's representative law firm.
        - Any third-party law firm present.
        - Whether the target company is mentioned in the paragraph.
        Each paragraph's results are formatted and concatenated for the next step.

        **Step 3:** LLM3 compiles the information from all analyzed paragraphs and outputs a structured JSON object with the following keys:
        
        ```json
        {
        "buyer_firm": "string",
        "seller_firm": "string",
        "third_party": "string",
        "contains_target_firm": boolean
        }
        ```
        
        The goal is to identify the representative law firms of involved parties and determine if the target company is mentioned, ensuring the results are structured and accurate.
        
        **Key Considerations:**
        - The output must adhere to the prescribed JSON format for the final step.
        - Ensure the system can accurately extract and classify relevant information from the input paragraphs.
        """)

        # Example Inputs and Outputs in an Accordion
        with gr.Accordion("Example Workflow", open=False):
            gr.Markdown("""
            **Example Query and System Output:**

            **User Query:**  
            *"Is Kirkland present?"*

            Step 1 (LLM1):  
            - If no target company is identified:  
            Output: `<user_message>Query is not relevant to the intended task.</user_message>`  

            - If a target company is identified:  
            Output: *"The query mentions the target company Kirkland."*

            Step 2 (LLM2 for Paragraphs):  
            **Input Paragraph Example:**  
            *"Representation agreements between Buyers and Kirkland & Ellis are included."*  

            **Output:**  
            *"Buyer Firm: Kirkland & Ellis, Seller Firm: None, Third Party Firm: None, Contains Target Firm: True."*  

            Step 3 (LLM3 Final Output):  
            Compiled JSON:
            ```json
            {
            "buyer_firm": "Kirkland & Ellis",
            "seller_firm": null,
            "third_party": null,
            "contains_target_firm": true
            }
            ```
            """)

        # Challenge instructions and testing guidance
        with gr.Accordion("Task Instructions and Testing", open=False):
            gr.Markdown("""
            - Design prompts that ensure proper interaction between the three LLM systems, with each step contributing to the final output.
            - Ensure strict adherence to JSON formatting requirements (e.g., no extra characters that may cause JSON parsing errors).
            - Test extensively to verify accurate law firm and target company identification.
            
            **Output Requirements:**
            - Ensure final LLM3 JSON output has the following keys:
            - `"buyer_firm"`
            - `"seller_firm"`
            - `"third_party"`
            - `"contains_target_firm"`
            - Values must be accurately extracted or classified based on LLM2's parsed data.

            **Hints for Crafting System Prompts:**
            - Explicitly specify formatting requirements at each step.
            - Clarify the task definitions and expected classifications in each system prompt for LLM1, LLM2, and LLM3.
            - Test using diverse sample data for robustness.

            You can only submit once, so validate your system prompts thoroughly using mock queries and example data before final submission.
            
            Good Luck!
            """)

        gr.Markdown("""
        Enter your name and email below, as listed in your CV, and submit your designed prompts.
        
        Remember: Focus on clarity, accuracy, and structured responses to achieve a high score!
        """)


        email_input = gr.Textbox(label="Email", placeholder="[email protected]")
        name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith")
        system_prompt_input = gr.Textbox(
            label="System Prompt",
            placeholder="Enter your system prompt here...",
            lines=6,
        )
        submit_button = gr.Button("Submit")
        output_text = gr.Textbox(label="Results", lines=15)

        submit_button.click(
            fn=submit_prompt,
            inputs=[email_input, name_input, system_prompt_input],
            outputs=output_text,
        )
        
        
    
    return demo

if __name__ == "__main__":
    interface = build_interface()
    # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
    interface.launch(server_name="0.0.0.0", server_port=7860)