File size: 5,096 Bytes
821e9b3
 
 
5521e44
821e9b3
 
 
 
5521e44
821e9b3
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
821e9b3
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
 
821e9b3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import re
from datetime import datetime

import gradio as gr
import openai
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Set OpenAI API key from an environment variable.
openai.api_key = os.environ["OPENAI_API_KEY"]

def get_evaluation_questions():
    """
    Loads evaluation questions and expected answers from environment variables.
    Expected environment variable names are:
      TEST_QUESTION_1, TEST_EXPECTED_1,
      TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
    """
    questions = []
    i = 1
    while True:
        question = os.environ.get(f"TEST_QUESTION_{i}")
        expected = os.environ.get(f"TEST_EXPECTED_{i}")
        if not question or not expected:
            break
        questions.append({"question": question, "expected": expected})
        i += 1
    return questions

# Load the evaluation questions once at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()

def init_sheet():
    """
    Initializes and returns the Google Sheet.
    The sheet name is taken from the SHEET_NAME environment variable,
    defaulting to "Prompt Evaluations" if not set.
    
    Ensure that your service account credentials (credentials.json)
    are available and that the sheet is shared with the service account's email.
    """
    scopes = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive"
    ]
    creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes)
    client = gspread.authorize(creds)
    sheet_name = os.environ.get("SHEET_NAME", "Prompt Evaluations")
    sheet = client.open(sheet_name).sheet1
    return sheet

def sanitize_input(text):
    """
    Sanitizes input to prevent harmful content and limits its length.
    """
    # Allow alphanumerics and some punctuation, then truncate to 500 characters.
    clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
    return clean_text.strip()[:500]

def evaluate_prompt(email, name, system_prompt):
    """
    For each test question:
      - Uses the provided system prompt to generate a response with GPT-4o Mini.
      - Checks if the expected substring is present.
      - Computes an aggregate score.
    Logs the user's email, name, system prompt, and score to a Google Sheet.
    """
    # Sanitize the inputs.
    email = sanitize_input(email)
    name = sanitize_input(name)
    system_prompt = sanitize_input(system_prompt)

    score = 0
    responses = []
    for item in EVALUATION_QUESTIONS:
        question = item["question"]
        expected = item["expected"]
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",  # Ensure this identifier matches the deployed model.
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ]
            )
            answer = response.choices[0].message["content"].strip()
        except Exception as e:
            answer = f"Error during OpenAI API call: {str(e)}"
        
        # Simple evaluation: check if the answer contains the expected substring.
        if expected.lower() in answer.lower():
            score += 1
            verdict = "Correct"
        else:
            verdict = "Incorrect"
        
        responses.append(
            f"Question: {question}\n"
            f"Answer: {answer}\n"
            f"Expected: {expected}\n"
            f"Result: {verdict}\n"
        )
    
    result_details = "\n".join(responses)
    
    try:
        sheet = init_sheet()
        timestamp = datetime.now().isoformat()
        row = [timestamp, email, name, score, system_prompt]
        sheet.append_row(row)
    except Exception as err:
        print("Error writing to Google Sheet:", err)
    
    return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"

def build_interface():
    """
    Constructs the Gradio interface.
    """
    with gr.Blocks() as demo:
        gr.Markdown("# GPT-4o Mini Prompt Evaluation")
        gr.Markdown("Enter your email, name, and a system prompt below:")
        
        email_input = gr.Textbox(label="Email", placeholder="[email protected]")
        name_input = gr.Textbox(label="Name", placeholder="Your name")
        system_prompt_input = gr.Textbox(
            label="System Prompt",
            placeholder="Enter your system prompt here...",
            lines=6
        )
        eval_button = gr.Button("Evaluate")
        output_text = gr.Textbox(label="Results", lines=15)
        
        eval_button.click(
            fn=evaluate_prompt,
            inputs=[email_input, name_input, system_prompt_input],
            outputs=output_text
        )
    return demo

if __name__ == "__main__":
    interface = build_interface()
    # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
    interface.launch(server_name="0.0.0.0", server_port=7860)