File size: 3,937 Bytes
821e9b3
 
 
5521e44
821e9b3
 
5521e44
821e9b3
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
821e9b3
 
5521e44
821e9b3
 
 
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
07b4a92
821e9b3
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
821e9b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5521e44
 
821e9b3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import re
from datetime import datetime

import gradio as gr
import openai

# Set OpenAI API key from an environment variable.
openai.api_key = os.environ["OPENAI_API_KEY"]

def get_evaluation_questions():
    """
    Loads evaluation questions and expected answers from environment variables.
    Expected environment variable names are:
      TEST_QUESTION_1, TEST_EXPECTED_1,
      TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
    """
    questions = []
    i = 1
    while True:
        question = os.environ.get(f"TEST_QUESTION_{i}")
        expected = os.environ.get(f"TEST_EXPECTED_{i}")
        if not question or not expected:
            break
        questions.append({"question": question, "expected": expected})
        i += 1
    return questions

# Load the evaluation questions once at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()

def sanitize_input(text):
    """
    Sanitizes input to prevent harmful content and limits its length.
    """
    # Allow alphanumerics and some punctuation, then truncate to 500 characters.
    clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
    return clean_text.strip()[:500]

def evaluate_prompt(email, name, system_prompt):
    """
    For each test question:
      - Uses the provided system prompt to generate a response with GPT-4o Mini.
      - Checks if the expected substring is present.
      - Computes an aggregate score.
    Returns the evaluation results as a string.
    """
    # Sanitize the inputs.
    email = sanitize_input(email)
    name = sanitize_input(name)
    system_prompt = sanitize_input(system_prompt)

    score = 0
    responses = []
    for item in EVALUATION_QUESTIONS:
        question = item["question"]
        expected = item["expected"]
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",  # Ensure this identifier matches the deployed model.
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ]
            )
            answer = response.choices[0].message["content"].strip()
        except Exception as e:
            answer = f"Error during OpenAI API call: {str(e)}"
        
        # Simple evaluation: check if the answer contains the expected substring.
        if expected.lower() in answer.lower():
            score += 1
            verdict = "Correct"
        else:
            verdict = "Incorrect"
        
        responses.append(
            f"Question: {question}\n"
            f"Answer: {answer}\n"
            f"Expected: {expected}\n"
            f"Result: {verdict}\n"
        )
    
    result_details = "\n".join(responses)
    
    return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"

def build_interface():
    """
    Constructs the Gradio interface.
    """
    with gr.Blocks() as demo:
        gr.Markdown("# GPT-4o Mini Prompt Evaluation")
        gr.Markdown("Enter your email, name, and a system prompt below:")
        
        email_input = gr.Textbox(label="Email", placeholder="[email protected]")
        name_input = gr.Textbox(label="Name", placeholder="Your name")
        system_prompt_input = gr.Textbox(
            label="System Prompt",
            placeholder="Enter your system prompt here...",
            lines=6
        )
        eval_button = gr.Button("Evaluate")
        output_text = gr.Textbox(label="Results", lines=15)
        
        eval_button.click(
            fn=evaluate_prompt,
            inputs=[email_input, name_input, system_prompt_input],
            outputs=output_text
        )
    return demo

if __name__ == "__main__":
    interface = build_interface()
    # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
    interface.launch(server_name="0.0.0.0", server_port=7860)