Timothy-Vinzent's picture
Update app.py
07b4a92 verified
raw
history blame
3.94 kB
import os
import re
from datetime import datetime
import gradio as gr
import openai
# Set OpenAI API key from an environment variable.
openai.api_key = os.environ["OPENAI_API_KEY"]
def get_evaluation_questions():
"""
Loads evaluation questions and expected answers from environment variables.
Expected environment variable names are:
TEST_QUESTION_1, TEST_EXPECTED_1,
TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
"""
questions = []
i = 1
while True:
question = os.environ.get(f"TEST_QUESTION_{i}")
expected = os.environ.get(f"TEST_EXPECTED_{i}")
if not question or not expected:
break
questions.append({"question": question, "expected": expected})
i += 1
return questions
# Load the evaluation questions once at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()
def sanitize_input(text):
"""
Sanitizes input to prevent harmful content and limits its length.
"""
# Allow alphanumerics and some punctuation, then truncate to 500 characters.
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
return clean_text.strip()[:500]
def evaluate_prompt(email, name, system_prompt):
"""
For each test question:
- Uses the provided system prompt to generate a response with GPT-4o Mini.
- Checks if the expected substring is present.
- Computes an aggregate score.
Returns the evaluation results as a string.
"""
# Sanitize the inputs.
email = sanitize_input(email)
name = sanitize_input(name)
system_prompt = sanitize_input(system_prompt)
score = 0
responses = []
for item in EVALUATION_QUESTIONS:
question = item["question"]
expected = item["expected"]
try:
response = openai.ChatCompletion.create(
model="gpt-4o-mini", # Ensure this identifier matches the deployed model.
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": question}
]
)
answer = response.choices[0].message["content"].strip()
except Exception as e:
answer = f"Error during OpenAI API call: {str(e)}"
# Simple evaluation: check if the answer contains the expected substring.
if expected.lower() in answer.lower():
score += 1
verdict = "Correct"
else:
verdict = "Incorrect"
responses.append(
f"Question: {question}\n"
f"Answer: {answer}\n"
f"Expected: {expected}\n"
f"Result: {verdict}\n"
)
result_details = "\n".join(responses)
return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"
def build_interface():
"""
Constructs the Gradio interface.
"""
with gr.Blocks() as demo:
gr.Markdown("# GPT-4o Mini Prompt Evaluation")
gr.Markdown("Enter your email, name, and a system prompt below:")
email_input = gr.Textbox(label="Email", placeholder="[email protected]")
name_input = gr.Textbox(label="Name", placeholder="Your name")
system_prompt_input = gr.Textbox(
label="System Prompt",
placeholder="Enter your system prompt here...",
lines=6
)
eval_button = gr.Button("Evaluate")
output_text = gr.Textbox(label="Results", lines=15)
eval_button.click(
fn=evaluate_prompt,
inputs=[email_input, name_input, system_prompt_input],
outputs=output_text
)
return demo
if __name__ == "__main__":
interface = build_interface()
# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
interface.launch(server_name="0.0.0.0", server_port=7860)