import os import re from datetime import datetime import gradio as gr import openai # Set OpenAI API key from an environment variable. openai.api_key = os.environ["OPENAI_API_KEY"] def get_evaluation_questions(): """ Loads evaluation questions and expected answers from environment variables. Expected environment variable names are: TEST_QUESTION_1, TEST_EXPECTED_1, TEST_QUESTION_2, TEST_EXPECTED_2, and so on. """ questions = [] i = 1 while True: question = os.environ.get(f"TEST_QUESTION_{i}") expected = os.environ.get(f"TEST_EXPECTED_{i}") if not question or not expected: break questions.append({"question": question, "expected": expected}) i += 1 return questions # Load the evaluation questions once at startup. EVALUATION_QUESTIONS = get_evaluation_questions() def sanitize_input(text): """ Sanitizes input to prevent harmful content and limits its length. """ # Allow alphanumerics and some punctuation, then truncate to 500 characters. clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text) return clean_text.strip()[:500] def evaluate_prompt(email, name, system_prompt): """ For each test question: - Uses the provided system prompt to generate a response with GPT-4o Mini. - Checks if the expected substring is present. - Computes an aggregate score. Returns the evaluation results as a string. """ # Sanitize the inputs. email = sanitize_input(email) name = sanitize_input(name) system_prompt = sanitize_input(system_prompt) score = 0 responses = [] for item in EVALUATION_QUESTIONS: question = item["question"] expected = item["expected"] try: response = openai.ChatCompletion.create( model="gpt-4o-mini", # Ensure this identifier matches the deployed model. messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": question} ] ) answer = response.choices[0].message["content"].strip() except Exception as e: answer = f"Error during OpenAI API call: {str(e)}" # Simple evaluation: check if the answer contains the expected substring. if expected.lower() in answer.lower(): score += 1 verdict = "Correct" else: verdict = "Incorrect" responses.append( f"Question: {question}\n" f"Answer: {answer}\n" f"Expected: {expected}\n" f"Result: {verdict}\n" ) result_details = "\n".join(responses) return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}" def build_interface(): """ Constructs the Gradio interface. """ with gr.Blocks() as demo: gr.Markdown("# GPT-4o Mini Prompt Evaluation") gr.Markdown("Enter your email, name, and a system prompt below:") email_input = gr.Textbox(label="Email", placeholder="your.email@example.com") name_input = gr.Textbox(label="Name", placeholder="Your name") system_prompt_input = gr.Textbox( label="System Prompt", placeholder="Enter your system prompt here...", lines=6 ) eval_button = gr.Button("Evaluate") output_text = gr.Textbox(label="Results", lines=15) eval_button.click( fn=evaluate_prompt, inputs=[email_input, name_input, system_prompt_input], outputs=output_text ) return demo if __name__ == "__main__": interface = build_interface() # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container). interface.launch(server_name="0.0.0.0", server_port=7860)