Spaces:

DeepJudge
/

Applicant-Task-Submission

Sleeping

File size: 3,937 Bytes

import os
import re
from datetime import datetime

import gradio as gr
import openai

# Set OpenAI API key from an environment variable.
openai.api_key = os.environ["OPENAI_API_KEY"]

def get_evaluation_questions():
    """
    Loads evaluation questions and expected answers from environment variables.
    Expected environment variable names are:
      TEST_QUESTION_1, TEST_EXPECTED_1,
      TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
    """
    questions = []
    i = 1
    while True:
        question = os.environ.get(f"TEST_QUESTION_{i}")
        expected = os.environ.get(f"TEST_EXPECTED_{i}")
        if not question or not expected:
            break
        questions.append({"question": question, "expected": expected})
        i += 1
    return questions

# Load the evaluation questions once at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()

def sanitize_input(text):
    """
    Sanitizes input to prevent harmful content and limits its length.
    """
    # Allow alphanumerics and some punctuation, then truncate to 500 characters.
    clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
    return clean_text.strip()[:500]

def evaluate_prompt(email, name, system_prompt):
    """
    For each test question:
      - Uses the provided system prompt to generate a response with GPT-4o Mini.
      - Checks if the expected substring is present.
      - Computes an aggregate score.
    Returns the evaluation results as a string.
    """
    # Sanitize the inputs.
    email = sanitize_input(email)
    name = sanitize_input(name)
    system_prompt = sanitize_input(system_prompt)

    score = 0
    responses = []
    for item in EVALUATION_QUESTIONS:
        question = item["question"]
        expected = item["expected"]
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",  # Ensure this identifier matches the deployed model.
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ]
            )
            answer = response.choices[0].message["content"].strip()
        except Exception as e:
            answer = f"Error during OpenAI API call: {str(e)}"
        
        # Simple evaluation: check if the answer contains the expected substring.
        if expected.lower() in answer.lower():
            score += 1
            verdict = "Correct"
        else:
            verdict = "Incorrect"
        
        responses.append(
            f"Question: {question}\n"
            f"Answer: {answer}\n"
            f"Expected: {expected}\n"
            f"Result: {verdict}\n"
        )
    
    result_details = "\n".join(responses)
    
    return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"

def build_interface():
    """
    Constructs the Gradio interface.
    """
    with gr.Blocks() as demo:
        gr.Markdown("# GPT-4o Mini Prompt Evaluation")
        gr.Markdown("Enter your email, name, and a system prompt below:")
        
        email_input = gr.Textbox(label="Email", placeholder="[email protected]")
        name_input = gr.Textbox(label="Name", placeholder="Your name")
        system_prompt_input = gr.Textbox(
            label="System Prompt",
            placeholder="Enter your system prompt here...",
            lines=6
        )
        eval_button = gr.Button("Evaluate")
        output_text = gr.Textbox(label="Results", lines=15)
        
        eval_button.click(
            fn=evaluate_prompt,
            inputs=[email_input, name_input, system_prompt_input],
            outputs=output_text
        )
    return demo

if __name__ == "__main__":
    interface = build_interface()
    # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
    interface.launch(server_name="0.0.0.0", server_port=7860)