Spaces:

DeepJudge
/

Applicant-Task-Submission

Running

File size: 5,096 Bytes

import os
import re
from datetime import datetime

import gradio as gr
import openai
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Set OpenAI API key from an environment variable.
openai.api_key = os.environ["OPENAI_API_KEY"]

def get_evaluation_questions():
    """
    Loads evaluation questions and expected answers from environment variables.
    Expected environment variable names are:
      TEST_QUESTION_1, TEST_EXPECTED_1,
      TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
    """
    questions = []
    i = 1
    while True:
        question = os.environ.get(f"TEST_QUESTION_{i}")
        expected = os.environ.get(f"TEST_EXPECTED_{i}")
        if not question or not expected:
            break
        questions.append({"question": question, "expected": expected})
        i += 1
    return questions

# Load the evaluation questions once at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()

def init_sheet():
    """
    Initializes and returns the Google Sheet.
    The sheet name is taken from the SHEET_NAME environment variable,
    defaulting to "Prompt Evaluations" if not set.
    
    Ensure that your service account credentials (credentials.json)
    are available and that the sheet is shared with the service account's email.
    """
    scopes = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive"
    ]
    creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes)
    client = gspread.authorize(creds)
    sheet_name = os.environ.get("SHEET_NAME", "Prompt Evaluations")
    sheet = client.open(sheet_name).sheet1
    return sheet

def sanitize_input(text):
    """
    Sanitizes input to prevent harmful content and limits its length.
    """
    # Allow alphanumerics and some punctuation, then truncate to 500 characters.
    clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
    return clean_text.strip()[:500]

def evaluate_prompt(email, name, system_prompt):
    """
    For each test question:
      - Uses the provided system prompt to generate a response with GPT-4o Mini.
      - Checks if the expected substring is present.
      - Computes an aggregate score.
    Logs the user's email, name, system prompt, and score to a Google Sheet.
    """
    # Sanitize the inputs.
    email = sanitize_input(email)
    name = sanitize_input(name)
    system_prompt = sanitize_input(system_prompt)

    score = 0
    responses = []
    for item in EVALUATION_QUESTIONS:
        question = item["question"]
        expected = item["expected"]
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",  # Ensure this identifier matches the deployed model.
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": question}
                ]
            )
            answer = response.choices[0].message["content"].strip()
        except Exception as e:
            answer = f"Error during OpenAI API call: {str(e)}"
        
        # Simple evaluation: check if the answer contains the expected substring.
        if expected.lower() in answer.lower():
            score += 1
            verdict = "Correct"
        else:
            verdict = "Incorrect"
        
        responses.append(
            f"Question: {question}\n"
            f"Answer: {answer}\n"
            f"Expected: {expected}\n"
            f"Result: {verdict}\n"
        )
    
    result_details = "\n".join(responses)
    
    try:
        sheet = init_sheet()
        timestamp = datetime.now().isoformat()
        row = [timestamp, email, name, score, system_prompt]
        sheet.append_row(row)
    except Exception as err:
        print("Error writing to Google Sheet:", err)
    
    return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"

def build_interface():
    """
    Constructs the Gradio interface.
    """
    with gr.Blocks() as demo:
        gr.Markdown("# GPT-4o Mini Prompt Evaluation")
        gr.Markdown("Enter your email, name, and a system prompt below:")
        
        email_input = gr.Textbox(label="Email", placeholder="[email protected]")
        name_input = gr.Textbox(label="Name", placeholder="Your name")
        system_prompt_input = gr.Textbox(
            label="System Prompt",
            placeholder="Enter your system prompt here...",
            lines=6
        )
        eval_button = gr.Button("Evaluate")
        output_text = gr.Textbox(label="Results", lines=15)
        
        eval_button.click(
            fn=evaluate_prompt,
            inputs=[email_input, name_input, system_prompt_input],
            outputs=output_text
        )
    return demo

if __name__ == "__main__":
    interface = build_interface()
    # Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
    interface.launch(server_name="0.0.0.0", server_port=7860)