Spaces:

DeepJudge
/

Applicant-Task-Submission

Sleeping

App Files Files

Applicant-Task-Submission / app.py

Timothy-Vinzent

Update app.py

07b4a92 verified 5 months ago

raw

history blame

3.94 kB

	import os
	import re
	from datetime import datetime

	import gradio as gr
	import openai

	# Set OpenAI API key from an environment variable.
	openai.api_key = os.environ["OPENAI_API_KEY"]

	def get_evaluation_questions():
	"""
	Loads evaluation questions and expected answers from environment variables.
	Expected environment variable names are:
	TEST_QUESTION_1, TEST_EXPECTED_1,
	TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
	"""
	questions = []
	i = 1
	while True:
	question = os.environ.get(f"TEST_QUESTION_{i}")
	expected = os.environ.get(f"TEST_EXPECTED_{i}")
	if not question or not expected:
	break
	questions.append({"question": question, "expected": expected})
	i += 1
	return questions

	# Load the evaluation questions once at startup.
	EVALUATION_QUESTIONS = get_evaluation_questions()

	def sanitize_input(text):
	"""
	Sanitizes input to prevent harmful content and limits its length.
	"""
	# Allow alphanumerics and some punctuation, then truncate to 500 characters.
	clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
	return clean_text.strip()[:500]

	def evaluate_prompt(email, name, system_prompt):
	"""
	For each test question:
	- Uses the provided system prompt to generate a response with GPT-4o Mini.
	- Checks if the expected substring is present.
	- Computes an aggregate score.
	Returns the evaluation results as a string.
	"""
	# Sanitize the inputs.
	email = sanitize_input(email)
	name = sanitize_input(name)
	system_prompt = sanitize_input(system_prompt)

	score = 0
	responses = []
	for item in EVALUATION_QUESTIONS:
	question = item["question"]
	expected = item["expected"]
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4o-mini", # Ensure this identifier matches the deployed model.
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": question}
	]
	)
	answer = response.choices[0].message["content"].strip()
	except Exception as e:
	answer = f"Error during OpenAI API call: {str(e)}"

	# Simple evaluation: check if the answer contains the expected substring.
	if expected.lower() in answer.lower():
	score += 1
	verdict = "Correct"
	else:
	verdict = "Incorrect"

	responses.append(
	f"Question: {question}\n"
	f"Answer: {answer}\n"
	f"Expected: {expected}\n"
	f"Result: {verdict}\n"
	)

	result_details = "\n".join(responses)

	return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"

	def build_interface():
	"""
	Constructs the Gradio interface.
	"""
	with gr.Blocks() as demo:
	gr.Markdown("# GPT-4o Mini Prompt Evaluation")
	gr.Markdown("Enter your email, name, and a system prompt below:")

	email_input = gr.Textbox(label="Email", placeholder="[email protected]")
	name_input = gr.Textbox(label="Name", placeholder="Your name")
	system_prompt_input = gr.Textbox(
	label="System Prompt",
	placeholder="Enter your system prompt here...",
	lines=6
	)
	eval_button = gr.Button("Evaluate")
	output_text = gr.Textbox(label="Results", lines=15)

	eval_button.click(
	fn=evaluate_prompt,
	inputs=[email_input, name_input, system_prompt_input],
	outputs=output_text
	)
	return demo

	if __name__ == "__main__":
	interface = build_interface()
	# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
	interface.launch(server_name="0.0.0.0", server_port=7860)