Spaces:
Running
Running
File size: 5,096 Bytes
821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 5521e44 821e9b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import os
import re
from datetime import datetime
import gradio as gr
import openai
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# Set OpenAI API key from an environment variable.
openai.api_key = os.environ["OPENAI_API_KEY"]
def get_evaluation_questions():
"""
Loads evaluation questions and expected answers from environment variables.
Expected environment variable names are:
TEST_QUESTION_1, TEST_EXPECTED_1,
TEST_QUESTION_2, TEST_EXPECTED_2, and so on.
"""
questions = []
i = 1
while True:
question = os.environ.get(f"TEST_QUESTION_{i}")
expected = os.environ.get(f"TEST_EXPECTED_{i}")
if not question or not expected:
break
questions.append({"question": question, "expected": expected})
i += 1
return questions
# Load the evaluation questions once at startup.
EVALUATION_QUESTIONS = get_evaluation_questions()
def init_sheet():
"""
Initializes and returns the Google Sheet.
The sheet name is taken from the SHEET_NAME environment variable,
defaulting to "Prompt Evaluations" if not set.
Ensure that your service account credentials (credentials.json)
are available and that the sheet is shared with the service account's email.
"""
scopes = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive"
]
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scopes)
client = gspread.authorize(creds)
sheet_name = os.environ.get("SHEET_NAME", "Prompt Evaluations")
sheet = client.open(sheet_name).sheet1
return sheet
def sanitize_input(text):
"""
Sanitizes input to prevent harmful content and limits its length.
"""
# Allow alphanumerics and some punctuation, then truncate to 500 characters.
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
return clean_text.strip()[:500]
def evaluate_prompt(email, name, system_prompt):
"""
For each test question:
- Uses the provided system prompt to generate a response with GPT-4o Mini.
- Checks if the expected substring is present.
- Computes an aggregate score.
Logs the user's email, name, system prompt, and score to a Google Sheet.
"""
# Sanitize the inputs.
email = sanitize_input(email)
name = sanitize_input(name)
system_prompt = sanitize_input(system_prompt)
score = 0
responses = []
for item in EVALUATION_QUESTIONS:
question = item["question"]
expected = item["expected"]
try:
response = openai.ChatCompletion.create(
model="gpt-4o-mini", # Ensure this identifier matches the deployed model.
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": question}
]
)
answer = response.choices[0].message["content"].strip()
except Exception as e:
answer = f"Error during OpenAI API call: {str(e)}"
# Simple evaluation: check if the answer contains the expected substring.
if expected.lower() in answer.lower():
score += 1
verdict = "Correct"
else:
verdict = "Incorrect"
responses.append(
f"Question: {question}\n"
f"Answer: {answer}\n"
f"Expected: {expected}\n"
f"Result: {verdict}\n"
)
result_details = "\n".join(responses)
try:
sheet = init_sheet()
timestamp = datetime.now().isoformat()
row = [timestamp, email, name, score, system_prompt]
sheet.append_row(row)
except Exception as err:
print("Error writing to Google Sheet:", err)
return f"Your evaluation score is {score} out of {len(EVALUATION_QUESTIONS)}.\n\nDetails:\n{result_details}"
def build_interface():
"""
Constructs the Gradio interface.
"""
with gr.Blocks() as demo:
gr.Markdown("# GPT-4o Mini Prompt Evaluation")
gr.Markdown("Enter your email, name, and a system prompt below:")
email_input = gr.Textbox(label="Email", placeholder="[email protected]")
name_input = gr.Textbox(label="Name", placeholder="Your name")
system_prompt_input = gr.Textbox(
label="System Prompt",
placeholder="Enter your system prompt here...",
lines=6
)
eval_button = gr.Button("Evaluate")
output_text = gr.Textbox(label="Results", lines=15)
eval_button.click(
fn=evaluate_prompt,
inputs=[email_input, name_input, system_prompt_input],
outputs=output_text
)
return demo
if __name__ == "__main__":
interface = build_interface()
# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
interface.launch(server_name="0.0.0.0", server_port=7860)
|