|
import os |
|
|
|
import gradio as gr |
|
from openai import OpenAI |
|
import time |
|
|
|
|
|
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] |
|
client = OpenAI(api_key = OPENAI_API_KEY) |
|
|
|
def chat_with_gpt(message, history, system_prompt): |
|
|
|
messages = [] |
|
|
|
|
|
if system_prompt.strip(): |
|
messages.append({"role": "system", "content": system_prompt.strip()}) |
|
|
|
|
|
for user_msg, assistant_msg in history: |
|
messages.append({"role": "user", "content": user_msg}) |
|
messages.append({"role": "assistant", "content": assistant_msg}) |
|
|
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
try: |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=messages, |
|
max_tokens=500, |
|
temperature=0.7, |
|
top_p=0.9 |
|
) |
|
|
|
assistant_response = response.choices[0].message.content |
|
|
|
|
|
history.append((message, assistant_response)) |
|
|
|
return history, "" |
|
|
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
history.append((message, error_msg)) |
|
return history, "" |
|
|
|
def evaluate_conversation(history, system_prompt, evaluation_metrics, progress=gr.Progress()): |
|
if not history: |
|
return "β No conversation to evaluate. Please have a conversation first." |
|
|
|
|
|
progress(0, desc="Starting evaluation...") |
|
|
|
|
|
progress(0.2, desc="Preparing conversation transcript...") |
|
conversation_text = "" |
|
if system_prompt.strip(): |
|
conversation_text += f"System Prompt: {system_prompt}\n\n" |
|
|
|
conversation_text += "Conversation:\n" |
|
for i, (user_msg, assistant_msg) in enumerate(history, 1): |
|
conversation_text += f"Turn {i}:\n" |
|
conversation_text += f"User: {user_msg}\n" |
|
conversation_text += f"Assistant: {assistant_msg}\n\n" |
|
|
|
|
|
progress(0.4, desc="Crafting evaluation prompt...") |
|
evaluation_prompt = f"""Please evaluate the following conversation based on these specific criteria: |
|
|
|
{evaluation_metrics} |
|
|
|
CONVERSATION TO EVALUATE: |
|
{conversation_text} |
|
|
|
Please provide a detailed evaluation report that: |
|
1. Scores each criterion on a scale of 1-10 |
|
2. Provides specific examples from the conversation to support your scores |
|
3. Offers constructive feedback for improvement |
|
4. Gives an overall assessment |
|
|
|
Format your response clearly with headings for each evaluation criterion.""" |
|
|
|
try: |
|
|
|
progress(0.6, desc="Sending request to OpenAI...") |
|
response = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{"role": "system", "content": "You are an expert conversation analyst. Provide thorough, objective evaluations with specific examples and actionable feedback."}, |
|
{"role": "user", "content": evaluation_prompt} |
|
], |
|
max_tokens=1000, |
|
temperature=0.3 |
|
) |
|
|
|
progress(0.9, desc="Processing evaluation results...") |
|
|
|
|
|
evaluation_result = f"""# π Conversation Evaluation Report |
|
|
|
{response.choices[0].message.content} |
|
|
|
--- |
|
*Evaluation completed at {time.strftime('%Y-%m-%d %H:%M:%S')}* |
|
*Conversation length: {len(history)} exchanges* |
|
""" |
|
|
|
progress(1.0, desc="Evaluation complete!") |
|
return evaluation_result |
|
|
|
except Exception as e: |
|
progress(1.0, desc="Evaluation failed") |
|
return f"β **Error during evaluation:** {str(e)}" |
|
|
|
def start_evaluation(): |
|
"""Return initial evaluation status""" |
|
return "π **Evaluating conversation...** \n\nPlease wait while we analyze your conversation. This may take 10-30 seconds depending on conversation length." |
|
|
|
def reset_conversation(): |
|
return [], "", "No evaluation yet. Have a conversation and click 'Evaluate' to see detailed feedback." |
|
|
|
def load_preset_prompt(preset): |
|
"""Load predefined system prompts""" |
|
presets = { |
|
"General Assistant": "You are a helpful, knowledgeable, and friendly AI assistant.", |
|
"Therapist": "You are a compassionate and professional therapist. Provide supportive, empathetic responses while maintaining appropriate boundaries. Ask thoughtful questions to help the user explore their feelings.", |
|
"Distressed Teen Jen": "You are a distressed 14 year old teenager dealing with typical teenage problems like school stress, peer pressure, and family issues. Respond with the emotional intensity and perspective of a troubled teen seeking help.", |
|
"Distressed Teen John": "You are a distressed 15 year old teenager dealing with typical teenage problems like school stress, peer pressure, and family issues. Respond with the emotional intensity and perspective of a troubled teen seeking help.", |
|
"Technical Expert": "You are a technical expert with deep knowledge in programming, engineering, and technology. Provide detailed, accurate technical explanations and solutions.", |
|
"Creative Writer": "You are a creative writing assistant. Help with storytelling, character development, plot ideas, and provide creative inspiration with vivid descriptions.", |
|
"Custom": "" |
|
} |
|
return presets.get(preset, "") |
|
|
|
|
|
default_evaluation = """Please evaluate the conversation according to: |
|
|
|
1) **Coherence**: How logically consistent and well-structured are the responses? Do they flow naturally from one turn to the next? |
|
|
|
2) **Relevance**: How well do the assistant's responses address the user's specific questions, needs, and context? |
|
|
|
3) **Engagement**: How natural, conversational, and engaging is the interaction? Does it feel like a meaningful dialogue? |
|
|
|
4) **Helpfulness**: How useful and actionable are the assistant's responses? Do they provide value to the user? |
|
|
|
5) **Role Consistency**: How well does the assistant maintain its assigned role/persona throughout the conversation? Are there any character breaks?""" |
|
|
|
|
|
with gr.Blocks(title="OpenAI Chatbot with Evaluation") as demo: |
|
gr.Markdown("# OpenAI Chatbot with Conversation Evaluation") |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1, min_width=350): |
|
gr.Markdown("## System Configuration") |
|
|
|
|
|
preset_dropdown = gr.Dropdown( |
|
choices=["General Assistant", "Therapist", "Distressed Teen Jen", "Distressed Teen John", "Technical Expert", "Creative Writer", "Custom"], |
|
value="General Assistant", |
|
label="Quick Presets", |
|
info="Select a preset or choose 'Custom' to write your own" |
|
) |
|
|
|
|
|
system_prompt = gr.Textbox( |
|
label="System Prompt", |
|
placeholder="Enter system instructions here...", |
|
value="You are a helpful, knowledgeable, and friendly AI assistant.", |
|
lines=4, |
|
info="This guides the AI's behavior and personality" |
|
) |
|
|
|
gr.Markdown("## Evaluation Configuration") |
|
|
|
|
|
evaluation_metrics = gr.Textbox( |
|
label="Evaluation Metrics", |
|
placeholder="Enter evaluation criteria here...", |
|
value=default_evaluation, |
|
lines=8, |
|
info="Customize how you want the conversation to be evaluated" |
|
) |
|
|
|
gr.Markdown("### Usage") |
|
gr.Markdown("β’ Configure system prompt and evaluation criteria") |
|
gr.Markdown("β’ Have a conversation with the AI") |
|
gr.Markdown("β’ Click 'Evaluate' to get detailed feedback") |
|
gr.Markdown("β’ Evaluation takes 10-30 seconds β±οΈ") |
|
|
|
|
|
with gr.Column(scale=2): |
|
gr.Markdown("**Chat with your configured AI assistant**") |
|
|
|
|
|
chatbot = gr.Chatbot( |
|
label="Conversation", |
|
value=[], |
|
height=500 |
|
) |
|
|
|
|
|
msg_input = gr.Textbox( |
|
label="Your message", |
|
placeholder="Type your message here...", |
|
lines=2, |
|
scale=4 |
|
) |
|
|
|
|
|
with gr.Row(): |
|
send_btn = gr.Button("Send", variant="primary", scale=1) |
|
reset_btn = gr.Button("Reset Chat", variant="secondary", scale=1) |
|
evaluate_btn = gr.Button("π Evaluate", variant="huggingface", scale=1) |
|
|
|
|
|
with gr.Accordion("π Evaluation Report", open=False) as evaluation_accordion: |
|
evaluation_output = gr.Markdown( |
|
value="No evaluation yet. Have a conversation and click 'Evaluate' to see detailed feedback.", |
|
label="Evaluation Results" |
|
) |
|
|
|
|
|
|
|
|
|
preset_dropdown.change( |
|
fn=load_preset_prompt, |
|
inputs=[preset_dropdown], |
|
outputs=[system_prompt] |
|
) |
|
|
|
|
|
send_btn.click( |
|
fn=chat_with_gpt, |
|
inputs=[msg_input, chatbot, system_prompt], |
|
outputs=[chatbot, msg_input] |
|
) |
|
|
|
|
|
msg_input.submit( |
|
fn=chat_with_gpt, |
|
inputs=[msg_input, chatbot, system_prompt], |
|
outputs=[chatbot, msg_input] |
|
) |
|
|
|
|
|
evaluate_btn.click( |
|
fn=start_evaluation, |
|
inputs=[], |
|
outputs=[evaluation_output] |
|
).then( |
|
fn=evaluate_conversation, |
|
inputs=[chatbot, system_prompt, evaluation_metrics], |
|
outputs=[evaluation_output] |
|
) |
|
|
|
|
|
reset_btn.click( |
|
fn=reset_conversation, |
|
inputs=[], |
|
outputs=[chatbot, msg_input, evaluation_output] |
|
) |
|
|
|
|
|
demo.launch(share=True) |