File size: 10,611 Bytes
bac2ab2
 
7f55fea
bac2ab2
 
7f55fea
bac2ab2
 
 
7f55fea
bac2ab2
 
 
7f55fea
bac2ab2
 
 
7f55fea
bac2ab2
 
 
 
7f55fea
bac2ab2
7f55fea
 
bac2ab2
 
 
 
 
 
 
 
 
7f55fea
bac2ab2
7f55fea
bac2ab2
 
7f55fea
bac2ab2
7f55fea
bac2ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f55fea
bac2ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import os

import gradio as gr
from openai import OpenAI
import time

# Initialize OpenAI client
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
client = OpenAI(api_key = OPENAI_API_KEY)  # Assumes OPENAI_API_KEY is set in environment

def chat_with_gpt(message, history, system_prompt):
    # Convert Gradio history format to OpenAI messages format
    messages = []

    # Add system prompt if provided
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt.strip()})

    # Add conversation history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})

    # Add current user message
    messages.append({"role": "user", "content": message})

    try:
        # Call OpenAI API with full conversation context
        response = client.chat.completions.create(
            model="gpt-4o-mini",  # or "gpt-4o" for better quality
            messages=messages,
            max_tokens=500,
            temperature=0.7,
            top_p=0.9
        )

        assistant_response = response.choices[0].message.content

        # Update history with new exchange
        history.append((message, assistant_response))

        return history, ""  # Return updated history and clear input

    except Exception as e:
        error_msg = f"Error: {str(e)}"
        history.append((message, error_msg))
        return history, ""

def evaluate_conversation(history, system_prompt, evaluation_metrics, progress=gr.Progress()):
    if not history:
        return "❌ No conversation to evaluate. Please have a conversation first."

    # Initialize progress
    progress(0, desc="Starting evaluation...")

    # Prepare the conversation transcript
    progress(0.2, desc="Preparing conversation transcript...")
    conversation_text = ""
    if system_prompt.strip():
        conversation_text += f"System Prompt: {system_prompt}\n\n"

    conversation_text += "Conversation:\n"
    for i, (user_msg, assistant_msg) in enumerate(history, 1):
        conversation_text += f"Turn {i}:\n"
        conversation_text += f"User: {user_msg}\n"
        conversation_text += f"Assistant: {assistant_msg}\n\n"

    # Create evaluation prompt
    progress(0.4, desc="Crafting evaluation prompt...")
    evaluation_prompt = f"""Please evaluate the following conversation based on these specific criteria:

{evaluation_metrics}

CONVERSATION TO EVALUATE:
{conversation_text}

Please provide a detailed evaluation report that:
1. Scores each criterion on a scale of 1-10
2. Provides specific examples from the conversation to support your scores
3. Offers constructive feedback for improvement
4. Gives an overall assessment

Format your response clearly with headings for each evaluation criterion."""

    try:
        # Call OpenAI API for evaluation
        progress(0.6, desc="Sending request to OpenAI...")
        response = client.chat.completions.create(
            model="gpt-4o",  # Use better model for evaluation
            messages=[
                {"role": "system", "content": "You are an expert conversation analyst. Provide thorough, objective evaluations with specific examples and actionable feedback."},
                {"role": "user", "content": evaluation_prompt}
            ],
            max_tokens=1000,
            temperature=0.3  # Lower temperature for more consistent evaluation
        )

        progress(0.9, desc="Processing evaluation results...")

        # Format the response nicely
        evaluation_result = f"""# πŸ“Š Conversation Evaluation Report

{response.choices[0].message.content}

---
*Evaluation completed at {time.strftime('%Y-%m-%d %H:%M:%S')}*
*Conversation length: {len(history)} exchanges*
"""

        progress(1.0, desc="Evaluation complete!")
        return evaluation_result

    except Exception as e:
        progress(1.0, desc="Evaluation failed")
        return f"❌ **Error during evaluation:** {str(e)}"

def start_evaluation():
    """Return initial evaluation status"""
    return "πŸ”„ **Evaluating conversation...** \n\nPlease wait while we analyze your conversation. This may take 10-30 seconds depending on conversation length."

def reset_conversation():
    return [], "", "No evaluation yet. Have a conversation and click 'Evaluate' to see detailed feedback."  # Clear history, input, and evaluation

def load_preset_prompt(preset):
    """Load predefined system prompts"""
    presets = {
        "General Assistant": "You are a helpful, knowledgeable, and friendly AI assistant.",
        "Therapist": "You are a compassionate and professional therapist. Provide supportive, empathetic responses while maintaining appropriate boundaries. Ask thoughtful questions to help the user explore their feelings.",
        "Distressed Teen Jen": "You are a distressed 14 year old teenager dealing with typical teenage problems like school stress, peer pressure, and family issues. Respond with the emotional intensity and perspective of a troubled teen seeking help.",
        "Distressed Teen John": "You are a distressed 15 year old teenager dealing with typical teenage problems like school stress, peer pressure, and family issues. Respond with the emotional intensity and perspective of a troubled teen seeking help.",
        "Technical Expert": "You are a technical expert with deep knowledge in programming, engineering, and technology. Provide detailed, accurate technical explanations and solutions.",
        "Creative Writer": "You are a creative writing assistant. Help with storytelling, character development, plot ideas, and provide creative inspiration with vivid descriptions.",
        "Custom": ""
    }
    return presets.get(preset, "")

# Default evaluation metrics
default_evaluation = """Please evaluate the conversation according to:

1) **Coherence**: How logically consistent and well-structured are the responses? Do they flow naturally from one turn to the next?

2) **Relevance**: How well do the assistant's responses address the user's specific questions, needs, and context?

3) **Engagement**: How natural, conversational, and engaging is the interaction? Does it feel like a meaningful dialogue?

4) **Helpfulness**: How useful and actionable are the assistant's responses? Do they provide value to the user?

5) **Role Consistency**: How well does the assistant maintain its assigned role/persona throughout the conversation? Are there any character breaks?"""

# Create the Gradio interface
with gr.Blocks(title="OpenAI Chatbot with Evaluation") as demo:
    gr.Markdown("# OpenAI Chatbot with Conversation Evaluation")

    with gr.Row():
        # Left sidebar for system prompt and evaluation configuration
        with gr.Column(scale=1, min_width=350):
            gr.Markdown("## System Configuration")

            # Preset dropdown
            preset_dropdown = gr.Dropdown(
                choices=["General Assistant", "Therapist", "Distressed Teen Jen", "Distressed Teen John", "Technical Expert", "Creative Writer", "Custom"],
                value="General Assistant",
                label="Quick Presets",
                info="Select a preset or choose 'Custom' to write your own"
            )

            # System prompt textbox
            system_prompt = gr.Textbox(
                label="System Prompt",
                placeholder="Enter system instructions here...",
                value="You are a helpful, knowledgeable, and friendly AI assistant.",
                lines=4,
                info="This guides the AI's behavior and personality"
            )

            gr.Markdown("## Evaluation Configuration")

            # Evaluation metrics textbox
            evaluation_metrics = gr.Textbox(
                label="Evaluation Metrics",
                placeholder="Enter evaluation criteria here...",
                value=default_evaluation,
                lines=8,
                info="Customize how you want the conversation to be evaluated"
            )

            gr.Markdown("### Usage")
            gr.Markdown("β€’ Configure system prompt and evaluation criteria")
            gr.Markdown("β€’ Have a conversation with the AI")
            gr.Markdown("β€’ Click 'Evaluate' to get detailed feedback")
            gr.Markdown("β€’ Evaluation takes 10-30 seconds ⏱️")

        # Right side for chat interface
        with gr.Column(scale=2):
            gr.Markdown("**Chat with your configured AI assistant**")

            # Chatbot component to display conversation
            chatbot = gr.Chatbot(
                label="Conversation",
                value=[],
                height=500
            )

            # Input textbox
            msg_input = gr.Textbox(
                label="Your message",
                placeholder="Type your message here...",
                lines=2,
                scale=4
            )

            # Buttons row
            with gr.Row():
                send_btn = gr.Button("Send", variant="primary", scale=1)
                reset_btn = gr.Button("Reset Chat", variant="secondary", scale=1)
                evaluate_btn = gr.Button("πŸ” Evaluate", variant="huggingface", scale=1)

    # Evaluation results section (collapsible)
    with gr.Accordion("πŸ“Š Evaluation Report", open=False) as evaluation_accordion:
        evaluation_output = gr.Markdown(
            value="No evaluation yet. Have a conversation and click 'Evaluate' to see detailed feedback.",
            label="Evaluation Results"
        )

    # Event handlers

    # Load preset prompts
    preset_dropdown.change(
        fn=load_preset_prompt,
        inputs=[preset_dropdown],
        outputs=[system_prompt]
    )

    # Send message
    send_btn.click(
        fn=chat_with_gpt,
        inputs=[msg_input, chatbot, system_prompt],
        outputs=[chatbot, msg_input]
    )

    # Allow Enter key to send message
    msg_input.submit(
        fn=chat_with_gpt,
        inputs=[msg_input, chatbot, system_prompt],
        outputs=[chatbot, msg_input]
    )

    # Evaluate conversation with progress tracking
    evaluate_btn.click(
        fn=start_evaluation,
        inputs=[],
        outputs=[evaluation_output]
    ).then(
        fn=evaluate_conversation,
        inputs=[chatbot, system_prompt, evaluation_metrics],
        outputs=[evaluation_output]
    )

    # Reset button functionality
    reset_btn.click(
        fn=reset_conversation,
        inputs=[],
        outputs=[chatbot, msg_input, evaluation_output]
    )

# Launch the app
demo.launch(share=True)