Spaces:

AtlaAI
/

judge-arena

Running

App Files Files Community

kaikaidai commited on Nov 14, 2024

Commit

d4256bf

verified ·

1 Parent(s): 44387c3

13-14 Nov changes

Browse files

Files changed (1) hide show

app.py +201 -299

app.py CHANGED Viewed

@@ -4,13 +4,19 @@ import random
 from collections import defaultdict
 from datetime import datetime, timezone
 import hashlib
 from dotenv import load_dotenv
 load_dotenv()
 import gradio as gr
-from gen_api_answer import get_model_response, parse_model_response, get_random_human_ai_pair
 from db import add_vote, create_db_connection, get_votes
 from utils import Vote
 from common import (
@@ -26,12 +32,16 @@ from common import (
     EVAL_DESCRIPTION,
     VOTING_HEADER,
 )
-from example_metrics import EXAMPLE_METRICS
-# Model and ELO score data
-DEFAULT_ELO = 1200  # Starting ELO for new models
-K_FACTOR = 32  # Standard chess K-factor, adjust as needed
 elo_scores = defaultdict(lambda: DEFAULT_ELO)
 vote_counts = defaultdict(int)
@@ -143,6 +153,30 @@ def get_ip(request: gr.Request) -> str:
     return hashlib.sha256(ip.encode()).hexdigest()[:16]
 def vote(
     choice,
     model_a,
@@ -192,16 +226,20 @@ def vote(
     store_vote_data(
         final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
     )
     # Return updates for UI components
     return [
-        gr.update(visible=False),  # vote_a
-        gr.update(visible=False),  # vote_b
-        gr.update(visible=False),  # tie_button_row
         gr.update(value=f"*Model: {model_a}*"),  # model_name_a
         gr.update(value=f"*Model: {model_b}*"),  # model_name_b
-        gr.update(interactive=True, value="Run the evaluators", variant="primary"),  # send_btn
-        gr.update(visible=True),  # spacing_div
     ]
@@ -210,150 +248,24 @@ def get_current_votes():
     return get_votes(db)
-def get_leaderboard(show_preliminary=True):
-    """Generate leaderboard data using fresh votes from MongoDB."""
-    # Get fresh voting data
-    voting_data = get_current_votes()
-    print(f"Fetched {len(voting_data)} votes from database")  # Debug log
-    # Initialize dictionaries for tracking
-    ratings = defaultdict(lambda: DEFAULT_ELO)
-    matches = defaultdict(int)
-    # Process each vote
-    for vote in voting_data:
-        try:
-            model_a = vote.get("model_a")
-            model_b = vote.get("model_b")
-            winner = vote.get("winner")
-            # Skip if models aren't in current model_data
-            if (
-                not all([model_a, model_b, winner])
-                or model_a not in model_data
-                or model_b not in model_data
-            ):
-                continue
-            # Update match counts
-            matches[model_a] += 1
-            matches[model_b] += 1
-            # Calculate ELO changes
-            elo_a = ratings[model_a]
-            elo_b = ratings[model_b]
-            # Expected scores
-            expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
-            expected_b = 1 - expected_a
-            # Actual scores
-            score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
-            score_b = 1 - score_a
-            # Update ratings
-            ratings[model_a] += K_FACTOR * (score_a - expected_a)
-            ratings[model_b] += K_FACTOR * (score_b - expected_b)
-        except Exception as e:
-            print(f"Error processing vote: {e}")
-            continue
-    # Generate leaderboard data
-    leaderboard = []
-    for model in model_data.keys():
-        votes = matches[model]
-        # Skip models with < 500 votes if show_preliminary is False
-        if not show_preliminary and votes < 500:
-            continue
-        elo = ratings[model]
-        ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
-        data = {
-            "Model": model,
-            "ELO Score": f"{int(elo)}",
-            "95% CI": f"±{int(ci)}",
-            "# Votes": votes,
-            "Organization": model_data[model]["organization"],
-            "License": model_data[model]["license"],
-        }
-        leaderboard.append(data)
-    # Sort leaderboard by ELO score in descending order
-    leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
-    return leaderboard
-def calculate_elo_change(rating_a, rating_b, winner):
-    """Calculate ELO rating changes for both players."""
-    expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
-    expected_b = 1 - expected_a
-    if winner == "A":
-        score_a, score_b = 1, 0
-    elif winner == "B":
-        score_a, score_b = 0, 1
-    else:  # Handle ties
-        score_a, score_b = 0.5, 0.5
-    change_a = K_FACTOR * (score_a - expected_a)
-    change_b = K_FACTOR * (score_b - expected_b)
-    return change_a, change_b
-def update_leaderboard():
-    """Generate leaderboard DataFrame using fresh votes from MongoDB."""
-    # Get fresh voting data
     voting_data = get_current_votes()
-    print(f"Found {len(voting_data)} votes in database")
-    matches = defaultdict(int)
-    # Process each vote chronologically
-    for vote in voting_data:
-        # Extract model names from the vote document
-        try:
-            model_a = vote.get("model_a")
-            model_b = vote.get("model_b")
-            winner = vote.get("winner")
-            print(f"Processing vote: {model_a} vs {model_b}, winner: {winner}")
-            # Skip if any required field is missing or models aren't in current model_data
-            if not all([model_a, model_b, winner]):
-                print(f"Missing required fields in vote: {vote}")
-                continue
-            if model_a not in model_data:
-                print(f"Model A '{model_a}' not found in model_data")
-                continue
-            if model_b not in model_data:
-                print(f"Model B '{model_b}' not found in model_data")
-                continue
-            # Update match counts
-            matches[model_a] += 1
-            matches[model_b] += 1
-            print(
-                f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
-            )
-        except Exception as e:
-            print(f"Error processing vote: {e}")
-            print(f"Problematic vote data: {vote}")
-            continue
-# Update the display_leaderboard function
-def display_leaderboard():
-    df = update_leaderboard()
-    return gr.DataFrame(
-        value=df,
-        headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
-        datatype=["str", "number", "str", "number", "str", "str", "str"],
-        row_count=(len(df) + 1, "dynamic"),
-    )
 # Update the leaderboard table definition in the UI
@@ -363,63 +275,22 @@ leaderboard_table = gr.Dataframe(
 )
-def get_leaderboard_stats():
-    """Get summary statistics for the leaderboard."""
-    now = datetime.now(timezone.utc)
-    total_votes = len(get_current_votes())
-    total_models = len(model_data)
-    last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
-        "%B %d, %Y at %H:00 UTC"
-    )
-    return f"""
-### Leaderboard Stats
-- **Total Models**: {total_models}
-- **Total Votes**: {total_votes}
-- **Last Updated**: {last_updated}
-"""
-#def set_example_metric(metric_name):
-#    if metric_name == "Custom":
-#        variables = parse_variables(DEFAULT_EVAL_PROMPT)
-#        variable_values = []
-#        for var in variables:
-#            if var == "input":
-#                variable_values.append(DEFAULT_INPUT)
-#            elif var == "response":
-#                variable_values.append(DEFAULT_RESPONSE)
-#            else:
-#                variable_values.append("")  # Default empty value
-        # Pad variable_values to match the length of variable_rows
-#        while len(variable_values) < len(variable_rows):
-#            variable_values.append("")
-#        return [DEFAULT_EVAL_PROMPT] + variable_values
-#    metric_data = EXAMPLE_METRICS[metric_name]
-#    variables = parse_variables(metric_data["prompt"])
-#    variable_values = []
-#    for var in variables:
-#        value = metric_data.get(var, "")  # Default to empty string if not found
-#        variable_values.append(value)
-    # Pad variable_values to match the length of variable_rows
-#    while len(variable_values) < len(variable_rows):
-#        variable_values.append("")
-#    return [metric_data["prompt"]] + variable_values
-# Select random metric at startup
-#  def get_random_metric():
-#    metrics = list(EXAMPLE_METRICS.keys())
-#    return set_example_metric(random.choice(metrics))
 def populate_random_example(request: gr.Request):
-    """Generate a random human-AI conversation example."""
     human_msg, ai_msg = get_random_human_ai_pair()
     return [
         gr.update(value=human_msg),
-        gr.update(value=ai_msg)
     ]
@@ -435,27 +306,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
     with gr.Tabs():
         with gr.TabItem("Judge Arena"):
-            random_btn = gr.Button("🎲", scale=0)
             with gr.Row():
                 # Left side - Input section
                 with gr.Column(scale=1):
                     with gr.Group():
                         human_input = gr.TextArea(
                             label="👩 Human Input",
-                            lines=13,
                             placeholder="Enter the human message here..."
                         )
                         ai_response = gr.TextArea(
                             label="🤖 AI Response",
-                            lines=13,
                             placeholder="Enter the AI response here..."
                         )
                         send_btn = gr.Button(
-                            value="Run the evaluators",
                             variant="primary",
-                            size="lg"
                         )
                 # Right side - Model outputs
@@ -466,17 +345,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
                         with gr.Row():
                             with gr.Column(scale=1, min_width=100):  # Fixed narrow width for score
                                 score_a = gr.Textbox(label="Score", lines=6, interactive=False)
-                                vote_a = gr.Button("Vote A", variant="primary", visible=False)
                             with gr.Column(scale=9, min_width=400):  # Wider width for critique
                                 critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
-                    # Spacing div that's visible only when tie button is hidden
-                    spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
                     # Tie button row
-                    with gr.Row(visible=False) as tie_button_row:
                         with gr.Column():
-                            vote_tie = gr.Button("Tie", variant="secondary")
                     gr.Markdown("### 🧑‍⚖️ Judge B")
@@ -485,13 +361,17 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
                         with gr.Row():
                             with gr.Column(scale=1, min_width=100):  # Fixed narrow width for score
                                 score_b = gr.Textbox(label="Score", lines=6, interactive=False)
-                                vote_b = gr.Button("Vote B", variant="primary", visible=False)
                             with gr.Column(scale=9, min_width=400):  # Wider width for critique
                                 critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
                     # Place Vote B button directly under Judge B
             gr.Markdown("<br>")
             # Add spacing and acknowledgements at the bottom
             gr.Markdown(ACKNOWLEDGEMENTS)
@@ -510,24 +390,6 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
                 datatype=["str", "number", "str", "number", "str", "str", "str"],
             )
-            # Update refresh_leaderboard to use the checkbox value
-            def refresh_leaderboard(show_preliminary):
-                """Refresh the leaderboard data and stats."""
-                leaderboard = get_leaderboard(show_preliminary)
-                data = [
-                    [
-                        entry["Model"],
-                        float(entry["ELO Score"]),
-                        entry["95% CI"],
-                        entry["# Votes"],
-                        entry["Organization"],
-                        entry["License"],
-                    ]
-                    for entry in leaderboard
-                ]
-                stats = get_leaderboard_stats()
-                return [gr.update(value=data), gr.update(value=stats)]
             # Add change handler for checkbox
             show_preliminary.change(
                 fn=refresh_leaderboard,
@@ -551,35 +413,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
     final_prompt_state = gr.State()
     # Update variable inputs based on the eval prompt
-    def update_variables(eval_prompt):
-        variables = parse_variables(eval_prompt)
-        updates = []
-        for i in range(len(variable_rows)):
-            var_row, var_input = variable_rows[i]
-            if i < len(variables):
-                var_name = variables[i]
-                # Set the number of lines based on the variable name
-                if var_name == "response":
-                    lines = 4  # Adjust this number as needed
-                else:
-                    lines = 1  # Default to single line for other variables
-                updates.extend(
-                    [
-                        gr.update(visible=True),  # Show the variable row
-                        gr.update(
-                            label=var_name, visible=True, lines=lines
-                        ),  # Update label and lines
-                    ]
-                )
-            else:
-                updates.extend(
-                    [
-                        gr.update(visible=False),  # Hide the variable row
-                        gr.update(value="", visible=False),  # Clear value when hidden
-                    ]
-                )
-        return updates
     #eval_prompt.change(
     #    fn=update_variables,
@@ -619,7 +481,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
     vote_a.click(
         fn=vote,
         inputs=[
-            gr.State("A"),  # Choice
             model_a_state,
             model_b_state,
             final_prompt_state,
@@ -631,18 +493,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         outputs=[
             vote_a,
             vote_b,
-            tie_button_row,
             model_name_a,
             model_name_b,
             send_btn,
-            spacing_div,
         ],
     )
     vote_b.click(
         fn=vote,
         inputs=[
-            gr.State("B"),  # Choice
             model_a_state,
             model_b_state,
             final_prompt_state,
@@ -654,18 +517,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         outputs=[
             vote_a,
             vote_b,
-            tie_button_row,
             model_name_a,
             model_name_b,
             send_btn,
-            spacing_div,
         ],
     )
     vote_tie.click(
         fn=vote,
         inputs=[
-            gr.State("Tie"),  # Choice
             model_a_state,
             model_b_state,
             final_prompt_state,
@@ -677,11 +541,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
         outputs=[
             vote_a,
             vote_b,
-            tie_button_row,
             model_name_a,
             model_name_b,
             send_btn,
-            spacing_div,
         ],
     )
@@ -717,21 +582,20 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
             critique_a,
             score_b,
             critique_b,
-            gr.update(visible=True),  # vote_a
-            gr.update(visible=True),  # vote_b
-            gr.update(visible=True),  # tie_button_row
             model_a,
             model_b,
-            final_prompt,  # Add final_prompt to state
             gr.update(value="*Model: Hidden*"),
             gr.update(value="*Model: Hidden*"),
-            # Change the button to "Regenerate" mode after evaluation
             gr.update(
-                value="Regenerate with different models",
                 variant="secondary",
                 interactive=True
             ),
-            gr.update(visible=False),  # spacing_div
         )
     send_btn.click(
@@ -744,29 +608,29 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
             critique_b,
             vote_a,
             vote_b,
-            tie_button_row,
             model_a_state,
             model_b_state,
             final_prompt_state,
             model_name_a,
             model_name_b,
             send_btn,
-            spacing_div,
         ],
     )
     # Update the input change handlers to also disable regenerate button
-    def handle_input_changes(prompt, *variables):
-        """Enable send button and manage regenerate button based on input changes"""
-        last_inputs = last_submission.value
-        current_inputs = {"prompt": prompt, "variables": variables}
-        inputs_changed = last_inputs != current_inputs
-        return [
-            gr.update(interactive=True),  # send button always enabled
-            gr.update(
-                interactive=not inputs_changed
-            ),  # regenerate button disabled if inputs changed
-        ]
     # Update the change handlers for prompt and variables
     #eval_prompt.change(
@@ -813,24 +677,62 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
     random_btn.click(
         fn=populate_random_example,
         inputs=[],
-        outputs=[human_input, ai_response]
     )
     # Add new input change handlers
     def handle_input_change():
-        return gr.update(value="Run the evaluators", variant="primary")
     # Update the change handlers for inputs
     human_input.change(
         fn=handle_input_change,
         inputs=[],
-        outputs=[send_btn]
     )
     ai_response.change(
         fn=handle_input_change,
         inputs=[],
-        outputs=[send_btn]
     )
     # Update the demo.load to include the random example population

 from collections import defaultdict
 from datetime import datetime, timezone
 import hashlib
+from typing import Dict, List
 from dotenv import load_dotenv
 load_dotenv()
 import gradio as gr
+from gen_api_answer import (
+    get_model_response,
+    parse_model_response,
+    get_random_human_ai_pair,
+    generate_ai_response
+)
 from db import add_vote, create_db_connection, get_votes
 from utils import Vote
 from common import (
     EVAL_DESCRIPTION,
     VOTING_HEADER,
 )
+from leaderboard import (
+    get_leaderboard,
+    get_leaderboard_stats,
+    calculate_elo_change,
+    get_model_rankings,
+    DEFAULT_ELO,
+    K_FACTOR
+)
 elo_scores = defaultdict(lambda: DEFAULT_ELO)
 vote_counts = defaultdict(int)
     return hashlib.sha256(ip.encode()).hexdigest()[:16]
+def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
+    """Generate appropriate message based on vote and model rankings."""
+    voting_data = get_current_votes()
+    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
+    rankings = get_model_rankings(leaderboard)
+    pos_a = rankings.get(model_a, 0)
+    pos_b = rankings.get(model_b, 0)
+    if choice == "Tie":
+        return f"It's a tie! Currently, {model_a} ranks #{pos_a} and {model_b} ranks #{pos_b}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
+    # Get chosen and rejected models based on vote
+    model_chosen = model_a if choice == "A" else model_b
+    model_rejected = model_b if choice == "A" else model_a
+    pos_chosen = pos_a if choice == "A" else pos_b
+    pos_rejected = pos_b if choice == "A" else pos_a
+    # Check if vote aligns with leaderboard
+    if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
+        return f"You're in touch with the community! {model_chosen} ranks #{pos_chosen} ahead of {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
+    else:
+        return f"You don't think like everyone else ;) {model_chosen} ranks #{pos_chosen} which is behind {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
 def vote(
     choice,
     model_a,
     store_vote_data(
         final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
     )
+    # Generate vote message
+    message = get_vote_message(choice, model_a, model_b)
     # Return updates for UI components
     return [
+        gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"),  # vote_a
+        gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"),  # vote_b
+        gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"),  # vote_tie
         gr.update(value=f"*Model: {model_a}*"),  # model_name_a
         gr.update(value=f"*Model: {model_b}*"),  # model_name_b
+        gr.update(interactive=True, value="Regenerate judges", variant="secondary"),  # send_btn
+        gr.update(value="🎲 New round", variant="primary"),  # random_btn
+        gr.Info(message, title = "🥳 Thanks for your vote!"),  # success message
     ]
     return get_votes(db)
+# Update the refresh_leaderboard function
+def refresh_leaderboard(show_preliminary):
+    """Refresh the leaderboard data and stats."""
     voting_data = get_current_votes()
+    leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
+    data = [
+        [
+            entry["Model"],
+            float(entry["ELO Score"]),
+            entry["95% CI"],
+            entry["# Votes"],
+            entry["Organization"],
+            entry["License"],
+        ]
+        for entry in leaderboard
+    ]
+    stats = get_leaderboard_stats(model_data, voting_data)
+    return [gr.update(value=data), gr.update(value=stats)]
 # Update the leaderboard table definition in the UI
 )
 def populate_random_example(request: gr.Request):
+    """Generate a random human-AI conversation example and reset judge outputs."""
     human_msg, ai_msg = get_random_human_ai_pair()
     return [
         gr.update(value=human_msg),
+        gr.update(value=ai_msg),
+        gr.update(value="🎲", variant="secondary"),  # Reset random button appearance
+        gr.update(value=""),  # Clear score A
+        gr.update(value=""),  # Clear critique A
+        gr.update(value=""),  # Clear score B
+        gr.update(value=""),  # Clear critique B
+        gr.update(interactive=False, variant="primary"),  # Reset vote A
+        gr.update(interactive=False, variant="primary"),  # Reset vote B
+        gr.update(interactive=False, variant="primary"),  # Reset vote tie
+        gr.update(value="*Model: Hidden*"),  # Reset model name A
+        gr.update(value="*Model: Hidden*"),  # Reset model name B
     ]
     with gr.Tabs():
         with gr.TabItem("Judge Arena"):
             with gr.Row():
                 # Left side - Input section
                 with gr.Column(scale=1):
                     with gr.Group():
                         human_input = gr.TextArea(
                             label="👩 Human Input",
+                            lines=10,
                             placeholder="Enter the human message here..."
                         )
+                        with gr.Row():
+                            generate_btn = gr.Button(
+                                "Generate AI Response",
+                                size="sm",
+                                interactive=False
+                            )
                         ai_response = gr.TextArea(
                             label="🤖 AI Response",
+                            lines=15,
                             placeholder="Enter the AI response here..."
                         )
+                    with gr.Row():
+                        random_btn = gr.Button("🎲", scale=2)
                         send_btn = gr.Button(
+                            value="Run judges",
                             variant="primary",
+                            size="lg",
+                            scale=8
                         )
                 # Right side - Model outputs
                         with gr.Row():
                             with gr.Column(scale=1, min_width=100):  # Fixed narrow width for score
                                 score_a = gr.Textbox(label="Score", lines=6, interactive=False)
+                                vote_a = gr.Button("Vote A", variant="primary", interactive=False)
                             with gr.Column(scale=9, min_width=400):  # Wider width for critique
                                 critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
                     # Tie button row
+                    with gr.Row() as tie_button_row:
                         with gr.Column():
+                            vote_tie = gr.Button("Tie", variant="primary", interactive=False)
                     gr.Markdown("### 🧑‍⚖️ Judge B")
                         with gr.Row():
                             with gr.Column(scale=1, min_width=100):  # Fixed narrow width for score
                                 score_b = gr.Textbox(label="Score", lines=6, interactive=False)
+                                vote_b = gr.Button("Vote B", variant="primary", interactive=False)
                             with gr.Column(scale=9, min_width=400):  # Wider width for critique
                                 critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
                     # Place Vote B button directly under Judge B
             gr.Markdown("<br>")
+            # Add Evaluator Prompt Accordion
+            with gr.Accordion("📝 Evaluator Prompt", open=False):
+                gr.Markdown(f"```\n{DEFAULT_EVAL_PROMPT}\n```")
             # Add spacing and acknowledgements at the bottom
             gr.Markdown(ACKNOWLEDGEMENTS)
                 datatype=["str", "number", "str", "number", "str", "str", "str"],
             )
             # Add change handler for checkbox
             show_preliminary.change(
                 fn=refresh_leaderboard,
     final_prompt_state = gr.State()
     # Update variable inputs based on the eval prompt
+    #def update_variables(eval_prompt):
+    #    variables = parse_variables(eval_prompt)
+    #    updates = []
+    #    for i in range(len(variable_rows)):
+    #        var_row, var_input = variable_rows[i]
+    #        if i < len(variables):
+    #            var_name = variables[i]
+    #            # Set the number of lines based on the variable name
+    #            if var_name == "response":
+    #                lines = 4  # Adjust this number as needed
+    #            else:
+    #                lines = 1  # Default to single line for other variables
+    #            updates.extend(
+    #                [
+    #                    gr.update(visible=True),  # Show the variable row
+    #                    gr.update(
+    #                        label=var_name, visible=True, lines=lines
+    #                    ),  # Update label and lines
+    #                ]
+    #            )
+    #        else:
+    #            updates.extend(
+    #                [
+    #                        gr.update(visible=False),  # Hide the variable row
+    #                        gr.update(value="", visible=False),  # Clear value when hidden
+    #                    ]
+    #            )
+    #    return updates
     #eval_prompt.change(
     #    fn=update_variables,
     vote_a.click(
         fn=vote,
         inputs=[
+            gr.State("A"),
             model_a_state,
             model_b_state,
             final_prompt_state,
         outputs=[
             vote_a,
             vote_b,
+            vote_tie,
             model_name_a,
             model_name_b,
             send_btn,
+            random_btn,
+            gr.State(),  # placeholder for success message
         ],
     )
     vote_b.click(
         fn=vote,
         inputs=[
+            gr.State("B"),
             model_a_state,
             model_b_state,
             final_prompt_state,
         outputs=[
             vote_a,
             vote_b,
+            vote_tie,
             model_name_a,
             model_name_b,
             send_btn,
+            random_btn,
+            gr.State(),  # placeholder for success message
         ],
     )
     vote_tie.click(
         fn=vote,
         inputs=[
+            gr.State("Tie"),
             model_a_state,
             model_b_state,
             final_prompt_state,
         outputs=[
             vote_a,
             vote_b,
+            vote_tie,
             model_name_a,
             model_name_b,
             send_btn,
+            random_btn,
+            gr.State(),  # placeholder for success message
         ],
     )
             critique_a,
             score_b,
             critique_b,
+            gr.update(interactive=True, variant="primary"),  # vote_a
+            gr.update(interactive=True, variant="primary"),  # vote_b
+            gr.update(interactive=True, variant="primary"),  # vote_tie
             model_a,
             model_b,
+            final_prompt,
             gr.update(value="*Model: Hidden*"),
             gr.update(value="*Model: Hidden*"),
             gr.update(
+                value="Regenerate judges",
                 variant="secondary",
                 interactive=True
             ),
+            gr.update(value="🎲"),  # random_btn
         )
     send_btn.click(
             critique_b,
             vote_a,
             vote_b,
+            vote_tie,
             model_a_state,
             model_b_state,
             final_prompt_state,
             model_name_a,
             model_name_b,
             send_btn,
+            random_btn,
         ],
     )
     # Update the input change handlers to also disable regenerate button
+    # def handle_input_changes(prompt, *variables):
+    #    """Enable send button and manage regenerate button based on input changes"""
+    #    last_inputs = last_submission.value
+    #    current_inputs = {"prompt": prompt, "variables": variables}
+    #    inputs_changed = last_inputs != current_inputs
+    #    return [
+    #        gr.update(interactive=True),  # send button always enabled
+    #        gr.update(
+    #            interactive=not inputs_changed
+    #        ),  # regenerate button disabled if inputs changed
+    #    ]
     # Update the change handlers for prompt and variables
     #eval_prompt.change(
     random_btn.click(
         fn=populate_random_example,
         inputs=[],
+        outputs=[
+            human_input,
+            ai_response,
+            random_btn,
+            score_a,
+            critique_a,
+            score_b,
+            critique_b,
+            vote_a,
+            vote_b,
+            vote_tie,
+            model_name_a,
+            model_name_b,
+        ]
     )
     # Add new input change handlers
     def handle_input_change():
+        """Reset UI state when inputs are changed"""
+        return [
+            gr.update(interactive=False),  # vote_a
+            gr.update(interactive=False),  # vote_b
+            gr.update(interactive=False),  # vote_tie
+            gr.update(value="Run judges", variant="primary"),  # send_btn
+            gr.update(value="🎲", variant="secondary"),  # random_btn
+        ]
     # Update the change handlers for inputs
     human_input.change(
         fn=handle_input_change,
         inputs=[],
+        outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
     )
     ai_response.change(
         fn=handle_input_change,
         inputs=[],
+        outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
+    )
+    generate_btn.click(
+        fn=lambda msg: (
+            generate_ai_response(msg)[0],  # Only take the response text
+            gr.update(
+                value="Generate AI Response",  # Keep the label
+                interactive=False  # Disable the button
+            )
+        ),
+        inputs=[human_input],
+        outputs=[ai_response, generate_btn]
+    )
+    human_input.change(
+        fn=lambda x: gr.update(interactive=bool(x.strip())),
+        inputs=[human_input],
+        outputs=[generate_btn]
     )
     # Update the demo.load to include the random example population