Spaces:
Running
Running
13-14 Nov changes
Browse files
app.py
CHANGED
|
@@ -4,13 +4,19 @@ import random
|
|
| 4 |
from collections import defaultdict
|
| 5 |
from datetime import datetime, timezone
|
| 6 |
import hashlib
|
|
|
|
| 7 |
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
-
from gen_api_answer import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from db import add_vote, create_db_connection, get_votes
|
| 15 |
from utils import Vote
|
| 16 |
from common import (
|
|
@@ -26,12 +32,16 @@ from common import (
|
|
| 26 |
EVAL_DESCRIPTION,
|
| 27 |
VOTING_HEADER,
|
| 28 |
)
|
| 29 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
-
# Model and ELO score data
|
| 33 |
-
DEFAULT_ELO = 1200 # Starting ELO for new models
|
| 34 |
-
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
| 35 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
| 36 |
vote_counts = defaultdict(int)
|
| 37 |
|
|
@@ -143,6 +153,30 @@ def get_ip(request: gr.Request) -> str:
|
|
| 143 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
| 144 |
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def vote(
|
| 147 |
choice,
|
| 148 |
model_a,
|
|
@@ -192,16 +226,20 @@ def vote(
|
|
| 192 |
store_vote_data(
|
| 193 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
| 194 |
)
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
# Return updates for UI components
|
| 197 |
return [
|
| 198 |
-
gr.update(
|
| 199 |
-
gr.update(
|
| 200 |
-
gr.update(
|
| 201 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
| 202 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
| 203 |
-
gr.update(interactive=True, value="
|
| 204 |
-
gr.update(
|
|
|
|
| 205 |
]
|
| 206 |
|
| 207 |
|
|
@@ -210,150 +248,24 @@ def get_current_votes():
|
|
| 210 |
return get_votes(db)
|
| 211 |
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
voting_data = get_current_votes()
|
| 217 |
-
print(f"Fetched {len(voting_data)} votes from database") # Debug log
|
| 218 |
-
|
| 219 |
-
# Initialize dictionaries for tracking
|
| 220 |
-
ratings = defaultdict(lambda: DEFAULT_ELO)
|
| 221 |
-
matches = defaultdict(int)
|
| 222 |
-
|
| 223 |
-
# Process each vote
|
| 224 |
-
for vote in voting_data:
|
| 225 |
-
try:
|
| 226 |
-
model_a = vote.get("model_a")
|
| 227 |
-
model_b = vote.get("model_b")
|
| 228 |
-
winner = vote.get("winner")
|
| 229 |
-
|
| 230 |
-
# Skip if models aren't in current model_data
|
| 231 |
-
if (
|
| 232 |
-
not all([model_a, model_b, winner])
|
| 233 |
-
or model_a not in model_data
|
| 234 |
-
or model_b not in model_data
|
| 235 |
-
):
|
| 236 |
-
continue
|
| 237 |
-
|
| 238 |
-
# Update match counts
|
| 239 |
-
matches[model_a] += 1
|
| 240 |
-
matches[model_b] += 1
|
| 241 |
-
|
| 242 |
-
# Calculate ELO changes
|
| 243 |
-
elo_a = ratings[model_a]
|
| 244 |
-
elo_b = ratings[model_b]
|
| 245 |
-
|
| 246 |
-
# Expected scores
|
| 247 |
-
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
|
| 248 |
-
expected_b = 1 - expected_a
|
| 249 |
-
|
| 250 |
-
# Actual scores
|
| 251 |
-
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
|
| 252 |
-
score_b = 1 - score_a
|
| 253 |
-
|
| 254 |
-
# Update ratings
|
| 255 |
-
ratings[model_a] += K_FACTOR * (score_a - expected_a)
|
| 256 |
-
ratings[model_b] += K_FACTOR * (score_b - expected_b)
|
| 257 |
-
|
| 258 |
-
except Exception as e:
|
| 259 |
-
print(f"Error processing vote: {e}")
|
| 260 |
-
continue
|
| 261 |
-
|
| 262 |
-
# Generate leaderboard data
|
| 263 |
-
leaderboard = []
|
| 264 |
-
for model in model_data.keys():
|
| 265 |
-
votes = matches[model]
|
| 266 |
-
# Skip models with < 500 votes if show_preliminary is False
|
| 267 |
-
if not show_preliminary and votes < 500:
|
| 268 |
-
continue
|
| 269 |
-
|
| 270 |
-
elo = ratings[model]
|
| 271 |
-
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
| 272 |
-
data = {
|
| 273 |
-
"Model": model,
|
| 274 |
-
"ELO Score": f"{int(elo)}",
|
| 275 |
-
"95% CI": f"±{int(ci)}",
|
| 276 |
-
"# Votes": votes,
|
| 277 |
-
"Organization": model_data[model]["organization"],
|
| 278 |
-
"License": model_data[model]["license"],
|
| 279 |
-
}
|
| 280 |
-
leaderboard.append(data)
|
| 281 |
-
|
| 282 |
-
# Sort leaderboard by ELO score in descending order
|
| 283 |
-
leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
|
| 284 |
-
|
| 285 |
-
return leaderboard
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
def calculate_elo_change(rating_a, rating_b, winner):
|
| 289 |
-
"""Calculate ELO rating changes for both players."""
|
| 290 |
-
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
| 291 |
-
expected_b = 1 - expected_a
|
| 292 |
-
|
| 293 |
-
if winner == "A":
|
| 294 |
-
score_a, score_b = 1, 0
|
| 295 |
-
elif winner == "B":
|
| 296 |
-
score_a, score_b = 0, 1
|
| 297 |
-
else: # Handle ties
|
| 298 |
-
score_a, score_b = 0.5, 0.5
|
| 299 |
-
|
| 300 |
-
change_a = K_FACTOR * (score_a - expected_a)
|
| 301 |
-
change_b = K_FACTOR * (score_b - expected_b)
|
| 302 |
-
|
| 303 |
-
return change_a, change_b
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
def update_leaderboard():
|
| 307 |
-
"""Generate leaderboard DataFrame using fresh votes from MongoDB."""
|
| 308 |
-
# Get fresh voting data
|
| 309 |
voting_data = get_current_votes()
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
if not all([model_a, model_b, winner]):
|
| 325 |
-
print(f"Missing required fields in vote: {vote}")
|
| 326 |
-
continue
|
| 327 |
-
|
| 328 |
-
if model_a not in model_data:
|
| 329 |
-
print(f"Model A '{model_a}' not found in model_data")
|
| 330 |
-
continue
|
| 331 |
-
|
| 332 |
-
if model_b not in model_data:
|
| 333 |
-
print(f"Model B '{model_b}' not found in model_data")
|
| 334 |
-
continue
|
| 335 |
-
|
| 336 |
-
# Update match counts
|
| 337 |
-
matches[model_a] += 1
|
| 338 |
-
matches[model_b] += 1
|
| 339 |
-
print(
|
| 340 |
-
f"Updated matches - {model_a}: {matches[model_a]}, {model_b}: {matches[model_b]}"
|
| 341 |
-
)
|
| 342 |
-
except Exception as e:
|
| 343 |
-
print(f"Error processing vote: {e}")
|
| 344 |
-
print(f"Problematic vote data: {vote}")
|
| 345 |
-
continue
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
# Update the display_leaderboard function
|
| 349 |
-
def display_leaderboard():
|
| 350 |
-
df = update_leaderboard()
|
| 351 |
-
return gr.DataFrame(
|
| 352 |
-
value=df,
|
| 353 |
-
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
| 354 |
-
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 355 |
-
row_count=(len(df) + 1, "dynamic"),
|
| 356 |
-
)
|
| 357 |
|
| 358 |
|
| 359 |
# Update the leaderboard table definition in the UI
|
|
@@ -363,63 +275,22 @@ leaderboard_table = gr.Dataframe(
|
|
| 363 |
)
|
| 364 |
|
| 365 |
|
| 366 |
-
def get_leaderboard_stats():
|
| 367 |
-
"""Get summary statistics for the leaderboard."""
|
| 368 |
-
now = datetime.now(timezone.utc)
|
| 369 |
-
total_votes = len(get_current_votes())
|
| 370 |
-
total_models = len(model_data)
|
| 371 |
-
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
|
| 372 |
-
"%B %d, %Y at %H:00 UTC"
|
| 373 |
-
)
|
| 374 |
-
|
| 375 |
-
return f"""
|
| 376 |
-
### Leaderboard Stats
|
| 377 |
-
- **Total Models**: {total_models}
|
| 378 |
-
- **Total Votes**: {total_votes}
|
| 379 |
-
- **Last Updated**: {last_updated}
|
| 380 |
-
"""
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
#def set_example_metric(metric_name):
|
| 384 |
-
# if metric_name == "Custom":
|
| 385 |
-
# variables = parse_variables(DEFAULT_EVAL_PROMPT)
|
| 386 |
-
# variable_values = []
|
| 387 |
-
# for var in variables:
|
| 388 |
-
# if var == "input":
|
| 389 |
-
# variable_values.append(DEFAULT_INPUT)
|
| 390 |
-
# elif var == "response":
|
| 391 |
-
# variable_values.append(DEFAULT_RESPONSE)
|
| 392 |
-
# else:
|
| 393 |
-
# variable_values.append("") # Default empty value
|
| 394 |
-
# Pad variable_values to match the length of variable_rows
|
| 395 |
-
# while len(variable_values) < len(variable_rows):
|
| 396 |
-
# variable_values.append("")
|
| 397 |
-
# return [DEFAULT_EVAL_PROMPT] + variable_values
|
| 398 |
-
|
| 399 |
-
# metric_data = EXAMPLE_METRICS[metric_name]
|
| 400 |
-
# variables = parse_variables(metric_data["prompt"])
|
| 401 |
-
# variable_values = []
|
| 402 |
-
# for var in variables:
|
| 403 |
-
# value = metric_data.get(var, "") # Default to empty string if not found
|
| 404 |
-
# variable_values.append(value)
|
| 405 |
-
# Pad variable_values to match the length of variable_rows
|
| 406 |
-
# while len(variable_values) < len(variable_rows):
|
| 407 |
-
# variable_values.append("")
|
| 408 |
-
# return [metric_data["prompt"]] + variable_values
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
# Select random metric at startup
|
| 412 |
-
# def get_random_metric():
|
| 413 |
-
# metrics = list(EXAMPLE_METRICS.keys())
|
| 414 |
-
# return set_example_metric(random.choice(metrics))
|
| 415 |
-
|
| 416 |
-
|
| 417 |
def populate_random_example(request: gr.Request):
|
| 418 |
-
"""Generate a random human-AI conversation example."""
|
| 419 |
human_msg, ai_msg = get_random_human_ai_pair()
|
| 420 |
return [
|
| 421 |
gr.update(value=human_msg),
|
| 422 |
-
gr.update(value=ai_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
]
|
| 424 |
|
| 425 |
|
|
@@ -435,27 +306,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 435 |
|
| 436 |
with gr.Tabs():
|
| 437 |
with gr.TabItem("Judge Arena"):
|
| 438 |
-
random_btn = gr.Button("🎲", scale=0)
|
| 439 |
with gr.Row():
|
| 440 |
# Left side - Input section
|
| 441 |
with gr.Column(scale=1):
|
| 442 |
with gr.Group():
|
| 443 |
human_input = gr.TextArea(
|
| 444 |
label="👩 Human Input",
|
| 445 |
-
lines=
|
| 446 |
placeholder="Enter the human message here..."
|
| 447 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
ai_response = gr.TextArea(
|
| 450 |
label="🤖 AI Response",
|
| 451 |
-
lines=
|
| 452 |
placeholder="Enter the AI response here..."
|
| 453 |
)
|
| 454 |
|
|
|
|
|
|
|
| 455 |
send_btn = gr.Button(
|
| 456 |
-
value="Run
|
| 457 |
variant="primary",
|
| 458 |
-
size="lg"
|
|
|
|
| 459 |
)
|
| 460 |
|
| 461 |
# Right side - Model outputs
|
|
@@ -466,17 +345,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 466 |
with gr.Row():
|
| 467 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
| 468 |
score_a = gr.Textbox(label="Score", lines=6, interactive=False)
|
| 469 |
-
vote_a = gr.Button("Vote A", variant="primary",
|
| 470 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 471 |
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 472 |
|
| 473 |
-
# Spacing div that's visible only when tie button is hidden
|
| 474 |
-
spacing_div = gr.HTML('<div style="height: 42px;"></div>', visible=True, elem_id="spacing-div")
|
| 475 |
-
|
| 476 |
# Tie button row
|
| 477 |
-
with gr.Row(
|
| 478 |
with gr.Column():
|
| 479 |
-
vote_tie = gr.Button("Tie", variant="
|
| 480 |
|
| 481 |
|
| 482 |
gr.Markdown("### 🧑⚖️ Judge B")
|
|
@@ -485,13 +361,17 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 485 |
with gr.Row():
|
| 486 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
| 487 |
score_b = gr.Textbox(label="Score", lines=6, interactive=False)
|
| 488 |
-
vote_b = gr.Button("Vote B", variant="primary",
|
| 489 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 490 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 491 |
# Place Vote B button directly under Judge B
|
| 492 |
|
| 493 |
gr.Markdown("<br>")
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
# Add spacing and acknowledgements at the bottom
|
| 496 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 497 |
|
|
@@ -510,24 +390,6 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 510 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 511 |
)
|
| 512 |
|
| 513 |
-
# Update refresh_leaderboard to use the checkbox value
|
| 514 |
-
def refresh_leaderboard(show_preliminary):
|
| 515 |
-
"""Refresh the leaderboard data and stats."""
|
| 516 |
-
leaderboard = get_leaderboard(show_preliminary)
|
| 517 |
-
data = [
|
| 518 |
-
[
|
| 519 |
-
entry["Model"],
|
| 520 |
-
float(entry["ELO Score"]),
|
| 521 |
-
entry["95% CI"],
|
| 522 |
-
entry["# Votes"],
|
| 523 |
-
entry["Organization"],
|
| 524 |
-
entry["License"],
|
| 525 |
-
]
|
| 526 |
-
for entry in leaderboard
|
| 527 |
-
]
|
| 528 |
-
stats = get_leaderboard_stats()
|
| 529 |
-
return [gr.update(value=data), gr.update(value=stats)]
|
| 530 |
-
|
| 531 |
# Add change handler for checkbox
|
| 532 |
show_preliminary.change(
|
| 533 |
fn=refresh_leaderboard,
|
|
@@ -551,35 +413,35 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 551 |
final_prompt_state = gr.State()
|
| 552 |
|
| 553 |
# Update variable inputs based on the eval prompt
|
| 554 |
-
def update_variables(eval_prompt):
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
gr.update(visible=False), # Hide the variable row
|
| 579 |
-
gr.update(value="", visible=False), # Clear value when hidden
|
| 580 |
-
]
|
| 581 |
-
|
| 582 |
-
|
| 583 |
|
| 584 |
#eval_prompt.change(
|
| 585 |
# fn=update_variables,
|
|
@@ -619,7 +481,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 619 |
vote_a.click(
|
| 620 |
fn=vote,
|
| 621 |
inputs=[
|
| 622 |
-
gr.State("A"),
|
| 623 |
model_a_state,
|
| 624 |
model_b_state,
|
| 625 |
final_prompt_state,
|
|
@@ -631,18 +493,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 631 |
outputs=[
|
| 632 |
vote_a,
|
| 633 |
vote_b,
|
| 634 |
-
|
| 635 |
model_name_a,
|
| 636 |
model_name_b,
|
| 637 |
send_btn,
|
| 638 |
-
|
|
|
|
| 639 |
],
|
| 640 |
)
|
| 641 |
|
| 642 |
vote_b.click(
|
| 643 |
fn=vote,
|
| 644 |
inputs=[
|
| 645 |
-
gr.State("B"),
|
| 646 |
model_a_state,
|
| 647 |
model_b_state,
|
| 648 |
final_prompt_state,
|
|
@@ -654,18 +517,19 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 654 |
outputs=[
|
| 655 |
vote_a,
|
| 656 |
vote_b,
|
| 657 |
-
|
| 658 |
model_name_a,
|
| 659 |
model_name_b,
|
| 660 |
send_btn,
|
| 661 |
-
|
|
|
|
| 662 |
],
|
| 663 |
)
|
| 664 |
|
| 665 |
vote_tie.click(
|
| 666 |
fn=vote,
|
| 667 |
inputs=[
|
| 668 |
-
gr.State("Tie"),
|
| 669 |
model_a_state,
|
| 670 |
model_b_state,
|
| 671 |
final_prompt_state,
|
|
@@ -677,11 +541,12 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 677 |
outputs=[
|
| 678 |
vote_a,
|
| 679 |
vote_b,
|
| 680 |
-
|
| 681 |
model_name_a,
|
| 682 |
model_name_b,
|
| 683 |
send_btn,
|
| 684 |
-
|
|
|
|
| 685 |
],
|
| 686 |
)
|
| 687 |
|
|
@@ -717,21 +582,20 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 717 |
critique_a,
|
| 718 |
score_b,
|
| 719 |
critique_b,
|
| 720 |
-
gr.update(
|
| 721 |
-
gr.update(
|
| 722 |
-
gr.update(
|
| 723 |
model_a,
|
| 724 |
model_b,
|
| 725 |
-
final_prompt,
|
| 726 |
gr.update(value="*Model: Hidden*"),
|
| 727 |
gr.update(value="*Model: Hidden*"),
|
| 728 |
-
# Change the button to "Regenerate" mode after evaluation
|
| 729 |
gr.update(
|
| 730 |
-
value="Regenerate
|
| 731 |
variant="secondary",
|
| 732 |
interactive=True
|
| 733 |
),
|
| 734 |
-
gr.update(
|
| 735 |
)
|
| 736 |
|
| 737 |
send_btn.click(
|
|
@@ -744,29 +608,29 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 744 |
critique_b,
|
| 745 |
vote_a,
|
| 746 |
vote_b,
|
| 747 |
-
|
| 748 |
model_a_state,
|
| 749 |
model_b_state,
|
| 750 |
final_prompt_state,
|
| 751 |
model_name_a,
|
| 752 |
model_name_b,
|
| 753 |
send_btn,
|
| 754 |
-
|
| 755 |
],
|
| 756 |
)
|
| 757 |
|
| 758 |
# Update the input change handlers to also disable regenerate button
|
| 759 |
-
def handle_input_changes(prompt, *variables):
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
|
| 771 |
# Update the change handlers for prompt and variables
|
| 772 |
#eval_prompt.change(
|
|
@@ -813,24 +677,62 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
| 813 |
random_btn.click(
|
| 814 |
fn=populate_random_example,
|
| 815 |
inputs=[],
|
| 816 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
)
|
| 818 |
|
| 819 |
# Add new input change handlers
|
| 820 |
def handle_input_change():
|
| 821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
|
| 823 |
# Update the change handlers for inputs
|
| 824 |
human_input.change(
|
| 825 |
fn=handle_input_change,
|
| 826 |
inputs=[],
|
| 827 |
-
outputs=[send_btn]
|
| 828 |
)
|
| 829 |
|
| 830 |
ai_response.change(
|
| 831 |
fn=handle_input_change,
|
| 832 |
inputs=[],
|
| 833 |
-
outputs=[send_btn]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
)
|
| 835 |
|
| 836 |
# Update the demo.load to include the random example population
|
|
|
|
| 4 |
from collections import defaultdict
|
| 5 |
from datetime import datetime, timezone
|
| 6 |
import hashlib
|
| 7 |
+
from typing import Dict, List
|
| 8 |
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
+
from gen_api_answer import (
|
| 15 |
+
get_model_response,
|
| 16 |
+
parse_model_response,
|
| 17 |
+
get_random_human_ai_pair,
|
| 18 |
+
generate_ai_response
|
| 19 |
+
)
|
| 20 |
from db import add_vote, create_db_connection, get_votes
|
| 21 |
from utils import Vote
|
| 22 |
from common import (
|
|
|
|
| 32 |
EVAL_DESCRIPTION,
|
| 33 |
VOTING_HEADER,
|
| 34 |
)
|
| 35 |
+
from leaderboard import (
|
| 36 |
+
get_leaderboard,
|
| 37 |
+
get_leaderboard_stats,
|
| 38 |
+
calculate_elo_change,
|
| 39 |
+
get_model_rankings,
|
| 40 |
+
DEFAULT_ELO,
|
| 41 |
+
K_FACTOR
|
| 42 |
+
)
|
| 43 |
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
| 46 |
vote_counts = defaultdict(int)
|
| 47 |
|
|
|
|
| 153 |
return hashlib.sha256(ip.encode()).hexdigest()[:16]
|
| 154 |
|
| 155 |
|
| 156 |
+
def get_vote_message(choice: str, model_a: str, model_b: str) -> str:
|
| 157 |
+
"""Generate appropriate message based on vote and model rankings."""
|
| 158 |
+
voting_data = get_current_votes()
|
| 159 |
+
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
|
| 160 |
+
rankings = get_model_rankings(leaderboard)
|
| 161 |
+
pos_a = rankings.get(model_a, 0)
|
| 162 |
+
pos_b = rankings.get(model_b, 0)
|
| 163 |
+
|
| 164 |
+
if choice == "Tie":
|
| 165 |
+
return f"It's a tie! Currently, {model_a} ranks #{pos_a} and {model_b} ranks #{pos_b}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
|
| 166 |
+
|
| 167 |
+
# Get chosen and rejected models based on vote
|
| 168 |
+
model_chosen = model_a if choice == "A" else model_b
|
| 169 |
+
model_rejected = model_b if choice == "A" else model_a
|
| 170 |
+
pos_chosen = pos_a if choice == "A" else pos_b
|
| 171 |
+
pos_rejected = pos_b if choice == "A" else pos_a
|
| 172 |
+
|
| 173 |
+
# Check if vote aligns with leaderboard
|
| 174 |
+
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
|
| 175 |
+
return f"You're in touch with the community! {model_chosen} ranks #{pos_chosen} ahead of {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
|
| 176 |
+
else:
|
| 177 |
+
return f"You don't think like everyone else ;) {model_chosen} ranks #{pos_chosen} which is behind {model_rejected} in #{pos_rejected}. \nYour votes shapes the leaderboard, carry on voting responsibly :)"
|
| 178 |
+
|
| 179 |
+
|
| 180 |
def vote(
|
| 181 |
choice,
|
| 182 |
model_a,
|
|
|
|
| 226 |
store_vote_data(
|
| 227 |
final_prompt, response_a, response_b, model_a, model_b, choice, judge_id
|
| 228 |
)
|
| 229 |
+
|
| 230 |
+
# Generate vote message
|
| 231 |
+
message = get_vote_message(choice, model_a, model_b)
|
| 232 |
+
|
| 233 |
# Return updates for UI components
|
| 234 |
return [
|
| 235 |
+
gr.update(interactive=False, variant="primary" if choice == "A" else "secondary"), # vote_a
|
| 236 |
+
gr.update(interactive=False, variant="primary" if choice == "B" else "secondary"), # vote_b
|
| 237 |
+
gr.update(interactive=False, variant="primary" if choice == "Tie" else "secondary"), # vote_tie
|
| 238 |
gr.update(value=f"*Model: {model_a}*"), # model_name_a
|
| 239 |
gr.update(value=f"*Model: {model_b}*"), # model_name_b
|
| 240 |
+
gr.update(interactive=True, value="Regenerate judges", variant="secondary"), # send_btn
|
| 241 |
+
gr.update(value="🎲 New round", variant="primary"), # random_btn
|
| 242 |
+
gr.Info(message, title = "🥳 Thanks for your vote!"), # success message
|
| 243 |
]
|
| 244 |
|
| 245 |
|
|
|
|
| 248 |
return get_votes(db)
|
| 249 |
|
| 250 |
|
| 251 |
+
# Update the refresh_leaderboard function
|
| 252 |
+
def refresh_leaderboard(show_preliminary):
|
| 253 |
+
"""Refresh the leaderboard data and stats."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
voting_data = get_current_votes()
|
| 255 |
+
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary)
|
| 256 |
+
data = [
|
| 257 |
+
[
|
| 258 |
+
entry["Model"],
|
| 259 |
+
float(entry["ELO Score"]),
|
| 260 |
+
entry["95% CI"],
|
| 261 |
+
entry["# Votes"],
|
| 262 |
+
entry["Organization"],
|
| 263 |
+
entry["License"],
|
| 264 |
+
]
|
| 265 |
+
for entry in leaderboard
|
| 266 |
+
]
|
| 267 |
+
stats = get_leaderboard_stats(model_data, voting_data)
|
| 268 |
+
return [gr.update(value=data), gr.update(value=stats)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
# Update the leaderboard table definition in the UI
|
|
|
|
| 275 |
)
|
| 276 |
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
def populate_random_example(request: gr.Request):
|
| 279 |
+
"""Generate a random human-AI conversation example and reset judge outputs."""
|
| 280 |
human_msg, ai_msg = get_random_human_ai_pair()
|
| 281 |
return [
|
| 282 |
gr.update(value=human_msg),
|
| 283 |
+
gr.update(value=ai_msg),
|
| 284 |
+
gr.update(value="🎲", variant="secondary"), # Reset random button appearance
|
| 285 |
+
gr.update(value=""), # Clear score A
|
| 286 |
+
gr.update(value=""), # Clear critique A
|
| 287 |
+
gr.update(value=""), # Clear score B
|
| 288 |
+
gr.update(value=""), # Clear critique B
|
| 289 |
+
gr.update(interactive=False, variant="primary"), # Reset vote A
|
| 290 |
+
gr.update(interactive=False, variant="primary"), # Reset vote B
|
| 291 |
+
gr.update(interactive=False, variant="primary"), # Reset vote tie
|
| 292 |
+
gr.update(value="*Model: Hidden*"), # Reset model name A
|
| 293 |
+
gr.update(value="*Model: Hidden*"), # Reset model name B
|
| 294 |
]
|
| 295 |
|
| 296 |
|
|
|
|
| 306 |
|
| 307 |
with gr.Tabs():
|
| 308 |
with gr.TabItem("Judge Arena"):
|
|
|
|
| 309 |
with gr.Row():
|
| 310 |
# Left side - Input section
|
| 311 |
with gr.Column(scale=1):
|
| 312 |
with gr.Group():
|
| 313 |
human_input = gr.TextArea(
|
| 314 |
label="👩 Human Input",
|
| 315 |
+
lines=10,
|
| 316 |
placeholder="Enter the human message here..."
|
| 317 |
)
|
| 318 |
+
with gr.Row():
|
| 319 |
+
generate_btn = gr.Button(
|
| 320 |
+
"Generate AI Response",
|
| 321 |
+
size="sm",
|
| 322 |
+
interactive=False
|
| 323 |
+
)
|
| 324 |
|
| 325 |
ai_response = gr.TextArea(
|
| 326 |
label="🤖 AI Response",
|
| 327 |
+
lines=15,
|
| 328 |
placeholder="Enter the AI response here..."
|
| 329 |
)
|
| 330 |
|
| 331 |
+
with gr.Row():
|
| 332 |
+
random_btn = gr.Button("🎲", scale=2)
|
| 333 |
send_btn = gr.Button(
|
| 334 |
+
value="Run judges",
|
| 335 |
variant="primary",
|
| 336 |
+
size="lg",
|
| 337 |
+
scale=8
|
| 338 |
)
|
| 339 |
|
| 340 |
# Right side - Model outputs
|
|
|
|
| 345 |
with gr.Row():
|
| 346 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
| 347 |
score_a = gr.Textbox(label="Score", lines=6, interactive=False)
|
| 348 |
+
vote_a = gr.Button("Vote A", variant="primary", interactive=False)
|
| 349 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 350 |
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 351 |
|
|
|
|
|
|
|
|
|
|
| 352 |
# Tie button row
|
| 353 |
+
with gr.Row() as tie_button_row:
|
| 354 |
with gr.Column():
|
| 355 |
+
vote_tie = gr.Button("Tie", variant="primary", interactive=False)
|
| 356 |
|
| 357 |
|
| 358 |
gr.Markdown("### 🧑⚖️ Judge B")
|
|
|
|
| 361 |
with gr.Row():
|
| 362 |
with gr.Column(scale=1, min_width=100): # Fixed narrow width for score
|
| 363 |
score_b = gr.Textbox(label="Score", lines=6, interactive=False)
|
| 364 |
+
vote_b = gr.Button("Vote B", variant="primary", interactive=False)
|
| 365 |
with gr.Column(scale=9, min_width=400): # Wider width for critique
|
| 366 |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False)
|
| 367 |
# Place Vote B button directly under Judge B
|
| 368 |
|
| 369 |
gr.Markdown("<br>")
|
| 370 |
|
| 371 |
+
# Add Evaluator Prompt Accordion
|
| 372 |
+
with gr.Accordion("📝 Evaluator Prompt", open=False):
|
| 373 |
+
gr.Markdown(f"```\n{DEFAULT_EVAL_PROMPT}\n```")
|
| 374 |
+
|
| 375 |
# Add spacing and acknowledgements at the bottom
|
| 376 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
| 377 |
|
|
|
|
| 390 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
| 391 |
)
|
| 392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
# Add change handler for checkbox
|
| 394 |
show_preliminary.change(
|
| 395 |
fn=refresh_leaderboard,
|
|
|
|
| 413 |
final_prompt_state = gr.State()
|
| 414 |
|
| 415 |
# Update variable inputs based on the eval prompt
|
| 416 |
+
#def update_variables(eval_prompt):
|
| 417 |
+
# variables = parse_variables(eval_prompt)
|
| 418 |
+
# updates = []
|
| 419 |
+
|
| 420 |
+
# for i in range(len(variable_rows)):
|
| 421 |
+
# var_row, var_input = variable_rows[i]
|
| 422 |
+
# if i < len(variables):
|
| 423 |
+
# var_name = variables[i]
|
| 424 |
+
# # Set the number of lines based on the variable name
|
| 425 |
+
# if var_name == "response":
|
| 426 |
+
# lines = 4 # Adjust this number as needed
|
| 427 |
+
# else:
|
| 428 |
+
# lines = 1 # Default to single line for other variables
|
| 429 |
+
# updates.extend(
|
| 430 |
+
# [
|
| 431 |
+
# gr.update(visible=True), # Show the variable row
|
| 432 |
+
# gr.update(
|
| 433 |
+
# label=var_name, visible=True, lines=lines
|
| 434 |
+
# ), # Update label and lines
|
| 435 |
+
# ]
|
| 436 |
+
# )
|
| 437 |
+
# else:
|
| 438 |
+
# updates.extend(
|
| 439 |
+
# [
|
| 440 |
+
# gr.update(visible=False), # Hide the variable row
|
| 441 |
+
# gr.update(value="", visible=False), # Clear value when hidden
|
| 442 |
+
# ]
|
| 443 |
+
# )
|
| 444 |
+
# return updates
|
| 445 |
|
| 446 |
#eval_prompt.change(
|
| 447 |
# fn=update_variables,
|
|
|
|
| 481 |
vote_a.click(
|
| 482 |
fn=vote,
|
| 483 |
inputs=[
|
| 484 |
+
gr.State("A"),
|
| 485 |
model_a_state,
|
| 486 |
model_b_state,
|
| 487 |
final_prompt_state,
|
|
|
|
| 493 |
outputs=[
|
| 494 |
vote_a,
|
| 495 |
vote_b,
|
| 496 |
+
vote_tie,
|
| 497 |
model_name_a,
|
| 498 |
model_name_b,
|
| 499 |
send_btn,
|
| 500 |
+
random_btn,
|
| 501 |
+
gr.State(), # placeholder for success message
|
| 502 |
],
|
| 503 |
)
|
| 504 |
|
| 505 |
vote_b.click(
|
| 506 |
fn=vote,
|
| 507 |
inputs=[
|
| 508 |
+
gr.State("B"),
|
| 509 |
model_a_state,
|
| 510 |
model_b_state,
|
| 511 |
final_prompt_state,
|
|
|
|
| 517 |
outputs=[
|
| 518 |
vote_a,
|
| 519 |
vote_b,
|
| 520 |
+
vote_tie,
|
| 521 |
model_name_a,
|
| 522 |
model_name_b,
|
| 523 |
send_btn,
|
| 524 |
+
random_btn,
|
| 525 |
+
gr.State(), # placeholder for success message
|
| 526 |
],
|
| 527 |
)
|
| 528 |
|
| 529 |
vote_tie.click(
|
| 530 |
fn=vote,
|
| 531 |
inputs=[
|
| 532 |
+
gr.State("Tie"),
|
| 533 |
model_a_state,
|
| 534 |
model_b_state,
|
| 535 |
final_prompt_state,
|
|
|
|
| 541 |
outputs=[
|
| 542 |
vote_a,
|
| 543 |
vote_b,
|
| 544 |
+
vote_tie,
|
| 545 |
model_name_a,
|
| 546 |
model_name_b,
|
| 547 |
send_btn,
|
| 548 |
+
random_btn,
|
| 549 |
+
gr.State(), # placeholder for success message
|
| 550 |
],
|
| 551 |
)
|
| 552 |
|
|
|
|
| 582 |
critique_a,
|
| 583 |
score_b,
|
| 584 |
critique_b,
|
| 585 |
+
gr.update(interactive=True, variant="primary"), # vote_a
|
| 586 |
+
gr.update(interactive=True, variant="primary"), # vote_b
|
| 587 |
+
gr.update(interactive=True, variant="primary"), # vote_tie
|
| 588 |
model_a,
|
| 589 |
model_b,
|
| 590 |
+
final_prompt,
|
| 591 |
gr.update(value="*Model: Hidden*"),
|
| 592 |
gr.update(value="*Model: Hidden*"),
|
|
|
|
| 593 |
gr.update(
|
| 594 |
+
value="Regenerate judges",
|
| 595 |
variant="secondary",
|
| 596 |
interactive=True
|
| 597 |
),
|
| 598 |
+
gr.update(value="🎲"), # random_btn
|
| 599 |
)
|
| 600 |
|
| 601 |
send_btn.click(
|
|
|
|
| 608 |
critique_b,
|
| 609 |
vote_a,
|
| 610 |
vote_b,
|
| 611 |
+
vote_tie,
|
| 612 |
model_a_state,
|
| 613 |
model_b_state,
|
| 614 |
final_prompt_state,
|
| 615 |
model_name_a,
|
| 616 |
model_name_b,
|
| 617 |
send_btn,
|
| 618 |
+
random_btn,
|
| 619 |
],
|
| 620 |
)
|
| 621 |
|
| 622 |
# Update the input change handlers to also disable regenerate button
|
| 623 |
+
# def handle_input_changes(prompt, *variables):
|
| 624 |
+
# """Enable send button and manage regenerate button based on input changes"""
|
| 625 |
+
# last_inputs = last_submission.value
|
| 626 |
+
# current_inputs = {"prompt": prompt, "variables": variables}
|
| 627 |
+
# inputs_changed = last_inputs != current_inputs
|
| 628 |
+
# return [
|
| 629 |
+
# gr.update(interactive=True), # send button always enabled
|
| 630 |
+
# gr.update(
|
| 631 |
+
# interactive=not inputs_changed
|
| 632 |
+
# ), # regenerate button disabled if inputs changed
|
| 633 |
+
# ]
|
| 634 |
|
| 635 |
# Update the change handlers for prompt and variables
|
| 636 |
#eval_prompt.change(
|
|
|
|
| 677 |
random_btn.click(
|
| 678 |
fn=populate_random_example,
|
| 679 |
inputs=[],
|
| 680 |
+
outputs=[
|
| 681 |
+
human_input,
|
| 682 |
+
ai_response,
|
| 683 |
+
random_btn,
|
| 684 |
+
score_a,
|
| 685 |
+
critique_a,
|
| 686 |
+
score_b,
|
| 687 |
+
critique_b,
|
| 688 |
+
vote_a,
|
| 689 |
+
vote_b,
|
| 690 |
+
vote_tie,
|
| 691 |
+
model_name_a,
|
| 692 |
+
model_name_b,
|
| 693 |
+
]
|
| 694 |
)
|
| 695 |
|
| 696 |
# Add new input change handlers
|
| 697 |
def handle_input_change():
|
| 698 |
+
"""Reset UI state when inputs are changed"""
|
| 699 |
+
return [
|
| 700 |
+
gr.update(interactive=False), # vote_a
|
| 701 |
+
gr.update(interactive=False), # vote_b
|
| 702 |
+
gr.update(interactive=False), # vote_tie
|
| 703 |
+
gr.update(value="Run judges", variant="primary"), # send_btn
|
| 704 |
+
gr.update(value="🎲", variant="secondary"), # random_btn
|
| 705 |
+
]
|
| 706 |
|
| 707 |
# Update the change handlers for inputs
|
| 708 |
human_input.change(
|
| 709 |
fn=handle_input_change,
|
| 710 |
inputs=[],
|
| 711 |
+
outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
|
| 712 |
)
|
| 713 |
|
| 714 |
ai_response.change(
|
| 715 |
fn=handle_input_change,
|
| 716 |
inputs=[],
|
| 717 |
+
outputs=[vote_a, vote_b, vote_tie, send_btn, random_btn]
|
| 718 |
+
)
|
| 719 |
+
|
| 720 |
+
generate_btn.click(
|
| 721 |
+
fn=lambda msg: (
|
| 722 |
+
generate_ai_response(msg)[0], # Only take the response text
|
| 723 |
+
gr.update(
|
| 724 |
+
value="Generate AI Response", # Keep the label
|
| 725 |
+
interactive=False # Disable the button
|
| 726 |
+
)
|
| 727 |
+
),
|
| 728 |
+
inputs=[human_input],
|
| 729 |
+
outputs=[ai_response, generate_btn]
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
human_input.change(
|
| 733 |
+
fn=lambda x: gr.update(interactive=bool(x.strip())),
|
| 734 |
+
inputs=[human_input],
|
| 735 |
+
outputs=[generate_btn]
|
| 736 |
)
|
| 737 |
|
| 738 |
# Update the demo.load to include the random example population
|