lcipolina's picture
Upgraded the leaderboard
d778057 verified
raw
history blame
3.34 kB
import os
import json
import gradio as gr
from agents.llm_registry import LLM_REGISTRY # Dynamically fetch LLM models
# Extract available LLM models
llm_models = list(LLM_REGISTRY.keys())
# Define game list manually (for now)
games_list = [
"rock_paper_scissors",
"prisoners_dilemma",
"tic_tac_toe",
"connect_four",
"matching_pennies",
"kuhn_poker",
]
# File to persist results
RESULTS_TRACKER_FILE = "results_tracker.json"
# Load or initialize the results tracker
if os.path.exists(RESULTS_TRACKER_FILE):
with open(RESULTS_TRACKER_FILE, "r") as f:
results_tracker = json.load(f)
else:
results_tracker = {
llm: {game: {"wins": 0, "ties": 0, "losses": 0, "games": 0} for game in games_list}
for llm in llm_models
}
def save_results_tracker():
"""Save the results tracker to a JSON file."""
with open(RESULTS_TRACKER_FILE, "w") as f:
json.dump(results_tracker, f, indent=4)
def calculate_leaderboard():
"""Generate a leaderboard table summarizing LLM performance across games."""
leaderboard_data = {"LLM Model": llm_models}
for game in games_list:
leaderboard_data[game] = [
f"{(results_tracker[llm][game]['wins'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% W / "
f"{(results_tracker[llm][game]['ties'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% T / "
f"{(results_tracker[llm][game]['losses'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% L"
for llm in llm_models
]
return leaderboard_data
def get_model_details(model_name):
"""Returns detailed performance of the selected LLM model."""
if model_name not in results_tracker:
return "No data available for this model."
details = f"### {model_name} Performance Breakdown\n"
for game, record in results_tracker[model_name].items():
total_games = record["games"]
details += f"- **{game.capitalize()}**: {record['wins']} Wins, {record['ties']} Ties, {record['losses']} Losses (Total: {total_games})\n"
return details
# Gradio Interface
with gr.Blocks() as interface:
with gr.Tab("Game Arena"):
gr.Markdown("# LLM Game Arena\nPlay against LLMs or other players in classic games!")
# (Game selection and play functionality remains unchanged)
with gr.Tab("Leaderboard"):
gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!")
leaderboard_table = gr.Dataframe(label="Leaderboard", value=calculate_leaderboard())
with gr.Row():
model_dropdown = gr.Dropdown(choices=llm_models, label="Select LLM Model")
details_output = gr.Markdown(label="Model Performance Details")
def update_leaderboard():
"""Updates the leaderboard table."""
return calculate_leaderboard()
def update_details(model_name):
"""Updates the details section when an LLM is selected."""
return get_model_details(model_name)
update_leaderboard_button = gr.Button("Refresh Leaderboard")
update_leaderboard_button.click(update_leaderboard, inputs=[], outputs=leaderboard_table)
model_dropdown.change(update_details, inputs=[model_dropdown], outputs=details_output)
interface.launch()