Spaces:
Sleeping
Sleeping
File size: 7,395 Bytes
d778057 0d67af5 d778057 106f4f6 d778057 70c682f d778057 70c682f 84f0932 d778057 70c682f 634c45e 70c682f 634c45e d778057 70c682f d778057 4c30414 70c682f d778057 70c682f 4c30414 70c682f 634c45e 0d67af5 70c682f 634c45e 27ff230 0d67af5 d778057 84f0932 70c682f 84f0932 106f4f6 84f0932 d778057 84f0932 70c682f 84f0932 634c45e 84f0932 634c45e 70c682f 634c45e 70c682f 634c45e 70c682f 634c45e 70c682f 634c45e 70c682f 634c45e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import os
import json
import pandas as pd
import gradio as gr
from agents.llm_registry import LLM_REGISTRY # Dynamically fetch LLM models
from simulators.tic_tac_toe_simulator import TicTacToeSimulator
from simulators.prisoners_dilemma_simulator import PrisonersDilemmaSimulator
from simulators.rock_paper_scissors_simulator import RockPaperScissorsSimulator
from games_registry import GAMES_REGISTRY
from simulators.base_simulator import PlayerType
from typing import Dict
# Extract available LLM models from the registry
llm_models = list(LLM_REGISTRY.keys())
# List of available games (manually defined for now)
games_list = [
"rock_paper_scissors",
"prisoners_dilemma",
"tic_tac_toe",
"connect_four",
"matching_pennies",
"kuhn_poker",
]
# Special leaderboard option for aggregating stats across all games
games_list.insert(0, "Total Performance")
# File to persist game results
RESULTS_TRACKER_FILE = "results_tracker.json"
# Load or initialize the results tracker
if os.path.exists(RESULTS_TRACKER_FILE):
with open(RESULTS_TRACKER_FILE, "r") as f:
results_tracker = json.load(f)
else:
# Initialize tracking for all LLMs and games
results_tracker = {
llm: {game: {"games": 0, "moves/game": 0, "illegal-moves": 0,
"win-rate": 0, "vs Random": 0} for game in games_list[1:]}
for llm in llm_models
}
def save_results_tracker():
"""Save the results tracker to a JSON file."""
with open(RESULTS_TRACKER_FILE, "w") as f:
json.dump(results_tracker, f, indent=4)
def generate_stats_file(model_name: str) -> str:
"""Generate a JSON file with detailed statistics for the selected LLM model."""
file_path = f"{model_name}_stats.json"
with open(file_path, "w") as f:
json.dump(results_tracker.get(model_name, {}), f, indent=4)
return file_path
def provide_download_file(model_name):
"""Creates a downloadable JSON file with stats for the selected model."""
return generate_stats_file(model_name)
def calculate_leaderboard(selected_game: str) -> pd.DataFrame:
"""
Generate a structured leaderboard table.
- If a specific game is selected, returns performance stats per LLM for that game.
- If 'Total Performance' is selected, aggregates stats across all games.
"""
leaderboard_df = pd.DataFrame(
index=llm_models,
columns=["# games", "moves/game", "illegal-moves", "win-rate", "vs Random"]
)
for llm in llm_models:
if selected_game == "Total Performance":
# Aggregate stats across all games
total_games = sum(results_tracker[llm][game]["games"] for game in games_list[1:])
total_moves = sum(results_tracker[llm][game]["moves/game"] * results_tracker[llm][game]["games"]
for game in games_list[1:])
total_illegal_moves = sum(results_tracker[llm][game]["illegal-moves"] for game in games_list[1:])
avg_win_rate = sum(results_tracker[llm][game]["win-rate"] * results_tracker[llm][game]["games"]
for game in games_list[1:]) / total_games if total_games > 0 else 0
avg_vs_random = sum(results_tracker[llm][game]["vs Random"] * results_tracker[llm][game]["games"]
for game in games_list[1:]) / total_games if total_games > 0 else 0
leaderboard_df.loc[llm] = [
total_games,
f"{(total_moves / total_games) if total_games > 0 else 0:.1f}",
total_illegal_moves,
f"{avg_win_rate:.1f}%",
f"{avg_vs_random:.1f}%"
]
else:
# Retrieve stats for the selected game
game_stats = results_tracker[llm].get(selected_game, {})
leaderboard_df.loc[llm] = [
game_stats.get("games", 0),
game_stats.get("moves/game", 0),
game_stats.get("illegal-moves", 0),
f"{game_stats.get('win-rate', 0):.1f}%",
f"{game_stats.get('vs Random', 0):.1f}%"
]
leaderboard_df = leaderboard_df.reset_index()
leaderboard_df.rename(columns={"index": "LLM Model"}, inplace=True)
return leaderboard_df
def play_game(game_name, player1_type, player2_type, player1_model, player2_model, rounds):
"""Simulates a game session with the chosen players and logs results."""
llms = {}
if player1_type == "llm":
llms["Player 1"] = player1_model
if player2_type == "llm":
llms["Player 2"] = player2_model
simulator_class = GAMES_REGISTRY[game_name]
simulator = simulator_class(game_name, llms=llms)
game_states = []
def log_fn(state):
"""Logs the current game state and available moves."""
current_player = state.current_player()
legal_moves = state.legal_actions(current_player)
board = str(state)
game_states.append(f"Current Player: {current_player}\nBoard:\n{board}\nLegal Moves: {legal_moves}")
results = simulator.simulate(rounds=int(rounds), log_fn=log_fn)
return "\n".join(game_states) + f"\nGame Result: {results}"
# Gradio Interface
with gr.Blocks() as interface:
# Game Arena Tab
with gr.Tab("Game Arena"):
gr.Markdown("# LLM Game Arena\nSelect a game and players to play against LLMs.")
game_dropdown = gr.Dropdown(choices=games_list[1:], label="Select a Game", value=games_list[1])
player1_dropdown = gr.Dropdown(choices=["human", "random_bot", "llm"], label="Player 1 Type", value="llm")
player2_dropdown = gr.Dropdown(choices=["human", "random_bot", "llm"], label="Player 2 Type", value="random_bot")
player1_model_dropdown = gr.Dropdown(choices=llm_models, label="Player 1 Model", visible=False)
player2_model_dropdown = gr.Dropdown(choices=llm_models, label="Player 2 Model", visible=False)
rounds_slider = gr.Slider(1, 10, step=1, label="Rounds")
result_output = gr.Textbox(label="Game Result")
play_button = gr.Button("Play Game")
play_button.click(
play_game,
inputs=[game_dropdown, player1_dropdown, player2_dropdown, player1_model_dropdown, player2_model_dropdown, rounds_slider],
outputs=result_output,
)
# Leaderboard Tab
with gr.Tab("Leaderboard"):
gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!")
game_dropdown = gr.Dropdown(choices=games_list, label="Select Game", value="Total Performance")
leaderboard_table = gr.Dataframe(value=calculate_leaderboard("Total Performance"), label="Leaderboard")
model_dropdown = gr.Dropdown(choices=llm_models, label="Select LLM Model")
download_button = gr.File(label="Download Statistics File")
refresh_button = gr.Button("Refresh Leaderboard")
def update_leaderboard(selected_game):
"""Updates the leaderboard based on the selected game."""
return calculate_leaderboard(selected_game)
model_dropdown.change(fn=provide_download_file, inputs=[model_dropdown], outputs=[download_button])
game_dropdown.change(fn=update_leaderboard, inputs=[game_dropdown], outputs=[leaderboard_table])
refresh_button.click(fn=update_leaderboard, inputs=[game_dropdown], outputs=[leaderboard_table])
interface.launch()
|