File size: 7,395 Bytes
d778057
 
0d67af5
d778057
 
106f4f6
 
 
 
 
 
d778057
70c682f
d778057
 
70c682f
84f0932
 
 
 
 
 
 
 
d778057
70c682f
 
634c45e
70c682f
 
634c45e
d778057
 
 
 
 
70c682f
d778057
4c30414
70c682f
d778057
 
 
 
 
 
 
 
70c682f
 
 
 
 
 
 
 
 
 
 
4c30414
70c682f
 
 
 
 
 
 
 
 
634c45e
0d67af5
70c682f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634c45e
27ff230
 
0d67af5
d778057
84f0932
70c682f
84f0932
 
 
 
 
106f4f6
84f0932
 
 
d778057
84f0932
70c682f
84f0932
 
 
 
634c45e
84f0932
 
634c45e
 
 
70c682f
634c45e
 
 
70c682f
634c45e
 
 
 
 
 
 
 
 
 
 
 
 
 
70c682f
634c45e
 
 
70c682f
 
634c45e
 
 
 
 
70c682f
634c45e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import json
import pandas as pd
import gradio as gr
from agents.llm_registry import LLM_REGISTRY  # Dynamically fetch LLM models
from simulators.tic_tac_toe_simulator import TicTacToeSimulator
from simulators.prisoners_dilemma_simulator import PrisonersDilemmaSimulator
from simulators.rock_paper_scissors_simulator import RockPaperScissorsSimulator
from games_registry import GAMES_REGISTRY
from simulators.base_simulator import PlayerType
from typing import Dict

# Extract available LLM models from the registry
llm_models = list(LLM_REGISTRY.keys())

# List of available games (manually defined for now)
games_list = [
    "rock_paper_scissors",
    "prisoners_dilemma",
    "tic_tac_toe",
    "connect_four",
    "matching_pennies",
    "kuhn_poker",
]

# Special leaderboard option for aggregating stats across all games
games_list.insert(0, "Total Performance")  

# File to persist game results
RESULTS_TRACKER_FILE = "results_tracker.json"

# Load or initialize the results tracker
if os.path.exists(RESULTS_TRACKER_FILE):
    with open(RESULTS_TRACKER_FILE, "r") as f:
        results_tracker = json.load(f)
else:
    # Initialize tracking for all LLMs and games
    results_tracker = {
        llm: {game: {"games": 0, "moves/game": 0, "illegal-moves": 0,
                     "win-rate": 0, "vs Random": 0} for game in games_list[1:]}
        for llm in llm_models
    }

def save_results_tracker():
    """Save the results tracker to a JSON file."""
    with open(RESULTS_TRACKER_FILE, "w") as f:
        json.dump(results_tracker, f, indent=4)

def generate_stats_file(model_name: str) -> str:
    """Generate a JSON file with detailed statistics for the selected LLM model."""
    file_path = f"{model_name}_stats.json"
    with open(file_path, "w") as f:
        json.dump(results_tracker.get(model_name, {}), f, indent=4)
    return file_path

def provide_download_file(model_name):
    """Creates a downloadable JSON file with stats for the selected model."""
    return generate_stats_file(model_name)

def calculate_leaderboard(selected_game: str) -> pd.DataFrame:
    """
    Generate a structured leaderboard table.
    - If a specific game is selected, returns performance stats per LLM for that game.
    - If 'Total Performance' is selected, aggregates stats across all games.
    """
    leaderboard_df = pd.DataFrame(
        index=llm_models,
        columns=["# games", "moves/game", "illegal-moves", "win-rate", "vs Random"]
    )

    for llm in llm_models:
        if selected_game == "Total Performance":
            # Aggregate stats across all games
            total_games = sum(results_tracker[llm][game]["games"] for game in games_list[1:])
            total_moves = sum(results_tracker[llm][game]["moves/game"] * results_tracker[llm][game]["games"]
                              for game in games_list[1:])
            total_illegal_moves = sum(results_tracker[llm][game]["illegal-moves"] for game in games_list[1:])
            avg_win_rate = sum(results_tracker[llm][game]["win-rate"] * results_tracker[llm][game]["games"]
                               for game in games_list[1:]) / total_games if total_games > 0 else 0
            avg_vs_random = sum(results_tracker[llm][game]["vs Random"] * results_tracker[llm][game]["games"]
                                for game in games_list[1:]) / total_games if total_games > 0 else 0

            leaderboard_df.loc[llm] = [
                total_games,
                f"{(total_moves / total_games) if total_games > 0 else 0:.1f}",
                total_illegal_moves,
                f"{avg_win_rate:.1f}%",
                f"{avg_vs_random:.1f}%"
            ]
        else:
            # Retrieve stats for the selected game
            game_stats = results_tracker[llm].get(selected_game, {})
            leaderboard_df.loc[llm] = [
                game_stats.get("games", 0),
                game_stats.get("moves/game", 0),
                game_stats.get("illegal-moves", 0),
                f"{game_stats.get('win-rate', 0):.1f}%",
                f"{game_stats.get('vs Random', 0):.1f}%"
            ]

    leaderboard_df = leaderboard_df.reset_index()
    leaderboard_df.rename(columns={"index": "LLM Model"}, inplace=True)
    return leaderboard_df

def play_game(game_name, player1_type, player2_type, player1_model, player2_model, rounds):
    """Simulates a game session with the chosen players and logs results."""
    llms = {}
    if player1_type == "llm":
        llms["Player 1"] = player1_model
    if player2_type == "llm":
        llms["Player 2"] = player2_model

    simulator_class = GAMES_REGISTRY[game_name]
    simulator = simulator_class(game_name, llms=llms)
    game_states = []

    def log_fn(state):
        """Logs the current game state and available moves."""
        current_player = state.current_player()
        legal_moves = state.legal_actions(current_player)
        board = str(state)
        game_states.append(f"Current Player: {current_player}\nBoard:\n{board}\nLegal Moves: {legal_moves}")

    results = simulator.simulate(rounds=int(rounds), log_fn=log_fn)
    return "\n".join(game_states) + f"\nGame Result: {results}"

# Gradio Interface
with gr.Blocks() as interface:
    # Game Arena Tab
    with gr.Tab("Game Arena"):
        gr.Markdown("# LLM Game Arena\nSelect a game and players to play against LLMs.")

        game_dropdown = gr.Dropdown(choices=games_list[1:], label="Select a Game", value=games_list[1])
        player1_dropdown = gr.Dropdown(choices=["human", "random_bot", "llm"], label="Player 1 Type", value="llm")
        player2_dropdown = gr.Dropdown(choices=["human", "random_bot", "llm"], label="Player 2 Type", value="random_bot")
        player1_model_dropdown = gr.Dropdown(choices=llm_models, label="Player 1 Model", visible=False)
        player2_model_dropdown = gr.Dropdown(choices=llm_models, label="Player 2 Model", visible=False)
        rounds_slider = gr.Slider(1, 10, step=1, label="Rounds")
        result_output = gr.Textbox(label="Game Result")

        play_button = gr.Button("Play Game")
        play_button.click(
            play_game,
            inputs=[game_dropdown, player1_dropdown, player2_dropdown, player1_model_dropdown, player2_model_dropdown, rounds_slider],
            outputs=result_output,
        )

    # Leaderboard Tab
    with gr.Tab("Leaderboard"):
        gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!")

        game_dropdown = gr.Dropdown(choices=games_list, label="Select Game", value="Total Performance")
        leaderboard_table = gr.Dataframe(value=calculate_leaderboard("Total Performance"), label="Leaderboard")
        model_dropdown = gr.Dropdown(choices=llm_models, label="Select LLM Model")
        download_button = gr.File(label="Download Statistics File")
        refresh_button = gr.Button("Refresh Leaderboard")

        def update_leaderboard(selected_game):
            """Updates the leaderboard based on the selected game."""
            return calculate_leaderboard(selected_game)

        model_dropdown.change(fn=provide_download_file, inputs=[model_dropdown], outputs=[download_button])
        game_dropdown.change(fn=update_leaderboard, inputs=[game_dropdown], outputs=[leaderboard_table])
        refresh_button.click(fn=update_leaderboard, inputs=[game_dropdown], outputs=[leaderboard_table])

interface.launch()