File size: 3,338 Bytes
d778057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import json
import gradio as gr
from agents.llm_registry import LLM_REGISTRY  # Dynamically fetch LLM models

# Extract available LLM models
llm_models = list(LLM_REGISTRY.keys())

# Define game list manually (for now)
games_list = [
    "rock_paper_scissors",
    "prisoners_dilemma",
    "tic_tac_toe",
    "connect_four",
    "matching_pennies",
    "kuhn_poker",
]

# File to persist results
RESULTS_TRACKER_FILE = "results_tracker.json"

# Load or initialize the results tracker
if os.path.exists(RESULTS_TRACKER_FILE):
    with open(RESULTS_TRACKER_FILE, "r") as f:
        results_tracker = json.load(f)
else:
    results_tracker = {
        llm: {game: {"wins": 0, "ties": 0, "losses": 0, "games": 0} for game in games_list}
        for llm in llm_models
    }


def save_results_tracker():
    """Save the results tracker to a JSON file."""
    with open(RESULTS_TRACKER_FILE, "w") as f:
        json.dump(results_tracker, f, indent=4)


def calculate_leaderboard():
    """Generate a leaderboard table summarizing LLM performance across games."""
    leaderboard_data = {"LLM Model": llm_models}

    for game in games_list:
        leaderboard_data[game] = [
            f"{(results_tracker[llm][game]['wins'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% W / "
            f"{(results_tracker[llm][game]['ties'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% T / "
            f"{(results_tracker[llm][game]['losses'] / max(1, results_tracker[llm][game]['games']) * 100):.1f}% L"
            for llm in llm_models
        ]

    return leaderboard_data


def get_model_details(model_name):
    """Returns detailed performance of the selected LLM model."""
    if model_name not in results_tracker:
        return "No data available for this model."

    details = f"### {model_name} Performance Breakdown\n"
    for game, record in results_tracker[model_name].items():
        total_games = record["games"]
        details += f"- **{game.capitalize()}**: {record['wins']} Wins, {record['ties']} Ties, {record['losses']} Losses (Total: {total_games})\n"

    return details


# Gradio Interface
with gr.Blocks() as interface:
    with gr.Tab("Game Arena"):
        gr.Markdown("# LLM Game Arena\nPlay against LLMs or other players in classic games!")

        # (Game selection and play functionality remains unchanged)

    with gr.Tab("Leaderboard"):
        gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!")

        leaderboard_table = gr.Dataframe(label="Leaderboard", value=calculate_leaderboard())

        with gr.Row():
            model_dropdown = gr.Dropdown(choices=llm_models, label="Select LLM Model")
        details_output = gr.Markdown(label="Model Performance Details")

        def update_leaderboard():
            """Updates the leaderboard table."""
            return calculate_leaderboard()

        def update_details(model_name):
            """Updates the details section when an LLM is selected."""
            return get_model_details(model_name)

        update_leaderboard_button = gr.Button("Refresh Leaderboard")
        update_leaderboard_button.click(update_leaderboard, inputs=[], outputs=leaderboard_table)

        model_dropdown.change(update_details, inputs=[model_dropdown], outputs=details_output)

interface.launch()