lcipolina commited on
Commit
7ec49b6
ยท
verified ยท
1 Parent(s): 9b0fc0b

New app with newer functionalities

Browse files
Files changed (1) hide show
  1. app.py +677 -0
app.py ADDED
@@ -0,0 +1,677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Game Reasoning Arena โ€” Hugging Face Spaces Gradio App
4
+
5
+ Pipeline:
6
+ User clicks "Start Game" in Gradio
7
+ โ†“
8
+ app.py (play_game)
9
+ โ†“
10
+ ui/gradio_config_generator.py (run_game_with_existing_infrastructure)
11
+ โ†“
12
+ src/game_reasoning_arena/ (core game infrastructure)
13
+ โ†“
14
+ Game results + metrics displayed in Gradio
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import sqlite3
20
+
21
+ import sys
22
+ from pathlib import Path
23
+ from typing import List, Dict, Any, Tuple, Generator
24
+
25
+ import pandas as pd
26
+ import gradio as gr
27
+
28
+ # Logging (optional)
29
+ import logging
30
+ logging.basicConfig(level=logging.INFO)
31
+ log = logging.getLogger("arena_space")
32
+
33
+ # Optional transformers import (only needed if your backend uses it here)
34
+ try:
35
+ from transformers import pipeline # noqa: F401
36
+ except Exception:
37
+ pass
38
+
39
+ # Make sure src is on PYTHONPATH
40
+ src_path = Path(__file__).parent / "src"
41
+ if str(src_path) not in sys.path:
42
+ sys.path.insert(0, str(src_path))
43
+
44
+ # Try to import game registry
45
+ try:
46
+ from src.game_reasoning_arena.arena.games.registry import (
47
+ registry as games_registry
48
+ )
49
+ except Exception as e:
50
+ log.warning("Game registry not available: %s", e)
51
+ games_registry = None
52
+
53
+ # Optional: import backend & LLM registry
54
+ try:
55
+ from src.game_reasoning_arena.backends.huggingface_backend import (
56
+ HuggingFaceBackend,
57
+ )
58
+ from src.game_reasoning_arena.backends import (
59
+ initialize_llm_registry, LLM_REGISTRY,
60
+ )
61
+ BACKEND_SYSTEM_AVAILABLE = True
62
+ log.info("Backend system available - using proper LLM infrastructure.")
63
+ except Exception as e:
64
+ BACKEND_SYSTEM_AVAILABLE = False
65
+ log.warning("Backend system not available: %s", e)
66
+
67
+ # -----------------------------------------------------------------------------
68
+ # Config & constants
69
+ # -----------------------------------------------------------------------------
70
+
71
+ # HF demo-safe tiny models (CPU friendly)
72
+ HUGGINGFACE_MODELS: Dict[str, str] = {
73
+ "gpt2": "gpt2",
74
+ "distilgpt2": "distilgpt2",
75
+ "google/flan-t5-small": "google/flan-t5-small",
76
+ "EleutherAI/gpt-neo-125M": "EleutherAI/gpt-neo-125M",
77
+ }
78
+
79
+ GAMES_REGISTRY: Dict[str, Any] = {}
80
+ db_dir = Path(__file__).resolve().parent / "scripts" / "results"
81
+
82
+ LEADERBOARD_COLUMNS = [
83
+ "agent_name", "agent_type", "# games", "total rewards",
84
+ "avg_generation_time (sec)", "win-rate", "win vs_random (%)",
85
+ ]
86
+
87
+ # -----------------------------------------------------------------------------
88
+ # Init backend + register models (optional)
89
+ # -----------------------------------------------------------------------------
90
+
91
+ huggingface_backend = None
92
+ if BACKEND_SYSTEM_AVAILABLE:
93
+ try:
94
+ huggingface_backend = HuggingFaceBackend()
95
+ initialize_llm_registry()
96
+
97
+ for model_name in HUGGINGFACE_MODELS.keys():
98
+ if huggingface_backend.is_model_available(model_name):
99
+ registry_key = f"hf_{model_name}"
100
+ LLM_REGISTRY[registry_key] = {
101
+ "backend": huggingface_backend,
102
+ "model_name": model_name,
103
+ }
104
+ log.info("Registered HuggingFace model: %s", registry_key)
105
+ except Exception as e:
106
+ log.error("Failed to initialize HuggingFace backend: %s", e)
107
+ huggingface_backend = None
108
+
109
+ # -----------------------------------------------------------------------------
110
+ # Load games registry
111
+ # -----------------------------------------------------------------------------
112
+
113
+ try:
114
+ if games_registry is not None:
115
+ GAMES_REGISTRY = {
116
+ name: cls for name, cls in games_registry._registry.items()
117
+ }
118
+ log.info("Successfully imported full arena - games are playable.")
119
+ else:
120
+ GAMES_REGISTRY = {}
121
+ except Exception as e:
122
+ log.warning("Failed to load games registry: %s", e)
123
+ GAMES_REGISTRY = {}
124
+
125
+ # -----------------------------------------------------------------------------
126
+ # DB helpers
127
+ # -----------------------------------------------------------------------------
128
+
129
+
130
+ def ensure_results_dir() -> None:
131
+ db_dir.mkdir(parents=True, exist_ok=True)
132
+
133
+
134
+ def iter_agent_databases() -> Generator[Tuple[str, str, str], None, None]:
135
+ """Yield (db_file, agent_type, model_name) for non-random agents."""
136
+ for db_file in find_or_download_db():
137
+ agent_type, model_name = extract_agent_info(db_file)
138
+ if agent_type != "random":
139
+ yield db_file, agent_type, model_name
140
+
141
+
142
+ def find_or_download_db() -> List[str]:
143
+ """Return .db files; ensure random_None.db exists with minimal schema."""
144
+ ensure_results_dir()
145
+
146
+ random_db_path = db_dir / "random_None.db"
147
+ if not random_db_path.exists():
148
+ conn = sqlite3.connect(str(random_db_path))
149
+ try:
150
+ conn.execute(
151
+ """
152
+ CREATE TABLE IF NOT EXISTS games (
153
+ id INTEGER PRIMARY KEY,
154
+ game_name TEXT,
155
+ player1 TEXT,
156
+ player2 TEXT,
157
+ winner INTEGER,
158
+ timestamp TEXT
159
+ )
160
+ """
161
+ )
162
+ conn.commit()
163
+ finally:
164
+ conn.close()
165
+
166
+ return [str(p) for p in db_dir.glob("*.db")]
167
+
168
+
169
+ def extract_agent_info(filename: str) -> Tuple[str, str]:
170
+ base_name = Path(filename).stem
171
+ parts = base_name.split("_", 1)
172
+ if len(parts) == 2:
173
+ return parts[0], parts[1]
174
+ return parts[0], "Unknown"
175
+
176
+
177
+ def get_available_games(include_aggregated: bool = True) -> List[str]:
178
+ """Union of games seen in DBs and in registry."""
179
+ game_names = set()
180
+
181
+ # From DBs
182
+ for db_file in find_or_download_db():
183
+ conn = sqlite3.connect(db_file)
184
+ try:
185
+ df = pd.read_sql_query(
186
+ "SELECT DISTINCT game_name FROM moves", conn
187
+ )
188
+ game_names.update(df["game_name"].tolist())
189
+ except Exception:
190
+ pass
191
+ finally:
192
+ conn.close()
193
+
194
+ # From registry
195
+ if GAMES_REGISTRY:
196
+ game_names.update(GAMES_REGISTRY.keys())
197
+
198
+ if not game_names:
199
+ game_names.update(["tic_tac_toe", "kuhn_poker", "connect_four"])
200
+
201
+ game_list = sorted(game_names)
202
+ if include_aggregated:
203
+ game_list.insert(0, "Aggregated Performance")
204
+ return game_list
205
+
206
+
207
+ def extract_illegal_moves_summary() -> pd.DataFrame:
208
+ """# illegal moves per agent."""
209
+ summary = []
210
+ for db_file, agent_type, model_name in iter_agent_databases():
211
+ conn = sqlite3.connect(db_file)
212
+ try:
213
+ df = pd.read_sql_query(
214
+ "SELECT COUNT(*) AS illegal_moves FROM illegal_moves", conn
215
+ )
216
+ count = int(df["illegal_moves"].iloc[0]) if not df.empty else 0
217
+ except Exception:
218
+ count = 0
219
+ finally:
220
+ conn.close()
221
+ summary.append({"agent_name": model_name, "illegal_moves": count})
222
+ return pd.DataFrame(summary)
223
+
224
+ # -----------------------------------------------------------------------------
225
+ # Player config
226
+ # -----------------------------------------------------------------------------
227
+
228
+
229
+ class PlayerConfigData(gr.TypedDict, total=False):
230
+ player_types: List[str]
231
+ player_type_display: Dict[str, str]
232
+ available_models: List[str]
233
+
234
+
235
+ class GameArenaConfig(gr.TypedDict, total=False):
236
+ available_games: List[str]
237
+ player_config: PlayerConfigData
238
+ model_info: str
239
+ backend_available: bool
240
+
241
+
242
+ def setup_player_config(
243
+ player_type: str, player_model: str, player_id: str
244
+ ) -> Dict[str, Any]:
245
+ """Map dropdown selection to agent config for the runner."""
246
+ if player_type == "random_bot":
247
+ return {"type": "random"}
248
+
249
+ if (
250
+ player_type
251
+ and (
252
+ player_type.startswith("llm_")
253
+ or player_type.startswith("hf_")
254
+ )
255
+ ):
256
+ model_id = player_type.split("_", 1)[1]
257
+ if BACKEND_SYSTEM_AVAILABLE and model_id in HUGGINGFACE_MODELS:
258
+ return {"type": "llm", "model": model_id}
259
+
260
+ if (
261
+ player_type == "llm"
262
+ and player_model in HUGGINGFACE_MODELS
263
+ and BACKEND_SYSTEM_AVAILABLE
264
+ ):
265
+ return {"type": "llm", "model": player_model}
266
+
267
+ return {"type": "random"}
268
+
269
+
270
+ def create_player_config() -> GameArenaConfig:
271
+ available_games = get_available_games(include_aggregated=False)
272
+
273
+ # Collect models seen in DBs (for charts/labels)
274
+ database_models = [model for _, _, model in iter_agent_databases()]
275
+
276
+ player_types = ["random_bot"]
277
+ player_type_display = {"random_bot": "Random Bot"}
278
+
279
+ if BACKEND_SYSTEM_AVAILABLE:
280
+ for model_key in HUGGINGFACE_MODELS.keys():
281
+ key = f"hf_{model_key}"
282
+ player_types.append(key)
283
+ tag = model_key.split("/")[-1]
284
+ player_type_display[key] = f"HuggingFace: {tag}"
285
+
286
+ all_models = list(HUGGINGFACE_MODELS.keys()) + database_models
287
+
288
+ model_info = (
289
+ "HuggingFace transformer models integrated with backend system."
290
+ if BACKEND_SYSTEM_AVAILABLE
291
+ else "Backend system not available - limited functionality."
292
+ )
293
+
294
+ return {
295
+ "available_games": available_games,
296
+ "player_config": {
297
+ "player_types": player_types,
298
+ "player_type_display": player_type_display,
299
+ "available_models": all_models,
300
+ },
301
+ "model_info": model_info,
302
+ "backend_available": BACKEND_SYSTEM_AVAILABLE,
303
+ }
304
+
305
+ # -----------------------------------------------------------------------------
306
+ # Main game entry
307
+ # -----------------------------------------------------------------------------
308
+
309
+
310
+ def play_game(
311
+ game_name: str,
312
+ player1_type: str,
313
+ player2_type: str,
314
+ player1_model: str | None = None,
315
+ player2_model: str | None = None,
316
+ rounds: int = 1,
317
+ ) -> str:
318
+ if game_name == "No Games Found":
319
+ return "No games available. Please add game databases."
320
+
321
+ log.info(
322
+ "Starting game: %s | P1=%s(%s) P2=%s(%s) rounds=%d",
323
+ game_name,
324
+ player1_type,
325
+ player1_model,
326
+ player2_type,
327
+ player2_model,
328
+ rounds,
329
+ )
330
+
331
+ # Gradio passes display labels sometimesโ€”map back to keys
332
+ config = create_player_config()
333
+ display_to_key = {
334
+ v: k for k, v in config["player_config"]["player_type_display"].items()
335
+ }
336
+ if player1_type in display_to_key:
337
+ player1_type = display_to_key[player1_type]
338
+ if player2_type in display_to_key:
339
+ player2_type = display_to_key[player2_type]
340
+
341
+ try:
342
+ # IMPORTANT: rename your local folder to 'ui/'
343
+ from ui.gradio_config_generator import (
344
+ run_game_with_existing_infrastructure,
345
+ )
346
+
347
+ result = run_game_with_existing_infrastructure(
348
+ game_name=game_name,
349
+ player1_type=player1_type,
350
+ player2_type=player2_type,
351
+ player1_model=player1_model,
352
+ player2_model=player2_model,
353
+ rounds=rounds,
354
+ seed=42,
355
+ )
356
+ return result
357
+ except Exception as e:
358
+ return f"Error during game simulation: {e}"
359
+
360
+
361
+ def extract_leaderboard_stats(game_name: str) -> pd.DataFrame:
362
+ all_stats = []
363
+
364
+ for db_file, agent_type, model_name in iter_agent_databases():
365
+ conn = sqlite3.connect(db_file)
366
+ try:
367
+ if game_name == "Aggregated Performance":
368
+ q = (
369
+ "SELECT COUNT(DISTINCT episode) AS games_played, "
370
+ "SUM(reward) AS total_rewards FROM game_results"
371
+ )
372
+ df = pd.read_sql_query(q, conn)
373
+ avg_time = conn.execute(
374
+ "SELECT AVG(generation_time) FROM moves "
375
+ "WHERE game_name = 'kuhn_poker'"
376
+ ).fetchone()[0] or 0
377
+ else:
378
+ q = (
379
+ "SELECT COUNT(DISTINCT episode) AS games_played, "
380
+ "SUM(reward) AS total_rewards "
381
+ "FROM game_results WHERE game_name = ?"
382
+ )
383
+ df = pd.read_sql_query(q, conn, params=(game_name,))
384
+ avg_time = conn.execute(
385
+ "SELECT AVG(generation_time) FROM moves WHERE game_name = ?",
386
+ (game_name,),
387
+ ).fetchone()[0] or 0
388
+
389
+ df["total_rewards"] = (
390
+ df["total_rewards"].fillna(0).astype(float) / 2
391
+ )
392
+ avg_time = round(float(avg_time), 3)
393
+
394
+ wins_vs_random = conn.execute(
395
+ "SELECT COUNT(*) FROM game_results "
396
+ "WHERE opponent = 'random_None' AND reward > 0"
397
+ ).fetchone()[0] or 0
398
+ total_vs_random = conn.execute(
399
+ "SELECT COUNT(*) FROM game_results "
400
+ "WHERE opponent = 'random_None'"
401
+ ).fetchone()[0] or 0
402
+ vs_random_rate = (
403
+ wins_vs_random / total_vs_random * 100
404
+ if total_vs_random > 0
405
+ else 0
406
+ )
407
+
408
+ df.insert(0, "agent_name", model_name)
409
+ df.insert(1, "agent_type", agent_type)
410
+ df["avg_generation_time (sec)"] = avg_time
411
+ df["win vs_random (%)"] = round(vs_random_rate, 2)
412
+ # Optional: derive win-rate from rewards/games if you wish
413
+ df["# games"] = df["games_played"]
414
+ df["win-rate"] = df["win vs_random (%)"] # simple proxy for table
415
+
416
+ all_stats.append(df)
417
+ finally:
418
+ conn.close()
419
+
420
+ leaderboard_df = (
421
+ pd.concat(all_stats, ignore_index=True)
422
+ if all_stats
423
+ else pd.DataFrame(columns=LEADERBOARD_COLUMNS)
424
+ )
425
+ # Reorder columns to match LEADERBOARD_COLUMNS (ignore missing)
426
+ cols = [c for c in LEADERBOARD_COLUMNS if c in leaderboard_df.columns]
427
+ leaderboard_df = leaderboard_df[cols]
428
+ return leaderboard_df
429
+
430
+ # -----------------------------------------------------------------------------
431
+ # Simple plotting helpers
432
+ # -----------------------------------------------------------------------------
433
+
434
+
435
+ def create_bar_plot(
436
+ data: pd.DataFrame,
437
+ x_col: str,
438
+ y_col: str,
439
+ title: str,
440
+ x_label: str,
441
+ y_label: str,
442
+ ) -> gr.BarPlot:
443
+ return gr.BarPlot(
444
+ value=data,
445
+ x=x_col,
446
+ y=y_col,
447
+ title=title,
448
+ x_label=x_label,
449
+ y_label=y_label,
450
+ )
451
+
452
+ # -----------------------------------------------------------------------------
453
+ # Upload handler (save .db files to scripts/results/)
454
+ # -----------------------------------------------------------------------------
455
+
456
+
457
+ def handle_db_upload(files: list[gr.File]) -> str:
458
+ ensure_results_dir()
459
+ saved = []
460
+ for f in files or []:
461
+ dest = db_dir / Path(f.name).name
462
+ Path(f.name).replace(dest)
463
+ saved.append(dest.name)
464
+ return (
465
+ f"Uploaded: {', '.join(saved)}" if saved else "No files uploaded."
466
+ )
467
+
468
+ # -----------------------------------------------------------------------------
469
+ # UI
470
+ # -----------------------------------------------------------------------------
471
+ with gr.Blocks() as interface:
472
+ pass
473
+ with gr.Blocks() as interface:
474
+ with gr.Tab("Game Arena"):
475
+ config = create_player_config()
476
+
477
+ gr.Markdown("# LLM Game Arena")
478
+ gr.Markdown("Play games against LLMs or watch LLMs compete!")
479
+ gr.Markdown(
480
+ f"> **๐Ÿค– Available AI Players**: {config['model_info']}\n"
481
+ "> Local transformer models run with Hugging Face transformers. "
482
+ "No API tokens required!"
483
+ )
484
+
485
+ with gr.Row():
486
+ game_dropdown = gr.Dropdown(
487
+ choices=config["available_games"],
488
+ label="Select a Game",
489
+ value=(
490
+ config["available_games"][0]
491
+ if config["available_games"]
492
+ else "No Games Found"
493
+ ),
494
+ )
495
+ rounds_slider = gr.Slider(
496
+ minimum=1,
497
+ maximum=10,
498
+ value=1,
499
+ step=1,
500
+ label="Number of Rounds",
501
+ )
502
+
503
+ def player_selector_block(label: str):
504
+ gr.Markdown(f"### {label}")
505
+ choices_pairs = [
506
+ (key, config["player_config"]["player_type_display"][key])
507
+ for key in config["player_config"]["player_types"]
508
+ ]
509
+ dd_type = gr.Dropdown(
510
+ choices=choices_pairs,
511
+ label=f"{label} Type",
512
+ value=choices_pairs[0][0],
513
+ )
514
+ dd_model = gr.Dropdown(
515
+ choices=config["player_config"]["available_models"],
516
+ label=f"{label} Model (if LLM)",
517
+ visible=False,
518
+ )
519
+ return dd_type, dd_model
520
+
521
+ with gr.Row():
522
+ p1_type, p1_model = player_selector_block("Player 1")
523
+ p2_type, p2_model = player_selector_block("Player 2")
524
+
525
+ def _vis(player_type: str):
526
+ is_llm = (
527
+ player_type == "llm"
528
+ or (
529
+ player_type
530
+ and (
531
+ player_type.startswith("llm_")
532
+ or player_type.startswith("hf_")
533
+ )
534
+ )
535
+ )
536
+ return gr.update(visible=is_llm)
537
+
538
+ p1_type.change(_vis, inputs=p1_type, outputs=p1_model)
539
+ p2_type.change(_vis, inputs=p2_type, outputs=p2_model)
540
+
541
+ play_button = gr.Button("๐ŸŽฎ Start Game", variant="primary")
542
+ game_output = gr.Textbox(
543
+ label="Game Log",
544
+ lines=20,
545
+ placeholder="Game results will appear here...",
546
+ )
547
+
548
+ play_button.click(
549
+ play_game,
550
+ inputs=[
551
+ game_dropdown,
552
+ p1_type,
553
+ p2_type,
554
+ p1_model,
555
+ p2_model,
556
+ rounds_slider,
557
+ ],
558
+ outputs=[game_output],
559
+ )
560
+
561
+ with gr.Tab("Leaderboard"):
562
+ gr.Markdown(
563
+ "# LLM Model Leaderboard\n"
564
+ "Track performance across different games!"
565
+ )
566
+ leaderboard_game_dropdown = gr.Dropdown(
567
+ choices=get_available_games(),
568
+ label="Select Game",
569
+ value="Aggregated Performance",
570
+ )
571
+ leaderboard_table = gr.Dataframe(
572
+ value=extract_leaderboard_stats("Aggregated Performance"),
573
+ headers=LEADERBOARD_COLUMNS,
574
+ interactive=False,
575
+ )
576
+ refresh_btn = gr.Button("๐Ÿ”„ Refresh")
577
+
578
+ def _update_leaderboard(game: str) -> pd.DataFrame:
579
+ return extract_leaderboard_stats(game)
580
+
581
+ leaderboard_game_dropdown.change(
582
+ _update_leaderboard,
583
+ inputs=[leaderboard_game_dropdown],
584
+ outputs=[leaderboard_table],
585
+ )
586
+ refresh_btn.click(
587
+ _update_leaderboard,
588
+ inputs=[leaderboard_game_dropdown],
589
+ outputs=[leaderboard_table],
590
+ )
591
+
592
+ gr.Markdown("### Upload new `.db` result files")
593
+ db_files = gr.Files(file_count="multiple", file_types=[".db"])
594
+ upload_btn = gr.Button("โฌ†๏ธ Upload to results/")
595
+ upload_status = gr.Markdown()
596
+
597
+ upload_btn.click(
598
+ handle_db_upload, inputs=[db_files], outputs=[upload_status]
599
+ )
600
+
601
+ with gr.Tab("Metrics Dashboard"):
602
+ gr.Markdown(
603
+ "# ๐Ÿ“Š Metrics Dashboard\n"
604
+ "Visual summaries of LLM performance across games."
605
+ )
606
+ metrics_df = extract_leaderboard_stats("Aggregated Performance")
607
+
608
+ with gr.Row():
609
+ create_bar_plot(
610
+ data=metrics_df,
611
+ x_col="agent_name",
612
+ y_col="win vs_random (%)",
613
+ title="Win Rate vs Random Bot",
614
+ x_label="LLM Model",
615
+ y_label="Win Rate (%)",
616
+ )
617
+
618
+ with gr.Row():
619
+ create_bar_plot(
620
+ data=metrics_df,
621
+ x_col="agent_name",
622
+ y_col="avg_generation_time (sec)",
623
+ title="Average Generation Time",
624
+ x_label="LLM Model",
625
+ y_label="Time (sec)",
626
+ )
627
+
628
+ with gr.Row():
629
+ gr.Dataframe(
630
+ value=metrics_df,
631
+ label="Performance Summary",
632
+ interactive=False,
633
+ )
634
+
635
+ with gr.Tab("Analysis of LLM Reasoning"):
636
+ gr.Markdown(
637
+ "# ๐Ÿง  Analysis of LLM Reasoning\n"
638
+ "Insights into move legality and decision behavior."
639
+ )
640
+ illegal_df = extract_illegal_moves_summary()
641
+
642
+ with gr.Row():
643
+ create_bar_plot(
644
+ data=illegal_df,
645
+ x_col="agent_name",
646
+ y_col="illegal_moves",
647
+ title="Illegal Moves by Model",
648
+ x_label="LLM Model",
649
+ y_label="# of Illegal Moves",
650
+ )
651
+
652
+ with gr.Row():
653
+ gr.Dataframe(
654
+ value=illegal_df,
655
+ label="Illegal Move Summary",
656
+ interactive=False,
657
+ )
658
+
659
+ with gr.Tab("About"):
660
+ gr.Markdown(
661
+ """
662
+ # About Game Reasoning Arena
663
+
664
+ This app analyzes and visualizes LLM performance in games.
665
+
666
+ - **Game Arena**: Play games vs. LLMs or watch LLM vs. LLM
667
+ - **Leaderboard**: Performance statistics across games
668
+ - **Metrics Dashboard**: Visual summaries
669
+ - **Reasoning Analysis**: Illegal moves & behavior
670
+
671
+ **Data**: SQLite databases in `scripts/results/`.
672
+ """
673
+ )
674
+
675
+ # Local run only. On Spaces, the runtime will serve `interface` automatically.
676
+ if __name__ == "__main__":
677
+ interface.launch(server_name="0.0.0.0", server_port=None, show_api=False)