ed-donner commited on
Commit
683d749
·
1 Parent(s): c908a5c

Added leaderboard with DB connectivity

Browse files
Files changed (12) hide show
  1. README.md +50 -0
  2. app.py +6 -0
  3. arena/board.py +63 -9
  4. arena/board_view.py +5 -1
  5. arena/c4.py +132 -22
  6. arena/game.py +56 -4
  7. arena/llm.py +54 -64
  8. arena/player.py +31 -7
  9. arena/record.py +178 -0
  10. connect.png +0 -0
  11. prototype.ipynb +0 -0
  12. requirements.txt +2 -1
README.md CHANGED
@@ -15,3 +15,53 @@ short_description: Arena for playing Four-in-a-row between LLMs
15
  # Four-in-a-row Arena
16
 
17
  ### A battleground for pitting LLMs against each other in the classic board game
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Four-in-a-row Arena
16
 
17
  ### A battleground for pitting LLMs against each other in the classic board game
18
+
19
+ ![Connect](connect.png)
20
+
21
+ It has been great fun making this Arena and watching LLMs duke it out!
22
+
23
+ Quick links:
24
+ - The [GitHub repo](https://github.com/ed-donner/connect) for the code
25
+ - The [HuggingFace Spaces](https://huggingface.co/spaces/ed-donner/connect) where it's running
26
+ - My [LinkedIn](https://www.linkedin.com/in/eddonner/) - I love connecting!
27
+
28
+ If you'd like to learn more about this:
29
+ - I have a best-selling intensive 8-week [Mastering LLM engineering](https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/?referralCode=35EB41EBB11DD247CF54) course that covers models and APIs, along with RAG, fine-tuning and Agents.
30
+ - I'm running a number of [Live Events](https://www.oreilly.com/search/?q=author%3A%20%22Ed%20Donner%22) with O'Reilly and Pearson
31
+
32
+ ## Installing the code
33
+
34
+ 1. Clone the repo with `git clone https://github.com/ed-donner/connect.git`
35
+ 2. Change to the project directory with `cd connect`
36
+ 3. Create a python virtualenv with `python -m venv venv`
37
+ 4. Activate your environment with either `venv\Scripts\activate` on Windows, or `source venv/bin/activate` on Mac/Linux
38
+ 5. Then run `pip install -r requirements.txt` to install the packages
39
+
40
+ ## Setting up your API keys
41
+
42
+ Please create a file with the exact name `.env` in the project root directory (connect).
43
+
44
+ You would typically use Notepad (Windows) or nano (Mac) for this.
45
+
46
+ If you're not familiar with setting up a .env file this way, ask ChatGPT! It will give much more eloquent instructions than me. 😂
47
+
48
+ Your .env file should contain the following; add whichever keys you would like to use.
49
+
50
+ ```
51
+ OPENAI_API_KEY=sk-proj-...
52
+ ANTHROPIC_API_KEY=sk-ant-...
53
+ DEEPSEEK_API_KEY=sk...
54
+ GROQ_API_KEY=...
55
+ ```
56
+
57
+ ## Optional - using Ollama
58
+
59
+ You can run Ollama locally, and the Arena will connect to run local models.
60
+ 1. Download and install Ollama from https://ollama.com noting that on a PC you might need to have administrator permissions for the install to work properly
61
+ 2. On a PC, start a Command prompt / Powershell (Press Win + R, type `cmd`, and press Enter). On a Mac, start a Terminal (Applications > Utilities > Terminal).
62
+ 3. Run `ollama run llama3.2` or for smaller machines try `ollama run llama3.2:1b`
63
+ 4. If this doesn't work, you may need to run `ollama serve` in another Powershell (Windows) or Terminal (Mac), and try step 3 again
64
+
65
+
66
+
67
+
app.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  from arena.c4 import make_display
2
  from dotenv import load_dotenv
3
 
 
1
+ """
2
+ The main entry-point for the Spaces application
3
+ Create a Gradio app and launch it
4
+ """
5
+
6
+
7
  from arena.c4 import make_display
8
  from dotenv import load_dotenv
9
 
arena/board.py CHANGED
@@ -1,4 +1,5 @@
1
  from arena.board_view import to_svg
 
2
 
3
  RED = 1
4
  YELLOW = -1
@@ -10,8 +11,15 @@ cols = "ABCDEFG"
10
 
11
 
12
  class Board:
 
 
 
13
 
14
  def __init__(self):
 
 
 
 
15
  self.cells = [[0 for _ in range(7)] for _ in range(6)]
16
  self.player = RED
17
  self.winner = EMPTY
@@ -20,6 +28,9 @@ class Board:
20
  self.latest_x, self.latest_y = -1, -1
21
 
22
  def __repr__(self):
 
 
 
23
  result = ""
24
  for y in range(6):
25
  for x in range(7):
@@ -29,6 +40,9 @@ class Board:
29
  return result
30
 
31
  def message(self):
 
 
 
32
  if self.winner and self.forfeit:
33
  return f"{show[self.winner]} wins after an illegal move by {show[-1*self.winner]}\n"
34
  elif self.winner:
@@ -39,16 +53,24 @@ class Board:
39
  return f"{show[self.player]} to play\n"
40
 
41
  def html(self):
 
 
 
42
  result = '<div style="text-align: center;font-size:24px">'
43
  result += self.__repr__().replace("\n", "<br/>")
44
  result += "</div>"
45
  return result
46
 
47
  def svg(self):
48
- """Convert the board state to an SVG representation"""
 
 
49
  return to_svg(self)
50
 
51
  def json(self):
 
 
 
52
  result = "{\n"
53
  result += ' "Column names": ["A", "B", "C", "D", "E", "F", "G"],\n'
54
  for y in range(6):
@@ -60,6 +82,9 @@ class Board:
60
  return result
61
 
62
  def alternative(self):
 
 
 
63
  result = "ABCDEFG\n"
64
  for y in range(6):
65
  for x in range(7):
@@ -67,19 +92,32 @@ class Board:
67
  result += "\n"
68
  return result
69
 
70
- def height(self, x):
 
 
 
71
  height = 0
72
  while height < 6 and self.cells[height][x] != EMPTY:
73
  height += 1
74
  return height
75
 
76
- def legal_moves(self):
 
 
 
77
  return [cols[x] for x in range(7) if self.height(x) < 6]
78
 
79
- def illegal_moves(self):
 
 
 
80
  return [cols[x] for x in range(7) if self.height(x) == 6]
81
 
82
- def winning_line(self, x, y, dx, dy):
 
 
 
 
83
  color = self.cells[y][x]
84
  for pointer in range(1, 4):
85
  xp = x + dx * pointer
@@ -88,20 +126,33 @@ class Board:
88
  return EMPTY
89
  return color
90
 
91
- def winning_cell(self, x, y):
 
 
 
 
 
 
92
  for dx, dy in ((0, 1), (1, 1), (1, 0), (1, -1)):
93
  if winner := self.winning_line(x, y, dx, dy):
94
  return winner
95
  return EMPTY
96
 
97
- def wins(self):
 
 
 
 
98
  for y in range(6):
99
  for x in range(7):
100
  if winner := self.winning_cell(x, y):
101
  return winner
102
  return EMPTY
103
 
104
- def move(self, x):
 
 
 
105
  y = self.height(x)
106
  self.cells[y][x] = self.player
107
  self.latest_x, self.latest_y = x, y
@@ -113,5 +164,8 @@ class Board:
113
  self.player = -1 * self.player
114
  return self
115
 
116
- def is_active(self):
 
 
 
117
  return not self.winner and not self.draw
 
1
  from arena.board_view import to_svg
2
+ from typing import List
3
 
4
  RED = 1
5
  YELLOW = -1
 
11
 
12
 
13
  class Board:
14
+ """
15
+ A class to represent a Four-in-the-row Board
16
+ """
17
 
18
  def __init__(self):
19
+ """
20
+ Initialize this instance, starting with empty cells, RED to play
21
+ The latest x,y is used to track the most recent move, so it animates on the display
22
+ """
23
  self.cells = [[0 for _ in range(7)] for _ in range(6)]
24
  self.player = RED
25
  self.winner = EMPTY
 
28
  self.latest_x, self.latest_y = -1, -1
29
 
30
  def __repr__(self):
31
+ """
32
+ A visual representation
33
+ """
34
  result = ""
35
  for y in range(6):
36
  for x in range(7):
 
40
  return result
41
 
42
  def message(self):
43
+ """
44
+ A summary of the status
45
+ """
46
  if self.winner and self.forfeit:
47
  return f"{show[self.winner]} wins after an illegal move by {show[-1*self.winner]}\n"
48
  elif self.winner:
 
53
  return f"{show[self.player]} to play\n"
54
 
55
  def html(self):
56
+ """
57
+ Return an HTML representation
58
+ """
59
  result = '<div style="text-align: center;font-size:24px">'
60
  result += self.__repr__().replace("\n", "<br/>")
61
  result += "</div>"
62
  return result
63
 
64
  def svg(self):
65
+ """
66
+ Return an SVG representation
67
+ """
68
  return to_svg(self)
69
 
70
  def json(self):
71
+ """
72
+ Return a json representation
73
+ """
74
  result = "{\n"
75
  result += ' "Column names": ["A", "B", "C", "D", "E", "F", "G"],\n'
76
  for y in range(6):
 
82
  return result
83
 
84
  def alternative(self):
85
+ """
86
+ An alternative representation, used in prompting so that the LLM sees this 2 ways
87
+ """
88
  result = "ABCDEFG\n"
89
  for y in range(6):
90
  for x in range(7):
 
92
  result += "\n"
93
  return result
94
 
95
+ def height(self, x: int) -> int:
96
+ """
97
+ Return the height of the given column
98
+ """
99
  height = 0
100
  while height < 6 and self.cells[height][x] != EMPTY:
101
  height += 1
102
  return height
103
 
104
+ def legal_moves(self) -> List[str]:
105
+ """
106
+ Return the names of columns that are not full
107
+ """
108
  return [cols[x] for x in range(7) if self.height(x) < 6]
109
 
110
+ def illegal_moves(self) -> List[str]:
111
+ """
112
+ Return the names of columns that are full
113
+ """
114
  return [cols[x] for x in range(7) if self.height(x) == 6]
115
 
116
+ def winning_line(self, x: int, y: int, dx: int, dy: int) -> int:
117
+ """
118
+ Return RED or YELLOW if this cell is the start of a 4 in the row going in the direction dx, dy
119
+ Or EMPTY if not
120
+ """
121
  color = self.cells[y][x]
122
  for pointer in range(1, 4):
123
  xp = x + dx * pointer
 
126
  return EMPTY
127
  return color
128
 
129
+ def winning_cell(self, x: int, y: int) -> int:
130
+ """
131
+ Return RED or YELLOW if this cell is the start of a 4 in the row
132
+ Or EMPTY if not
133
+ For performance reasons, only look in 4 of the possible 8 directions,
134
+ (because this test will run on both sides of the 4-in-a-row)
135
+ """
136
  for dx, dy in ((0, 1), (1, 1), (1, 0), (1, -1)):
137
  if winner := self.winning_line(x, y, dx, dy):
138
  return winner
139
  return EMPTY
140
 
141
+ def wins(self) -> int:
142
+ """
143
+ Return RED or YELLOW if there is a 4-in-a-row of that color on the board
144
+ Or EMPTY if not
145
+ """
146
  for y in range(6):
147
  for x in range(7):
148
  if winner := self.winning_cell(x, y):
149
  return winner
150
  return EMPTY
151
 
152
+ def move(self, x: int):
153
+ """
154
+ Make a move in the given column
155
+ """
156
  y = self.height(x)
157
  self.cells[y][x] = self.player
158
  self.latest_x, self.latest_y = x, y
 
164
  self.player = -1 * self.player
165
  return self
166
 
167
+ def is_active(self) -> bool:
168
+ """
169
+ Return true if the game has not yet ended
170
+ """
171
  return not self.winner and not self.draw
arena/board_view.py CHANGED
@@ -3,7 +3,11 @@ YELLOW = -1
3
  EMPTY = 0
4
 
5
  def to_svg(board):
6
- """Convert the board state to an SVG representation"""
 
 
 
 
7
  svg = '''
8
  <div style="display: flex; justify-content: center;">
9
  <svg width="450" height="420" viewBox="0 0 450 420">
 
3
  EMPTY = 0
4
 
5
  def to_svg(board):
6
+ """
7
+ Create an SVG representation of the board, with the latest piece dropping down via SVG
8
+ I must confess that this function was written almost entirely by Claude; done in 15 mins,
9
+ when it would have taken me a couple of hours. Amazing!
10
+ """
11
  svg = '''
12
  <div style="display: flex; justify-content: center;">
13
  <svg width="450" height="420" viewBox="0 0 450 420">
arena/c4.py CHANGED
@@ -4,7 +4,13 @@ from arena.llm import LLM
4
  import gradio as gr
5
 
6
 
7
- css = "footer{display:none !important}"
 
 
 
 
 
 
8
 
9
  js = """
10
  function refresh() {
@@ -18,20 +24,70 @@ function refresh() {
18
  """
19
 
20
 
21
- def message_html(game):
 
 
 
22
  return (
23
  f'<div style="text-align: center;font-size:18px">{game.board.message()}</div>'
24
  )
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def load_callback(red_llm, yellow_llm):
 
 
 
28
  game = Game(red_llm, yellow_llm)
29
  enabled = gr.Button(interactive=True)
30
  message = message_html(game)
31
- return game, game.board.svg(), message, "", "", enabled, enabled, enabled
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
  def move_callback(game):
 
 
 
35
  game.move()
36
  message = message_html(game)
37
  if_active = gr.Button(interactive=game.board.is_active())
@@ -47,8 +103,13 @@ def move_callback(game):
47
 
48
 
49
  def run_callback(game):
 
 
 
 
50
  enabled = gr.Button(interactive=True)
51
  disabled = gr.Button(interactive=False)
 
52
  message = message_html(game)
53
  yield game, game.board.svg(), message, game.thoughts(RED), game.thoughts(
54
  YELLOW
@@ -59,26 +120,39 @@ def run_callback(game):
59
  yield game, game.board.svg(), message, game.thoughts(RED), game.thoughts(
60
  YELLOW
61
  ), disabled, disabled, disabled
 
62
  yield game, game.board.svg(), message, game.thoughts(RED), game.thoughts(
63
  YELLOW
64
  ), disabled, disabled, enabled
65
 
66
 
67
  def model_callback(player_name, game, new_model_name):
 
 
 
68
  player = game.players[player_name]
69
  player.switch_model(new_model_name)
70
  return game
71
 
72
 
73
  def red_model_callback(game, new_model_name):
 
 
 
74
  return model_callback(RED, game, new_model_name)
75
 
76
 
77
  def yellow_model_callback(game, new_model_name):
 
 
 
78
  return model_callback(YELLOW, game, new_model_name)
79
 
80
 
81
  def player_section(name, default):
 
 
 
82
  all_model_names = LLM.all_model_names()
83
  with gr.Row():
84
  gr.HTML(f'<div style="text-align: center;font-size:18px">{name} Player</div>')
@@ -94,6 +168,9 @@ def player_section(name, default):
94
 
95
 
96
  def make_display():
 
 
 
97
  with gr.Blocks(
98
  title="C4 Battle",
99
  css=css,
@@ -103,31 +180,60 @@ def make_display():
103
 
104
  game = gr.State()
105
 
106
- with gr.Row():
107
- gr.HTML(
108
- '<div style="text-align: center;font-size:24px">Four-in-a-row LLM Showdown</div>'
109
- )
110
- with gr.Row():
111
- with gr.Column(scale=1):
112
- red_thoughts, red_dropdown = player_section("Red", "gpt-4o-mini")
113
- with gr.Column(scale=2):
114
  with gr.Row():
115
- message = gr.HTML(
116
- '<div style="text-align: center;font-size:18px">The Board</div>'
117
  )
118
- with gr.Row():
119
- board_display = gr.HTML()
120
  with gr.Row():
121
  with gr.Column(scale=1):
122
- move_button = gr.Button("Next move")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  with gr.Column(scale=1):
124
- run_button = gr.Button("Run game", variant="primary")
 
 
 
 
125
  with gr.Column(scale=1):
126
- reset_button = gr.Button("Start Over", variant="stop")
127
- with gr.Column(scale=1):
128
- yellow_thoughts, yellow_dropdown = player_section(
129
- "Yellow", "claude-3-5-sonnet-latest"
130
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  blocks.load(
133
  load_callback,
@@ -191,4 +297,8 @@ def make_display():
191
  ],
192
  )
193
 
 
 
 
 
194
  return blocks
 
4
  import gradio as gr
5
 
6
 
7
+ css = """
8
+ .dataframe-fix .table-wrap {
9
+ min-height: 800px;
10
+ max-height: 800px;
11
+ }
12
+ footer{display:none !important}
13
+ """
14
 
15
  js = """
16
  function refresh() {
 
24
  """
25
 
26
 
27
+ def message_html(game) -> str:
28
+ """
29
+ Return the message for the top of the UI
30
+ """
31
  return (
32
  f'<div style="text-align: center;font-size:18px">{game.board.message()}</div>'
33
  )
34
 
35
 
36
+ def format_records_for_table(games):
37
+ """
38
+ Turn the results objects into a list of lists for the Gradio Dataframe
39
+ """
40
+ return [
41
+ [
42
+ game.when,
43
+ game.red_player,
44
+ game.yellow_player,
45
+ "Red" if game.red_won else "Yellow" if game.yellow_won else "Draw",
46
+ ]
47
+ for game in reversed(games)
48
+ ]
49
+
50
+
51
+ def format_ratings_for_table(ratings):
52
+ """
53
+ Turn the ratings into a List of Lists for the Gradio Dataframe
54
+ """
55
+ items = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
56
+ return [[item[0], int(round(item[1]))] for item in items]
57
+
58
+
59
  def load_callback(red_llm, yellow_llm):
60
+ """
61
+ Callback called when the game is started. Create a new Game object for the state.
62
+ """
63
  game = Game(red_llm, yellow_llm)
64
  enabled = gr.Button(interactive=True)
65
  message = message_html(game)
66
+ return (
67
+ game,
68
+ game.board.svg(),
69
+ message,
70
+ "",
71
+ "",
72
+ enabled,
73
+ enabled,
74
+ enabled,
75
+ )
76
+
77
+
78
+ def leaderboard_callback(game):
79
+ """
80
+ Callback called when the user switches to the Leaderboard tab. Load in the results.
81
+ """
82
+ records_df = format_records_for_table(Game.get_games())
83
+ ratings_df = format_ratings_for_table(Game.get_ratings())
84
+ return records_df, ratings_df
85
 
86
 
87
  def move_callback(game):
88
+ """
89
+ Callback called when the user clicks to do a single move.
90
+ """
91
  game.move()
92
  message = message_html(game)
93
  if_active = gr.Button(interactive=game.board.is_active())
 
103
 
104
 
105
  def run_callback(game):
106
+ """
107
+ Callback called when the user runs an entire game. Reset the board, run the game, store results.
108
+ Yield interim results so the UI updates.
109
+ """
110
  enabled = gr.Button(interactive=True)
111
  disabled = gr.Button(interactive=False)
112
+ game.reset()
113
  message = message_html(game)
114
  yield game, game.board.svg(), message, game.thoughts(RED), game.thoughts(
115
  YELLOW
 
120
  yield game, game.board.svg(), message, game.thoughts(RED), game.thoughts(
121
  YELLOW
122
  ), disabled, disabled, disabled
123
+ game.record()
124
  yield game, game.board.svg(), message, game.thoughts(RED), game.thoughts(
125
  YELLOW
126
  ), disabled, disabled, enabled
127
 
128
 
129
  def model_callback(player_name, game, new_model_name):
130
+ """
131
+ Callback when the user changes the model
132
+ """
133
  player = game.players[player_name]
134
  player.switch_model(new_model_name)
135
  return game
136
 
137
 
138
  def red_model_callback(game, new_model_name):
139
+ """
140
+ Callback when red model is changed
141
+ """
142
  return model_callback(RED, game, new_model_name)
143
 
144
 
145
  def yellow_model_callback(game, new_model_name):
146
+ """
147
+ Callback when yellow model is changed
148
+ """
149
  return model_callback(YELLOW, game, new_model_name)
150
 
151
 
152
  def player_section(name, default):
153
+ """
154
+ Create the left and right sections of the UI
155
+ """
156
  all_model_names = LLM.all_model_names()
157
  with gr.Row():
158
  gr.HTML(f'<div style="text-align: center;font-size:18px">{name} Player</div>')
 
168
 
169
 
170
  def make_display():
171
+ """
172
+ The Gradio UI to show the Game, with event handlers
173
+ """
174
  with gr.Blocks(
175
  title="C4 Battle",
176
  css=css,
 
180
 
181
  game = gr.State()
182
 
183
+ with gr.Tabs():
184
+ with gr.TabItem("Game"):
185
+
 
 
 
 
 
186
  with gr.Row():
187
+ gr.HTML(
188
+ '<div style="text-align: center;font-size:24px">Four-in-a-row LLM Showdown</div>'
189
  )
 
 
190
  with gr.Row():
191
  with gr.Column(scale=1):
192
+ red_thoughts, red_dropdown = player_section(
193
+ "Red", "gpt-4o-mini"
194
+ )
195
+ with gr.Column(scale=2):
196
+ with gr.Row():
197
+ message = gr.HTML(
198
+ '<div style="text-align: center;font-size:18px">The Board</div>'
199
+ )
200
+ with gr.Row():
201
+ board_display = gr.HTML()
202
+ with gr.Row():
203
+ with gr.Column(scale=1):
204
+ move_button = gr.Button("Next move")
205
+ with gr.Column(scale=1):
206
+ run_button = gr.Button("Run game", variant="primary")
207
+ with gr.Column(scale=1):
208
+ reset_button = gr.Button("Start Over", variant="stop")
209
  with gr.Column(scale=1):
210
+ yellow_thoughts, yellow_dropdown = player_section(
211
+ "Yellow", "claude-3-5-sonnet-latest"
212
+ )
213
+ with gr.TabItem("Leaderboard") as leaderboard_tab:
214
+ with gr.Row():
215
  with gr.Column(scale=1):
216
+ ratings_df = gr.Dataframe(
217
+ headers=["Player", "ELO"],
218
+ label="Ratings",
219
+ column_widths=[2, 1],
220
+ wrap=True,
221
+ col_count=2,
222
+ row_count=10,
223
+ max_height=800,
224
+ elem_classes=["dataframe-fix"],
225
+ )
226
+ with gr.Column(scale=2):
227
+ results_df = gr.Dataframe(
228
+ headers=["When", "Red Player", "Yellow Player", "Winner"],
229
+ label="Game History",
230
+ column_widths=[2, 2, 2, 1],
231
+ wrap=True,
232
+ col_count=4,
233
+ row_count=10,
234
+ max_height=800,
235
+ elem_classes=["dataframe-fix"],
236
+ )
237
 
238
  blocks.load(
239
  load_callback,
 
297
  ],
298
  )
299
 
300
+ leaderboard_tab.select(
301
+ leaderboard_callback, inputs=[game], outputs=[results_df, ratings_df]
302
+ )
303
+
304
  return blocks
arena/game.py CHANGED
@@ -1,26 +1,78 @@
1
- from arena.board import Board, RED, YELLOW, EMPTY, pieces
2
  from arena.player import Player
 
 
 
3
 
4
 
5
  class Game:
 
 
 
6
 
7
- def __init__(self, model_red, model_yellow):
 
 
 
8
  self.board = Board()
9
  self.players = {
10
  RED: Player(model_red, RED),
11
  YELLOW: Player(model_yellow, YELLOW),
12
  }
13
 
 
 
 
 
 
 
14
  def move(self):
 
 
 
15
  self.players[self.board.player].move(self.board)
16
 
17
- def is_active(self):
 
 
 
18
  return self.board.is_active()
19
 
20
- def thoughts(self, player):
 
 
 
21
  return self.players[player].thoughts()
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def run(self):
 
 
 
24
  while self.is_active():
25
  self.move()
26
  print(self.board)
 
1
+ from arena.board import Board, RED, YELLOW
2
  from arena.player import Player
3
+ from arena.record import get_games, Result, record_game, ratings
4
+ from datetime import datetime
5
+ from typing import List
6
 
7
 
8
  class Game:
9
+ """
10
+ A Game consists of a Board and 2 players
11
+ """
12
 
13
+ def __init__(self, model_red: str, model_yellow: str):
14
+ """
15
+ Initialize this Game; a new board, and new Player objects
16
+ """
17
  self.board = Board()
18
  self.players = {
19
  RED: Player(model_red, RED),
20
  YELLOW: Player(model_yellow, YELLOW),
21
  }
22
 
23
+ def reset(self):
24
+ """
25
+ Restart the game by resetting the board; keep players the same
26
+ """
27
+ self.board = Board()
28
+
29
  def move(self):
30
+ """
31
+ Make the next move. Delegate to the current player to make a move on this board.
32
+ """
33
  self.players[self.board.player].move(self.board)
34
 
35
+ def is_active(self) -> bool:
36
+ """
37
+ Return true if the game hasn't yet ended
38
+ """
39
  return self.board.is_active()
40
 
41
+ def thoughts(self, player) -> str:
42
+ """
43
+ Return the inner thoughts of the given player
44
+ """
45
  return self.players[player].thoughts()
46
 
47
+ @staticmethod
48
+ def get_games() -> List:
49
+ """
50
+ Return all the games stored in the db
51
+ """
52
+ return get_games()
53
+
54
+ @staticmethod
55
+ def get_ratings():
56
+ """
57
+ Return the ELO ratings of all players
58
+ """
59
+ return ratings()
60
+
61
+ def record(self):
62
+ """
63
+ Store the results of this game in the DB
64
+ """
65
+ red_player = self.players[RED].llm.model_name
66
+ yellow_player = self.players[YELLOW].llm.model_name
67
+ red_won = self.board.winner == RED
68
+ yellow_won = self.board.winner == YELLOW
69
+ result = Result(red_player, yellow_player, red_won, yellow_won, datetime.now())
70
+ record_game(result)
71
+
72
  def run(self):
73
+ """
74
+ If being used outside gradio; move and print in a loop
75
+ """
76
  while self.is_active():
77
  self.move()
78
  print(self.board)
arena/llm.py CHANGED
@@ -48,7 +48,11 @@ class LLM(ABC):
48
  return result
49
 
50
  def protected_send(self, system: str, user: str, max_tokens: int = 3000) -> str:
51
- retries = 5
 
 
 
 
52
  while retries:
53
  retries -= 1
54
  try:
@@ -61,9 +65,27 @@ class LLM(ABC):
61
  return "{}"
62
 
63
  def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
64
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- def api_model_name(self):
 
 
 
67
  if " " in self.model_name:
68
  return self.model_name.split(" ")[0]
69
  else:
@@ -83,6 +105,10 @@ class LLM(ABC):
83
 
84
  @classmethod
85
  def all_model_names(cls) -> List[str]:
 
 
 
 
86
  models = list(cls.model_map().keys())
87
  allowed = os.getenv("MODELS")
88
  if allowed:
@@ -153,28 +179,10 @@ class GPT(LLM):
153
  super().__init__(model_name, temperature)
154
  self.client = OpenAI()
155
 
156
- def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
157
- """
158
- Send a message to GPT
159
- :param system: the context in which this message is to be taken
160
- :param user: the prompt
161
- :param max_tokens: max number of tokens to generate
162
- :return: the response from the AI
163
- """
164
- response = self.client.chat.completions.create(
165
- model=self.api_model_name(),
166
- messages=[
167
- {"role": "system", "content": system},
168
- {"role": "user", "content": user},
169
- ],
170
- response_format={"type": "json_object"},
171
- )
172
- return response.choices[0].message.content
173
-
174
 
175
  class O1(LLM):
176
  """
177
- A class to act as an interface to the remote AI, in this case GPT
178
  """
179
 
180
  model_names = ["o1-mini"]
@@ -188,7 +196,7 @@ class O1(LLM):
188
 
189
  def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
190
  """
191
- Send a message to GPT
192
  :param system: the context in which this message is to be taken
193
  :param user: the prompt
194
  :param max_tokens: max number of tokens to generate
@@ -206,7 +214,7 @@ class O1(LLM):
206
 
207
  class O3(LLM):
208
  """
209
- A class to act as an interface to the remote AI, in this case GPT
210
  """
211
 
212
  model_names = ["o3-mini"]
@@ -225,7 +233,7 @@ class O3(LLM):
225
 
226
  def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
227
  """
228
- Send a message to GPT
229
  :param system: the context in which this message is to be taken
230
  :param user: the prompt
231
  :param max_tokens: max number of tokens to generate
@@ -241,6 +249,25 @@ class O3(LLM):
241
  return response.choices[0].message.content
242
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  class Ollama(LLM):
245
  """
246
  A class to act as an interface to the remote AI, in this case Ollama via the OpenAI client
@@ -250,7 +277,7 @@ class Ollama(LLM):
250
 
251
  def __init__(self, model_name: str, temperature: float):
252
  """
253
- Create a new instance of the OpenAI client
254
  """
255
  super().__init__(model_name, temperature)
256
  self.client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
@@ -296,25 +323,6 @@ class DeepSeekAPI(LLM):
296
  api_key=deepseek_api_key, base_url="https://api.deepseek.com"
297
  )
298
 
299
- def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
300
- """
301
- Send a message to DeepSeek
302
- :param system: the context in which this message is to be taken
303
- :param user: the prompt
304
- :param max_tokens: max number of tokens to generate
305
- :return: the response from the AI
306
- """
307
-
308
- response = self.client.chat.completions.create(
309
- model=self.api_model_name(),
310
- messages=[
311
- {"role": "system", "content": system},
312
- {"role": "user", "content": user},
313
- ],
314
- )
315
- reply = response.choices[0].message.content
316
- return reply
317
-
318
 
319
  class DeepSeekLocal(LLM):
320
  """
@@ -367,25 +375,7 @@ class GroqAPI(LLM):
367
 
368
  def __init__(self, model_name: str, temperature: float):
369
  """
370
- Create a new instance of the OpenAI client
371
  """
372
  super().__init__(model_name, temperature)
373
  self.client = Groq()
374
-
375
- def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
376
- """
377
- Send a message to GPT
378
- :param system: the context in which this message is to be taken
379
- :param user: the prompt
380
- :param max_tokens: max number of tokens to generate
381
- :return: the response from the AI
382
- """
383
- response = self.client.chat.completions.create(
384
- model=self.api_model_name(),
385
- messages=[
386
- {"role": "system", "content": system},
387
- {"role": "user", "content": user},
388
- ],
389
- response_format={"type": "json_object"},
390
- )
391
- return response.choices[0].message.content
 
48
  return result
49
 
50
  def protected_send(self, system: str, user: str, max_tokens: int = 3000) -> str:
51
+ """
52
+ Wrap the send call in an exception handler, giving the LLM 3 chances in total, in case
53
+ of overload errors. If it fails 3 times, then it forfeits!
54
+ """
55
+ retries = 3
56
  while retries:
57
  retries -= 1
58
  try:
 
65
  return "{}"
66
 
67
  def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
68
+ """
69
+ Send a message to the model - this default implementation follows the OpenAI API structure
70
+ :param system: the context in which this message is to be taken
71
+ :param user: the prompt
72
+ :param max_tokens: max number of tokens to generate
73
+ :return: the response from the AI
74
+ """
75
+ response = self.client.chat.completions.create(
76
+ model=self.api_model_name(),
77
+ messages=[
78
+ {"role": "system", "content": system},
79
+ {"role": "user", "content": user},
80
+ ],
81
+ response_format={"type": "json_object"},
82
+ )
83
+ return response.choices[0].message.content
84
 
85
+ def api_model_name(self) -> str:
86
+ """
87
+ Return the actual model_name to be used in the call to the API; strip out anything after a space
88
+ """
89
  if " " in self.model_name:
90
  return self.model_name.split(" ")[0]
91
  else:
 
105
 
106
  @classmethod
107
  def all_model_names(cls) -> List[str]:
108
+ """
109
+ Return a list of all the model names supported.
110
+ Use the ones specified in the model_map, but also check if there's an env variable set that restricts the models
111
+ """
112
  models = list(cls.model_map().keys())
113
  allowed = os.getenv("MODELS")
114
  if allowed:
 
179
  super().__init__(model_name, temperature)
180
  self.client = OpenAI()
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  class O1(LLM):
184
  """
185
+ A class to act as an interface to the remote AI, in this case O1
186
  """
187
 
188
  model_names = ["o1-mini"]
 
196
 
197
  def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
198
  """
199
+ Send a message to O1
200
  :param system: the context in which this message is to be taken
201
  :param user: the prompt
202
  :param max_tokens: max number of tokens to generate
 
214
 
215
  class O3(LLM):
216
  """
217
+ A class to act as an interface to the remote AI, in this case O3
218
  """
219
 
220
  model_names = ["o3-mini"]
 
233
 
234
  def _send(self, system: str, user: str, max_tokens: int = 3000) -> str:
235
  """
236
+ Send a message to O3
237
  :param system: the context in which this message is to be taken
238
  :param user: the prompt
239
  :param max_tokens: max number of tokens to generate
 
249
  return response.choices[0].message.content
250
 
251
 
252
+ class Gemini(LLM):
253
+ """
254
+ A class to act as an interface to the remote AI, in this case Gemini
255
+ """
256
+
257
+ model_names = ["gemini-2.0-flash", "gemini-1.5-flash"]
258
+
259
+ def __init__(self, model_name: str, temperature: float):
260
+ """
261
+ Create a new instance of the OpenAI client
262
+ """
263
+ super().__init__(model_name, temperature)
264
+ google_api_key = os.getenv("GOOGLE_API_KEY")
265
+ self.client = OpenAI(
266
+ api_key=google_api_key,
267
+ base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
268
+ )
269
+
270
+
271
  class Ollama(LLM):
272
  """
273
  A class to act as an interface to the remote AI, in this case Ollama via the OpenAI client
 
277
 
278
  def __init__(self, model_name: str, temperature: float):
279
  """
280
+ Create a new instance of the OpenAI client for Ollama
281
  """
282
  super().__init__(model_name, temperature)
283
  self.client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
 
323
  api_key=deepseek_api_key, base_url="https://api.deepseek.com"
324
  )
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
  class DeepSeekLocal(LLM):
328
  """
 
375
 
376
  def __init__(self, model_name: str, temperature: float):
377
  """
378
+ Create a new instance of the Groq client
379
  """
380
  super().__init__(model_name, temperature)
381
  self.client = Groq()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
arena/player.py CHANGED
@@ -5,8 +5,15 @@ import random
5
 
6
 
7
  class Player:
8
-
9
- def __init__(self, model, color):
 
 
 
 
 
 
 
10
  self.color = color
11
  self.model = model
12
  self.llm = LLM.create(self.model)
@@ -15,7 +22,10 @@ class Player:
15
  self.opportunities = ""
16
  self.strategy = ""
17
 
18
- def system(self, board, legal_moves, illegal_moves):
 
 
 
19
  return f"""You are playing the board game Connect 4.
20
  Players take turns to drop counters into one of 7 columns A, B, C, D, E, F, G.
21
  The winner is the first player to get 4 counters in a row in any direction.
@@ -33,7 +43,10 @@ You should respond in JSON according to this spec:
33
 
34
  You must pick one of these letters for your move_column: {legal_moves}{illegal_moves}"""
35
 
36
- def user(self, board, legal_moves, illegal_moves):
 
 
 
37
  return f"""It is your turn to make a move as {pieces[self.color]}.
38
  Here is the current board, with row 1 at the bottom of the board:
39
 
@@ -78,8 +91,10 @@ Now make your decision.
78
  You must pick one of these letters for your move_column: {legal_moves}{illegal_moves}
79
  """
80
 
81
- def process_move(self, reply, board):
82
- print(reply)
 
 
83
  try:
84
  if len(reply) == 3 and reply[0] == "{" and reply[2] == "}":
85
  reply = f'{{"move_column": "{reply[1]}"}}'
@@ -100,6 +115,9 @@ You must pick one of these letters for your move_column: {legal_moves}{illegal_m
100
  board.winner = -1 * board.player
101
 
102
  def move(self, board):
 
 
 
103
  legal_moves = ", ".join(board.legal_moves())
104
  if illegal := board.illegal_moves():
105
  illegal_moves = (
@@ -114,6 +132,9 @@ You must pick one of these letters for your move_column: {legal_moves}{illegal_m
114
  self.process_move(reply, board)
115
 
116
  def thoughts(self):
 
 
 
117
  result = '<div style="text-align: left;font-size:14px"><br/>'
118
  result += f"<b>Evaluation:</b><br/>{self.evaluation}<br/><br/>"
119
  result += f"<b>Threats:</b><br/>{self.threats}<br/><br/>"
@@ -122,5 +143,8 @@ You must pick one of these letters for your move_column: {legal_moves}{illegal_m
122
  result += "</div>"
123
  return result
124
 
125
- def switch_model(self, new_model_name):
 
 
 
126
  self.llm = LLM.create(new_model_name)
 
5
 
6
 
7
  class Player:
8
+ """
9
+ This class represents one AI player in the game, and is responsible for managing the prompts
10
+ Delegating to an LLM instance to connect to the LLM
11
+ """
12
+
13
+ def __init__(self, model: str, color: int):
14
+ """
15
+ Set up this instance for the given model and player color
16
+ """
17
  self.color = color
18
  self.model = model
19
  self.llm = LLM.create(self.model)
 
22
  self.opportunities = ""
23
  self.strategy = ""
24
 
25
+ def system(self, board, legal_moves: str, illegal_moves: str) -> str:
26
+ """
27
+ Return the system prompt for this move
28
+ """
29
  return f"""You are playing the board game Connect 4.
30
  Players take turns to drop counters into one of 7 columns A, B, C, D, E, F, G.
31
  The winner is the first player to get 4 counters in a row in any direction.
 
43
 
44
  You must pick one of these letters for your move_column: {legal_moves}{illegal_moves}"""
45
 
46
+ def user(self, board, legal_moves: str, illegal_moves: str) -> str:
47
+ """
48
+ Return the user prompt for this move
49
+ """
50
  return f"""It is your turn to make a move as {pieces[self.color]}.
51
  Here is the current board, with row 1 at the bottom of the board:
52
 
 
91
  You must pick one of these letters for your move_column: {legal_moves}{illegal_moves}
92
  """
93
 
94
+ def process_move(self, reply: str, board):
95
+ """
96
+ Interpret the reply and make the move; if the move is illegal, then the current player loses
97
+ """
98
  try:
99
  if len(reply) == 3 and reply[0] == "{" and reply[2] == "}":
100
  reply = f'{{"move_column": "{reply[1]}"}}'
 
115
  board.winner = -1 * board.player
116
 
117
  def move(self, board):
118
+ """
119
+ Have the underlying LLM make a move, and process the result
120
+ """
121
  legal_moves = ", ".join(board.legal_moves())
122
  if illegal := board.illegal_moves():
123
  illegal_moves = (
 
132
  self.process_move(reply, board)
133
 
134
  def thoughts(self):
135
+ """
136
+ Return HTML to describe the inner thoughts
137
+ """
138
  result = '<div style="text-align: left;font-size:14px"><br/>'
139
  result += f"<b>Evaluation:</b><br/>{self.evaluation}<br/><br/>"
140
  result += f"<b>Threats:</b><br/>{self.threats}<br/><br/>"
 
143
  result += "</div>"
144
  return result
145
 
146
+ def switch_model(self, new_model_name: str):
147
+ """
148
+ Change the underlying LLM to the new model
149
+ """
150
  self.llm = LLM.create(new_model_name)
arena/record.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import math
4
+ from datetime import datetime
5
+ from typing import List, Dict
6
+ from dataclasses import dataclass, asdict
7
+ from pymongo import MongoClient
8
+ from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
9
+
10
+
11
+ @dataclass
12
+ class Result:
13
+ red_player: str
14
+ yellow_player: str
15
+ red_won: bool
16
+ yellow_won: bool
17
+ when: datetime
18
+
19
+
20
+ COLLECTION = "connect"
21
+
22
+
23
+ def _get_collection():
24
+ """Helper function to get MongoDB collection with error handling"""
25
+ try:
26
+ mongo_uri = os.getenv("MONGO_URI")
27
+ if mongo_uri:
28
+ client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
29
+ # Quick check if we can actually connect
30
+ client.admin.command("ismaster")
31
+ db = client.outsmart
32
+ return db[COLLECTION]
33
+ except (ConnectionFailure, ServerSelectionTimeoutError):
34
+ return None
35
+
36
+
37
+ def record_game(result: Result) -> bool:
38
+ """
39
+ Store the results in the database, if database is available.
40
+ Returns True if successful, False if database is unavailable.
41
+ """
42
+ collection = _get_collection()
43
+ if collection is None:
44
+ return False
45
+
46
+ # Convert Result object to dictionary for MongoDB storage
47
+ game_dict = asdict(result)
48
+
49
+ try:
50
+ collection.insert_one(game_dict)
51
+ return True
52
+ except Exception as e:
53
+ logging.error("Failed to record a game in the database")
54
+ logging.exception(e)
55
+ return False
56
+
57
+
58
+ def get_games() -> List[Result]:
59
+ """
60
+ Return all games in the order that they were played.
61
+ Returns empty list if database is unavailable.
62
+ """
63
+ collection = _get_collection()
64
+ if collection is None:
65
+ return []
66
+
67
+ try:
68
+ # Sort by _id to maintain insertion order
69
+ games = collection.find().sort("_id", 1)
70
+
71
+ # Convert MongoDB documents back to Result objects
72
+ results = []
73
+ for game in games:
74
+ # Remove MongoDB's _id field
75
+ game.pop("_id", None)
76
+ results.append(Result(**game))
77
+
78
+ return results
79
+ except Exception as e:
80
+ logging.error("Error getting games")
81
+ logging.exception(e)
82
+ return []
83
+
84
+
85
+ class EloCalculator:
86
+ def __init__(self, k_factor: float = 32, default_rating: int = 1000):
87
+ """
88
+ Initialize the ELO calculator.
89
+
90
+ Args:
91
+ k_factor: Determines how much ratings change after each game
92
+ default_rating: Starting rating for new players
93
+ """
94
+ self.k_factor = k_factor
95
+ self.default_rating = default_rating
96
+ self.ratings: Dict[str, float] = {}
97
+
98
+ def get_player_rating(self, player: str) -> float:
99
+ """Get a player's current rating, or default if they're new."""
100
+ return self.ratings.get(player, self.default_rating)
101
+
102
+ def calculate_expected_score(self, rating_a: float, rating_b: float) -> float:
103
+ """
104
+ Calculate the expected score (win probability) for player A against player B.
105
+ Uses the ELO formula: 1 / (1 + 10^((ratingB - ratingA)/400))
106
+ """
107
+ return 1 / (1 + math.pow(10, (rating_b - rating_a) / 400))
108
+
109
+ def update_ratings(
110
+ self, player_a: str, player_b: str, score_a: float, score_b: float
111
+ ) -> None:
112
+ """
113
+ Update ratings for two players based on their game outcome.
114
+
115
+ Args:
116
+ player_a: Name of first player
117
+ player_b: Name of second player
118
+ score_a: Actual score for player A (1 for win, 0.5 for draw, 0 for loss)
119
+ score_b: Actual score for player B (1 for win, 0.5 for draw, 0 for loss)
120
+ """
121
+ rating_a = self.get_player_rating(player_a)
122
+ rating_b = self.get_player_rating(player_b)
123
+
124
+ expected_a = self.calculate_expected_score(rating_a, rating_b)
125
+ expected_b = 1 - expected_a
126
+
127
+ # Update ratings using the ELO formula: R' = R + K * (S - E)
128
+ # where R is the current rating, K is the k-factor,
129
+ # S is the actual score, and E is the expected score
130
+ new_rating_a = rating_a + self.k_factor * (score_a - expected_a)
131
+ new_rating_b = rating_b + self.k_factor * (score_b - expected_b)
132
+
133
+ self.ratings[player_a] = new_rating_a
134
+ self.ratings[player_b] = new_rating_b
135
+
136
+
137
+ def calculate_elo_ratings(
138
+ results: List[Result], exclude_self_play: bool = True
139
+ ) -> Dict[str, float]:
140
+ """
141
+ Calculate final ELO ratings for all players based on a list of game results.
142
+
143
+ Args:
144
+ results: List of game results, sorted by date
145
+ exclude_self_play: If True, skip games where a player plays against themselves
146
+
147
+ Returns:
148
+ Dictionary mapping player names to their final ELO ratings
149
+ """
150
+ calculator = EloCalculator()
151
+
152
+ for result in results:
153
+ # Skip self-play games if requested
154
+ if exclude_self_play and result.red_player == result.yellow_player:
155
+ continue
156
+
157
+ # Convert game result to ELO scores (1 for win, 0.5 for draw, 0 for loss)
158
+ if result.red_won and not result.yellow_won:
159
+ red_score, yellow_score = 1.0, 0.0
160
+ elif result.yellow_won and not result.red_won:
161
+ red_score, yellow_score = 0.0, 1.0
162
+ else:
163
+ # Draw (including double-win or double-loss cases)
164
+ red_score, yellow_score = 0.5, 0.5
165
+
166
+ calculator.update_ratings(
167
+ result.red_player, result.yellow_player, red_score, yellow_score
168
+ )
169
+
170
+ return calculator.ratings
171
+
172
+
173
+ def ratings() -> Dict[str, float]:
174
+ """
175
+ Return the ELO ratings from all prior games in the DB
176
+ """
177
+ games = get_games()
178
+ return calculate_elo_ratings(games)
connect.png ADDED
prototype.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ gradio
7
  google.generativeai
8
  anthropic
9
  groq
10
- black
 
 
7
  google.generativeai
8
  anthropic
9
  groq
10
+ black
11
+ pymongo