Upload ChessBot Chess model

Browse files

Files changed (5) hide show

__pycache__/modeling_chessbot.cpython-311.pyc +0 -0
config.json +0 -4
model.safetensors +2 -2
modeling_chessbot.py +392 -141
usage_example.py +17 -9

__pycache__/modeling_chessbot.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/modeling_chessbot.cpython-311.pyc and b/__pycache__/modeling_chessbot.cpython-311.pyc differ

config.json CHANGED Viewed

@@ -1,14 +1,10 @@
 {
-  "architectures": [
-    "ChessBotModel"
-  ],
   "d_ff": 736,
   "d_model": 512,
   "max_position_embeddings": 64,
   "model_type": "chessbot",
   "num_heads": 8,
   "num_layers": 10,
-  "torch_dtype": "float32",
   "transformers_version": "4.53.1",
   "vocab_size": 1929
 }

 {
   "d_ff": 736,
   "d_model": 512,
   "max_position_embeddings": 64,
   "model_type": "chessbot",
   "num_heads": 8,
   "num_layers": 10,
   "transformers_version": "4.53.1",
   "vocab_size": 1929
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18bfb31a333bcc46e2d747315626a030855f913c1e3b129ee08d8d979659fd14
-size 122277600

 version https://git-lfs.github.com/spec/v1
+oid sha256:274e6c174ae963a3ad25960fb50de368c9a8fe937719d6d78d7ab55c262ae2c1
+size 126985096

modeling_chessbot.py CHANGED Viewed

@@ -1,15 +1,23 @@
 """
-Standalone ChessBot Model for HuggingFace Hub
-Contains all necessary code to run the model without external dependencies
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
 from transformers.modeling_outputs import BaseModelOutput
-import chess
-import numpy as np
 from typing import Optional, Tuple
 import math
@@ -32,124 +40,66 @@ class ChessBotConfig(PretrainedConfig):
         max_position_embeddings: int = 64,
         **kwargs,
     ):
         self.num_layers = num_layers
         self.d_model = d_model
         self.d_ff = d_ff
         self.num_heads = num_heads
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
-        super().__init__(**kwargs)
-# Attention modules
-class RelativeMultiHeadAttention2(nn.Module):
     """
-    Relative Multi-Head Attention mechanism
     """
-    def __init__(self, d_model: int = 512, num_heads: int = 16, dropout_p: float = 0.1):
-        super(RelativeMultiHeadAttention2, self).__init__()
-        assert d_model % num_heads == 0
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.d_head = int(d_model / num_heads)
-        self.query_proj = nn.Linear(d_model, d_model)
-        self.key_proj = nn.Linear(d_model, d_model)
-        self.value_proj = nn.Linear(d_model, d_model)
-        self.pos_proj = nn.Linear(d_model, d_model, bias=False)
-        self.dropout = nn.Dropout(p=dropout_p)
-        self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
-        self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
-        torch.nn.init.xavier_uniform_(self.u_bias)
-        torch.nn.init.xavier_uniform_(self.v_bias)
-        self.out_proj = nn.Linear(d_model, d_model)
-    def forward(self, query, key, value, pos_embedding, mask=None):
-        batch_size = value.size(0)
-        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
-        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
-        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
-        pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
-        query = query.permute(0, 2, 1, 3)
-        query_with_u_bias = query + self.u_bias.unsqueeze(1)
-        query_with_v_bias = query + self.v_bias.unsqueeze(1)
-        content_score = torch.matmul(query_with_u_bias, key.transpose(-1, -2))
-        pos_score = torch.matmul(query_with_v_bias, pos_embedding.permute(0, 2, 3, 1))
-        pos_score = self._compute_relative_positional_encoding(pos_score)
-        score = (content_score + pos_score) / math.sqrt(self.d_head)
-        if mask is not None:
-            score.masked_fill_(mask, -float('inf'))
-        attn = F.softmax(score, -1)
-        attn = self.dropout(attn)
-        context = torch.matmul(attn, value).transpose(1, 2)
-        context = context.contiguous().view(batch_size, -1, self.d_model)
-        return self.out_proj(context)
-    def _compute_relative_positional_encoding(self, pos_score):
-        batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
-        zeros = pos_score.new_zeros(batch_size, num_heads, seq_length1, 1)
-        padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
-        padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
-        pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
-        return pos_score
-# Utility functions
-def fen_to_tensor(fen: str):
-    """Convert FEN string to tensor representation"""
     board = chess.Board(fen)
-    P = 19  # 12 planes for pieces + 1 for side to play + 1 for en passant + 4 for castling + 1 for 50-move rule
-    tensor = np.zeros((8, 8, P), dtype=np.float32)
     piece_map = {
         'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5,  # White pieces
         'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11  # Black pieces
     }
-    # Populate piece planes
-    for square, piece in board.piece_map().items():
-        rank, file = divmod(square, 8)
-        plane = piece_map[piece.symbol()]
-        tensor[7 - rank, file, plane] = 1.0  # Flip rank to align with standard board representation
-    # Side to play plane
-    tensor[:, :, 12] = 1.0 if board.turn == chess.WHITE else 0.0
-    # En passant plane
-    if board.ep_square is not None:
-        rank, file = divmod(board.ep_square, 8)
-        tensor[7 - rank, file, 13] = 1.0
-    # Castling rights planes (4 total: white kingside, white queenside, black kingside, black queenside)
-    tensor[:, :, 14] = 1.0 if board.has_kingside_castling_rights(chess.WHITE) else 0.0
-    tensor[:, :, 15] = 1.0 if board.has_queenside_castling_rights(chess.WHITE) else 0.0
-    tensor[:, :, 16] = 1.0 if board.has_kingside_castling_rights(chess.BLACK) else 0.0
-    tensor[:, :, 17] = 1.0 if board.has_queenside_castling_rights(chess.BLACK) else 0.0
-    # 50-move rule plane (normalized to [0,1])
-    tensor[:, :, 18] = min(board.halfmove_clock / 100.0, 1.0)
     return tensor
-# Policy index (chess moves vocabulary)
 policy_index = [
     "a1b1", "a1c1", "a1d1", "a1e1", "a1f1", "a1g1", "a1h1", "a1a2", "a1b2",
     "a1c2", "a1a3", "a1b3", "a1c3", "a1a4", "a1d4", "a1a5", "a1e5", "a1a6",
@@ -370,6 +320,68 @@ policy_index = [
     "<thinking>","</thinking>","end_variation","end","padding_token"
 ]
 # Model components
 class MaGating(nn.Module):
     def __init__(self, d_model):
@@ -425,48 +437,40 @@ class AbsolutePositionalEncoder(nn.Module):
 class ValueHead(nn.Module):
     def __init__(self, d_model):
         super().__init__()
-        self.linear1 = nn.Linear(d_model, d_model)
-        self.linear2 = nn.Linear(d_model, d_model)
-        self.linear3 = nn.Linear(d_model, 3)
-        self.gelu = nn.GELU()
-        self.layernorm1 = nn.LayerNorm(d_model)
-        self.layernorm2 = nn.LayerNorm(d_model)
     def forward(self, x):
-        x = x.mean(dim=-2)
-        x = self.linear1(x)
-        x = self.gelu(x)
-        x = self.layernorm1(x)
-        x = self.linear2(x)
-        x = self.gelu(x)
-        x = self.layernorm2(x)
-        x = self.linear3(x)
         return x
 class ValueHeadQ(nn.Module):
     def __init__(self, d_model):
         super().__init__()
-        self.linear1 = nn.Linear(d_model, d_model)
-        self.linear2 = nn.Linear(d_model, d_model)
-        self.linear3 = nn.Linear(d_model, 3)
-        self.gelu = nn.GELU()
-        self.layernorm1 = nn.LayerNorm(d_model)
-        self.layernorm2 = nn.LayerNorm(d_model)
     def forward(self, x):
-        x = x.mean(dim=-2)
-        x = self.linear1(x)
-        x = self.gelu(x)
-        x = self.layernorm1(x)
-        x = self.linear2(x)
-        x = self.gelu(x)
-        x = self.layernorm2(x)
-        x = self.linear3(x)
         return x
-# Main model class
 class ChessBotPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
@@ -491,19 +495,19 @@ class ChessBotPreTrainedModel(PreTrainedModel):
 class ChessBotModel(ChessBotPreTrainedModel):
     """
-    HuggingFace compatible ChessBot Chess model
     """
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        # Initialize the same components as the original BT4 model
         self.is_thinking_model = False
         self.d_model = config.d_model
         self.num_layers = config.num_layers
-        # Model layers
         self.layers = nn.ModuleList([
             EncoderLayer(config.d_model, config.d_ff, config.num_heads)
             for _ in range(config.num_layers)
@@ -523,11 +527,90 @@ class ChessBotModel(ChessBotPreTrainedModel):
         # Initialize weights
         self.post_init()
     def forward(self, input_ids, attention_mask=None, compute_loss=False):
         """
-        Forward pass compatible with Hugging Face interface
         """
-        x = input_ids
         b, seq_len, _, _, emb = x.size()
         x = x.view(b * seq_len, 64, emb)
@@ -537,9 +620,8 @@ class ChessBotModel(ChessBotPreTrainedModel):
         x = self.ma_gating(x)
         pos_enc = self.positional(x)
-        for layer in self.layers:
-            x = layer(x, pos_enc)
         value_h = self.value_head(x)
         value_h = value_h.view(b, seq_len, 3)
@@ -561,12 +643,23 @@ class ChessBotModel(ChessBotPreTrainedModel):
         policy = self.policy_head(policy_attn_logits)
         return BaseModelOutput(
             last_hidden_state=x,
             hidden_states=None,
             attentions=None,
         ), policy, value_h, value_h_q
     def get_move_from_fen_no_thinking(self, fen, T=1, device="cuda", force_legal=True, return_probs=False):
         """
         Get a move from FEN string without thinking
@@ -627,11 +720,169 @@ class ChessBotModel(ChessBotPreTrainedModel):
         return selected_move
 # Register the configuration and model with transformers
 AutoConfig.register("chessbot", ChessBotConfig)
 AutoModel.register(ChessBotConfig, ChessBotModel)
-# For backward compatibility, create aliases
 ChessBot = ChessBotModel
-BT4Model = ChessBotModel  # Keep for backward compatibility

 """
+Standalone ChessBot Chess Model
+This file contains all the necessary code to run the ChessBot model
+without requiring the HFChessRL package installation.
+Requirements:
+- torch>=2.0.0
+- transformers>=4.30.0
+- python-chess>=1.10.0
+- numpy>=1.21.0
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import numpy as np
+import chess
 from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
 from transformers.modeling_outputs import BaseModelOutput
 from typing import Optional, Tuple
 import math
         max_position_embeddings: int = 64,
         **kwargs,
     ):
+        super().__init__(**kwargs)
         self.num_layers = num_layers
         self.d_model = d_model
         self.d_ff = d_ff
         self.num_heads = num_heads
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
+# FEN encoding function
+def fen_to_tensor(fen: str):
     """
+    Convert FEN string to tensor representation for the model.
     """
     board = chess.Board(fen)
+    tensor = np.zeros((8, 8, 19), dtype=np.float32)
+    # Piece mapping
     piece_map = {
         'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5,  # White pieces
         'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11  # Black pieces
     }
+    # Fill piece positions
+    for square in chess.SQUARES:
+        piece = board.piece_at(square)
+        if piece:
+            row = 7 - (square // 8)  # Flip vertically for proper orientation
+            col = square % 8
+            tensor[row, col, piece_map[piece.symbol()]] = 1.0
+    # Add metadata channels
+    # Channel 12: White to move
+    if board.turn == chess.WHITE:
+        tensor[:, :, 12] = 1.0
+    # Channel 13: Black to move
+    if board.turn == chess.BLACK:
+        tensor[:, :, 13] = 1.0
+    # Castling rights
+    if board.has_kingside_castling_rights(chess.WHITE):
+        tensor[:, :, 14] = 1.0
+    if board.has_queenside_castling_rights(chess.WHITE):
+        tensor[:, :, 15] = 1.0
+    if board.has_kingside_castling_rights(chess.BLACK):
+        tensor[:, :, 16] = 1.0
+    if board.has_queenside_castling_rights(chess.BLACK):
+        tensor[:, :, 17] = 1.0
+    # En passant
+    if board.ep_square is not None:
+        ep_row = 7 - (board.ep_square // 8)
+        ep_col = board.ep_square % 8
+        tensor[ep_row, ep_col, 18] = 1.0
     return tensor
+# Complete policy index with all 1929 moves
 policy_index = [
     "a1b1", "a1c1", "a1d1", "a1e1", "a1f1", "a1g1", "a1h1", "a1a2", "a1b2",
     "a1c2", "a1a3", "a1b3", "a1c3", "a1a4", "a1d4", "a1a5", "a1e5", "a1a6",
     "<thinking>","</thinking>","end_variation","end","padding_token"
 ]
+# Attention mechanism
+class RelativeMultiHeadAttention2(nn.Module):
+    def __init__(self, d_model: int = 512, num_heads: int = 16, dropout_p: float = 0.1):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_head = d_model // num_heads
+        self.sqrt_dim = math.sqrt(d_model)
+        self.query_proj = nn.Linear(d_model, d_model)
+        self.key_proj = nn.Linear(d_model, d_model)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.pos_proj = nn.Linear(d_model, d_model)
+        self.out_proj = nn.Linear(d_model, d_model)
+        self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        torch.nn.init.xavier_uniform_(self.u_bias)
+        torch.nn.init.xavier_uniform_(self.v_bias)
+        self.dropout = nn.Dropout(dropout_p)
+    def forward(self, query, key, value, pos_embedding, mask=None):
+        batch_size = value.size(0)
+        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
+        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
+        content_score = torch.matmul((query + self.u_bias).transpose(1, 2), key.transpose(2, 3))
+        pos_score = torch.matmul((query + self.v_bias).transpose(1, 2), pos_embedding.permute(0, 2, 3, 1))
+        pos_score = self._compute_relative_positional_encoding(pos_score)
+        score = (content_score + pos_score) / self.sqrt_dim
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            score.masked_fill_(mask, -1e9)
+        attn = F.softmax(score, -1)
+        attn = self.dropout(attn)
+        context = torch.matmul(attn, value).transpose(1, 2)
+        context = context.contiguous().view(batch_size, -1, self.d_model)
+        return self.out_proj(context)
+    def _compute_relative_positional_encoding(self, pos_score):
+        batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
+        zeros = pos_score.new_zeros(batch_size, num_heads, seq_length1, 1)
+        padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
+        padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
+        pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
+        return pos_score
 # Model components
 class MaGating(nn.Module):
     def __init__(self, d_model):
 class ValueHead(nn.Module):
     def __init__(self, d_model):
         super().__init__()
+        self.dense1 = nn.Linear(d_model, 128)
+        self.dense2 = nn.Linear(128*64, 128)
+        self.dense3 = nn.Linear(128, 3)
     def forward(self, x):
+        b, _, _ = x.size()
+        x = self.dense1(x)
+        x = F.gelu(x)
+        x = x.view(b, -1)
+        x = self.dense2(x)
+        x = F.gelu(x)
+        x = self.dense3(x)
         return x
 class ValueHeadQ(nn.Module):
     def __init__(self, d_model):
         super().__init__()
+        self.dense1 = nn.Linear(d_model, 128)
+        self.dense2 = nn.Linear(128*64, 128)
+        self.dense3 = nn.Linear(128, 3)
     def forward(self, x):
+        b, _, _ = x.size()
+        x = self.dense1(x)
+        x = F.gelu(x)
+        x = x.view(b, -1)
+        x = self.dense2(x)
+        x = F.gelu(x)
+        x = self.dense3(x)
         return x
+# Main HuggingFace compatible model class
 class ChessBotPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
 class ChessBotModel(ChessBotPreTrainedModel):
     """
+    HuggingFace compatible ChessBot Chess model with ALL original functionality
     """
     def __init__(self, config):
         super().__init__(config)
         self.config = config
+        # Initialize exactly like the original BT4 model
         self.is_thinking_model = False
         self.d_model = config.d_model
         self.num_layers = config.num_layers
+        # Model layers - same as original
         self.layers = nn.ModuleList([
             EncoderLayer(config.d_model, config.d_ff, config.num_heads)
             for _ in range(config.num_layers)
         # Initialize weights
         self.post_init()
+    @classmethod
+    def from_pretrained(cls, model_path, **kwargs):
+        """
+        Load a pretrained model from a directory (HuggingFace compatible)
+        """
+        import os
+        # Load config
+        config_path = os.path.join(model_path, "config.json")
+        if os.path.exists(config_path):
+            config = ChessBotConfig.from_pretrained(model_path)
+        else:
+            config = ChessBotConfig()
+        # Create model instance
+        model = cls(config)
+        # Load weights
+        model_file = None
+        for filename in ["pytorch_model.bin", "model.safetensors"]:
+            full_path = os.path.join(model_path, filename)
+            if os.path.exists(full_path):
+                model_file = full_path
+                break
+        if model_file is None:
+            raise FileNotFoundError(f"No model file found in {model_path}")
+        if model_file.endswith('.safetensors'):
+            # Handle safetensors format
+            try:
+                from safetensors import safe_open
+                state_dict = {}
+                with safe_open(model_file, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+            except ImportError:
+                raise ImportError("safetensors library is required to load .safetensors files. Install with: pip install safetensors")
+        else:
+            # Handle pytorch format
+            state_dict = torch.load(model_file, map_location="cpu")
+        # Load state dict into model
+        model.load_state_dict(state_dict, strict=False)
+        return model
+    def save_pretrained(self, save_directory, safe_serialization=False):
+        """
+        Save the model to a directory (HuggingFace compatible)
+        """
+        import os
+        os.makedirs(save_directory, exist_ok=True)
+        # Save config
+        self.config.save_pretrained(save_directory)
+        # Save model weights
+        if safe_serialization:
+            try:
+                from safetensors.torch import save_file
+                model_path = os.path.join(save_directory, "model.safetensors")
+                save_file(self.state_dict(), model_path)
+            except ImportError:
+                print("⚠ Warning: safetensors not available, falling back to pytorch_model.bin")
+                model_path = os.path.join(save_directory, "pytorch_model.bin")
+                torch.save(self.state_dict(), model_path)
+        else:
+            model_path = os.path.join(save_directory, "pytorch_model.bin")
+            torch.save(self.state_dict(), model_path)
     def forward(self, input_ids, attention_mask=None, compute_loss=False):
         """
+        Forward pass compatible with both HuggingFace interface and original interface
         """
+        # Handle both HF interface (input_ids) and original interface (tuple)
+        if isinstance(input_ids, tuple):
+            inp = input_ids
+            x = inp[0]
+            compute_loss = compute_loss or len(inp) > 1
+        else:
+            x = input_ids
+            inp = (x,)
         b, seq_len, _, _, emb = x.size()
         x = x.view(b * seq_len, 64, emb)
         x = self.ma_gating(x)
         pos_enc = self.positional(x)
+        for i in range(self.num_layers):
+            x = self.layers[i](x, pos_enc)
         value_h = self.value_head(x)
         value_h = value_h.view(b, seq_len, 3)
         policy = self.policy_head(policy_attn_logits)
+        if compute_loss:
+            targets = inp[1]
+            true_values = inp[3]
+            q_values = inp[4]
+            loss_policy = F.cross_entropy(policy.view(-1, policy.size(-1)), targets.view(-1), ignore_index=1928)
+            z = torch.argmax(true_values, dim=-1)
+            loss_value = F.cross_entropy(value_h.view(-1, value_h.size(-1)), z.view(-1), ignore_index=3)
+            value_h_q = torch.softmax(value_h_q, dim=-1)
+            loss_q = F.mse_loss(value_h_q.view(-1, value_h_q.size(-1)), q_values.view(-1, 3))
+            return policy, value_h, loss_policy, loss_value, loss_q, targets, z
         return BaseModelOutput(
             last_hidden_state=x,
             hidden_states=None,
             attentions=None,
         ), policy, value_h, value_h_q
     def get_move_from_fen_no_thinking(self, fen, T=1, device="cuda", force_legal=True, return_probs=False):
         """
         Get a move from FEN string without thinking
         return selected_move
+    def get_position_value(self, fen, device="cuda"):
+        """
+        Get the value evaluation for a given FEN position.
+        Returns the value vector [black_win_prob, draw_prob, white_win_prob]
+        """
+        x = torch.from_numpy(fen_to_tensor(fen)).to(device).to(torch.float32)
+        x = x.view(1, 1, 8, 8, 19)
+        # Forward pass through the model to get value
+        with torch.no_grad():
+            # We need to run through the model layers to get to value_head
+            b, seq_len, _, _, emb = x.size()
+            x_processed = x.view(b * seq_len, 64, emb)
+            x_processed = self.linear1(x_processed)
+            x_processed = F.gelu(x_processed)
+            x_processed = self.layernorm1(x_processed)
+            x_processed = self.ma_gating(x_processed)
+            pos_enc = self.positional(x_processed)
+            for i in range(self.num_layers):
+                x_processed = self.layers[i](x_processed, pos_enc)
+            value_logits = self.value_head_q(x_processed)
+            value_logits = value_logits.view(b, seq_len, 3)
+            value_logits = torch.softmax(value_logits, dim=-1)
+        return value_logits.squeeze()  # Remove batch and sequence dimensions
+    def get_batch_position_values(self, fens, device="cuda"):
+        """
+        Get the value evaluation for a batch of FEN positions efficiently.
+        Args:
+            fens: List of FEN strings
+            device: Device to run computations on
+        Returns:
+            value_probs: Tensor of shape [batch_size, 3] with [black_win_prob, draw_prob, white_win_prob] for each position
+        """
+        if len(fens) == 0:
+            return torch.empty(0, 3, device=device)
+        # Convert all FENs to tensors and stack them
+        position_tensors = []
+        for fen in fens:
+            x = torch.from_numpy(fen_to_tensor(fen)).to(device).to(torch.float32)
+            position_tensors.append(x)
+        # Stack to create batch: [batch_size, 8, 8, 19]
+        batch_x = torch.stack(position_tensors, dim=0)
+        # Reshape to [batch_size, 1, 8, 8, 19] for the model
+        batch_x = batch_x.unsqueeze(1)
+        # Forward pass through the model to get values
+        with torch.no_grad():
+            b, seq_len, _, _, emb = batch_x.size()
+            x_processed = batch_x.view(b * seq_len, 64, emb)
+            x_processed = self.linear1(x_processed)
+            x_processed = F.gelu(x_processed)
+            x_processed = self.layernorm1(x_processed)
+            x_processed = self.ma_gating(x_processed)
+            pos_enc = self.positional(x_processed)
+            for i in range(self.num_layers):
+                x_processed = self.layers[i](x_processed, pos_enc)
+            value_logits = self.value_head_q(x_processed)
+            value_logits = value_logits.view(b, seq_len, 3)
+            value_logits = torch.softmax(value_logits, dim=-1)
+        return value_logits.squeeze(1)  # Remove sequence dimension, keep batch dimension
+    def calculate_move_values(self, fen, device="cuda"):
+        """
+        Calculate the value for each legal move from the given position efficiently using batching.
+        For white to move, value = white_win_prob - black_win_prob
+        For black to move, value = black_win_prob - white_win_prob
+        """
+        board = chess.Board()
+        board.set_fen(fen)
+        # Determine whose turn it is
+        is_white_turn = board.turn == chess.WHITE
+        legal_moves = list(board.legal_moves)
+        if len(legal_moves) == 0:
+            return [], torch.empty(0, device=device)
+        # Get all resulting FENs after each move
+        resulting_fens = []
+        for move in legal_moves:
+            board.push(move)
+            resulting_fens.append(board.fen())
+            board.pop()
+        # Batch process all positions in a single inference
+        batch_value_q = self.get_batch_position_values(resulting_fens, device)
+        # Calculate values from the current player's perspective
+        # batch_value_probs[:, 0] = black_win_prob, [:, 1] = draw_prob, [:, 2] = white_win_prob
+        batch_value_q = batch_value_q[:,2]-batch_value_q[:,0]
+        if is_white_turn:
+            # White's perspective: white_win_prob - black_win_prob
+            player_values = batch_value_q
+        else:
+            # Black's perspective: black_win_prob - white_win_prob
+            player_values = -batch_value_q
+        return legal_moves, player_values
+    def get_best_move_value(self, fen, T=1, device="cuda", return_probs=False):
+        """
+        Determine the best move based on the value of resulting positions using efficient batching.
+        Args:
+            fen: FEN string of the position (works for both white and black to move)
+            T: Temperature for sampling (T=0 for greedy, T>0 for stochastic)
+            device: Device to run computations on
+            return_probs: Whether to return the probability distribution
+        Returns:
+            move: UCI string of the selected move
+            probs (optional): probability distribution over moves if return_probs=True
+        """
+        legal_moves, move_values = self.calculate_move_values(fen, device)
+        if len(legal_moves) == 0:
+            raise ValueError("No legal moves available")
+        if T == 0:
+            # Greedy selection - choose move with highest value
+            best_idx = torch.argmax(move_values)
+            selected_move = legal_moves[best_idx]
+        else:
+            # Stochastic selection based on move values
+            # Convert values to probabilities using softmax with temperature
+            probs = F.softmax(move_values / T, dim=0)
+            # Sample according to probabilities
+            sampled_idx = torch.multinomial(probs, num_samples=1)
+            selected_move = legal_moves[sampled_idx.item()]
+        # Convert chess.Move to UCI string
+        move_uci = selected_move.uci()
+        if return_probs:
+            if T == 0:
+                # Create one-hot distribution for greedy case
+                probs = torch.zeros_like(move_values)
+                probs[best_idx] = 1.0
+            else:
+                probs = F.softmax(move_values / T, dim=0)
+            # Create dictionary with move strings as keys
+            move_dict = {}
+            for i, move in enumerate(legal_moves):
+                move_dict[move.uci()] = probs[i].item()
+            return move_uci, move_dict
+        return move_uci
 # Register the configuration and model with transformers
 AutoConfig.register("chessbot", ChessBotConfig)
 AutoModel.register(ChessBotConfig, ChessBotModel)
+# For backward compatibility
 ChessBot = ChessBotModel
+BT4Model = ChessBotModel

usage_example.py CHANGED Viewed

@@ -10,25 +10,33 @@ This model can be used without installing any external packages except:
 import torch
 import sys
-sys.path.append("./")  # Add the model directory to path
 from modeling_chessbot import ChessBotModel, ChessBotConfig
 # Load the model
 config = ChessBotConfig()
-model = ChessBotModel.from_pretrained("./")
-# Alternative: You can also try AutoModel (may require additional setup)
-# from transformers import AutoModel
-# model = AutoModel.from_pretrained("./", trust_remote_code=True)
 # Example usage
 fen = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
-# Get the best move
-move = model.get_move_from_fen_no_thinking(fen, T=0.1, device=device)
-print(f"Best move: {move}")
 # Get move probabilities
 probs = model.get_move_from_fen_no_thinking(fen, T=0.1, device=device, return_probs=True)

 import torch
 import sys
+import os
+# Get the directory of this script (the model directory)
+model_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(model_dir)  # Add the model directory to path
 from modeling_chessbot import ChessBotModel, ChessBotConfig
 # Load the model
 config = ChessBotConfig()
+model = ChessBotModel.from_pretrained(model_dir)
 # Example usage
 fen = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = model.to(device)
+# Get the best move using policy
+policy_move = model.get_move_from_fen_no_thinking(fen, T=0.1, device=device)
+print(f"Policy-based move: {policy_move}")
+# Get the best move using value analysis
+value_move = model.get_best_move_value(fen, T=0.1, device=device)
+print(f"Value-based move: {value_move}")
+# Get position evaluation
+position_value = model.get_position_value(fen, device=device)
+print(f"Position value [black_win, draw, white_win]: {position_value}")
 # Get move probabilities
 probs = model.get_move_from_fen_no_thinking(fen, T=0.1, device=device, return_probs=True)