Spaces:

Debito
/

mamba-encoder-swarm_app

Sleeping

App Files Files Community

Debito commited on 13 days ago

Commit

055a9c8

verified ·

1 Parent(s): 85d4a54

Upload 8 files

Browse files

Files changed (8) hide show

core/config.py +44 -0
core/embedding.py +32 -0
core/mamba.py +81 -0
core/mamba_swarm_integration.py +323 -0
core/model.py +106 -0
core/preprocess.py +54 -0
core/stateSpace.py +71 -0
core/tokenizer.py +63 -0

core/config.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# =============================================================================
+# core/config.py
+# =============================================================================
+import torch
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+@dataclass
+class MambaConfig:
+    # Model architecture
+    vocab_size: int = 50257
+    d_model: int = 1024
+    n_layers: int = 12
+    d_inner: int = 2048
+    d_state: int = 16
+    d_conv: int = 4
+    dt_rank: Optional[int] = None
+    bias: bool = False
+    conv_bias: bool = True
+    # Training
+    max_seq_len: int = 2048
+    batch_size: int = 8
+    learning_rate: float = 1e-4
+    weight_decay: float = 0.1
+    warmup_steps: int = 1000
+    max_steps: int = 100000
+    # Swarm specific
+    num_specialists: int = 100
+    specialist_domains: List[str] = None
+    shared_embedding: bool = True
+    hierarchical_sharing: bool = True
+    # Hardware
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype: torch.dtype = torch.float16
+    def __post_init__(self):
+        if self.dt_rank is None:
+            self.dt_rank = max(16, self.d_model // 16)
+        if self.specialist_domains is None:
+            self.specialist_domains = [f"domain_{i}" for i in range(self.num_specialists)]

core/embedding.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# =============================================================================
+# core/embedding.py
+# =============================================================================
+import torch
+import torch.nn as nn
+import math
+from config import MambaConfig
+class MambaEmbedding(nn.Module):
+    def __init__(self, config: MambaConfig):
+        super().__init__()
+        self.config = config
+        # Token embeddings (no positional encoding needed for Mamba)
+        self.token_embedding = nn.Embedding(
+            config.vocab_size,
+            config.d_model,
+            dtype=config.dtype
+        )
+        # Initialize embeddings
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input_ids: [batch_size, seq_len]
+        Returns:
+            embeddings: [batch_size, seq_len, d_model]
+        """
+        embeddings = self.token_embedding(input_ids)
+        return embeddings

core/mamba.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# =============================================================================
+# core/mamba.py
+# =============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from core.stateSpace import StateSpaceModel
+from utils.conv_layer import Mamba1DConv
+class RMSNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(d_model))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.norm(dim=-1, keepdim=True) * (x.shape[-1] ** -0.5)
+        return x / (norm + self.eps) * self.weight
+class MambaBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # Projections
+        self.in_proj = nn.Linear(config.d_model, config.d_inner * 2, bias=config.bias)
+        self.out_proj = nn.Linear(config.d_inner, config.d_model, bias=config.bias)
+        # Convolution for local context
+        self.conv1d = Mamba1DConv(config.d_inner, config.d_conv, config.conv_bias)
+        # State space model
+        self.ssm = StateSpaceModel(
+            d_inner=config.d_inner,
+            d_state=config.d_state,
+            dt_rank=config.dt_rank,
+            bias=config.bias
+        )
+        # Activation
+        self.act = F.silu
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [batch, seq_len, d_model]
+        Returns:
+            output: [batch, seq_len, d_model]
+        """
+        batch_size, seq_len, d_model = x.shape
+        # Input projection
+        xz = self.in_proj(x)  # [batch, seq_len, 2*d_inner]
+        x, z = xz.chunk(2, dim=-1)  # Each [batch, seq_len, d_inner]
+        # Apply convolution
+        x = self.act(self.conv1d(x))
+        # Apply state space model
+        y = self.ssm(x)
+        # Apply gating with z
+        y = y * self.act(z)
+        # Output projection
+        output = self.out_proj(y)
+        return output
+class MambaLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.norm = RMSNorm(config.d_model)
+        self.mamba_block = MambaBlock(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Pre-norm architecture
+        residual = x
+        x = self.norm(x)
+        x = self.mamba_block(x)
+        return x + residual

core/mamba_swarm_integration.py ADDED Viewed

	@@ -0,0 +1,323 @@

+#!/usr/bin/env python3
+"""
+Mamba Encoder Swarm - Integration with Existing Mamba Implementation
+Uses your existing Mamba components as building blocks for the swarm architecture
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Optional, Tuple
+# Import your existing Mamba components
+from core.config import MambaConfig
+from core.model import MambaModel
+from core.mamba import MambaLayer, RMSNorm
+from core.embedding import MambaEmbedding
+class SwarmRouter(nn.Module):
+    """
+    Routes input tokens to different encoder instances
+    This is the NEW component that enables the swarm architecture
+    """
+    def __init__(self, d_model: int, num_encoders: int, routing_strategy: str = "learned"):
+        super().__init__()
+        self.d_model = d_model
+        self.num_encoders = num_encoders
+        self.routing_strategy = routing_strategy
+        if routing_strategy == "learned":
+            # Neural router that learns optimal token distribution
+            self.router_network = nn.Sequential(
+                nn.Linear(d_model, d_model // 2),
+                nn.SiLU(),
+                nn.Linear(d_model // 2, num_encoders),
+                nn.Softmax(dim=-1)
+            )
+        # Load balancing coefficient
+        self.load_balance_coef = 0.01
+    def forward(self, x: torch.Tensor) -> Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor]:
+        """
+        Route tokens to encoder instances
+        Args:
+            x: [batch, seq_len, d_model]
+        Returns:
+            encoder_inputs: List of inputs for each encoder
+            routing_weights: Weights for aggregation [batch, seq_len, num_encoders]
+            load_balance_loss: Loss term for training
+        """
+        batch_size, seq_len, d_model = x.shape
+        if self.routing_strategy == "learned":
+            # Learn routing patterns
+            routing_logits = self.router_network(x)  # [batch, seq_len, num_encoders]
+            routing_weights = F.gumbel_softmax(routing_logits, tau=1.0, hard=False)
+            # Load balancing loss to encourage equal usage
+            avg_routing = routing_weights.mean(dim=[0, 1])
+            load_balance_loss = self.load_balance_coef * torch.var(avg_routing)
+        else:  # Round-robin for simplicity
+            seq_indices = torch.arange(seq_len, device=x.device)
+            encoder_ids = seq_indices % self.num_encoders
+            routing_weights = F.one_hot(encoder_ids, self.num_encoders).float()
+            routing_weights = routing_weights.unsqueeze(0).expand(batch_size, -1, -1)
+            load_balance_loss = torch.tensor(0.0, device=x.device)
+        # Create weighted inputs for each encoder
+        encoder_inputs = []
+        for i in range(self.num_encoders):
+            weight = routing_weights[:, :, i:i+1]  # [batch, seq_len, 1]
+            encoder_input = x * weight
+            encoder_inputs.append(encoder_input)
+        return encoder_inputs, routing_weights, load_balance_loss
+class SwarmAggregator(nn.Module):
+    """
+    Aggregates outputs from all encoder instances
+    This is the NEW component that combines swarm outputs
+    """
+    def __init__(self, d_model: int, num_encoders: int):
+        super().__init__()
+        self.d_model = d_model
+        self.num_encoders = num_encoders
+        # Attention-based aggregation
+        self.attention = nn.MultiheadAttention(
+            embed_dim=d_model,
+            num_heads=8,
+            batch_first=True
+        )
+        # Output processing
+        self.norm = RMSNorm(d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+    def forward(self, encoder_outputs: List[torch.Tensor], routing_weights: torch.Tensor) -> torch.Tensor:
+        """
+        Aggregate encoder outputs using learned attention
+        Args:
+            encoder_outputs: List of [batch, seq_len, d_model] tensors
+            routing_weights: [batch, seq_len, num_encoders]
+        Returns:
+            aggregated: [batch, seq_len, d_model]
+        """
+        batch_size, seq_len, d_model = encoder_outputs[0].shape
+        # Stack and weight encoder outputs
+        stacked = torch.stack(encoder_outputs, dim=2)  # [batch, seq_len, num_encoders, d_model]
+        routing_expanded = routing_weights.unsqueeze(-1)  # [batch, seq_len, num_encoders, 1]
+        weighted = stacked * routing_expanded
+        # Initial aggregation
+        initial = weighted.sum(dim=2)  # [batch, seq_len, d_model]
+        # Attention-based refinement
+        encoder_sequence = stacked.view(batch_size, seq_len * self.num_encoders, d_model)
+        refined, _ = self.attention(initial, encoder_sequence, encoder_sequence)
+        # Final processing
+        output = self.output_proj(refined)
+        output = self.norm(output + initial)  # Residual connection
+        return output
+class MambaEncoderSwarmModel(nn.Module):
+    """
+    Complete Swarm Model using your existing Mamba components
+    Architecture:
+    1. Use your MambaEmbedding for input processing
+    2. NEW: Router distributes tokens to encoder swarm
+    3. Use your MambaLayer instances as shared encoders
+    4. NEW: Aggregator combines encoder outputs
+    5. Use your MambaLayer instances for decoder
+    6. Use your existing LM head for output
+    """
+    def __init__(self, config: MambaConfig, num_encoders: int = 8, routing_strategy: str = "learned"):
+        super().__init__()
+        self.config = config
+        self.num_encoders = num_encoders
+        # Use your existing embedding
+        self.embedding = MambaEmbedding(config)
+        # NEW: Swarm components
+        self.router = SwarmRouter(config.d_model, num_encoders, routing_strategy)
+        # Shared encoder (using your MambaLayer)
+        # All encoder instances will use this same layer (weight sharing!)
+        self.shared_encoder_layer = MambaLayer(config)
+        # NEW: Aggregator
+        self.aggregator = SwarmAggregator(config.d_model, num_encoders)
+        # Decoder layers (using your MambaLayer)
+        self.decoder_layers = nn.ModuleList([
+            MambaLayer(config) for _ in range(config.n_layers)
+        ])
+        # Use your existing components
+        self.norm_f = RMSNorm(config.d_model)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, input_ids: torch.Tensor, targets: torch.Tensor = None):
+        """
+        Forward pass through swarm architecture
+        Args:
+            input_ids: [batch, seq_len]
+            targets: [batch, seq_len] (optional, for training)
+        Returns:
+            if targets is None: logits [batch, seq_len, vocab_size]
+            else: (logits, loss, load_balance_loss)
+        """
+        # 1. Embedding (using your existing component)
+        x = self.embedding(input_ids)  # [batch, seq_len, d_model]
+        # 2. Route to encoder swarm
+        encoder_inputs, routing_weights, load_balance_loss = self.router(x)
+        # 3. Process through shared encoder instances
+        encoder_outputs = []
+        for encoder_input in encoder_inputs:
+            # Each instance uses the SAME shared_encoder_layer (weight sharing!)
+            encoder_output = self.shared_encoder_layer(encoder_input)
+            encoder_outputs.append(encoder_output)
+        # 4. Aggregate encoder outputs
+        x = self.aggregator(encoder_outputs, routing_weights)
+        # 5. Process through decoder (using your existing layers)
+        for decoder_layer in self.decoder_layers:
+            x = decoder_layer(x)
+        # 6. Final processing (using your existing components)
+        x = self.norm_f(x)
+        logits = self.lm_head(x)  # [batch, seq_len, vocab_size]
+        if targets is not None:
+            # Compute loss
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                targets.view(-1),
+                ignore_index=-100
+            )
+            return logits, loss, load_balance_loss
+        return logits
+    def generate(self, input_ids: torch.Tensor, max_new_tokens: int = 100,
+                 temperature: float = 1.0, top_k: int = None):
+        """Generate using swarm architecture"""
+        self.eval()
+        for _ in range(max_new_tokens):
+            with torch.no_grad():
+                logits = self.forward(input_ids)
+                logits = logits[:, -1, :] / temperature
+                if top_k is not None:
+                    v, _ = torch.topk(logits, top_k)
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+    def get_num_params(self):
+        """Get number of parameters"""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+def create_swarm_from_existing_config(config: MambaConfig, num_encoders: int = 8) -> MambaEncoderSwarmModel:
+    """
+    Create swarm model using your existing configuration
+    """
+    swarm_model = MambaEncoderSwarmModel(config, num_encoders, routing_strategy="learned")
+    num_params = swarm_model.get_num_params()
+    print(f"🚀 Swarm model created with {num_params:,} parameters ({num_params/1e6:.1f}M)")
+    print(f"📊 Using {num_encoders} encoder instances with shared weights")
+    return swarm_model
+def compare_architectures(config: MambaConfig):
+    """
+    Compare standard Mamba vs Swarm architecture
+    """
+    print("🔍 Architecture Comparison")
+    print("=" * 50)
+    # Standard model (your existing)
+    standard_model = MambaModel(config)
+    standard_params = standard_model.get_num_params()
+    # Swarm model (new architecture)
+    swarm_model = create_swarm_from_existing_config(config, num_encoders=8)
+    swarm_params = swarm_model.get_num_params()
+    print(f"📈 Standard Mamba: {standard_params:,} parameters ({standard_params/1e6:.1f}M)")
+    print(f"🔥 Swarm Mamba:    {swarm_params:,} parameters ({swarm_params/1e6:.1f}M)")
+    print(f"💡 Parameter overhead: {((swarm_params - standard_params) / standard_params * 100):.1f}%")
+    return standard_model, swarm_model
+if __name__ == "__main__":
+    # Test with your existing config
+    from core.config import MambaConfig
+    # Create a test config
+    config = MambaConfig(
+        vocab_size=50257,
+        d_model=512,
+        n_layers=8,
+        d_state=16,
+        d_conv=4,
+        bias=False
+    )
+    print("🧪 Testing Swarm Integration")
+    print("=" * 40)
+    # Compare architectures
+    standard_model, swarm_model = compare_architectures(config)
+    # Test forward pass
+    batch_size, seq_len = 2, 32
+    input_ids = torch.randint(0, config.vocab_size, (batch_size, seq_len))
+    # Test standard model
+    with torch.no_grad():
+        standard_logits = standard_model(input_ids)
+        print(f"✅ Standard model output: {standard_logits.shape}")
+    # Test swarm model
+    with torch.no_grad():
+        swarm_logits = swarm_model(input_ids)
+        print(f"✅ Swarm model output: {swarm_logits.shape}")
+    print(f"\n🎉 Both architectures working! Ready to train the swarm.")

core/model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# =============================================================================
+# core/model.py
+# =============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from core.config import MambaConfig
+from core.embedding import MambaEmbedding
+from core.mamba import MambaLayer, RMSNorm
+class MambaModel(nn.Module):
+    def __init__(self, config: MambaConfig):
+        super().__init__()
+        self.config = config
+        # Embeddings
+        self.embedding = MambaEmbedding(config)
+        # Mamba layers
+        self.layers = nn.ModuleList([
+            MambaLayer(config) for _ in range(config.n_layers)
+        ])
+        # Final normalization
+        self.norm_f = RMSNorm(config.d_model)
+        # Language modeling head
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Tie weights with embedding if specified
+        if hasattr(config, 'tie_word_embeddings') and config.tie_word_embeddings:
+            self.lm_head.weight = self.embedding.token_embedding.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, input_ids: torch.Tensor, targets: torch.Tensor = None):
+        """
+        Args:
+            input_ids: [batch, seq_len]
+            targets: [batch, seq_len] (optional, for training)
+        Returns:
+            if targets is None: logits [batch, seq_len, vocab_size]
+            else: (logits, loss)
+        """
+        # Get embeddings
+        x = self.embedding(input_ids)  # [batch, seq_len, d_model]
+        # Apply Mamba layers
+        for layer in self.layers:
+            x = layer(x)
+        # Final normalization
+        x = self.norm_f(x)
+        # Language modeling head
+        logits = self.lm_head(x)  # [batch, seq_len, vocab_size]
+        if targets is not None:
+            # Compute loss
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                targets.view(-1),
+                ignore_index=-100
+            )
+            return logits, loss
+        return logits
+    def generate(self, input_ids: torch.Tensor, max_new_tokens: int = 100,
+                 temperature: float = 1.0, top_k: int = None):
+        """Generate text autoregressively"""
+        self.eval()
+        for _ in range(max_new_tokens):
+            with torch.no_grad():
+                # Get logits for last token
+                logits = self.forward(input_ids)
+                logits = logits[:, -1, :] / temperature
+                # Apply top-k filtering
+                if top_k is not None:
+                    v, _ = torch.topk(logits, top_k)
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                # Sample next token
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                # Append to sequence
+                input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+    def get_num_params(self):
+        """Get number of parameters"""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)

core/preprocess.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# =============================================================================
+# core/preprocess.py
+# =============================================================================
+import re
+import unicodedata
+from config import MambaConfig
+from typing import List, Dict, Any
+class TextPreprocessor:
+    def __init__(self, config: MambaConfig):
+        self.config = config
+        self.max_length = config.max_seq_len
+    def clean_text(self, text: str) -> str:
+        """Basic text cleaning"""
+        # Normalize unicode
+        text = unicodedata.normalize('NFKC', text)
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove control characters except newlines and tabs
+        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
+        return text.strip()
+    def chunk_text(self, text: str, chunk_size: int = None) -> List[str]:
+        """Split text into chunks for distributed processing"""
+        if chunk_size is None:
+            chunk_size = self.max_length // 2
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for word in words:
+            if current_length + len(word) + 1 > chunk_size and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_length = len(word)
+            else:
+                current_chunk.append(word)
+                current_length += len(word) + 1
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def preprocess_batch(self, texts: List[str]) -> List[str]:
+        """Preprocess a batch of texts"""
+        return [self.clean_text(text) for text in texts]

core/stateSpace.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# =============================================================================
+# core/stateSpace.py
+# =============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.selective_scan import selective_scan_fn
+class StateSpaceModel(nn.Module):
+    def __init__(self, d_inner: int, d_state: int = 16, dt_rank: int = None, bias: bool = False):
+        super().__init__()
+        self.d_inner = d_inner
+        self.d_state = d_state
+        self.dt_rank = dt_rank if dt_rank is not None else max(16, d_inner // 16)
+        # State space parameters
+        self.A_log = nn.Parameter(torch.randn(d_inner, d_state))
+        self.D = nn.Parameter(torch.ones(d_inner))
+        # Projection layers
+        self.x_proj = nn.Linear(d_inner, self.dt_rank + d_state * 2, bias=False)
+        self.dt_proj = nn.Linear(self.dt_rank, d_inner, bias=True)
+        # Initialize parameters
+        self._init_parameters()
+    def _init_parameters(self):
+        # Initialize A with negative values for stability
+        nn.init.uniform_(self.A_log, -4.0, -1.0)
+        # Initialize dt_proj bias to encourage large dt values
+        dt_init_std = self.dt_rank**-0.5
+        with torch.no_grad():
+            self.dt_proj.bias.uniform_(-dt_init_std, dt_init_std)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [batch, seq_len, d_inner]
+        Returns:
+            y: [batch, seq_len, d_inner]
+        """
+        batch_size, seq_len, d_inner = x.shape
+        # Project x to get delta, B, C
+        x_dbl = self.x_proj(x)  # [batch, seq_len, dt_rank + 2*d_state]
+        delta, B, C = torch.split(
+            x_dbl,
+            [self.dt_rank, self.d_state, self.d_state],
+            dim=-1
+        )
+        # Project delta to d_inner
+        delta = self.dt_proj(delta)  # [batch, seq_len, d_inner]
+        # Get A matrix (ensure it's negative for stability)
+        A = -torch.exp(self.A_log)  # [d_inner, d_state]
+        # Apply selective scan
+        y = selective_scan_fn(
+            u=x,
+            delta=delta,
+            A=A,
+            B=B,
+            C=C,
+            D=self.D,
+            delta_softplus=True
+        )
+        return y

core/tokenizer.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# =============================================================================
+# core/tokenizer.py
+# =============================================================================
+from transformers import AutoTokenizer
+import torch
+from config import MambaConfig
+from typing import List, Dict, Union
+class MambaTokenizer:
+    def __init__(self, config: MambaConfig, tokenizer_name: str = "gpt2"):
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        # Add special tokens if needed
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.vocab_size = len(self.tokenizer)
+    def encode(self, text: str, max_length: int = None) -> Dict[str, torch.Tensor]:
+        """Encode text to token ids"""
+        if max_length is None:
+            max_length = self.config.max_seq_len
+        encoded = self.tokenizer(
+            text,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": encoded["input_ids"],
+            "attention_mask": encoded["attention_mask"]
+        }
+    def encode_batch(self, texts: List[str], max_length: int = None) -> Dict[str, torch.Tensor]:
+        """Encode batch of texts"""
+        if max_length is None:
+            max_length = self.config.max_seq_len
+        encoded = self.tokenizer(
+            texts,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": encoded["input_ids"],
+            "attention_mask": encoded["attention_mask"]
+        }
+    def decode(self, token_ids: torch.Tensor, skip_special_tokens: bool = True) -> str:
+        """Decode token ids to text"""
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def decode_batch(self, token_ids: torch.Tensor, skip_special_tokens: bool = True) -> List[str]:
+        """Decode batch of token ids"""
+        return self.tokenizer.batch_decode(token_ids, skip_special_tokens=skip_special_tokens)