Spaces:

piyushgrover
/

SmoLLM-135M

Running

App Files Files Community

piyushgrover commited on Jan 24

Commit

189668a

verified ·

1 Parent(s): 8d780f0

Upload model.py

Browse files

Files changed (1) hide show

model.py +221 -178

model.py CHANGED Viewed

@@ -1,202 +1,245 @@
-# Solving for residual std scaling issue
-import os
-import math
-import time
-import inspect
-from dataclasses import dataclass
 import torch
 import torch.nn as nn
-from torch.nn import functional as F
-class CausalSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        assert config.n_embd % config.n_head == 0
-        # key, query, value projections for all heads, but in a batch
-        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
-        # output projection
-        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
-        self.c_proj.NANGPT_SCALE_INIT = 1
-        # regularization
-        self.n_head = config.n_head
-        self.n_embd = config.n_embd
-        self.register_buffer("bias",
-                             torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size,
-                                                                                               config.block_size))
-    def forward(self, x):
-        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
-        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
-        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
-        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
-        qkv = self.c_attn(x)
-        q, k, v = qkv.split(self.n_embd, dim=2)
-        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
-        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
-        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
-        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
-        att = F.softmax(att, dim=-1)
-        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
-        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
         # output projection
-        y = self.c_proj(y)
         return y
-class MLP(nn.Module):
-    def __init__(self, config):
         super().__init__()
-        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
-        self.gelu = nn.GELU(approximate='tanh')
-        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
-        self.c_proj.NANOGPT_SCALE_INIT = 1
     def forward(self, x):
-        x = self.c_fc(x)
-        x = self.gelu(x)
-        x = self.c_proj(x)
-        return x
-class Block(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.ln_1 = nn.LayerNorm(config.n_embd)
-        self.attn = CausalSelfAttention(config)
-        self.ln_2 = nn.LayerNorm(config.n_embd)
-        self.mlp = MLP(config)
-    def forward(self, x):
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-@dataclass
-class GPTConfig:
-    block_size: int = 1024  # max sequence length
-    vocab_size: int = 50257  # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
-    n_layer: int = 12  # number of layers
-    n_head: int = 8  # number of heads
-    n_embd: int = 768  # embedding dimension
-class GPT(nn.Module):
     def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.transformer = nn.ModuleDict(dict(
-            wte=nn.Embedding(config.vocab_size, config.n_embd),
-            wpe=nn.Embedding(config.block_size, config.n_embd),
-            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
-            ln_f=nn.LayerNorm(config.n_embd),
-        ))
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        # weight sharing
-        self.transformer.wte.weight = self.lm_head.weight
-        # weight initialization
-        self.apply(self._init_weights)
-    def _init_weights(self, module):
-        if isinstance(module, nn.Linear):
-            std = 0.02
-            if hasattr(module, 'NANGPT_SCALE_INIT'):
-                std *= (2 * self.config.n_layer) ** -0.5
-            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-    def forward(self, idx, targets=None):
-        # idx is of shape (B, T)
-        B, T = idx.size()
-        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
-        # forward the token and posisition embeddings
-        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)  # shape (T)
-        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (T, n_embd)
-        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
-        x = tok_emb + pos_emb
-        # forward the blocks of the transformer
-        for block in self.transformer.h:
-            x = block(x)
-        # forward the final layernorm and the classifier
-        x = self.transformer.ln_f(x)
-        logits = self.lm_head(x)  # (B, T, vocab_size)
-        loss = None
-        if targets is not None:
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
-        return logits, loss
-    @classmethod
-    def from_pretrained(cls, model_type):
-        """Loads pretrained GPT-2 model weights from huggingface"""
-        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
-        from transformers import GPT2LMHeadModel
-        print("loading weights from pretrained gpt: %s" % model_type)
-        # n_layer, n_head and n_embd are determined from model_type
-        config_args = {
-            'gpt2': dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
-            'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024),  # 350M params
-            'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280),  # 774M params
-            'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600),  # 1558M params
-        }[model_type]
-        config_args['vocab_size'] = 50257  # always 50257 for GPT model checkpoints
-        config_args['block_size'] = 1024  # always 1024 for GPT model checkpoints
-        # create a from-scratch initialized minGPT model
-        config = GPTConfig(**config_args)
-        model = GPT(config)
-        sd = model.state_dict()
-        sd_keys = sd.keys()
-        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]  # discard this mask / buffer, not a param
-        # init a huggingface/transformers model
-        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
-        sd_hf = model_hf.state_dict()
-        # copy while ensuring all of the parameters are aligned and match in names and shapes
-        sd_keys_hf = sd_hf.keys()
-        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]  # ignore these, just a buffer
-        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]  # same, just the mask (buffer)
-        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
-        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
-        # this means that we have to transpose these weights when we import them
-        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
-        for k in sd_keys_hf:
-            if any(k.endswith(w) for w in transposed):
-                # special treatment for the Conv1D weights we need to transpose
-                assert sd_hf[k].shape[::-1] == sd[k].shape
-                with torch.no_grad():
-                    sd[k].copy_(sd_hf[k].t())
-            else:
-                # vanilla copy over the other parameters
-                assert sd_hf[k].shape == sd[k].shape
-                with torch.no_grad():
-                    sd[k].copy_(sd_hf[k])
-        return model
-    def generate(self, input_tensor, max_length, EOS_TOKEN_ID=50256):
-        output_ids = input_tensor  # Start with input
-        self.eval()
-        for _ in range(max_length - input_tensor.size(1)):
-            logits = self(input_tensor)  # Forward pass
-            if isinstance(logits, tuple):
-                logits = logits[0]
-            next_token = torch.argmax(logits[:, -1, :], dim=-1)  # Get the next token
-            input_tensor = torch.cat([input_tensor, next_token.unsqueeze(0)], dim=1)
-            if next_token.item() == EOS_TOKEN_ID:  # Stop if end-of-sequence token is generated
-                break
-        return input_tensor

+# model.py
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.llama.modeling_llama import (
+    LlamaRotaryEmbedding,
+    LlamaRMSNorm,
+)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class CausalAttention(nn.Module):
+    def __init__(self, hidden_size, num_attention_heads, num_key_value_heads):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = hidden_size // num_attention_heads
+        self.num_key_value_groups = num_attention_heads // num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
+        #self.attention_dropout = attention_dropout
+        self.is_causal = True
+        # Query, Key, Value projections
+        self.q_proj = nn.Linear(hidden_size, self.head_dim * num_attention_heads, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.head_dim * num_key_value_heads, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.head_dim * num_key_value_heads, bias=False)
+        self.o_proj = nn.Linear(hidden_size, hidden_size, bias=False)
+    def forward(self, hidden_states, attention_mask=None, position_embeddings=None):
+        batch, seq_len = hidden_states.shape[:-1]
+        hidden_shape = (batch, seq_len, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        y = F.scaled_dot_product_attention(query_states,
+                                           key_states,
+                                           value_states,
+                                           is_causal=True,
+                                           enable_gqa=True)  # Flash attention
+        y = y.transpose(1, 2).contiguous().view(batch, seq_len, self.hidden_size)  # re-assemble all head outputs side by side
         # output projection
+        y = self.o_proj(y)
         return y
+class MLP(nn.Module):   ###Inspired from LLamaMLP
+    def __init__(self, hidden_size, num_attention_heads, num_key_value_heads, intermediate_size, eps, activation_fn):
         super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = activation_fn
     def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class TransformerBlock(nn.Module):
+    def __init__(self, hidden_size, num_attention_heads, num_key_value_heads, intermediate_size, eps, activation_fn):
+        super(TransformerBlock, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = hidden_size // num_attention_heads
+        assert self.head_dim * num_attention_heads == hidden_size, "Hidden size must be divisible by the number of attention heads."
+        assert self.hidden_size % self.num_key_value_heads == 0, "hidden_size must be divisible by num_key_value_heads"
+        self.layer_norm_1 = LlamaRMSNorm(self.hidden_size, eps=eps)
+        self.attn = CausalAttention(hidden_size, num_attention_heads, num_key_value_heads)
+        # Feedforward layer
+        self.feed_forward = MLP(hidden_size, num_attention_heads, num_key_value_heads, intermediate_size, eps, activation_fn)
+        self.layer_norm_2 = LlamaRMSNorm(self.hidden_size, eps=eps)
+    def forward(self, hidden_states, attention_mask=None, position_embeddings=None):
+        # Layer normalization
+        residual = hidden_states
+        hidden_states = self.layer_norm_1(hidden_states)
+        '''
+        # Query projection
+        query = self.query_proj(hidden_states)
+        query = query.view(hidden_states.size(0), hidden_states.size(1), self.num_attention_heads,
+                           self.head_dim).transpose(1, 2)
+        # Key and Value projections with shared num_key_value_heads
+        key = self.key_proj(hidden_states)
+        value = self.value_proj(hidden_states)
+        key = key.view(hidden_states.size(0), hidden_states.size(1), self.num_key_value_heads,
+                       self.head_dim).transpose(1, 2)
+        value = value.view(hidden_states.size(0), hidden_states.size(1), self.num_key_value_heads,
+                           self.head_dim).transpose(1, 2)
+        # Expand keys and values to match num_attention_heads
+        key = key.repeat_interleave(self.num_attention_heads // self.num_key_value_heads, dim=1)
+        value = value.repeat_interleave(self.num_attention_heads // self.num_key_value_heads, dim=1)
+        # Apply rotary embeddings to query and key
+        cos, sin = position_embeddings
+        query, key = apply_rotary_pos_emb(query, key, cos, sin)
+        # Scaled dot-product attention
+        attention_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, is_causal=True)
+        # Reshape back to [batch_size, seq_length, hidden_size]
+        attention_output = attention_output.transpose(1, 2).contiguous().view(hidden_states.size(0), -1,
+                                                                              self.hidden_size)
+        # Output projection
+        attention_output = self.out_proj(attention_output)
+        '''
+        attention_output = self.attn(hidden_states, position_embeddings=position_embeddings)
+        # Residual connection
+        hidden_states = residual + attention_output
+        # Feedforward layer
+        residual = hidden_states
+        # Feed-forward
+        hidden_states = self.layer_norm_2(hidden_states)
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = residual + feed_forward_output
+        return hidden_states
+class SmollM(nn.Module):
     def __init__(self, config):
+        super(SmollM, self).__init__()
+        self.vocab_size = config['vocab_size']
+        self.hidden_size = config['hidden_size']
+        self.num_hidden_layers = config['num_hidden_layers']
+        self.num_attention_heads = config['num_attention_heads']
+        self.num_key_value_heads = config['num_key_value_heads']
+        self.max_position_embeddings = config['max_position_embeddings']
+        self.intermediate_size = config['intermediate_size']
+        self.initializer_range = config['initializer_range']
+        self.eps = config['rms_norm_eps']
+        self.head_dim = self.hidden_size // self.num_attention_heads
+        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim)
+        self.layers = nn.ModuleList([
+            TransformerBlock(
+                hidden_size=self.hidden_size,
+                num_attention_heads=self.num_attention_heads,
+                num_key_value_heads=self.num_key_value_heads,
+                intermediate_size=self.intermediate_size,
+                eps=self.eps,
+                activation_fn=F.silu  # Activation function specified in config
+            ) for _ in range(self.num_hidden_layers)
+        ])
+        self.layer_norm = LlamaRMSNorm(self.hidden_size, eps=self.eps)
+        # Language modeling head
+        self.lm_head = nn.Linear(self.hidden_size, self.vocab_size, bias=False)
+        # Share weights between embedding and lm_head
+        self.lm_head.weight = self.embedding.weight
+        self._init_weights()
+    def forward(self, input_ids, attention_mask=None):
+        batch_size, seq_length = input_ids.size()
+        position_ids = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+        embeddings = self.embedding(input_ids)
+        hidden_states = embeddings
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask=attention_mask, position_embeddings=position_embeddings)
+        hidden_states = self.layer_norm(hidden_states)
+        logits = self.lm_head(hidden_states)
+        return logits
+    def _init_weights(self):
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.normal_(module.weight, mean=0.0, std=self.initializer_range)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, mean=0.0, std=self.initializer_range)
+            elif isinstance(module, nn.LayerNorm):
+                nn.init.constant_(module.bias, 0)
+                nn.init.constant_(module.weight, 1.0)