Spaces:

alibayram
/

usta-llm-demo

Running

App Files Files Community

alibayram commited on Jun 10

Commit

8d4b0c7

1 Parent(s): 66af716

space update

Browse files

Files changed (15) hide show

.gitignore +1 -0
app.py +132 -32
requirements.txt +2 -1
v1/__init__.py +0 -0
v1/tokenizer.json +66 -0
v1/usta_causal_attention.py +33 -0
v1/usta_decoder_block.py +31 -0
v1/usta_embedding.py +49 -0
v1/usta_layer_norm.py +18 -0
v1/usta_mlp.py +36 -0
v1/usta_model.py +52 -0
v1/usta_multi_head_attention.py +22 -0
v1/usta_multi_head_attention_old.py +26 -0
v1/usta_self_attention.py +22 -0
v1/usta_tokenizer.py +48 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

app.py CHANGED Viewed

@@ -1,11 +1,79 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
     message,
@@ -15,30 +83,49 @@ def respond(
     temperature,
     top_p,
 ):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
         yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
@@ -46,19 +133,32 @@ For information on how to customize the ChatInterface, peruse the gradio docs: h
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
             value=0.95,
             step=0.05,
             label="Top-p (nucleus sampling)",
         ),
     ],
 )
 if __name__ == "__main__":
     demo.launch()

+import os
 import gradio as gr
+import torch
+from v1.usta_model import UstaModel
+from v1.usta_tokenizer import UstaTokenizer
+# Load the model and tokenizer
+def load_model():
+    try:
+        u_tokenizer = UstaTokenizer("v1/tokenizer.json")
+        # Model parameters - adjust these to match your trained model
+        context_length = 32
+        vocab_size = len(u_tokenizer.vocab)
+        embedding_dim = 12
+        num_heads = 4
+        num_layers = 8
+        # Load the model
+        u_model = UstaModel(
+            vocab_size=vocab_size,
+            embedding_dim=embedding_dim,
+            num_heads=num_heads,
+            context_length=context_length,
+            num_layers=num_layers
+        )
+        # Load the trained weights if available
+        model_path = "v1/u_model.pth"
+        if not os.path.exists(model_path):
+            # Download the model file from GitHub
+            try:
+                print("📥 Downloading model weights from GitHub...")
+                import requests
+                url = "https://github.com/malibayram/llm-from-scratch/raw/main/u_model.pth"
+                response = requests.get(url)
+                response.raise_for_status()  # Raise an exception for bad status codes
+                # Create v1 directory if it doesn't exist
+                os.makedirs("v1", exist_ok=True)
+                with open(model_path, "wb") as f:
+                    f.write(response.content)
+                print("✅ Model weights downloaded successfully!")
+            except Exception as e:
+                print(f"❌ Failed to download model weights: {e}")
+                print("Using random initialization.")
+        if os.path.exists(model_path):
+            try:
+                u_model.load_state_dict(torch.load(model_path, map_location="cpu"))
+                u_model.eval()
+                print("✅ Model weights loaded successfully!")
+            except Exception as e:
+                print(f"⚠️ Warning: Could not load trained weights: {e}")
+                print("Using random initialization.")
+        else:
+            print(f"⚠️ Model file not found at {model_path}. Using random initialization.")
+        return u_model, u_tokenizer
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        raise e
+# Initialize model and tokenizer globally
+try:
+    model, tokenizer = load_model()
+    print("🚀 UstaModel and tokenizer initialized successfully!")
+except Exception as e:
+    print(f"❌ Failed to initialize model: {e}")
+    model, tokenizer = None, None
 def respond(
     message,
     temperature,
     top_p,
 ):
+    """
+    Generate a response using the UstaModel
+    """
+    if model is None or tokenizer is None:
+        yield "Sorry, the UstaModel is not available. Please try again later."
+        return
+    try:
+        # For UstaModel, we'll use the message directly (ignoring system_message for now)
+        # since it's a simpler model focused on geographical knowledge
+        # Encode the input message
+        tokens = tokenizer.encode(message)
+        # Make sure we don't exceed context length
+        if len(tokens) > 25:  # Leave some room for generation
+            tokens = tokens[-25:]
+        # Generate response
+        with torch.no_grad():
+            # Use max_tokens parameter, but cap it at reasonable limit for this model
+            actual_max_tokens = min(max_tokens, 32 - len(tokens))
+            generated_tokens = model.generate(tokens, actual_max_tokens)
+        # Decode the generated tokens
+        response = tokenizer.decode(generated_tokens)
+        # Clean up the response (remove the original input)
+        original_text = tokenizer.decode(tokens.tolist())
+        if response.startswith(original_text):
+            response = response[len(original_text):]
+        # Clean up any unwanted tokens
+        response = response.replace("<unk>", "").replace("<pad>", "").strip()
+        if not response:
+            response = "I'm not sure how to respond to that with my geographical knowledge."
+        # Yield the response (to maintain compatibility with streaming interface)
         yield response
+    except Exception as e:
+        yield f"Sorry, I encountered an error: {str(e)}"
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
+        gr.Textbox(
+            value="You are Usta, a geographical knowledge assistant trained from scratch.",
+            label="System message",
+            info="Note: This model focuses on geographical knowledge (countries, capitals, cities)"
+        ),
+        gr.Slider(minimum=1, maximum=30, value=20, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
             value=0.95,
             step=0.05,
             label="Top-p (nucleus sampling)",
+            info="Note: This parameter is not used by UstaModel but kept for interface compatibility"
         ),
     ],
+    title="🤖 Usta Model Chat",
+    description="Chat with a custom transformer language model built from scratch! This model specializes in geographical knowledge including countries, capitals, and cities.",
+    examples=[
+        "the capital of france",
+        "tell me about spain",
+        "what is the capital of united states",
+        "paris is in",
+        "germany and its capital"
+    ]
 )
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- ~~huggingface_hub==0~~.25.2


1	+ torch>=2.7.1
2	+ requests>=2.32.4

v1/__init__.py ADDED Viewed

File without changes

v1/tokenizer.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "the": 0,
+  "capital": 1,
+  "of": 2,
+  "united": 3,
+  "state": 4,
+  "is": 5,
+  "not": 6,
+  "london": 7,
+  "france": 8,
+  "paris": 9,
+  "and": 10,
+  "berlin": 11,
+  "germany": 12,
+  "rome": 13,
+  "in": 14,
+  "italy": 15,
+  "madrid": 16,
+  "spain": 17,
+  "lisbon": 18,
+  "portugal": 19,
+  "kingdom": 20,
+  "washington": 21,
+  "although": 22,
+  "these": 23,
+  "place": 24,
+  "are": 25,
+  "often": 26,
+  "mention": 27,
+  "together": 28,
+  "each": 29,
+  "country": 30,
+  "has": 31,
+  "its": 32,
+  "own": 33,
+  "identity": 34,
+  "any": 35,
+  "european": 36,
+  "city": 37,
+  "remain": 38,
+  "important": 39,
+  "with": 40,
+  "a": 41,
+  "rich": 42,
+  "history": 43,
+  "culture": 44,
+  "europe": 45,
+  "made": 46,
+  "many": 47,
+  "unique": 48,
+  "world": 49,
+  "while": 50,
+  "known": 51,
+  "for": 52,
+  "art": 53,
+  "fashion": 54,
+  "famous": 55,
+  "they": 56,
+  "ed": 57,
+  "s": 58,
+  ".": 59,
+  ",": 60,
+  " ": 61,
+  "<unk>": 62,
+  "<pad>": 63
+}

v1/usta_causal_attention.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+class UstaCausalAttention(nn.Module):
+  def __init__(self, embedding_dim, output_dim, context_length, dropout_rate = 0):
+    super().__init__()
+    self.embedding_dim = embedding_dim
+    self.q_weights = nn.Linear(embedding_dim, output_dim, bias=False)
+    self.k_weights = nn.Linear(embedding_dim, output_dim, bias=False)
+    self.v_weights = nn.Linear(embedding_dim, output_dim, bias=False)
+    self.dropout = nn.Dropout(dropout_rate)
+    self.register_buffer("mask", torch.tril(torch.ones(context_length, context_length)))
+    self.context_length = context_length
+  def forward(self, x):
+    number_of_tokens = x.shape[0]
+    # truncate the context length to the context length of the model
+    x = x[:self.context_length]
+    q = self.q_weights(x)
+    k = self.k_weights(x)
+    v = self.v_weights(x)
+    attention_scores = q @ k.T
+    attention_scores = attention_scores.masked_fill_(
+      self.mask.bool()[:number_of_tokens, :number_of_tokens] == 0, -torch.inf
+    )
+    attention_scores = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
+    attention_scores = self.dropout(attention_scores)
+    return attention_scores @ v

v1/usta_decoder_block.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch.nn as nn
+from .usta_layer_norm import UstaLayerNorm
+from .usta_mlp import UstaMLP
+from .usta_multi_head_attention import UstaMultiHeadAttention
+class UstaDecoderBlock(nn.Module):
+  def __init__(self, embedding_dim, num_heads, context_length):
+    super().__init__()
+    self.self_attention = UstaMultiHeadAttention(embedding_dim, embedding_dim, context_length, num_heads, dropout_rate=0.5)
+    self.norm1 = UstaLayerNorm(embedding_dim)
+    self.mlp = UstaMLP(embedding_dim, embedding_dim)
+    self.norm2 = UstaLayerNorm(embedding_dim)
+  def forward(self, x):
+    res = self.norm1(x)
+    x = self.self_attention(x)
+    x = self.norm1(x)
+    x = x + res
+    res = self.norm2(x)
+    x = self.mlp(x)
+    x = self.norm2(x)
+    x = x + res
+    return x

v1/usta_embedding.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+def get_rotary_position_encoding(input: torch.Tensor, base=10000, device="cpu"):
+  context_length, dimension = input.shape
+  assert dimension % 2 == 0, "dimension must be even"
+  half_dimension = dimension // 2
+  freqs_indices = torch.arange(0, half_dimension, device=device, dtype=torch.float32)
+  freqs = 1.0 / (base ** (freqs_indices / dimension))
+  positions = torch.arange(0, context_length, device=device, dtype=torch.float32).unsqueeze(1)
+  angles = positions * freqs
+  sin_angles = torch.sin(angles)
+  cos_angles = torch.cos(angles)
+  input_even = input[:, :dimension // 2] # [0, 2, 4, ..]
+  input_odd = input[:, dimension // 2:] # [1, 3, 5, ..]
+  input_even_rotated = input_even * cos_angles - input_odd * sin_angles
+  input_odd_rotated = input_even * sin_angles + input_odd * cos_angles
+  input_rotated = torch.empty_like(input)
+  input_rotated[:, :dimension // 2] = input_even_rotated
+  input_rotated[:, dimension // 2:] = input_odd_rotated
+  return input_rotated
+class UstaEmbedding(nn.Module):
+  def __init__(self, vocab_size, embedding_dim, context_length):
+    super().__init__()
+    # position embedding but not being used in the forward pass
+    # it is just for educational purposes
+    # self.pos_embedding = nn.Embedding(context_length, embedding_dim)
+    # self.get_pos = get_rotary_position_encoding
+    self.embedding = nn.Embedding(vocab_size, embedding_dim)
+    self.get_pos = get_rotary_position_encoding
+  def forward(self, x):
+    x = self.embedding(x)
+    x = self.get_pos(x)
+    return x

v1/usta_layer_norm.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import torch.nn as nn
+class UstaLayerNorm(nn.Module):
+  def __init__(self, embedding_dim, eps=1e-5):
+    super().__init__()
+    self.eps = eps
+    self.weight = nn.Parameter(torch.ones(embedding_dim))
+  def forward(self, x):
+    mean = x.mean(dim=-1, keepdim=True)
+    variance = x.var(dim=-1, keepdim=True, unbiased=False)
+    normalized_x = (x - mean) / torch.sqrt(variance + self.eps)
+    return self.weight * normalized_x

v1/usta_mlp.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+class GELU(nn.Module):
+  def __init__(self):
+    super().__init__()
+  def forward(self, x):
+    return 0.5 * x * (
+      1 + torch.tanh(
+          torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
+        )
+    )
+class UstaMLP(nn.Module):
+  def __init__(self, embedding_dim, hidden_dim):
+    super().__init__()
+    self.gate_proj = nn.Linear(embedding_dim, hidden_dim)
+    self.up_proj = nn.Linear(embedding_dim, hidden_dim)
+    self.down_proj = nn.Linear(hidden_dim, embedding_dim)
+    self.gelu = GELU()
+  def forward(self, x):
+    """ gate = self.gate_proj(x)
+        gate = F.gelu(gate, approximate="tanh")
+        up = self.up_proj(x)
+        fuse = gate * up
+        outputs = self.down_proj(fuse) """
+    gate = self.gate_proj(x)
+    gate = self.gelu(gate)
+    up = self.up_proj(x)
+    fuse = gate * up
+    outputs = self.down_proj(fuse)
+    return outputs

v1/usta_model.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import torch.nn as nn
+from .usta_decoder_block import UstaDecoderBlock
+from .usta_embedding import UstaEmbedding
+class UstaModel(nn.Module):
+  def __init__(self, vocab_size, embedding_dim, num_heads, context_length, num_layers):
+    super().__init__()
+    self.embedding = UstaEmbedding(vocab_size, embedding_dim, context_length)
+    self.layers = nn.Sequential(
+      *[UstaDecoderBlock(embedding_dim, num_heads, context_length) for _ in range(num_layers)]
+    )
+    self.lm_head = nn.Linear(embedding_dim, vocab_size)
+  def forward(self, x: torch.Tensor):
+    x = self.embedding(x) # dictionary meaning of the tokens (words)
+    x = self.layers(x)
+    x = self.lm_head(x)
+    return x
+  """ out = u_model(torch.tensor(new_tokens))
+  probs = torch.softmax(out[-1], dim=-1)
+  max_prob, max_index = torch.max(probs, dim=-1)
+  max_prob, max_index, probs
+  """
+  def generate(self, x: torch.Tensor, max_new_tokens: int): # top_k, top_p, temperature
+    tokens = x.detach().cpu().numpy().tolist()
+    for _ in range(max_new_tokens):
+      out = self.forward(x)
+      probs = torch.softmax(out[-1], dim=-1)
+      _, max_index = torch.max(probs, dim=-1)
+      tokens.append(max_index.item())
+      if max_index == 59 or len(tokens) > 32: # <eos> and max context length
+        break
+      x = torch.tensor(tokens)
+    return tokens

v1/usta_multi_head_attention.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+class UstaMultiHeadAttention(nn.Module):
+  def __init__(self, embedding_dim, output_dim, context_length, num_heads, dropout_rate = 0):
+    super().__init__()
+    self.context_length = context_length
+    self.multi_head_attention = nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_rate)
+    self.projection = nn.Linear(embedding_dim, output_dim)
+    self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1).bool())
+  def forward(self, x):
+    number_of_tokens = x.shape[0]
+    x = x[:self.context_length]
+    attention_mask = self.mask[:number_of_tokens, :number_of_tokens]
+    out, _ = self.multi_head_attention(x, x, x, attn_mask=attention_mask)
+    out = self.projection(out)
+    return out

v1/usta_multi_head_attention_old.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+import torch.nn as nn
+from .usta_causal_attention import UstaCausalAttention
+class UstaMultiHeadAttention(nn.Module):
+  def __init__(self, embedding_dim, output_dim, context_length, num_heads, dropout_rate = 0):
+    super().__init__()
+    self.heads = nn.ModuleList(
+      [UstaCausalAttention(embedding_dim, output_dim, context_length, dropout_rate) for _ in range(num_heads)]
+    )
+    self.projection = nn.Linear(embedding_dim, output_dim)
+  def forward(self, x):
+    attention_outs = []
+    for head in self.heads:
+      head_out = head(x)
+      attention_outs.append(head_out)
+    attention_out = torch.cat(attention_outs, dim=1)
+    return self.projection(attention_out)

v1/usta_self_attention.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+class UstaSelfAttention(nn.Module):
+  def __init__(self, embedding_dim, output_dim):
+    super().__init__()
+    self.embedding_dim = embedding_dim
+    self.q_weights = nn.Linear(embedding_dim, output_dim, bias=False)
+    self.k_weights = nn.Linear(embedding_dim, output_dim, bias=False)
+    self.v_weights = nn.Linear(embedding_dim, output_dim, bias=False)
+  def forward(self, x):
+    q = self.q_weights(x)
+    k = self.k_weights(x)
+    v = self.v_weights(x)
+    attention_scores = q @ k.T
+    attention_weights = torch.softmax(attention_scores / k.shape[-1] ** 0.5, dim=1)
+    return attention_weights @ v

v1/usta_tokenizer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import json
+import torch
+class UstaTokenizer:
+  def __init__(self, vocab_file):
+    with open(vocab_file, "r") as f:
+      self.vocab = json.load(f)
+      self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+  def encode(self, text):
+    tokens = []
+    for word in text.split():
+      i = 0
+      # example: states
+      # state => 4
+      # s => 58
+      while i < len(word):
+        found_match = False
+        for j in range(len(word), i, -1):
+          sub_word = word[i:j]
+          if sub_word in self.vocab:
+            tokens.append(self.vocab[sub_word])
+            i = j
+            found_match = True
+            break
+        if not found_match:
+          tokens.append(self.vocab["<unk>"])
+          i += 1
+      tokens.append(self.vocab[" "])
+    tokens.pop()
+    return torch.tensor(tokens)
+  def tokenize(self, text):
+    token_ids = self.encode(text)
+    # token_ids from tensor to list
+    token_ids = token_ids.detach().numpy().tolist()
+    return [self.reverse_vocab[id] for id in token_ids]
+  def decode(self, ids):
+    text = ""
+    for id in ids:
+      text += self.reverse_vocab[id]
+    return text