Spaces:

rahul7star
/

Gemma3

Paused

App Files Files Community

rahul7star commited on Aug 22

Commit

baff828

verified ·

1 Parent(s): 8e2085e

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -106

app.py CHANGED Viewed

@@ -1,114 +1,50 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch.nn as nn
-from torch.export import Dim
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from huggingface_hub import login
-def calculate_self_attention(Q, K, V, masked_fill=None, scale=None, epsilon=1e-9):
-    """
-    Args:
-        Q, K, V: [B, H, T_q, D] or [B, H, T_k, D]
-        masked_fill: Optional additive mask of shape [B, 1, T_q, T_k] or [B, H, T_q, T_k]
-        scale: Optional scaling factor (default: sqrt(D))
-        epsilon: Small constant for numerical stability
-    """
-    print("Q dtype torch:", Q.dtype)
-    print("K dtype torch:", K.dtype)
-    print("V dtype torch:", V.dtype)
-    B, H, T_q, D = Q.shape
-    T_k = K.shape[2]  # number of key tokens
-    scale = scale or np.sqrt(D)
-    log(f"Q: {np.sum(Q):.4f}, K: {np.sum(K):.4f}, V: {np.sum(V):.4f}")
-    # Step 1: Raw attention logits
-    QK_output = np.matmul(Q, K.transpose(0, 1, 3, 2))  # [B, H, T_q, T_k]
-    logits_unmasked = QK_output / scale
-    print(f"QK_output---  shape: {QK_output.shape},  value: {np.sum(QK_output):.4f}")
-    print(f"logits_unmasked---  shape: {logits_unmasked.shape},  value: {np.sum(logits_unmasked):.4f}")
-    ###########################################################################
-    def _nz_stats(name, arr, tol=1e-12):
-        total = arr.size
-        zeros = np.count_nonzero(np.abs(arr) < tol)
-        nonzeros = total - zeros
-        pct = (nonzeros / total) * 100
-        print(f"{name}: nonzeros={nonzeros} ({pct:.2f}%), zeros={zeros}")
-    # Debug: non-zero stats
-    _nz_stats("Q: ", Q)
-    _nz_stats("K: ", K)
-    _nz_stats("V: ", V)
-    _nz_stats("QK_output: ", QK_output)
-    ###########################################################################
-    # Step 2: Softmax over unmasked logits (for debugging or interpretability)
-    A = np.exp(logits_unmasked - np.max(logits_unmasked, axis=-1, keepdims=True))
-    A = A / (np.sum(A, axis=-1, keepdims=True) + epsilon)
-    log(f"A (unmasked attention weights) ---  shape: {A.shape}, value: {np.sum(A):.4f}")
-    # Step 3: Apply additive attention mask (optional)
-    masked_fill = None
-    if masked_fill is not None:
-        logits_masked = logits_unmasked + masked_fill  # [B, H, T, T] + [B, 1, T, T]
-        log(f"masked_fill--- minimum: {np.min(masked_fill)}, maximum: {np.max(masked_fill)}")
-    else:
-        logits_masked = logits_unmasked.copy()
-    log(f"logits_masked --- shape: {logits_masked.shape},  value: {np.sum(logits_masked):.4f}")
-    # Step 4: Softmax over masked logits
-    A_masked = np.exp(logits_masked - np.max(logits_masked, axis=-1, keepdims=True))
-    A_masked = A_masked / (np.sum(A_masked, axis=-1, keepdims=True) + epsilon)
-    log(f"A_masked (masked attention weights)---  shape: {A_masked.shape},  value: {np.sum(A_masked):.4f}")
-    # Step 5: Compute attention output using masked weights
-    attention_output = np.matmul(A_masked, V)  # [B, H, T_q, D]
-    log(f"attention_output (using A_masked)---  shape: {attention_output.shape},  value: {np.sum(attention_output):.4f}")
 def main():
-model_id = “google/gemma-3-1b-it”
-auth_token = “[Here, place your Huggingface Authentication Token]”
-class GemmaWrapper(nn.Module):
-    def __init__(self, model_id, token):
-        super().__init__()
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.float32,
-            token=token
-        ).eval()
-    def forward(self, input_ids, attention_mask):
-        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False).logits
-model = GemmaWrapper(model_id, auth_token)
-tokenizer = AutoTokenizer.from_pretrained(model_id, token=auth_token)
-tokenizer.pad_token = tokenizer.eos_token
-sentences = [“Hello”]
-tokens = tokenizer(sentences, return_tensors=“pt”, padding=True, truncation=True)
-input_ids = tokens[“input_ids”]
-attention_mask = tokens[“attention_mask”]
-if len(sentences) > 1:
-    batch_dim = Dim(“batch”, min=1, max=len(sentences))
-else:
-    batch_dim = 1  # Static dimension
-seq_dim = Dim(“seq”, min=1, max=input_ids.shape[1])
-dynamic_shapes = {
-    “input_ids”: {0: batch_dim, 1: seq_dim},
-    “attention_mask”: {0: batch_dim, 1: seq_dim},
-}
 if __name__ == "__main__":
     main()

+import os
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModelForCausalLM
 def main():
+    # Get Hugging Face token from environment variable
+    auth_token = os.environ.get("HF_TOKEN")
+    if auth_token is None:
+        raise ValueError("Please set your Hugging Face token in the environment variable HF_TOKEN")
+    # Model ID
+    model_id = "google/gemma-3-1b-it"
+    # Device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Wrapper class
+    class GemmaWrapper(nn.Module):
+        def __init__(self, model_id, token):
+            super().__init__()
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.float32,
+                use_auth_token=token
+            ).to(device).eval()
+        def forward(self, input_ids, attention_mask):
+            return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=False).logits
+    # Load model and tokenizer
+    model = GemmaWrapper(model_id, auth_token)
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=auth_token)
+    tokenizer.pad_token = tokenizer.eos_token
+    # Example input
+    sentences = ["Hello"]
+    tokens = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
+    input_ids = tokens["input_ids"].to(device)
+    attention_mask = tokens["attention_mask"].to(device)
+    # Forward pass
+    with torch.no_grad():
+        logits = model(input_ids=input_ids, attention_mask=attention_mask)
+    print("Logits shape:", logits.shape)
+    print("Sample logits:", logits[0, :5, :5])  # show small slice
 if __name__ == "__main__":
     main()