Upload model

Browse files

Files changed (7) hide show

chatNT.py +20 -9
config.json +1 -1
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +0 -0

chatNT.py CHANGED Viewed

@@ -747,7 +747,8 @@ class TorchRotaryEmbedding(torch.nn.Module):
         """
         # Create the inverse frequency based on theta and dim
         inv_freq = 1.0 / (
-            self.theta ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim)
         )
         # Compute sinusoidal input using the broadcasting
@@ -759,7 +760,9 @@ class TorchRotaryEmbedding(torch.nn.Module):
         sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
         # Allocate a tensor for the final sin-cos values
-        sincos = torch.zeros((self.max_seq_len, self.dim), dtype=torch.float32, device=device)
         # Fill the sincos tensor with sin and cos values
         sentinel = self.dim // 2 + self.dim % 2
@@ -827,7 +830,7 @@ class TorchRotaryEmbedding(torch.nn.Module):
         if self.sincos_cache is None:
             device = k.device
             self.sincos_cache = self._create_sinusoidal_positions(device=device)
         batch_size, seq_len, num_heads, head_dim = k.shape
         # Generate position ids
@@ -839,7 +842,7 @@ class TorchRotaryEmbedding(torch.nn.Module):
             position_ids += positions
         # Retrieve sincos values using the position_ids
-        sincos = self.sincos_cache[position_ids]
         # Split sincos into sin_pos and cos_pos
         sincos = torch.chunk(sincos, 2, dim=-1)
@@ -975,7 +978,9 @@ class TorchGptDecoder(nn.Module):
         self, embeddings: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> torch.Tensor:
         if attention_mask is None:
-            attention_mask = build_causal_attention_mask(1, embeddings.shape[1], device=embeddings.device)
         for layer in self.layers:
             embeddings = layer(embeddings, attention_mask)
@@ -985,7 +990,9 @@ class TorchGptDecoder(nn.Module):
         self, token_ids: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> dict[str, torch.Tensor]:
         if attention_mask is None:
-            attention_mask = build_causal_attention_mask(1, token_ids.shape[1], device=token_ids.device)
         tokens_embeddings = self.token_embed(token_ids)
@@ -1127,7 +1134,9 @@ def get_activation_fn(activation_name: str):  # type: ignore
     return activations.get(activation_name, nn.functional.relu)
-def build_causal_attention_mask(batch_size: int, seq_len: int, device: torch.device) -> torch.Tensor:
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
@@ -1218,14 +1227,16 @@ class RotaryEmbeddingBis(torch.nn.Module):
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.rescaling_factor is None:
             inv_freq = 1.0 / (
-                self.upper_freq ** (torch.arange(0, self.dim, 2, device=q.device).float() / self.dim)
             )
         else:
             updated_base = self.upper_freq * (
                 self.rescaling_factor ** (self.dim / (self.dim - 2))
             )
             inv_freq = 1.0 / (
-                updated_base ** (torch.arange(0, self.dim, 2, device=q.device).float() / self.dim)
             )
         self._cos_cached, self._sin_cached = self._compute_cos_sin_tables(

         """
         # Create the inverse frequency based on theta and dim
         inv_freq = 1.0 / (
+            self.theta
+            ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim)
         )
         # Compute sinusoidal input using the broadcasting
         sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
         # Allocate a tensor for the final sin-cos values
+        sincos = torch.zeros(
+            (self.max_seq_len, self.dim), dtype=torch.float32, device=device
+        )
         # Fill the sincos tensor with sin and cos values
         sentinel = self.dim // 2 + self.dim % 2
         if self.sincos_cache is None:
             device = k.device
             self.sincos_cache = self._create_sinusoidal_positions(device=device)
         batch_size, seq_len, num_heads, head_dim = k.shape
         # Generate position ids
             position_ids += positions
         # Retrieve sincos values using the position_ids
+        sincos = self.sincos_cache[position_ids]  # type: ignore
         # Split sincos into sin_pos and cos_pos
         sincos = torch.chunk(sincos, 2, dim=-1)
         self, embeddings: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> torch.Tensor:
         if attention_mask is None:
+            attention_mask = build_causal_attention_mask(
+                1, embeddings.shape[1], device=embeddings.device
+            )
         for layer in self.layers:
             embeddings = layer(embeddings, attention_mask)
         self, token_ids: torch.Tensor, attention_mask: torch.Tensor = None
     ) -> dict[str, torch.Tensor]:
         if attention_mask is None:
+            attention_mask = build_causal_attention_mask(
+                1, token_ids.shape[1], device=token_ids.device
+            )
         tokens_embeddings = self.token_embed(token_ids)
     return activations.get(activation_name, nn.functional.relu)
+def build_causal_attention_mask(
+    batch_size: int, seq_len: int, device: torch.device
+) -> torch.Tensor:
     """
     Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
     to an attention layer.
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if self.rescaling_factor is None:
             inv_freq = 1.0 / (
+                self.upper_freq
+                ** (torch.arange(0, self.dim, 2, device=q.device).float() / self.dim)
             )
         else:
             updated_base = self.upper_freq * (
                 self.rescaling_factor ** (self.dim / (self.dim - 2))
             )
             inv_freq = 1.0 / (
+                updated_base
+                ** (torch.arange(0, self.dim, 2, device=q.device).float() / self.dim)
             )
         self._cos_cached, self._sin_cached = self._compute_cos_sin_tables(

config.json CHANGED Viewed

@@ -80,6 +80,6 @@
     "use_gradient_checkpointing": false
   },
   "seq_token_id": 32000,
-  "torch_dtype": "float32",
   "transformers_version": "4.41.1"
 }

     "use_gradient_checkpointing": false
   },
   "seq_token_id": 32000,
+  "torch_dtype": "bfloat16",
   "transformers_version": "4.41.1"
 }

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcde43f92cb4fba555a67426956b0c0c0b1e9ac7b04a7d6744580e4729cfd9e3
+size 4998275550

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:187615f3a8661430364e2e824d5b0a0363c9cf5b3d8512f33c44015b0be27343
+size 4890784808

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:916b86538557669e3a74c00d4d58ae44e494c4439aba8c2d6ee51baf05f62ebe
+size 4985672264

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8524670292b2f477cd558fd76b3372840949dadd0b0a6c386519b05a82faebe6
+size 1212565848

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff