Upload model
Browse files- chatNT.py +20 -9
- config.json +1 -1
- model-00001-of-00004.safetensors +3 -0
- model-00002-of-00004.safetensors +3 -0
- model-00003-of-00004.safetensors +3 -0
- model-00004-of-00004.safetensors +3 -0
- model.safetensors.index.json +0 -0
chatNT.py
CHANGED
@@ -747,7 +747,8 @@ class TorchRotaryEmbedding(torch.nn.Module):
|
|
747 |
"""
|
748 |
# Create the inverse frequency based on theta and dim
|
749 |
inv_freq = 1.0 / (
|
750 |
-
self.theta
|
|
|
751 |
)
|
752 |
|
753 |
# Compute sinusoidal input using the broadcasting
|
@@ -759,7 +760,9 @@ class TorchRotaryEmbedding(torch.nn.Module):
|
|
759 |
sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
|
760 |
|
761 |
# Allocate a tensor for the final sin-cos values
|
762 |
-
sincos = torch.zeros(
|
|
|
|
|
763 |
|
764 |
# Fill the sincos tensor with sin and cos values
|
765 |
sentinel = self.dim // 2 + self.dim % 2
|
@@ -827,7 +830,7 @@ class TorchRotaryEmbedding(torch.nn.Module):
|
|
827 |
if self.sincos_cache is None:
|
828 |
device = k.device
|
829 |
self.sincos_cache = self._create_sinusoidal_positions(device=device)
|
830 |
-
|
831 |
batch_size, seq_len, num_heads, head_dim = k.shape
|
832 |
|
833 |
# Generate position ids
|
@@ -839,7 +842,7 @@ class TorchRotaryEmbedding(torch.nn.Module):
|
|
839 |
position_ids += positions
|
840 |
|
841 |
# Retrieve sincos values using the position_ids
|
842 |
-
sincos = self.sincos_cache[position_ids]
|
843 |
|
844 |
# Split sincos into sin_pos and cos_pos
|
845 |
sincos = torch.chunk(sincos, 2, dim=-1)
|
@@ -975,7 +978,9 @@ class TorchGptDecoder(nn.Module):
|
|
975 |
self, embeddings: torch.Tensor, attention_mask: torch.Tensor = None
|
976 |
) -> torch.Tensor:
|
977 |
if attention_mask is None:
|
978 |
-
attention_mask = build_causal_attention_mask(
|
|
|
|
|
979 |
for layer in self.layers:
|
980 |
embeddings = layer(embeddings, attention_mask)
|
981 |
|
@@ -985,7 +990,9 @@ class TorchGptDecoder(nn.Module):
|
|
985 |
self, token_ids: torch.Tensor, attention_mask: torch.Tensor = None
|
986 |
) -> dict[str, torch.Tensor]:
|
987 |
if attention_mask is None:
|
988 |
-
attention_mask = build_causal_attention_mask(
|
|
|
|
|
989 |
|
990 |
tokens_embeddings = self.token_embed(token_ids)
|
991 |
|
@@ -1127,7 +1134,9 @@ def get_activation_fn(activation_name: str): # type: ignore
|
|
1127 |
return activations.get(activation_name, nn.functional.relu)
|
1128 |
|
1129 |
|
1130 |
-
def build_causal_attention_mask(
|
|
|
|
|
1131 |
"""
|
1132 |
Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
|
1133 |
to an attention layer.
|
@@ -1218,14 +1227,16 @@ class RotaryEmbeddingBis(torch.nn.Module):
|
|
1218 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
1219 |
if self.rescaling_factor is None:
|
1220 |
inv_freq = 1.0 / (
|
1221 |
-
self.upper_freq
|
|
|
1222 |
)
|
1223 |
else:
|
1224 |
updated_base = self.upper_freq * (
|
1225 |
self.rescaling_factor ** (self.dim / (self.dim - 2))
|
1226 |
)
|
1227 |
inv_freq = 1.0 / (
|
1228 |
-
updated_base
|
|
|
1229 |
)
|
1230 |
|
1231 |
self._cos_cached, self._sin_cached = self._compute_cos_sin_tables(
|
|
|
747 |
"""
|
748 |
# Create the inverse frequency based on theta and dim
|
749 |
inv_freq = 1.0 / (
|
750 |
+
self.theta
|
751 |
+
** (torch.arange(0, self.dim, 2, device=device).float() / self.dim)
|
752 |
)
|
753 |
|
754 |
# Compute sinusoidal input using the broadcasting
|
|
|
760 |
sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
|
761 |
|
762 |
# Allocate a tensor for the final sin-cos values
|
763 |
+
sincos = torch.zeros(
|
764 |
+
(self.max_seq_len, self.dim), dtype=torch.float32, device=device
|
765 |
+
)
|
766 |
|
767 |
# Fill the sincos tensor with sin and cos values
|
768 |
sentinel = self.dim // 2 + self.dim % 2
|
|
|
830 |
if self.sincos_cache is None:
|
831 |
device = k.device
|
832 |
self.sincos_cache = self._create_sinusoidal_positions(device=device)
|
833 |
+
|
834 |
batch_size, seq_len, num_heads, head_dim = k.shape
|
835 |
|
836 |
# Generate position ids
|
|
|
842 |
position_ids += positions
|
843 |
|
844 |
# Retrieve sincos values using the position_ids
|
845 |
+
sincos = self.sincos_cache[position_ids] # type: ignore
|
846 |
|
847 |
# Split sincos into sin_pos and cos_pos
|
848 |
sincos = torch.chunk(sincos, 2, dim=-1)
|
|
|
978 |
self, embeddings: torch.Tensor, attention_mask: torch.Tensor = None
|
979 |
) -> torch.Tensor:
|
980 |
if attention_mask is None:
|
981 |
+
attention_mask = build_causal_attention_mask(
|
982 |
+
1, embeddings.shape[1], device=embeddings.device
|
983 |
+
)
|
984 |
for layer in self.layers:
|
985 |
embeddings = layer(embeddings, attention_mask)
|
986 |
|
|
|
990 |
self, token_ids: torch.Tensor, attention_mask: torch.Tensor = None
|
991 |
) -> dict[str, torch.Tensor]:
|
992 |
if attention_mask is None:
|
993 |
+
attention_mask = build_causal_attention_mask(
|
994 |
+
1, token_ids.shape[1], device=token_ids.device
|
995 |
+
)
|
996 |
|
997 |
tokens_embeddings = self.token_embed(token_ids)
|
998 |
|
|
|
1134 |
return activations.get(activation_name, nn.functional.relu)
|
1135 |
|
1136 |
|
1137 |
+
def build_causal_attention_mask(
|
1138 |
+
batch_size: int, seq_len: int, device: torch.device
|
1139 |
+
) -> torch.Tensor:
|
1140 |
"""
|
1141 |
Builds a batch of causal masks of shape (batch_size, 1, seq_len, seq_len) to feed
|
1142 |
to an attention layer.
|
|
|
1227 |
) -> Tuple[torch.Tensor, torch.Tensor]:
|
1228 |
if self.rescaling_factor is None:
|
1229 |
inv_freq = 1.0 / (
|
1230 |
+
self.upper_freq
|
1231 |
+
** (torch.arange(0, self.dim, 2, device=q.device).float() / self.dim)
|
1232 |
)
|
1233 |
else:
|
1234 |
updated_base = self.upper_freq * (
|
1235 |
self.rescaling_factor ** (self.dim / (self.dim - 2))
|
1236 |
)
|
1237 |
inv_freq = 1.0 / (
|
1238 |
+
updated_base
|
1239 |
+
** (torch.arange(0, self.dim, 2, device=q.device).float() / self.dim)
|
1240 |
)
|
1241 |
|
1242 |
self._cos_cached, self._sin_cached = self._compute_cos_sin_tables(
|
config.json
CHANGED
@@ -80,6 +80,6 @@
|
|
80 |
"use_gradient_checkpointing": false
|
81 |
},
|
82 |
"seq_token_id": 32000,
|
83 |
-
"torch_dtype": "
|
84 |
"transformers_version": "4.41.1"
|
85 |
}
|
|
|
80 |
"use_gradient_checkpointing": false
|
81 |
},
|
82 |
"seq_token_id": 32000,
|
83 |
+
"torch_dtype": "bfloat16",
|
84 |
"transformers_version": "4.41.1"
|
85 |
}
|
model-00001-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcde43f92cb4fba555a67426956b0c0c0b1e9ac7b04a7d6744580e4729cfd9e3
|
3 |
+
size 4998275550
|
model-00002-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:187615f3a8661430364e2e824d5b0a0363c9cf5b3d8512f33c44015b0be27343
|
3 |
+
size 4890784808
|
model-00003-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:916b86538557669e3a74c00d4d58ae44e494c4439aba8c2d6ee51baf05f62ebe
|
3 |
+
size 4985672264
|
model-00004-of-00004.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8524670292b2f477cd558fd76b3372840949dadd0b0a6c386519b05a82faebe6
|
3 |
+
size 1212565848
|
model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|