InstaDeepAI
/

ChatNT

Text Generation

feature-extraction

Model card Files Files and versions Community

Yanisadel commited on Apr 4

Commit

b54a5dd

·

1 Parent(s): 64c0358

Update chatNT.py

Files changed (1) hide show

chatNT.py +9 -5

chatNT.py CHANGED Viewed

@@ -736,9 +736,9 @@ class TorchRotaryEmbedding(torch.nn.Module):
         self.max_seq_len = config.max_seq_len
         self.dim = config.dim
         self.theta = config.theta
-        self.sincos_cache = self._create_sinusoidal_positions()
-    def _create_sinusoidal_positions(self) -> torch.Tensor:
         """
         Create the sines and cosines for the RoPE.
@@ -747,19 +747,19 @@ class TorchRotaryEmbedding(torch.nn.Module):
         """
         # Create the inverse frequency based on theta and dim
         inv_freq = 1.0 / (
-            self.theta ** (torch.arange(0, self.dim, 2).float() / self.dim)
         )
         # Compute sinusoidal input using the broadcasting
         sinusoid_inp = torch.einsum(
-            "i,j->ij", torch.arange(self.max_seq_len).float(), inv_freq
         )
         # Apply sin and cos to the sinusoidal input
         sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
         # Allocate a tensor for the final sin-cos values
-        sincos = torch.zeros((self.max_seq_len, self.dim), dtype=torch.float32)
         # Fill the sincos tensor with sin and cos values
         sentinel = self.dim // 2 + self.dim % 2
@@ -824,6 +824,10 @@ class TorchRotaryEmbedding(torch.nn.Module):
         Returns:
             RoPE embeddings for the keys and values.
         """
         batch_size, seq_len, num_heads, head_dim = k.shape
         # Generate position ids

         self.max_seq_len = config.max_seq_len
         self.dim = config.dim
         self.theta = config.theta
+        self.sincos_cache = None
+    def _create_sinusoidal_positions(self, device: torch.device) -> torch.Tensor:
         """
         Create the sines and cosines for the RoPE.
         """
         # Create the inverse frequency based on theta and dim
         inv_freq = 1.0 / (
+            self.theta ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim)
         )
         # Compute sinusoidal input using the broadcasting
         sinusoid_inp = torch.einsum(
+            "i,j->ij", torch.arange(self.max_seq_len, device=device).float(), inv_freq
         )
         # Apply sin and cos to the sinusoidal input
         sin, cos = sinusoid_inp.sin(), sinusoid_inp.cos()
         # Allocate a tensor for the final sin-cos values
+        sincos = torch.zeros((self.max_seq_len, self.dim), dtype=torch.float32, device=device)
         # Fill the sincos tensor with sin and cos values
         sentinel = self.dim // 2 + self.dim % 2
         Returns:
             RoPE embeddings for the keys and values.
         """
+        if self.sincos_cache is None:
+            device = k.device
+            self.sincos_cache = self._create_sinusoidal_positions(device=device)
         batch_size, seq_len, num_heads, head_dim = k.shape
         # Generate position ids