transformers-community
/

sink_cache

Transformers

Safetensors

custom_generate

Model card Files Files and versions Community

joaogante HF Staff commited on May 22

Commit

ffa8b5a

1 Parent(s): f84586a

add sanity checks

Browse files

Files changed (2) hide show

README.md +2 -2
custom_generate/generate.py +51 -31

README.md CHANGED Viewed

@@ -21,8 +21,8 @@ This implementation should match the `SinkCache` class present in `transformers<
 ## Additional Arguments
-- `window_length` (`int`, defaults to `256`): The length of the context window.
-- `num_sink_tokens` (`int`, defaults to `4`): The number of sink tokens. See the original paper for more information.
 ## Output Type changes

 ## Additional Arguments
+- `window_length` (`int`, *optional*, defaults to 256): The length of the context window.
+- `num_sink_tokens` (`int`, *optional*, defaults to 4): The number of sink tokens. See the original paper for more information.
 ## Output Type changes

custom_generate/generate.py CHANGED Viewed

@@ -1,11 +1,18 @@
 import torch
 from typing import Any, Dict, List, Optional, Tuple
-from transformers.utils import logging
-from transformers.cache_utils import Cache
-logger = logging.get_logger(__name__)
 class SinkCache(Cache):
     """
     A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
@@ -15,28 +22,13 @@ class SinkCache(Cache):
     It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
     `[batch_size, num_heads, seq_len, head_dim]`.
     Parameters:
         window_length (`int`):
             The length of the context window.
         num_sink_tokens (`int`):
             The number of sink tokens. See the original paper for more information.
-    Example:
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
-        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-        >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt")
-        >>> # Prepare a cache class and pass it to model's forward
-        >>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
-        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
-        >>> outputs.past_key_values # access cache filled with key/values from generation
-        SinkCache()
-        ```
     """
     def __init__(self, window_length: int, num_sink_tokens: int) -> None:
@@ -48,7 +40,6 @@ class SinkCache(Cache):
         self.cos_sin_rerotation_cache = {}
         self._cos_cache = None
         self._sin_cache = None
-        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
     @staticmethod
     def _rotate_half(x):
@@ -86,8 +77,6 @@ class SinkCache(Cache):
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
-        # TODO: deprecate this function in favor of `cache_position`
-        # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
         if len(self.key_cache) <= layer_idx:
             return 0
         return self.key_cache[layer_idx].shape[-2]
@@ -130,10 +119,6 @@ class SinkCache(Cache):
         partial_rotation_size = cache_kwargs.get("partial_rotation_size")
         using_rope = cos is not None and sin is not None
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_states.shape[-2]
         # Update the sin/cos cache, which holds sin/cos values for all possible positions
         if using_rope and layer_idx == 0:
             # BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove
@@ -194,17 +179,52 @@ class SinkCache(Cache):
 def generate(model, window_length=256, num_sink_tokens=4, **kwargs):
-    # compatibility with transformers 4.52: we must pop `custom_generate` from kwargs, otherwise it will result in an
-    # infinite loop. This is solved in transformers 4.53.
     kwargs.pop("custom_generate", None)
-    # prepare the cache, it is was not passed.
     past_key_values = kwargs.pop("past_key_values", None)
     if past_key_values is None:
         past_key_values = SinkCache(window_length=window_length, num_sink_tokens=num_sink_tokens)
     elif not isinstance(past_key_values, SinkCache):
         raise ValueError(f"`past_key_values` must be a `SinkCache` instance, got a {type(past_key_values)} instance")
-    # generate with the cache
     generation_outputs = model.generate(**kwargs, past_key_values=past_key_values, use_cache=True)
     return generation_outputs

 import torch
 from typing import Any, Dict, List, Optional, Tuple
+from transformers import Cache, GenerationConfig
+UNSUPPORTED_GENERATION_ARGS = [
+    "cache_implementation",  # cache-related arguments, here we always use SinkCache
+    "cache_config",
+    "return_legacy_cache",
+    "num_beams",  # beam search (and cousin techniques) are not supported
+    "compile_config",  # SinkCache doesn't support torch.compile
+    "assistant_model",  # it also doesn't support speculative decoding
+]
 class SinkCache(Cache):
     """
     A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
     It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
     `[batch_size, num_heads, seq_len, head_dim]`.
+    This class was copied from transformers 4.52.0, with minor modifications.
     Parameters:
         window_length (`int`):
             The length of the context window.
         num_sink_tokens (`int`):
             The number of sink tokens. See the original paper for more information.
     """
     def __init__(self, window_length: int, num_sink_tokens: int) -> None:
         self.cos_sin_rerotation_cache = {}
         self._cos_cache = None
         self._sin_cache = None
     @staticmethod
     def _rotate_half(x):
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         if len(self.key_cache) <= layer_idx:
             return 0
         return self.key_cache[layer_idx].shape[-2]
         partial_rotation_size = cache_kwargs.get("partial_rotation_size")
         using_rope = cos is not None and sin is not None
         # Update the sin/cos cache, which holds sin/cos values for all possible positions
         if using_rope and layer_idx == 0:
             # BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove
 def generate(model, window_length=256, num_sink_tokens=4, **kwargs):
+    """Custom generate function for SinkCache.
+    Args:
+        model (`PreTrainedModel`):
+            The model to generate from.
+        window_length (`int`, *optional*, defaults to 256):
+            The length of the context window.
+        num_sink_tokens (`int`, *optional*, defaults to 4):
+            The number of sink tokens. See the original paper for more information.
+    """
+    # 1. General sanity checks
+    # 1.a. A few arguments are not allowed, especially arguments that control caches.
+    generation_config = kwargs.get("generation_config")
+    default_global_generation_config = GenerationConfig()
+    default_model_generation_config = model.generation_config
+    for arg in UNSUPPORTED_GENERATION_ARGS:
+        has_custom_gen_config_arg = (
+            generation_config is not None
+            # = and not (match global default or match model-specific default)
+            and not (
+                getattr(default_model_generation_config, arg) == getattr(generation_config, arg)
+                or getattr(default_global_generation_config, arg) == getattr(generation_config, arg)
+            )
+        )
+        if arg in kwargs or has_custom_gen_config_arg:
+            raise ValueError(
+                f"`{arg}` is set, but it's not supported in this custom generate function. List of "
+                f"unsupported arguments: {UNSUPPORTED_GENERATION_ARGS}"
+            )
+    # 1.b. The model must be decoder-only
+    if model.config.is_encoder_decoder:
+        raise ValueError("This custom generate function only works with decoder-only models")
+    # 1.c. compatibility with transformers 4.52: we must pop `custom_generate` from kwargs, otherwise it will result
+    # in an infinite loop when we call `model.generate`. This is solved in transformers 4.53.
     kwargs.pop("custom_generate", None)
+    # 2. Generate with SinkCache
+    # 2.a. prepare the cache, if it was not passed.
     past_key_values = kwargs.pop("past_key_values", None)
     if past_key_values is None:
         past_key_values = SinkCache(window_length=window_length, num_sink_tokens=num_sink_tokens)
     elif not isinstance(past_key_values, SinkCache):
         raise ValueError(f"`past_key_values` must be a `SinkCache` instance, got a {type(past_key_values)} instance")
+    # 2.b. generate with the cache
     generation_outputs = model.generate(**kwargs, past_key_values=past_key_values, use_cache=True)
     return generation_outputs