Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +1 -0
.ruff_cache/.gitignore +2 -0
.ruff_cache/0.12.8/7010951691598163845 +0 -0
.ruff_cache/CACHEDIR.TAG +1 -0
README.md +118 -0
config.json +30 -0
custom_generate/beam_constraints.py +524 -0
custom_generate/beam_search.py +716 -0
custom_generate/generate.py +337 -0
generation_config.json +13 -0
merges.txt +0 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +239 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

.ruff_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Automatically created by ruff.
2	+ *

.ruff_cache/0.12.8/7010951691598163845 ADDED Viewed

Binary file (149 Bytes). View file

.ruff_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1 @@


1	+ Signature: 8a477f597d28d172789f06886806bc55

README.md ADDED Viewed

	@@ -0,0 +1,118 @@

+---
+library_name: transformers
+tags:
+  - custom_generate
+---
+## Description
+Constrained Beam Search extends standard beam search by allowing you to enforce lexical or phrasal constraints in the generated output. This is useful when you know certain words or phrases must appear (e.g., translation dictionaries, product names, slot values), or when multiple outputs are equally probable but only some are desirable for your use case.
+Unlike ordinary beam search, constrained beam search steers generation to include required subsequences somewhere in the final output while balancing fluency.
+---
+## Why it's difficult
+Beam search generates token-by-token and scores candidates locally. Forcing a phrase like "is fast" to appear somewhere requires the search to plan several steps ahead and decide when to insert the constrained tokens without breaking fluency. The problem becomes more complex with multiple constraints, optional alternatives, or ordering requirements.
+Constrained beam search solves this by:
+- Injecting constraint-progressing tokens among regular high-probability candidates
+- Grouping beams into banks by how much of the constraints they satisfied
+- Selecting beams round-robin across banks to balance fluency and constraint satisfaction
+---
+## Base model
+* [Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)
+---
+## Model compatibility
+- Encoder-decoder and decoder-only transformer models
+---
+## Additional Arguments
+- `constraints` (list[Constraint]): Advanced constraints, e.g., `PhrasalConstraint`, `DisjunctiveConstraint`
+- `force_words_ids` (list[list[int]] | list[list[list[int]]]): Simple way to specify words/phrases or disjunctive sets
+- `num_beams` (int): Beam width
+- Other standard beam args: `length_penalty`, `early_stopping`, `num_return_sequences`, `max_length`
+Notes:
+- Constrained decoding is incompatible with sampling: set `do_sample=False`
+- Tokenize constraints without adding special tokens
+---
+## Example 1: Forcing a word (formal German translation)
+```python
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+tokenizer = AutoTokenizer.from_pretrained("t5-base")
+model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+encoder_input_str = "translate English to German: How old are you?"
+input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+force_words = ["Sie"]
+force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
+outputs = model.generate(
+    input_ids,
+    custom_generate="transformers-community/constrained-beam-search",
+    force_words_ids=force_words_ids,
+    num_beams=5,
+    num_return_sequences=1,
+    no_repeat_ngram_size=1,
+    remove_invalid_values=True,
+    trust_remote_code=True,
+)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+Expected to contain the forced word: `Wie alt sind Sie?`
+---
+## Example 2: Disjunctive constraints (choose any of several forms)
+```python
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+model = GPT2LMHeadModel.from_pretrained("gpt2")
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+force_word = "scared"
+force_flexible = ["scream", "screams", "screaming", "screamed"]
+force_words_ids = [
+    tokenizer([force_word], add_prefix_space=True, add_special_tokens=False).input_ids,
+    tokenizer(force_flexible, add_prefix_space=True, add_special_tokens=False).input_ids,
+]
+starting_text = ["The soldiers", "The child"]
+input_ids = tokenizer(starting_text, return_tensors="pt").input_ids
+outputs = model.generate(
+    input_ids,
+    custom_generate="transformers-community/constrained-beam-search",
+    force_words_ids=force_words_ids,
+    num_beams=10,
+    num_return_sequences=1,
+    no_repeat_ngram_size=1,
+    remove_invalid_values=True,
+    trust_remote_code=True,
+)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+print(tokenizer.decode(outputs[1], skip_special_tokens=True))
+```
+Outputs will include the mandatory word and at least one from the flexible set.

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 28,
+    "model_type": "qwen3",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.56.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  }

custom_generate/beam_constraints.py ADDED Viewed

	@@ -0,0 +1,524 @@

+from abc import ABC, abstractmethod
+from typing import Optional
+class Constraint(ABC):
+    r"""Abstract base class for all constraints that can be applied during generation.
+    It must define how the constraint can be satisfied.
+    All classes that inherit Constraint must follow the requirement that
+    ```py
+    completed = False
+    while not completed:
+        _, completed = constraint.update(constraint.advance())
+    ```
+    will always terminate (halt).
+    """
+    def __init__(self):
+        # test for the above condition
+        self.test()
+    def test(self):
+        """
+        Tests whether this constraint has been properly defined.
+        """
+        counter = 0
+        completed = False
+        while not completed:
+            if counter == 1:
+                self.reset()
+            advance = self.advance()
+            if not self.does_advance(advance):
+                raise Exception(
+                    "Custom Constraint is not defined correctly. self.does_advance(self.advance()) must be true."
+                )
+            stepped, completed, reset = self.update(advance)
+            counter += 1
+            if counter > 10000:
+                raise Exception("update() does not fulfill the constraint.")
+        if self.remaining() != 0:
+            raise Exception("Custom Constraint is not defined correctly.")
+    @abstractmethod
+    def advance(self):
+        """
+        When called, returns the token(s) that would take this constraint one step closer to being fulfilled.
+        Return:
+            token_ids (Union[int, list[int], None]):
+                - A single token ID (int) that advances the constraint, or
+                - A list of token IDs that could advance the constraint
+                - None if the constraint is completed or cannot be advanced
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+    @abstractmethod
+    def does_advance(self, token_id: int):
+        """
+        Reads in a token and returns whether it creates progress.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+    @abstractmethod
+    def update(self, token_id: int):
+        """
+        Reads in a token and returns booleans that indicate the progress made by it. This function will update the
+        state of this object unlikes `does_advance(self, token_id: int)`.
+        This isn't to test whether a certain token will advance the progress; it's to update its state as if it has
+        been generated. This becomes important if token_id != desired token (refer to else statement in
+        PhrasalConstraint)
+        Args:
+            token_id(`int`):
+                The id of a newly generated token in the beam search.
+        Return:
+            stepped(`bool`):
+                Whether this constraint has become one step closer to being fulfuilled.
+            completed(`bool`):
+                Whether this constraint has been completely fulfilled by this token being generated.
+            reset (`bool`):
+                Whether this constraint has reset its progress by this token being generated.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+    @abstractmethod
+    def reset(self):
+        """
+        Resets the state of this constraint to its initialization. We would call this in cases where the fulfillment of
+        a constraint is abrupted by an unwanted token.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+    @abstractmethod
+    def remaining(self):
+        """
+        Returns the number of remaining steps of `advance()` in order to complete this constraint.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+    @abstractmethod
+    def copy(self, stateful=False):
+        """
+        Creates a new instance of this constraint.
+        Args:
+            stateful(`bool`): Whether to not only copy the constraint for new instance, but also its state.
+        Return:
+            constraint(`Constraint`): The same constraint as the one being called from.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+class PhrasalConstraint(Constraint):
+    r"""
+    [`Constraint`] enforcing that an ordered sequence of tokens is included in the output.
+    Args:
+        token_ids (`list[int]`):
+            The id of the token that must be generated by the output.
+    """
+    def __init__(self, token_ids: list[int]):
+        super(Constraint, self).__init__()
+        if not isinstance(token_ids, list) or len(token_ids) == 0:
+            raise ValueError(f"`token_ids` has to be a non-empty list, but is {token_ids}.")
+        if any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids):
+            raise ValueError(f"Each list in `token_ids` has to be a list of positive integers, but is {token_ids}.")
+        self.token_ids = token_ids
+        self.seqlen = len(self.token_ids)
+        self.fulfilled_idx = -1  # the index of the currently fulfilled step
+        self.completed = False
+    def advance(self):
+        if self.completed:
+            return None
+        return self.token_ids[self.fulfilled_idx + 1]
+    def does_advance(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}")
+        if self.completed:
+            return False
+        return token_id == self.token_ids[self.fulfilled_idx + 1]
+    def update(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}")
+        stepped = False
+        completed = False
+        reset = False
+        if self.does_advance(token_id):
+            self.fulfilled_idx += 1
+            stepped = True
+            if self.fulfilled_idx == (self.seqlen - 1):
+                completed = True
+            self.completed = completed
+        else:
+            # failed to make progress.
+            reset = True
+            self.reset()
+        return stepped, completed, reset
+    def reset(self):
+        self.completed = False
+        self.fulfilled_idx = 0
+    def remaining(self):
+        return self.seqlen - (self.fulfilled_idx + 1)
+    def copy(self, stateful=False):
+        new_constraint = PhrasalConstraint(self.token_ids)
+        if stateful:
+            new_constraint.seq_len = self.seqlen
+            new_constraint.fulfilled_idx = self.fulfilled_idx
+            new_constraint.completed = self.completed
+        return new_constraint
+class DisjunctiveTrie:
+    def __init__(self, nested_token_ids: list[list[int]], no_subsets=True):
+        r"""
+        A helper class that builds a trie with the words represented in `nested_token_ids`.
+        """
+        self.max_height = max([len(one) for one in nested_token_ids])
+        root = {}
+        for token_ids in nested_token_ids:
+            level = root
+            for tidx, token_id in enumerate(token_ids):
+                if token_id not in level:
+                    level[token_id] = {}
+                level = level[token_id]
+        if no_subsets and self.has_subsets(root, nested_token_ids):
+            raise ValueError(
+                "Each list in `nested_token_ids` can't be a complete subset of another list, but is"
+                f" {nested_token_ids}."
+            )
+        self.trie = root
+    def next_tokens(self, current_seq):
+        """
+        The next possible tokens that will progress the trie, given the current sequence of tokens in `current_seq`.
+        """
+        start = self.trie
+        for current_token in current_seq:
+            start = start[current_token]
+        next_tokens = list(start.keys())
+        return next_tokens
+    def reached_leaf(self, current_seq):
+        next_tokens = self.next_tokens(current_seq)
+        return len(next_tokens) == 0
+    def count_leaves(self, root):
+        next_nodes = list(root.values())
+        if len(next_nodes) == 0:
+            return 1
+        else:
+            return sum([self.count_leaves(nn) for nn in next_nodes])
+    def has_subsets(self, trie, nested_token_ids):
+        """
+        Returns whether # of leaves == # of words. Otherwise some word is a subset of another.
+        """
+        leaf_count = self.count_leaves(trie)
+        return len(nested_token_ids) != leaf_count
+class DisjunctiveConstraint(Constraint):
+    r"""
+    A special [`Constraint`] that is fulfilled by fulfilling just one of several constraints.
+    Args:
+        nested_token_ids (`list[list[int]]`):
+            A list of words, where each word is a list of ids. This constraint is fulfilled by generating just one from
+            the list of words.
+    """
+    def __init__(self, nested_token_ids: list[list[int]]):
+        super(Constraint, self).__init__()
+        if not isinstance(nested_token_ids, list) or len(nested_token_ids) == 0:
+            raise ValueError(f"`nested_token_ids` has to be a non-empty list, but is {nested_token_ids}.")
+        if any(not isinstance(token_ids, list) for token_ids in nested_token_ids):
+            raise ValueError(f"`nested_token_ids` has to be a list of lists, but is {nested_token_ids}.")
+        if any(
+            any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
+            for token_ids in nested_token_ids
+        ):
+            raise ValueError(
+                f"Each list in `nested_token_ids` has to be a list of positive integers, but is {nested_token_ids}."
+            )
+        self.trie = DisjunctiveTrie(nested_token_ids)
+        self.token_ids = nested_token_ids
+        self.seqlen = self.trie.max_height
+        self.current_seq = []
+        self.completed = False
+    def advance(self):
+        token_list = self.trie.next_tokens(self.current_seq)
+        if len(token_list) == 0:
+            return None
+        else:
+            return token_list
+    def does_advance(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}")
+        next_tokens = self.trie.next_tokens(self.current_seq)
+        return token_id in next_tokens
+    def update(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}")
+        stepped = False
+        completed = False
+        reset = False
+        if self.does_advance(token_id):
+            self.current_seq.append(token_id)
+            stepped = True
+        else:
+            reset = True
+            self.reset()
+        completed = self.trie.reached_leaf(self.current_seq)
+        self.completed = completed
+        return stepped, completed, reset
+    def reset(self):
+        self.completed = False
+        self.current_seq = []
+    def remaining(self):
+        if self.completed:
+            # since this can be completed without reaching max height
+            return 0
+        else:
+            return self.seqlen - len(self.current_seq)
+    def copy(self, stateful=False):
+        new_constraint = DisjunctiveConstraint(self.token_ids)
+        if stateful:
+            new_constraint.seq_len = self.seqlen
+            new_constraint.current_seq = self.current_seq
+            new_constraint.completed = self.completed
+        return new_constraint
+class ConstraintListState:
+    r"""
+    A class for beam scorers to track its progress through a list of constraints.
+    Args:
+        constraints (`list[Constraint]`):
+            A list of [`Constraint`] objects that must be fulfilled by the beam scorer.
+    """
+    def __init__(self, constraints: list[Constraint]):
+        self.constraints = constraints
+        # max # of steps required to fulfill a given constraint
+        self.max_seqlen = max([c.seqlen for c in constraints])
+        self.n_constraints = len(constraints)
+        self.completed = False
+        self.init_state()
+    def init_state(self):
+        self.complete_constraints = []
+        self.inprogress_constraint = None
+        self.pending_constraints = [constraint.copy(stateful=False) for constraint in self.constraints]
+    def get_bank(self):
+        add = 0
+        if self.inprogress_constraint:
+            # extra points for having a constraint mid-fulfilled
+            add += self.max_seqlen - self.inprogress_constraint.remaining()
+        return (len(self.complete_constraints) * self.max_seqlen) + add
+    def advance(self):
+        """The list of tokens to generate such that we can make progress.
+        By "list" we don't mean the list of token that will fully fulfill a constraint.
+        Given constraints `c_i = {t_ij | j == # of tokens}`, If we're not in the middle of progressing through a
+        specific constraint `c_i`, we return:
+        `[t_k1 for k in indices of unfulfilled constraints]`
+        If we are in the middle of a constraint, then we return:
+            `[t_ij]`, where `i` is the index of the inprogress constraint, `j` is the next step for the constraint.
+        Though we don't care which constraint is fulfilled first, if we are in the progress of fulfilling a constraint,
+        that's the only one we'll return.
+        """
+        token_list = []
+        if self.inprogress_constraint is None:
+            for constraint in self.pending_constraints:  # "pending" == "unfulfilled yet"
+                advance = constraint.advance()
+                if isinstance(advance, int):
+                    token_list.append(advance)
+                elif isinstance(advance, list):
+                    token_list.extend(advance)
+        else:
+            advance = self.inprogress_constraint.advance()
+            if isinstance(advance, int):
+                token_list.append(advance)
+            elif isinstance(advance, list):
+                token_list.extend(advance)
+        if len(token_list) == 0:
+            return None
+        else:
+            return token_list
+    def reset(self, token_ids: Optional[list[int]]):
+        """
+        token_ids: the tokens generated thus far to reset the state of the progress through constraints.
+        """
+        self.init_state()
+        if token_ids is not None:
+            for token in token_ids:
+                # completes or steps **one** constraint
+                complete, stepped = self.add(token)
+                # the entire list of constraints are fulfilled
+                if self.completed:
+                    break
+    def add(self, token_id: int):
+        if not isinstance(token_id, int):
+            raise TypeError(f"`token_id` should be an `int`, but is `{token_id}`.")
+        complete, stepped = False, False
+        if self.completed:
+            complete = True
+            stepped = False
+            return complete, stepped
+        if self.inprogress_constraint is not None:
+            # In the middle of fulfilling a constraint. If the `token_id` *does* makes an incremental progress to current
+            # job, simply update the state
+            stepped, complete, reset = self.inprogress_constraint.update(token_id)
+            if reset:
+                # 1. If the next token breaks the progress, then we must restart.
+                #     e.g. constraint = "I love pies" and sequence so far is "I love" but `token_id` == "books".
+                #     But that doesn't mean we self.init_state(), since we only reset the state for this particular
+                #     constraint, not the full list of constraints.
+                self.pending_constraints.append(self.inprogress_constraint.copy(stateful=False))
+                self.inprogress_constraint = None
+            if complete:
+                # 2. If the next token completes the constraint, move it to completed list, set
+                #     inprogress to None. If there are no pending constraints either, then this full list of constraints
+                #     is complete.
+                self.complete_constraints.append(self.inprogress_constraint)
+                self.inprogress_constraint = None
+                if len(self.pending_constraints) == 0:
+                    # we're done!
+                    self.completed = True
+        else:
+            # Not in the middle of fulfilling a constraint. So does this `token_id` helps us step towards any of our list
+            # of constraints?
+            for cidx, pending_constraint in enumerate(self.pending_constraints):
+                if pending_constraint.does_advance(token_id):
+                    stepped, complete, reset = pending_constraint.update(token_id)
+                    if not stepped:
+                        raise Exception(
+                            "`constraint.update(token_id)` is not yielding incremental progress, "
+                            "even though `constraint.does_advance(token_id)` is true."
+                        )
+                    if complete:
+                        self.complete_constraints.append(pending_constraint)
+                        self.inprogress_constraint = None
+                    if not complete and stepped:
+                        self.inprogress_constraint = pending_constraint
+                    if complete or stepped:
+                        # If we made any progress at all, then it's at least not a "pending constraint".
+                        self.pending_constraints = (
+                            self.pending_constraints[:cidx] + self.pending_constraints[cidx + 1 :]
+                        )
+                        if len(self.pending_constraints) == 0 and self.inprogress_constraint is None:
+                            # If there's no longer any pending after this and no inprogress either, then we must be
+                            # complete.
+                            self.completed = True
+                        break  # prevent accidentally stepping through multiple constraints with just one token.
+        return complete, stepped
+    def copy(self, stateful=True):
+        new_state = ConstraintListState(self.constraints)  # we actually never though self.constraints objects
+        # throughout this process. So it's at initialization state.
+        if stateful:
+            new_state.complete_constraints = [
+                constraint.copy(stateful=True) for constraint in self.complete_constraints
+            ]
+            if self.inprogress_constraint is not None:
+                new_state.inprogress_constraint = self.inprogress_constraint.copy(stateful=True)
+            new_state.pending_constraints = [constraint.copy() for constraint in self.pending_constraints]
+        return new_state

custom_generate/beam_search.py ADDED Viewed

	@@ -0,0 +1,716 @@

+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Optional, Union
+import numpy as np
+import torch
+from transformers.utils import add_start_docstrings
+from .beam_constraints import Constraint, ConstraintListState
+PROCESS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
+            Current scores of the top `2 * num_beams` non-finished beam hypotheses.
+        next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+            `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
+        next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, list[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        beam_indices (`torch.LongTensor`, *optional*):
+            Beam indices indicating to which beam hypothesis each token correspond.
+        group_index (`int`, *optional*):
+            The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
+    Return:
+        `UserDict`: A dictionary composed of the fields as defined above:
+            - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated scores of all
+              non-finished beams.
+            - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens to be added
+              to the non-finished beam_hypotheses.
+            - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
+              indicating to which beam the next tokens shall be added.
+"""
+FINALIZE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        final_beam_scores (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The final scores of all non-finished beams.
+        final_beam_tokens (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The last tokens to be added to the non-finished beam_hypotheses.
+        final_beam_indices (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The beam indices indicating to which beam the `final_beam_tokens` shall be added.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, list[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+    Return:
+        `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated sequences.
+        The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished early
+        due to the `eos_token_id`.
+"""
+class BeamScorer(ABC):
+    """
+    Abstract base class for all beam scorers that are used for [`~PreTrainedModel.beam_search`] and
+    [`~PreTrainedModel.beam_sample`].
+    """
+    @abstractmethod
+    @add_start_docstrings(PROCESS_INPUTS_DOCSTRING)
+    def process(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        raise NotImplementedError("This is an abstract method.")
+    @abstractmethod
+    @add_start_docstrings(FINALIZE_INPUTS_DOCSTRING)
+    def finalize(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        max_length: int,
+        **kwargs,
+    ) -> torch.LongTensor:
+        raise NotImplementedError("This is an abstract method.")
+class ConstrainedBeamSearchScorer(BeamScorer):
+    r"""
+    [`BeamScorer`] implementing constrained beam search decoding.
+    Args:
+        batch_size (`int`):
+            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
+        num_beams (`int`):
+            Number of beams for beam search.
+        constraints (`list[Constraint]`):
+            A list of positive constraints represented as `Constraint` objects that must be fulfilled in the generation
+            output. For more information, the documentation of [`Constraint`] should be read.
+        device (`torch.device`):
+            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
+            allocated.
+        length_penalty (`float`, *optional*, defaults to 1.0):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
+        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
+            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
+            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
+            beam search algorithm).
+        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            [`~transformers.BeamSearchScorer.finalize`].
+        num_beam_groups (`int`, *optional*, defaults to 1):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            See [this paper](https://huggingface.co/papers/1610.02424) for more details.
+        max_length (`int`, *optional*):
+            The maximum length of the sequence to be generated.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        num_beams: int,
+        constraints: list[Constraint],
+        device: torch.device,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[Union[bool, str]] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+        num_beam_groups: Optional[int] = 1,
+        max_length: Optional[int] = None,
+    ):
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+        self.constraints = constraints
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+                max_length=max_length,
+            )
+            for _ in range(batch_size)
+        ]
+        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
+                " one should make use of `greedy_search` instead."
+            )
+        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
+            raise ValueError(
+                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
+                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
+            )
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+    def make_constraint_states(self, n):
+        return [ConstraintListState([constraint.copy() for constraint in self.constraints]) for _ in range(n)]
+    def check_completes_constraints(self, sequence):
+        new_state = self.make_constraint_states(1)[0]
+        new_state.reset(sequence)
+        return new_state.completed
+    def process(
+        self,
+        input_ids: torch.LongTensor,
+        next_scores: torch.FloatTensor,
+        next_tokens: torch.LongTensor,
+        next_indices: torch.LongTensor,
+        scores_for_all_vocab: torch.FloatTensor,
+        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
+        decoder_prompt_len: Optional[int] = 0,
+    ) -> tuple[torch.Tensor]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+                Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
+                Current scores of the top `2 * num_beams` non-finished beam hypotheses.
+            next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+                `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
+            next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+                Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
+            scores_for_all_vocab (`torch.FloatTensor` of shape `(batch_size * num_beams, sequence_length)`):
+                The scores of all tokens in the vocabulary for each of the beam hypotheses.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`Union[int, list[int]]`, *optional*):
+                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices indicating to which beam hypothesis each token correspond.
+            decoder_prompt_len (`int`, *optional*):
+                The length of prompt that is included in the input to decoder.
+        Return:
+            `UserDict`: A dictionary composed of the fields as defined above:
+                - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated scores of
+                  all
+                non-finished beams.
+                - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens to be
+                  added
+                to the non-finished beam_hypotheses.
+                - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
+                indicating to which beam the next tokens shall be added.
+        """
+        # add up to the length which the next_scores is calculated on (including decoder prompt)
+        cur_len = input_ids.shape[-1] + 1
+        batch_size = len(self._beam_hyps)
+        if batch_size != (input_ids.shape[0] // self.group_size):
+            if self.num_beam_groups > 1:
+                raise ValueError(
+                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
+                    f"size of {self.group_size} is expected by the beam scorer."
+                )
+            else:
+                raise ValueError(
+                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
+                    f"{self.group_size} is expected by the beam scorer."
+                )
+        device = input_ids.device
+        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
+        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
+        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                if self.num_beams < len(beam_hyp):
+                    raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
+                if eos_token_id is None or pad_token_id is None:
+                    raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+            # next tokens for this sentence.
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
+                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
+            ):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item() in eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    completes_constraint = self.check_completes_constraints(input_ids[batch_beam_idx].tolist())
+                    if completes_constraint:
+                        if beam_indices is not None:
+                            beam_index = beam_indices[batch_beam_idx]
+                            beam_index = beam_index + (batch_beam_idx,)
+                        else:
+                            beam_index = None
+                        beam_hyp.add(
+                            input_ids[batch_beam_idx].clone(),
+                            next_score.item(),
+                            beam_indices=beam_index,
+                            generated_len=cur_len - decoder_prompt_len,
+                        )
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+            new_scores, new_tokens, new_indices = self.step_sentence_constraint(
+                batch_idx,
+                input_ids,
+                scores_for_all_vocab,
+                next_beam_scores[batch_idx],
+                next_beam_tokens[batch_idx],
+                next_beam_indices[batch_idx],
+            )
+            next_beam_scores[batch_idx] = new_scores
+            next_beam_tokens[batch_idx] = new_tokens
+            next_beam_indices[batch_idx] = new_indices
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
+                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
+                )
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
+            )
+        return UserDict(
+            {
+                "next_beam_scores": next_beam_scores.view(-1),
+                "next_beam_tokens": next_beam_tokens.view(-1),
+                "next_beam_indices": next_beam_indices.view(-1),
+            }
+        )
+    def step_sentence_constraint(
+        self,
+        batch_idx: int,
+        input_ids: torch.LongTensor,
+        vocab_scores: torch.FloatTensor,
+        sent_beam_scores: torch.FloatTensor,
+        sent_beam_tokens: torch.LongTensor,
+        sent_beam_indices: torch.LongTensor,
+        push_progress: bool = False,
+    ):
+        # sent_beam_tokens are the next {num_beams} number of tokens that are under consideration for this beam
+        # (candidate next tokens)
+        # 1. Adding "advance_tokens"
+        #     using ConstraintStateList.advance(), we propose new tokens to be added into this "candidate list" that will
+        #     advance us in fulfilling the constraints.
+        # 2. Selecting best candidates such that we end up with highest probable candidates
+        #     that fulfill our constraints.
+        orig_len = sent_beam_indices.size(0)
+        device = sent_beam_indices.device
+        # initialize states
+        topk_contraint_states = self.make_constraint_states(orig_len)
+        advance_constraint_states = self.make_constraint_states(orig_len)
+        sidx, eidx = batch_idx * orig_len, (batch_idx + 1) * orig_len
+        this_batch_input_ids = input_ids[sidx:eidx]
+        this_batch_token_scores = vocab_scores[sidx:eidx]
+        full_hypotheses = torch.cat((input_ids[sent_beam_indices], sent_beam_tokens.unsqueeze(-1)), dim=-1)
+        # need to make new hypothesis that advance the constraints
+        track_new = {
+            "new_seqs": full_hypotheses.tolist(),
+            "new_states": [],
+            "new_indices": [],
+            "new_tokens": [],
+            "new_scores": [],
+        }
+        for seq_idx, pre_seq in enumerate(this_batch_input_ids):
+            # pre_seq = ith sequence generated before this step.
+            # input_ids -> (topk) generic beam search best model next tokens
+            #           -> (advance) constraints forcing the next token
+            # either way, we need to sort them into "banks" later, so store a "ConstraintListState" for all types of
+            # hypotheses.
+            topk_state = topk_contraint_states[seq_idx]
+            topk_state.reset(full_hypotheses[seq_idx].tolist())
+            advance_state = advance_constraint_states[seq_idx]
+            advance_state.reset(pre_seq.tolist())
+            if not advance_state.completed:
+                advance_tokens = torch.tensor(advance_state.advance(), dtype=torch.long, device=device)
+                for advance_token in advance_tokens:
+                    # since adding each `advance_token` leads to a different hypothesis, create new state instance.
+                    new_state = advance_state.copy(stateful=True)
+                    new_state.add(advance_token.tolist())
+                    advance_seq = torch.cat((pre_seq, advance_token.unsqueeze(0)), -1).tolist()
+                    if advance_seq not in track_new["new_seqs"]:
+                        # prevent duplicates, which are basically bound to happen in this process.
+                        track_new["new_seqs"].append(advance_seq)
+                        track_new["new_indices"].append(sidx + seq_idx)  # idx -> global idx across all the batches
+                        track_new["new_tokens"].append(advance_token)
+                        track_new["new_scores"].append(this_batch_token_scores[seq_idx].take(advance_token))
+                        track_new["new_states"].append(new_state)
+            elif push_progress:
+                # Basically, `sent_beam_indices` often chooses very little among `input_ids` the generated sequences that
+                # actually fulfill our constraints. For example, let constraints == ["loves pies"] and
+                #     pre_seq_1 = "The child loves pies and" pre_seq_2 = "The child plays in the playground and"
+                # Without this step, if `sent_beam_indices` is something like [1,1], then
+                #     1. `pre_seq_1` won't be added to the list of (topk) hypothesis since it's not in the indices and
+                #     2.  it won't be added to the list of (advance) hypothesis since it's completed already. (this is
+                #         the else part of `if constraints_completed[seq_idx]`)
+                #     3. it ends up simply getting removed from consideration.
+                # #3 might be fine and actually desired, since it's likely that it's a low-probability output anyways,
+                # especially if it's not in the list of `sent_beam_indices`. But this often leads to lengthened beam
+                # search times, since completed sequences keep getting removed after all this effort for constrained
+                # generation.
+                # Here, we basically take `pre_seq_1` and to "push" it into the considered list of hypotheses, by simply
+                # appending the next likely token in the vocabulary and adding it to the list of hypotheses.
+                new_score, new_token = torch.max(this_batch_token_scores[seq_idx], 0)  # some next probable token
+                advance_seq = torch.cat((pre_seq, new_token.unsqueeze(0)), -1)
+                advance_state = advance_constraint_states[seq_idx]
+                advance_seq = advance_seq.tolist()
+                advance_state.reset(advance_seq)
+                if advance_seq not in track_new["new_seqs"]:
+                    # but still don't want to have duplicates
+                    track_new["new_seqs"].append(advance_seq)
+                    track_new["new_indices"].append(seq_idx)
+                    track_new["new_tokens"].append(new_token)
+                    track_new["new_scores"].append(new_score)
+                    track_new["new_states"].append(advance_state)
+        if len(track_new["new_indices"]) > 0:
+            new_indices = torch.tensor(track_new["new_indices"], device=device)
+            new_tokens = torch.stack(track_new["new_tokens"]).to(device)
+            new_scores = torch.stack(track_new["new_scores"]).to(device)
+            all_states = topk_contraint_states + track_new["new_states"]
+            all_tokens = torch.cat((sent_beam_tokens, new_tokens), -1)
+            all_scores = torch.cat((sent_beam_scores, new_scores), -1)
+            all_banks = torch.tensor([one.get_bank() for one in all_states], device=device)
+            zipped = all_banks * 100 + all_scores
+            indices = zipped.sort(descending=True).indices
+            sorted_banks = all_banks[indices]
+            # Then we end up with {sorted among bank C}, {sorted among bank C-1}, ..., {sorted among bank 0}
+            counter = -1
+            cur_bank = sorted_banks[0]
+            increments = []
+            for bank in sorted_banks:
+                if bank == cur_bank:
+                    counter += 1
+                else:
+                    counter = 0
+                    cur_bank = bank
+                increments.append(counter)
+            rearrangers = torch.tensor(np.argsort(increments, kind="mergesort"))
+            indices = indices[rearrangers][:orig_len]
+            sent_beam_scores = all_scores[indices]
+            sent_beam_tokens = all_tokens[indices]
+            sent_beam_indices = torch.cat((sent_beam_indices, new_indices))[indices]
+        return sent_beam_scores, sent_beam_tokens, sent_beam_indices
+    def finalize(
+        self,
+        input_ids: torch.LongTensor,
+        final_beam_scores: torch.FloatTensor,
+        final_beam_tokens: torch.LongTensor,
+        final_beam_indices: torch.LongTensor,
+        max_length: int,
+        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
+        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
+        beam_indices: Optional[torch.LongTensor] = None,
+        decoder_prompt_len: Optional[int] = 0,
+    ) -> tuple[torch.LongTensor]:
+        batch_size = len(self._beam_hyps)
+        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+            eos_token_id = torch.tensor(eos_token_id)
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+            ids_collect = []
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                completes_constraint = self.check_completes_constraints(final_tokens.tolist())
+                if completes_constraint:
+                    beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
+                    generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
+                    ids_collect.append(beam_id)
+            # due to overly complex constraints or other factors, sometimes we can't guarantee a successful
+            # generation. In these cases we simply return the highest scoring outputs.
+            if len(ids_collect) < self.num_beam_hyps_to_keep:
+                for beam_id in range(self.num_beams):
+                    if beam_id not in ids_collect:
+                        batch_beam_idx = batch_idx * self.num_beams + beam_id
+                        final_score = final_beam_scores[batch_beam_idx].item()
+                        final_tokens = input_ids[batch_beam_idx]
+                        generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                        beam_hyp.add(final_tokens, final_score, generated_len=generated_len)
+                    if len(ids_collect) >= self.num_beam_hyps_to_keep:
+                        break
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+        best_indices = []
+        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp_tuple = sorted_hyps.pop()
+                best_score = best_hyp_tuple[0]
+                best_hyp = best_hyp_tuple[1]
+                best_index = best_hyp_tuple[2]
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+                # append to lists
+                best.append(best_hyp)
+                # append indices to list
+                best_indices.append(best_index)
+                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
+        # prepare for adding eos
+        sent_lengths_max = sent_lengths.max().item() + 1
+        sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
+        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        if len(best_indices) > 0 and best_indices[0] is not None:
+            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        else:
+            indices = None
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            if pad_token_id is None:
+                raise ValueError("`pad_token_id` has to be defined")
+            decoded.fill_(pad_token_id)
+        if indices is not None:
+            indices.fill_(-1)
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
+            decoded[i, : sent_lengths[i]] = hypo
+            if indices is not None:
+                indices[i, : len(best_idx)] = torch.tensor(best_idx)
+            if sent_lengths[i] < sent_max_len:
+                # inserting only the first eos_token_id
+                decoded[i, sent_lengths[i]] = eos_token_id[0]
+        return UserDict(
+            {
+                "sequences": decoded,
+                "sequence_scores": best_scores,
+                "beam_indices": indices,
+            }
+        )
+class BeamHypotheses:
+    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool, max_length: Optional[int] = None):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+        if not isinstance(self.early_stopping, bool) and self.max_length is None:
+            raise ValueError(
+                "When `do_early_stopping` is set to a string, `max_length` must be defined. Ensure it is passed to the"
+                " BeamScorer class instance at initialization time."
+            )
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+    def add(
+        self,
+        hyp: torch.LongTensor,
+        sum_logprobs: float,
+        beam_indices: Optional[torch.LongTensor] = None,
+        generated_len: Optional[int] = None,
+    ):
+        """
+        Add a new hypothesis to the list.
+        """
+        if generated_len is not None:
+            score = sum_logprobs / (generated_len**self.length_penalty)
+        # This 'else' case exists for retrocompatibility
+        else:
+            score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, beam_indices))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([(s, idx) for idx, (s, _, _) in enumerate(self.beams)])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+    def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: Optional[int] = 0) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+        if len(self) < self.num_beams:
+            return False
+        # `True`: stop as soon as at least `num_beams` hypotheses are finished
+        if self.early_stopping is True:
+            return True
+        # `False`: heuristic -- compute best possible score from `cur_len`, even though it is not entirely accurate
+        #  when `length_penalty` is positive. See the discussion below for more details.
+        # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+        elif self.early_stopping is False:
+            highest_attainable_score = best_sum_logprobs / (cur_len - decoder_prompt_len) ** self.length_penalty
+            ret = self.worst_score >= highest_attainable_score
+            return ret
+        # `"never"`: compute the best possible score, depending on the signal of `length_penalty`
+        else:
+            # `length_penalty` > 0.0 -> max denominator is obtaned from `max_length`, not from `cur_len` -> min
+            # abs(`highest_attainable_score`) is obtained -> `highest_attainable_score` is negative, hence we obtain
+            # its max this way
+            if self.length_penalty > 0.0:
+                if self.max_length <= decoder_prompt_len:
+                    raise ValueError("max_length is not larger than decoder prompt length")
+                highest_attainable_score = (
+                    best_sum_logprobs / (self.max_length - decoder_prompt_len) ** self.length_penalty
+                )
+            # the opposite logic applies here (max `highest_attainable_score` from `cur_len`)
+            else:
+                highest_attainable_score = best_sum_logprobs / (cur_len - decoder_prompt_len) ** self.length_penalty
+            ret = self.worst_score >= highest_attainable_score
+            return ret

custom_generate/generate.py ADDED Viewed

	@@ -0,0 +1,337 @@

+from typing import Union
+import torch
+from transformers import LogitsProcessorList, StoppingCriteriaList, GenerationConfig
+from transformers.generation.utils import (
+    GenerateBeamOutput,
+    GenerationMixin,
+    GenerateBeamDecoderOnlyOutput,
+    GenerateBeamEncoderDecoderOutput,
+)
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import logging
+from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
+from .beam_search import ConstrainedBeamSearchScorer
+logger = logging.getLogger(__name__)
+def _constrained_beam_search(
+        model,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool,
+        **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **constrained beam search
+        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        if generation_config.constraints is not None or generation_config.force_words_ids is not None:
+            constrained_wrong_parameter_msg = (
+                "one of `constraints`, `force_words_ids` is not `None`, triggering constrained beam search. "
+                "However, `{flag_name}` is set to `{flag_value}`, which is incompatible with this generation "
+                "mode. Set `constraints` and `force_words_ids` to `None` or unset `{flag_name}` to continue."
+            )
+            if generation_config.do_sample is True:
+                raise ValueError(
+                    constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=generation_config.do_sample)
+                )
+        final_constraints = []
+        if generation_config.constraints is not None:
+            final_constraints = generation_config.constraints
+        if generation_config.force_words_ids is not None:
+            def typeerror():
+                raise ValueError(
+                    "`force_words_ids` has to either be a `list[list[list[int]]]` or `list[list[int]]` "
+                    f"of positive integers, but is {generation_config.force_words_ids}."
+                )
+            if (
+                not isinstance(generation_config.force_words_ids, list)
+                or len(generation_config.force_words_ids) == 0
+            ):
+                typeerror()
+            for word_ids in generation_config.force_words_ids:
+                if isinstance(word_ids[0], list):
+                    if not isinstance(word_ids, list) or len(word_ids) == 0:
+                        typeerror()
+                    if any(not isinstance(token_ids, list) for token_ids in word_ids):
+                        typeerror()
+                    if any(
+                        any((not isinstance(token_id, int) or token_id < 0) for token_id in token_ids)
+                        for token_ids in word_ids
+                    ):
+                        typeerror()
+                    constraint = DisjunctiveConstraint(word_ids)
+                else:
+                    if not isinstance(word_ids, list) or len(word_ids) == 0:
+                        typeerror()
+                    if any((not isinstance(token_id, int) or token_id < 0) for token_id in word_ids):
+                        typeerror()
+                    constraint = PhrasalConstraint(word_ids)
+                final_constraints.append(constraint)
+        # define beam scorer
+        constrained_beam_scorer = ConstrainedBeamSearchScorer(
+                constraints=final_constraints,
+                batch_size=batch_size,
+                num_beams=generation_config.num_beams,
+                device=inputs_tensor.device,
+                length_penalty=generation_config.length_penalty,
+                do_early_stopping=generation_config.early_stopping,
+                num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                max_length=generation_config.max_length,
+            )
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        batch_size = len(constrained_beam_scorer._beam_hyps)
+        num_beams = constrained_beam_scorer.num_beams
+        batch_beam_size, cur_len = input_ids.shape[:2]
+        model_kwargs = model._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and model.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        this_peer_finished = False
+        decoder_prompt_len = input_ids.shape[1]  # record the prompt length of decoder
+        while model._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # prepare variable output controls (note: some models won't accept all output controls)
+            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
+            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
+            outputs = model(**model_inputs, return_dict=True)
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = model._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=model.config.is_encoder_decoder,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue
+            # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # (the clone itself is always small)
+            # .float() is needed to retain precision for later logits manipulations
+            next_token_logits = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+            scores_for_all_vocab = next_token_scores.clone()
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if model.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if model.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if model.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+            # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+            n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+            )
+            next_indices = (next_tokens / vocab_size).long()
+            next_tokens = next_tokens % vocab_size
+            # stateless
+            beam_outputs = constrained_beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                scores_for_all_vocab,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+                decoder_prompt_len=decoder_prompt_len,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
+            # (that way the memory peak does not include outputs.logits)
+            del outputs
+            # NOTE: we need to check if `model._reorder_cache` exists for special models like RAG, RecurrentGemma etc.
+            if model_kwargs.get("past_key_values", None) is not None:
+                if hasattr(model, "_reorder_cache"):
+                    model_kwargs["past_key_values"] = model._reorder_cache(model_kwargs["past_key_values"], beam_idx)
+                else:
+                    model_kwargs["past_key_values"].reorder_cache(beam_idx)
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple(beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))
+            # increase cur_len
+            cur_len = cur_len + 1
+            if constrained_beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
+                this_peer_finished = True
+        sequence_outputs = constrained_beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if model.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+def generate(model, *args, **kwargs):
+    """Custom generate function for constrained beam search decoding.
+    Args:
+        model (`PreTrainedModel`):
+            The model to generate from.
+        num_beams (`int`): The number of beams to use for beam search.
+        constraints (`list[Constraint]`, *optional*):
+            Custom constraints that can be added to the generation to ensure that the output will contain the use of
+            certain tokens as defined by `Constraint` objects, in the most sensible way possible.
+        force_words_ids (`list[list[list[int]]]`): List of token ids that must be generated. If given a `list[list[int]]`, this is treated as a simple list of
+            words that must be included, the opposite to `bad_words_ids`. If given `list[list[list[int]]]`, this
+            triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), where one
+            can allow different forms of each word.
+        length_penalty (`float`): The length penalty to use for beam search.
+        early_stopping (`bool`): Whether to stop beam search when sufficient beams have finished.
+        num_return_sequences (`int`): The number of sequences to return.
+        max_length (`int`): The maximum length of the generated sequence.
+    """
+    generation_outputs = GenerationMixin.generate(
+        model, *args, custom_generate=_constrained_beam_search, **kwargs
+    )
+    return generation_outputs

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "bos_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "pad_token_id": 151643,
+    "temperature": 0.6,
+    "top_k": 20,
+    "top_p": 0.95,
+    "transformers_version": "4.56.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f47f71177f32bcd101b7573ec9171e6a57f4f4d31148d38e382306f42996874b
+size 1503300328

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff