h2ogpt-chatbot2

Runtime error

App Files Files Community

pseudotensor commited on May 7, 2023

Commit

a1c704a

1 Parent(s): c3931e9

Update with h2oGPT hash 05d3ad444971c24fb021ea80c27f867c7a953699

Browse files

Files changed (7) hide show

client_test.py +4 -2
finetune.py +60 -10
generate.py +98 -83
gradio_runner.py +26 -10
prompter.py +6 -5
requirements.txt +1 -1
stopping.py +49 -6

client_test.py CHANGED Viewed

@@ -53,13 +53,16 @@ def get_client():
 def test_client_basic():
     instruction = ''  # only for chat=True
     iinput = ''  # only for chat=True
     context = ''
     # streaming output is supported, loops over and outputs each generation in streaming mode
     # but leave stream_output=False for simple input/output mode
     stream_output = False
-    prompt_type = 'human_bot'
     temperature = 0.1
     top_p = 0.75
     top_k = 40
@@ -73,7 +76,6 @@ def test_client_basic():
     do_sample = True
     # only these 2 below used if pass chat=False
     chat = False
-    instruction_nochat = "Who are you?"
     iinput_nochat = ''
     args = [instruction,

 def test_client_basic():
+    return run_client_basic(instruction_nochat='Who are you?', prompt_type='human_bot')
+def run_client_basic(instruction_nochat, prompt_type):
     instruction = ''  # only for chat=True
     iinput = ''  # only for chat=True
     context = ''
     # streaming output is supported, loops over and outputs each generation in streaming mode
     # but leave stream_output=False for simple input/output mode
     stream_output = False
     temperature = 0.1
     top_p = 0.75
     top_k = 40
     do_sample = True
     # only these 2 below used if pass chat=False
     chat = False
     iinput_nochat = ''
     args = [instruction,

finetune.py CHANGED Viewed

@@ -28,6 +28,8 @@ class PromptType(Enum):
     instruct_vicuna = 7
     instruct_with_end = 8
     human_bot_orig = 9
 prompt_type_to_model_name = {
@@ -46,6 +48,14 @@ prompt_type_to_model_name = {
         'philschmid/flan-t5-base-samsum',
         'gpt2',
         'distilgpt2',
     ],
     'instruct': [],
     'instruct_with_end': ['databricks/dolly-v2-12b'],
@@ -61,14 +71,12 @@ prompt_type_to_model_name = {
     'simple_instruct': ['t5-small', 't5-large', 'google/flan-t5', 'google/flan-t5-xxl', 'google/flan-ul2'],
     'instruct_vicuna': ['AlekseyKorshuk/vicuna-7b'],
     'human_bot_orig': ['togethercomputer/GPT-NeoXT-Chat-Base-20B'],
 }
 inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
 inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
-human = '<human>:'
-bot = "<bot>:"
 prompt_types_strings = []
 for p in PromptType:
     prompt_types_strings.extend([p.name])
@@ -277,8 +285,13 @@ def train(
                 layer_norm_names=["layer_norm", "layernorm"],  # keep all layer norms in higher precision
             )
-    from peft import LoraConfig, get_peft_model, set_peft_model_state_dict, utils
-    lora_mappings = utils.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
     lora_mappings['distilgpt2'] = ["c_attn"]
     if lora_weights:
@@ -730,10 +743,10 @@ def generate_and_tokenize_prompt(data_point, prompt_type=None, train_on_inputs=F
     assert prompt_type is not None
     assert cutoff_len is not None
     assert tokenizer is not None
-    full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
     tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
     if not train_on_inputs:
-        user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
         tokenized_user_prompt = tokenize(user_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
         user_prompt_len = len(tokenized_user_prompt["input_ids"])
         if add_eos_token:
@@ -752,9 +765,11 @@ def get_prompt(prompt_type, chat, context, reduced):
     if prompt_type in [-1, "-1", "plain"]:
         promptA = promptB = PreInstruct = PreInput = PreResponse = ''
         terminate_response = []
     elif prompt_type == 'simple_instruct':
         promptA = promptB = PreInstruct = PreInput = PreResponse = None
         terminate_response = []
     elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
         promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
         promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
@@ -774,6 +789,7 @@ def get_prompt(prompt_type, chat, context, reduced):
             terminate_response = ['### End']
         else:
             terminate_response = None
     elif prompt_type in [1, "1", "quality"]:
         promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (chat and reduced) else ''
         promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (chat and reduced) else ''
@@ -790,7 +806,10 @@ def get_prompt(prompt_type, chat, context, reduced):
 ### Response:
 """
         terminate_response = None
     elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
         if reduced or context or prompt_type in [2, "2", "human_bot"]:
             preprompt = ''
         else:
@@ -819,6 +838,7 @@ Current Time: {}
             PreResponse = bot
         terminate_response = [start, PreResponse]
     elif prompt_type in [3, "3", "dai_faq"]:
         promptA = ''
         promptB = 'Answer the following Driverless AI question.\n'
@@ -833,11 +853,13 @@ Current Time: {}
 ### Driverless AI documentation answer:
 """
         terminate_response = ['\n\n']
     elif prompt_type in [5, "5", "summarize"]:
         promptA = promptB = PreInput = ''
         PreInstruct = '## Main Text\n\n'
         PreResponse = '\n\n## Summary\n\n'
         terminate_response = None
     elif prompt_type in [6, "6", "instruct_vicuna"]:
         promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
             "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (chat and reduced) else ''
@@ -852,10 +874,37 @@ Current Time: {}
 ### Assistant:
 """
         terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
-    return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response
 def generate_prompt(data_point, prompt_type, chat, reduced):
@@ -867,7 +916,8 @@ def generate_prompt(data_point, prompt_type, chat, reduced):
     output = data_point.get('output')
     prompt_type = data_point.get('prompt_type', prompt_type)
     assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
-    promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response = get_prompt(prompt_type, chat, context, reduced)
     prompt = context if not reduced else ''
@@ -919,7 +969,7 @@ def generate_prompt(data_point, prompt_type, chat, reduced):
     if output:
         prompt += f"""{output}"""
-    return prompt, pre_response, terminate_response
 def inject_newline(prompt_type, prompt):

     instruct_vicuna = 7
     instruct_with_end = 8
     human_bot_orig = 9
+    prompt_answer = 10
+    open_assistant = 11
 prompt_type_to_model_name = {
         'philschmid/flan-t5-base-samsum',
         'gpt2',
         'distilgpt2',
+        'mosaicml/mpt-7b-storywriter',
+        'mosaicml/mpt-7b-instruct',  # internal code handles instruct
+        'mosaicml/mpt-7b-chat',  # NC, internal code handles instruct
+    ],
+    'prompt_answer': [
+        'h2oai/h2ogpt-gm-oasst1-en-1024-20b',
+        'h2oai/h2ogpt-gm-oasst1-en-1024-12b',
+        'h2oai/h2ogpt-gm-oasst1-multilang-1024-20b',
     ],
     'instruct': [],
     'instruct_with_end': ['databricks/dolly-v2-12b'],
     'simple_instruct': ['t5-small', 't5-large', 'google/flan-t5', 'google/flan-t5-xxl', 'google/flan-ul2'],
     'instruct_vicuna': ['AlekseyKorshuk/vicuna-7b'],
     'human_bot_orig': ['togethercomputer/GPT-NeoXT-Chat-Base-20B'],
+    "open_assistant": ['OpenAssistant/oasst-sft-7-llama-30b-xor', 'oasst-sft-7-llama-30b'],
 }
 inv_prompt_type_to_model_name = {v.strip(): k for k, l in prompt_type_to_model_name.items() for v in l}
 inv_prompt_type_to_model_lower = {v.strip().lower(): k for k, l in prompt_type_to_model_name.items() for v in l}
 prompt_types_strings = []
 for p in PromptType:
     prompt_types_strings.extend([p.name])
                 layer_norm_names=["layer_norm", "layernorm"],  # keep all layer norms in higher precision
             )
+    from peft import LoraConfig, get_peft_model, set_peft_model_state_dict
+    try:
+        from peft import utils
+        lora_mappings = utils.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
+    except AttributeError:
+        from peft import mapping
+        lora_mappings = mapping.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy()
     lora_mappings['distilgpt2'] = ["c_attn"]
     if lora_weights:
     assert prompt_type is not None
     assert cutoff_len is not None
     assert tokenizer is not None
+    full_prompt, _, _, _ = generate_prompt(data_point, prompt_type, False, False)
     tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
     if not train_on_inputs:
+        user_prompt, _, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
         tokenized_user_prompt = tokenize(user_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
         user_prompt_len = len(tokenized_user_prompt["input_ids"])
         if add_eos_token:
     if prompt_type in [-1, "-1", "plain"]:
         promptA = promptB = PreInstruct = PreInput = PreResponse = ''
         terminate_response = []
+        chat_sep = ''
     elif prompt_type == 'simple_instruct':
         promptA = promptB = PreInstruct = PreInput = PreResponse = None
         terminate_response = []
+        chat_sep = '\n'
     elif prompt_type in [0, "0", "instruct"] or prompt_type in [7, "7", "instruct_with_end"]:
         promptA = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
         promptB = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n' if not (chat and reduced) else ''
             terminate_response = ['### End']
         else:
             terminate_response = None
+        chat_sep = '\n'
     elif prompt_type in [1, "1", "quality"]:
         promptA = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction as applied on the Input.\n' if not (chat and reduced) else ''
         promptB = 'Write a detailed high-quality, accurate, fair, Response with about 100 words by following the Instruction.\n' if not (chat and reduced) else ''
 ### Response:
 """
         terminate_response = None
+        chat_sep = '\n'
     elif prompt_type in [2, "2", "human_bot", 9, "9", "human_bot_orig"]:
+        human = '<human>:'
+        bot = "<bot>:"
         if reduced or context or prompt_type in [2, "2", "human_bot"]:
             preprompt = ''
         else:
             PreResponse = bot
         terminate_response = [start, PreResponse]
+        chat_sep = '\n'
     elif prompt_type in [3, "3", "dai_faq"]:
         promptA = ''
         promptB = 'Answer the following Driverless AI question.\n'
 ### Driverless AI documentation answer:
 """
         terminate_response = ['\n\n']
+        chat_sep = terminate_response
     elif prompt_type in [5, "5", "summarize"]:
         promptA = promptB = PreInput = ''
         PreInstruct = '## Main Text\n\n'
         PreResponse = '\n\n## Summary\n\n'
         terminate_response = None
+        chat_sep = '\n'
     elif prompt_type in [6, "6", "instruct_vicuna"]:
         promptA = promptB = "A chat between a curious human and an artificial intelligence assistant. " \
             "The assistant gives helpful, detailed, and polite answers to the human's questions." if not (chat and reduced) else ''
 ### Assistant:
 """
         terminate_response = ['### Human:']  # but only allow terminate after prompt is found correctly, else can't terminate
+        chat_sep = '\n'
+    elif prompt_type in [10, "10", "prompt_answer"]:
+        preprompt = ''
+        prompt_tokens = "<|prompt|>"
+        answer_tokens = "<|answer|>"
+        start = prompt_tokens
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = answer_tokens
+        eos = '<|endoftext|>'  # neox eos
+        terminate_response = [start, PreResponse, eos]
+        chat_sep = eos
+    elif prompt_type in [11, "11", "open_assistant"]:
+        # From added_tokens.json
+        preprompt = ''
+        prompt_tokens = "<|prompter|>"
+        answer_tokens = "<|assistant|>"
+        start = prompt_tokens
+        promptB = promptA = '%s%s' % (preprompt, start)
+        PreInstruct = ""
+        PreInput = None
+        PreResponse = answer_tokens
+        pend = "<|prefix_end|>"
+        eos = "</s>"
+        terminate_response = [start, PreResponse, pend, eos]
+        chat_sep = eos
     else:
         raise RuntimeError("No such prompt_type=%s" % prompt_type)
+    return promptA, promptB, PreInstruct, PreInput, PreResponse, terminate_response, chat_sep
 def generate_prompt(data_point, prompt_type, chat, reduced):
     output = data_point.get('output')
     prompt_type = data_point.get('prompt_type', prompt_type)
     assert prompt_type in prompt_types, "Bad prompt type: %s" % prompt_type
+    promptA, promptB, PreInstruct, PreInput, PreResponse, \
+    terminate_response, chat_sep = get_prompt(prompt_type, chat, context, reduced)
     prompt = context if not reduced else ''
     if output:
         prompt += f"""{output}"""
+    return prompt, pre_response, terminate_response, chat_sep
 def inject_newline(prompt_type, prompt):

generate.py CHANGED Viewed

@@ -9,7 +9,7 @@ from datetime import datetime
 import filelock
 import psutil
-from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread
 SEED = 1236
 set_seed(SEED)
@@ -22,13 +22,13 @@ import pandas as pd
 import fire
 import torch
 from peft import PeftModel
-from transformers import GenerationConfig, StoppingCriteriaList, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
 from prompter import Prompter
-from finetune import get_loaders, example_data_points, generate_prompt, human, bot, inv_prompt_type_to_model_lower
-from stopping import StoppingCriteriaSub
 eval_extra_columns = ['prompt', 'response', 'score']
@@ -62,6 +62,7 @@ def main(
         local_files_only: bool = False,
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
         src_lang: str = "English",
         tgt_lang: str = "Russian",
@@ -124,6 +125,7 @@ def main(
     :param local_files_only: whether to only use local files instead of doing to HF for models
     :param resume_download: whether to resume downloads from HF for models
     :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
     :param src_lang: source languages to include if doing translation (None = all)
     :param tgt_lang: target languages to include if doing translation (None = all)
     :param gradio: whether to enable gradio, or to enable benchmark mode
@@ -168,15 +170,22 @@ def main(
     if is_public:
         input_lines = 1  # ensure set, for ease of use
-        temperature = 0.2
-        top_p = 0.85
-        top_k = 70
-        do_sample = True
         if is_low_mem:
-            base_model = 'h2oai/h2ogpt-oasst1-512-12b'
-            load_8bit = True
         else:
-            base_model = 'h2oai/h2ogpt-oasst1-512-20b'
     if is_low_mem:
         load_8bit = True
     if is_hf:
@@ -229,6 +238,11 @@ def main(
                             do_sample,
                             )
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
@@ -416,7 +430,11 @@ def get_device():
 def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
                        gpu_id=0,
-                       use_auth_token=False):
     """
     Ensure model gets on correct device
     :param base_model:
@@ -426,29 +444,47 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     :param reward_type:
     :param gpu_id:
     :param use_auth_token:
     :return:
     """
     with init_empty_weights():
         from transformers import AutoConfig
-        config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
-        model = AutoModel.from_config(
-            config,
-        )
-    # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
-    # NOTE: Some models require avoiding sharding some layers,
-    # then would pass no_split_module_classes and give list of those layers.
-    device_map = infer_auto_device_map(
-        model,
-        dtype=torch.float16 if load_half else torch.float32,
-    )
-    if hasattr(model, 'model'):
-        device_map_model = infer_auto_device_map(
-            model.model,
             dtype=torch.float16 if load_half else torch.float32,
         )
-        device_map.update(device_map_model)
-    print('device_map: %s' % device_map, flush=True)
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
@@ -472,11 +508,13 @@ def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward
     if load_in_8bit or not load_half:
         model = model_loader.from_pretrained(
             base_model,
             **model_kwargs,
         )
     else:
         model = model_loader.from_pretrained(
             base_model,
             **model_kwargs,
         ).half()
     return model
@@ -495,6 +533,7 @@ def get_model(
         local_files_only: bool = False,
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
         compile: bool = True,
         **kwargs,
 ):
@@ -513,6 +552,7 @@ def get_model(
     :param local_files_only: use local files instead of from HF
     :param resume_download: resume downloads from HF
     :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
     :param compile: whether to compile torch model
     :param kwargs:
     :return:
@@ -531,7 +571,8 @@ def get_model(
     )
     from transformers import AutoConfig
-    config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token)
     llama_type_from_config = 'llama' in str(config).lower()
     llama_type_from_name = "llama" in base_model.lower()
     llama_type = llama_type_from_config or llama_type_from_name
@@ -548,6 +589,7 @@ def get_model(
                                                      local_files_only=local_files_only,
                                                      resume_download=resume_download,
                                                      use_auth_token=use_auth_token,
                                                      )
     else:
         tokenizer = tokenizer_loader
@@ -563,13 +605,18 @@ def get_model(
         model_kwargs = dict(local_files_only=local_files_only,
                             torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
                             resume_download=resume_download,
-                            use_auth_token=use_auth_token)
-        if 'mbart-' not in base_model.lower():
             model_kwargs.update(dict(load_in_8bit=load_8bit,
                                      device_map={"": 0} if load_8bit and device == 'cuda' else "auto",
                                      ))
         if 'OpenAssistant/reward-model'.lower() in base_model.lower():
-            # could put on other GPUs
             model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
             model_kwargs.pop('torch_dtype', None)
@@ -577,7 +624,10 @@ def get_model(
             with torch.device(device):
                 if infer_devices:
                     model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
-                                               gpu_id=gpu_id, use_auth_token=use_auth_token)
                 else:
                     if load_half and not load_8bit:
                         model = model_loader.from_pretrained(
@@ -599,6 +649,7 @@ def get_model(
                 local_files_only=local_files_only,
                 resume_download=resume_download,
                 use_auth_token=use_auth_token,
                 device_map={"": 0} if device == 'cuda' else {"": 'cpu'},  # seems to be required
             )
         else:
@@ -614,6 +665,7 @@ def get_model(
                     local_files_only=local_files_only,
                     resume_download=resume_download,
                     use_auth_token=use_auth_token,
                     device_map="auto",
                 )
                 if load_half:
@@ -782,49 +834,7 @@ def evaluate(
     if chat:
         # override, ignore user change
         num_return_sequences = 1
-    if prompt_type in ['human_bot', 'instruct_vicuna', 'instruct_with_end']:
-        if prompt_type == 'human_bot':
-            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
-            # stopping only starts once output is beyond prompt
-            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
-            stop_words = [human, bot, '\n' + human, '\n' + bot]
-            encounters = [1, 2]
-        elif prompt_type == 'instruct_vicuna':
-            # even below is not enough, generic strings and many ways to encode
-            stop_words = [
-                '### Human:',
-                """
-### Human:""",
-                """
-### Human:
-""",
-                '### Assistant:',
-                """
-### Assistant:""",
-                """
-### Assistant:
-""",
-            ]
-            encounters = [1, 2]
-        else:
-            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
-            stop_words = ['### End']
-            encounters = [1]
-        stop_words_ids = [
-            tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
-        # handle single token case
-        stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
-        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
-        # avoid padding in front of tokens
-        if tokenizer.pad_token:
-            stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
-        # handle fake \n added
-        stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
-        # build stopper
-        stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device)])
-    else:
-        stopping_criteria = StoppingCriteriaList()
     # help to avoid errors like:
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
@@ -903,7 +913,10 @@ def evaluate(
                     prompt = inputs_decoded
                 elif inputs_decoded_raw == prompt:
                     # some models specify special tokens that are part of normal prompt, so can't skip them
-                    inputs_decoded_raw = inputs_decoded
                     decoder = decoder_raw
                 else:
                     print("WARNING: Special characters in prompt", flush=True)
@@ -1046,6 +1059,7 @@ def get_generate_params(model_lower, chat,
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
     # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
     if show_examples is None:
@@ -1104,7 +1118,8 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
             placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
         placeholder_input = ""
         if model_lower:
-            prompt_type = prompt_type or 'human_bot'
         else:
             prompt_type = ''
         examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
@@ -1133,9 +1148,9 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
     else:
-        temperature = 0.2 if temperature is None else temperature
-        top_p = 0.85 if top_p is None else top_p
-        top_k = 70 if top_k is None else top_k
         if chat:
             num_beams = num_beams or 1
         else:
@@ -1143,7 +1158,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
         max_new_tokens = max_new_tokens or 256
         repetition_penalty = repetition_penalty or 1.07
         num_return_sequences = min(num_beams, num_return_sequences or 1)
-        do_sample = True if do_sample is None else do_sample
     # doesn't include chat, instruction_nochat, iinput_nochat, added later
     params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
                    early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]

 import filelock
 import psutil
+from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread, get_githash
 SEED = 1236
 set_seed(SEED)
 import fire
 import torch
 from peft import PeftModel
+from transformers import GenerationConfig, AutoModel, TextIteratorStreamer
 from accelerate import init_empty_weights, infer_auto_device_map
 from prompter import Prompter
+from finetune import get_loaders, example_data_points, generate_prompt, inv_prompt_type_to_model_lower
+from stopping import get_stopping
 eval_extra_columns = ['prompt', 'response', 'score']
         local_files_only: bool = False,
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
+        trust_remote_code: Union[str, bool] = True,
         src_lang: str = "English",
         tgt_lang: str = "Russian",
     :param local_files_only: whether to only use local files instead of doing to HF for models
     :param resume_download: whether to resume downloads from HF for models
     :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
+    :param trust_remote_code: whether to use trust any code needed for HF model
     :param src_lang: source languages to include if doing translation (None = all)
     :param tgt_lang: target languages to include if doing translation (None = all)
     :param gradio: whether to enable gradio, or to enable benchmark mode
     if is_public:
         input_lines = 1  # ensure set, for ease of use
+        temperature = 0.2 if temperature is None else temperature
+        top_p = 0.85 if top_p is None else top_p
+        top_k = 70 if top_k is None else top_k
+        if is_hf:
+            do_sample = True if do_sample is None else do_sample
+        else:
+            # by default don't sample, too chatty
+            do_sample = False if do_sample is None else do_sample
         if is_low_mem:
+            if not base_model:
+                base_model = 'h2oai/h2ogpt-oasst1-512-12b'
+                # don't set load_8bit if passed base_model, doesn't always work so can't just override
+                load_8bit = True
         else:
+            base_model = 'h2oai/h2ogpt-oasst1-512-20b' if not base_model else base_model
     if is_low_mem:
         load_8bit = True
     if is_hf:
                             do_sample,
                             )
+    locals_dict = locals()
+    locals_print = '\n'.join(['%s: %s' % (k, v) for k, v in locals_dict.items()])
+    print(f"Generating model with params:\n{locals_print}", flush=True)
+    print("Command: %s\nHash: %s" % (str(' '.join(sys.argv)), get_githash()), flush=True)
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
 def get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
                        gpu_id=0,
+                       use_auth_token=False,
+                       trust_remote_code=True,
+                       triton_attn=False,
+                       long_sequence=True,
+                       ):
     """
     Ensure model gets on correct device
     :param base_model:
     :param reward_type:
     :param gpu_id:
     :param use_auth_token:
+    :param trust_remote_code:
+    :param triton_attn:
+    :param long_sequence:
     :return:
     """
     with init_empty_weights():
         from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
+                                            trust_remote_code=trust_remote_code)
+        if triton_attn and 'mpt-' in base_model.lower():
+            config.attn_config['attn_impl'] = 'triton'
+        if long_sequence:
+            if 'mpt-7b-storywriter' in base_model.lower():
+                config.update({"max_seq_len": 83968})
+            if 'mosaicml/mpt-7b-chat' in base_model.lower():
+                config.update({"max_seq_len": 4096})
+        if issubclass(config.__class__, tuple(AutoModel._model_mapping.keys())):
+            model = AutoModel.from_config(
+                config,
+            )
+        else:
+            # can't infer
+            model = None
+    if model is not None:
+        # NOTE: Can specify max_memory={0: max_mem, 1: max_mem}, to shard model
+        # NOTE: Some models require avoiding sharding some layers,
+        # then would pass no_split_module_classes and give list of those layers.
+        device_map = infer_auto_device_map(
+            model,
             dtype=torch.float16 if load_half else torch.float32,
         )
+        if hasattr(model, 'model'):
+            device_map_model = infer_auto_device_map(
+                model.model,
+                dtype=torch.float16 if load_half else torch.float32,
+            )
+            device_map.update(device_map_model)
+        print('device_map: %s' % device_map, flush=True)
+    else:
+        device_map = "auto"
     n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0
     if load_in_8bit or not load_half:
         model = model_loader.from_pretrained(
             base_model,
+            config=config,
             **model_kwargs,
         )
     else:
         model = model_loader.from_pretrained(
             base_model,
+            config=config,
             **model_kwargs,
         ).half()
     return model
         local_files_only: bool = False,
         resume_download: bool = True,
         use_auth_token: Union[str, bool] = False,
+        trust_remote_code: bool = True,
         compile: bool = True,
         **kwargs,
 ):
     :param local_files_only: use local files instead of from HF
     :param resume_download: resume downloads from HF
     :param use_auth_token: assumes user did on CLI `huggingface-cli login` to access private repo
+    :param trust_remote_code: trust code needed by model
     :param compile: whether to compile torch model
     :param kwargs:
     :return:
     )
     from transformers import AutoConfig
+    config = AutoConfig.from_pretrained(base_model, use_auth_token=use_auth_token,
+                                        trust_remote_code=trust_remote_code)
     llama_type_from_config = 'llama' in str(config).lower()
     llama_type_from_name = "llama" in base_model.lower()
     llama_type = llama_type_from_config or llama_type_from_name
                                                      local_files_only=local_files_only,
                                                      resume_download=resume_download,
                                                      use_auth_token=use_auth_token,
+                                                     trust_remote_code=trust_remote_code,
                                                      )
     else:
         tokenizer = tokenizer_loader
         model_kwargs = dict(local_files_only=local_files_only,
                             torch_dtype=torch.float16 if device == 'cuda' else torch.float32,
                             resume_download=resume_download,
+                            use_auth_token=use_auth_token,
+                            trust_remote_code=trust_remote_code,
+                            )
+        if 'mbart-' not in base_model.lower() and 'mpt-' not in base_model.lower():
             model_kwargs.update(dict(load_in_8bit=load_8bit,
                                      device_map={"": 0} if load_8bit and device == 'cuda' else "auto",
                                      ))
+        if 'mpt-' in base_model.lower() and gpu_id >= 0:
+            model_kwargs.update(dict(device_map={"": gpu_id} if device == 'cuda' else "cpu"))
         if 'OpenAssistant/reward-model'.lower() in base_model.lower():
+            # FIXME: could put on other GPUs
             model_kwargs['device_map'] = {"": 0} if device == 'cuda' else {"": 'cpu'}
             model_kwargs.pop('torch_dtype', None)
             with torch.device(device):
                 if infer_devices:
                     model = get_non_lora_model(base_model, model_loader, load_half, model_kwargs, reward_type,
+                                               gpu_id=gpu_id,
+                                               use_auth_token=use_auth_token,
+                                               trust_remote_code=trust_remote_code,
+                                               )
                 else:
                     if load_half and not load_8bit:
                         model = model_loader.from_pretrained(
                 local_files_only=local_files_only,
                 resume_download=resume_download,
                 use_auth_token=use_auth_token,
+                trust_remote_code=trust_remote_code,
                 device_map={"": 0} if device == 'cuda' else {"": 'cpu'},  # seems to be required
             )
         else:
                     local_files_only=local_files_only,
                     resume_download=resume_download,
                     use_auth_token=use_auth_token,
+                    trust_remote_code=trust_remote_code,
                     device_map="auto",
                 )
                 if load_half:
     if chat:
         # override, ignore user change
         num_return_sequences = 1
+    stopping_criteria = get_stopping(prompt_type, tokenizer, device)
     # help to avoid errors like:
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
                     prompt = inputs_decoded
                 elif inputs_decoded_raw == prompt:
                     # some models specify special tokens that are part of normal prompt, so can't skip them
+                    inputs_decoded = prompt = inputs_decoded_raw
+                    decoder = decoder_raw
+                elif inputs_decoded_raw.replace("<unk> ", "").replace("<unk>", "").replace('\n', ' ').replace(' ', '') == prompt.replace('\n', ' ').replace(' ', ''):
+                    inputs_decoded = prompt = inputs_decoded_raw
                     decoder = decoder_raw
                 else:
                     print("WARNING: Special characters in prompt", flush=True)
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
+        print("Auto-selecting prompt_type=%s for %s" % (prompt_type, model_lower), flush=True)
     # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
     if show_examples is None:
             placeholder_instruction = "Give detailed answer for whether Einstein or Newton is smarter."
         placeholder_input = ""
         if model_lower:
+            # default is plain, because might relly upon trust_remote_code to handle prompting
+            prompt_type = prompt_type or 'plain'
         else:
             prompt_type = ''
         examples += [[summarize_example1, 'Summarize' if prompt_type not in ['plain', 'instruct_simple'] else '', "",
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
     else:
+        temperature = 0.1 if temperature is None else temperature
+        top_p = 0.75 if top_p is None else top_p
+        top_k = 40 if top_k is None else top_k
         if chat:
             num_beams = num_beams or 1
         else:
         max_new_tokens = max_new_tokens or 256
         repetition_penalty = repetition_penalty or 1.07
         num_return_sequences = min(num_beams, num_return_sequences or 1)
+        do_sample = False if do_sample is None else do_sample
     # doesn't include chat, instruction_nochat, iinput_nochat, added later
     params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
                    early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]

gradio_runner.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 import sys
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
     ping
 from finetune import prompt_type_to_model_name, prompt_types_strings, generate_prompt, inv_prompt_type_to_model_lower
@@ -49,6 +50,7 @@ def go_gradio(**kwargs):
                       """
     else:
         description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
     description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
     if kwargs['verbose']:
@@ -389,6 +391,7 @@ def go_gradio(**kwargs):
             .then(close_admin, inputs=admin_pass_textbox, outputs=admin_row, queue=False)
         # Get inputs to evaluate()
         all_kwargs = kwargs.copy()
         all_kwargs.update(locals())
         inputs_list = get_inputs_list(all_kwargs, kwargs['model_lower'])
@@ -516,9 +519,12 @@ def go_gradio(**kwargs):
             :return:
             """
             args_list = list(args)
-            user_message = args_list[0]
-            input1 = args_list[1]
-            context1 = args_list[2]
             if input1 and not user_message.endswith(':'):
                 user_message1 = user_message + ":" + input1
             elif input1:
@@ -528,6 +534,8 @@ def go_gradio(**kwargs):
             if sanitize_user_prompt:
                 from better_profanity import profanity
                 user_message1 = profanity.censor(user_message1)
             if user_message1 in ['']:
                 # e.g. when user just hits enter in textbox,
                 # else will have <human>: <bot>: on single line, which seems to be "ok" for LLM but not usual
@@ -559,7 +567,8 @@ def go_gradio(**kwargs):
             :param retry:
             :return:
             """
-            args_list = copy.deepcopy(list(args))
             history = args_list[-1]  # model_state is -2
             if retry and history:
                 history.pop()
@@ -580,12 +589,18 @@ def go_gradio(**kwargs):
                 context1 = ''
                 for histi in range(len(history) - 1):
                     data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
-                    context1 += generate_prompt(data_point, prompt_type1, chat1, reduced=True)[0].replace(
-                        '<br>', '\n')
-                    if not context1.endswith('\n'):
-                        context1 += '\n'
-                if context1 and not context1.endswith('\n'):
-                    context1 += '\n'  # ensure if terminates abruptly, then human continues on next line
             args_list[0] = instruction1  # override original instruction with history from user
             # only include desired chat history
             args_list[2] = context1[-kwargs['chat_history']:]
@@ -767,6 +782,7 @@ def go_gradio(**kwargs):
                 lora_weights = no_lora_str
                 return [None, None, None, model_name], model_name, lora_weights, prompt_type_old
             all_kwargs1 = all_kwargs.copy()
             all_kwargs1['base_model'] = model_name.strip()
             all_kwargs1['load_8bit'] = load_8bit

 import sys
 from gradio_themes import H2oTheme, SoftTheme, get_h2o_title, get_simple_title, get_dark_js
+from prompter import Prompter
 from utils import get_githash, flatten_list, zip_data, s3up, clear_torch_cache, get_torch_allocated, system_info_print, \
     ping
 from finetune import prompt_type_to_model_name, prompt_types_strings, generate_prompt, inv_prompt_type_to_model_lower
                       """
     else:
         description = "For more information, visit our GitHub pages: [h2oGPT](https://github.com/h2oai/h2ogpt) and [H2O LLM Studio](https://github.com/h2oai/h2o-llmstudio)<br>"
+    description += "If this host is busy, try [gpt.h2o.ai 20B](https://gpt.h2o.ai) and [30B](http://gpu.hopto.org) and [HF Spaces1 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot) and [HF Spaces2 12B](https://huggingface.co/spaces/h2oai/h2ogpt-chatbot2)<br>"
     description += """<p>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md)</p>"""
     if kwargs['verbose']:
             .then(close_admin, inputs=admin_pass_textbox, outputs=admin_row, queue=False)
         # Get inputs to evaluate()
+        # don't deepcopy, can contain model itself
         all_kwargs = kwargs.copy()
         all_kwargs.update(locals())
         inputs_list = get_inputs_list(all_kwargs, kwargs['model_lower'])
             :return:
             """
             args_list = list(args)
+            user_message = args_list[eval_func_param_names.index('instruction')]  # chat only
+            input1 = args_list[eval_func_param_names.index('iinput')]  # chat only
+            context1 = args_list[eval_func_param_names.index('context')]
+            prompt_type1 = args_list[eval_func_param_names.index('prompt_type')]
+            chat1 = args_list[eval_func_param_names.index('chat')]
+            stream_output1 = args_list[eval_func_param_names.index('stream_output')]
             if input1 and not user_message.endswith(':'):
                 user_message1 = user_message + ":" + input1
             elif input1:
             if sanitize_user_prompt:
                 from better_profanity import profanity
                 user_message1 = profanity.censor(user_message1)
+            # FIXME: WIP to use desired seperator when user enters nothing
+            prompter = Prompter(prompt_type1, debug=kwargs['debug'], chat=chat1, stream_output=stream_output1)
             if user_message1 in ['']:
                 # e.g. when user just hits enter in textbox,
                 # else will have <human>: <bot>: on single line, which seems to be "ok" for LLM but not usual
             :param retry:
             :return:
             """
+            # don't deepcopy, can contain model itself
+            args_list = list(args).copy()
             history = args_list[-1]  # model_state is -2
             if retry and history:
                 history.pop()
                 context1 = ''
                 for histi in range(len(history) - 1):
                     data_point = dict(instruction=history[histi][0], input='', output=history[histi][1])
+                    prompt, pre_response, terminate_response, chat_sep = generate_prompt(data_point, prompt_type1,
+                                                                                         chat1, reduced=True)
+                    # md -> back to text, maybe not super improtant if model trained enough
+                    prompt = prompt.replace('<br>', chat_sep)
+                    context1 += prompt
+                    if not context1.endswith(chat_sep):
+                        context1 += chat_sep
+                _, pre_response, terminate_response, chat_sep = generate_prompt({}, prompt_type1, chat1,
+                                                                                reduced=True)
+                if context1 and not context1.endswith(chat_sep):
+                    context1 += chat_sep  # ensure if terminates abruptly, then human continues on next line
             args_list[0] = instruction1  # override original instruction with history from user
             # only include desired chat history
             args_list[2] = context1[-kwargs['chat_history']:]
                 lora_weights = no_lora_str
                 return [None, None, None, model_name], model_name, lora_weights, prompt_type_old
+            # don't deepcopy, can contain model itself
             all_kwargs1 = all_kwargs.copy()
             all_kwargs1['base_model'] = model_name.strip()
             all_kwargs1['load_8bit'] = load_8bit

prompter.py CHANGED Viewed

@@ -6,7 +6,8 @@ class Prompter(object):
                  allowed_repeat_line_length=10):
         self.prompt_type = prompt_type
         data_point = dict(instruction='', input='', output='')
-        _, self.pre_response, self.terminate_response = generate_prompt(data_point, prompt_type, chat, False)
         self.debug = debug
         self.chat = chat
         self.stream_output = stream_output
@@ -15,7 +16,7 @@ class Prompter(object):
     def generate_prompt(self, data_point):
         reduced = False
-        prompt, _, _ = generate_prompt(data_point, self.prompt_type, self.chat, reduced)
         if self.debug:
             print("prompt: ", prompt, flush=True)
         self.prompt = prompt
@@ -25,12 +26,12 @@ class Prompter(object):
         if isinstance(outputs, str):
             outputs = [outputs]
         if self.debug:
-            print("output: ", '\n\n'.join(outputs), flush=True)
         if prompt is not None:
             self.prompt = prompt
         def clean_response(response):
-            meaningless_words = ['<pad>', '</s>', '<|endoftext|>', '”\n']
             for word in meaningless_words:
                 response = response.replace(word, "")
             if sanitize_bot_response:
@@ -103,5 +104,5 @@ class Prompter(object):
         # join all outputs, only one extra new line between outputs
         output = '\n'.join(outputs)
         if self.debug:
-            print("outputclean: ", '\n\n'.join(outputs), flush=True)
         return output

                  allowed_repeat_line_length=10):
         self.prompt_type = prompt_type
         data_point = dict(instruction='', input='', output='')
+        _, self.pre_response, self.terminate_response, self.chat_sep = \
+            generate_prompt(data_point, prompt_type, chat, False)
         self.debug = debug
         self.chat = chat
         self.stream_output = stream_output
     def generate_prompt(self, data_point):
         reduced = False
+        prompt, _, _, _ = generate_prompt(data_point, self.prompt_type, self.chat, reduced)
         if self.debug:
             print("prompt: ", prompt, flush=True)
         self.prompt = prompt
         if isinstance(outputs, str):
             outputs = [outputs]
         if self.debug:
+            print("output:\n", '\n\n'.join(outputs), flush=True)
         if prompt is not None:
             self.prompt = prompt
         def clean_response(response):
+            meaningless_words = ['<pad>', '</s>', '<|endoftext|>']
             for word in meaningless_words:
                 response = response.replace(word, "")
             if sanitize_bot_response:
         # join all outputs, only one extra new line between outputs
         output = '\n'.join(outputs)
         if self.debug:
+            print("outputclean:\n", '\n\n'.join(outputs), flush=True)
         return output

requirements.txt CHANGED Viewed

@@ -19,7 +19,7 @@ pandas==2.0.0
 matplotlib==3.7.1
 loralib==0.1.1
 bitsandbytes==0.38.1
-git+https://github.com/huggingface/peft.git@e8f66b8a425eced6c592089d40b8d33d82c2b2f0
 transformers==4.28.1
 tokenizers==0.13.3
 APScheduler==3.10.1

 matplotlib==3.7.1
 loralib==0.1.1
 bitsandbytes==0.38.1
+git+https://github.com/huggingface/peft.git@098962fa6515f2e4fe83a757f5995d3ffbb1c373
 transformers==4.28.1
 tokenizers==0.13.3
 APScheduler==3.10.1

stopping.py CHANGED Viewed

@@ -1,10 +1,5 @@
-import traceback
-from queue import Queue
-from threading import Thread
-import collections.abc
 import torch
-from transformers import StoppingCriteria
 class StoppingCriteriaSub(StoppingCriteria):
@@ -21,7 +16,55 @@ class StoppingCriteriaSub(StoppingCriteria):
             if torch.all((stop == input_ids[0][-len(stop):])).item():
                 self.num_stops[stopi] += 1
                 if self.num_stops[stopi] >= self.encounters[stopi % len(self.encounters)]:
                     return True
         # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
         # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
         return False

 import torch
+from transformers import StoppingCriteria, StoppingCriteriaList
 class StoppingCriteriaSub(StoppingCriteria):
             if torch.all((stop == input_ids[0][-len(stop):])).item():
                 self.num_stops[stopi] += 1
                 if self.num_stops[stopi] >= self.encounters[stopi % len(self.encounters)]:
+                    # print("Stopped", flush=True)
                     return True
         # print("Tokens: %s" % input_ids[0].cpu().numpy(), flush=True)
         # print("Stop Tokens: %s" % [x.cpu().numpy() for x in self.stops], flush=True)
         return False
+def get_stopping(prompt_type, tokenizer, device, human='<human>:', bot="<bot>:"):
+    if prompt_type in ['human_bot', 'instruct_vicuna', 'instruct_with_end']:
+        if prompt_type == 'human_bot':
+            # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
+            # stopping only starts once output is beyond prompt
+            # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
+            stop_words = [human, bot, '\n' + human, '\n' + bot]
+            encounters = [1, 2]
+        elif prompt_type == 'instruct_vicuna':
+            # even below is not enough, generic strings and many ways to encode
+            stop_words = [
+                '### Human:',
+                """
+### Human:""",
+                """
+### Human:
+""",
+                '### Assistant:',
+                """
+### Assistant:""",
+                """
+### Assistant:
+""",
+            ]
+            encounters = [1, 2]
+        else:
+            # some instruct prompts have this as end, doesn't hurt to stop on it since not common otherwise
+            stop_words = ['### End']
+            encounters = [1]
+        stop_words_ids = [
+            tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
+        # handle single token case
+        stop_words_ids = [x if len(x.shape) > 0 else torch.tensor([x]) for x in stop_words_ids]
+        stop_words_ids = [x for x in stop_words_ids if x.shape[0] > 0]
+        # avoid padding in front of tokens
+        if tokenizer.pad_token:
+            stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
+        # handle fake \n added
+        stop_words_ids = [x[1:] if y[0] == '\n' else x for x, y in zip(stop_words_ids, stop_words)]
+        # build stopper
+        stopping_criteria = StoppingCriteriaList(
+            [StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters, device=device)])
+    else:
+        stopping_criteria = StoppingCriteriaList()
+    return stopping_criteria