Spaces:

xu-song
/

self-chat

Running

App Files Files Community

xu song commited on Sep 7, 2024

Commit

2fa4e4c

1 Parent(s): c619300

update

Browse files

Files changed (6) hide show

README.md +4 -1
app.py +30 -22
app_util.py +12 -13
config.py +2 -1
models/cpp_qwen2.py +31 -17
models/hf_qwen2.py +11 -8

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Self Chat
-emoji: 💬
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
@@ -8,6 +8,9 @@ sdk_version: 4.39.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 ---
 title: Self Chat
+emoji: 🤖🤖
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
 app_file: app.py
 pinned: false
 license: apache-2.0
+tags:
+  - chatbot
+short_description: Generating synthetic data via self-chat
 ---
 An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py CHANGED Viewed

@@ -1,19 +1,29 @@
 """
 """
 import gradio
 import config
 from app_util import *
 user_simulator_doc = """\
 There are maily two types of user simulator:
 - prompt-based user-simulator (role-play)
 - model-based user-simulator
-In most cases, large language models (LLMs) are used to serve as assistant generator.
-Besides, it can also used as user simulator.
 """
 survey = """\
@@ -28,16 +38,16 @@ Essentially, it is a form of model compression.
 ## 有不用概率的知识蒸馏吗？
 """
-with gr.Blocks() as demo:
     # Knowledge Distillation through Self Chatting
     # Distilling the Knowledge from LLM through Self Chatting
     # Generating Synthetic Data through Self Chat
-    gr.HTML("""<h1 align="center">Generating Synthetic Data Through Self-Chat</h1>""")
     with gr.Row():
         with gr.Column(scale=5):
             system = gr.Dropdown(
                 choices=system_list,
-                value=system_list[0],
                 allow_custom_value=True,
                 interactive=True,
                 label="System message",
@@ -46,7 +56,8 @@ with gr.Blocks() as demo:
             chatbot = gr.Chatbot(show_copy_button=True,
                                  show_share_button=True,
-                                 avatar_images=("assets/man.png", "assets/bot.png"))
             # gr.Textbox("For faster inference, you can build locally with  ")
             # ss
@@ -54,30 +65,27 @@ with gr.Blocks() as demo:
                 input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
                 generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
                 with gr.Row():
-                    retry_btn = gr.Button("🔄  Retry", variant="secondary", size="sm", )
                     undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                     clear_btn = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                     # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
-                # gr.Markdown(
-                #     "Self-chat is a demo, which makes the model talk to itself. "
-                #     "It is based on user simulator and response generator.",
-                #     visible=True)
             # 也叫 chat-assistant,
-            with gradio.Tab("Response Generator", visible=False):
                 with gr.Row():
-                    input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
                     generate_btn_2 = gr.Button("Send", variant="primary")
                 with gr.Row():
                     retry_btn_2 = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
                     undo_btn_2 = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                     clear_btn_2 = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
-                gr.Markdown("Response simulator is the most commonly used chatbot.")
             #
-            with gradio.Tab("User Simulator", visible=False):
                 with gr.Row():
-                    input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
                     generate_btn_3 = gr.Button("Send", variant="primary")
                 with gr.Row():
                     retry_btn_3 = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
@@ -85,7 +93,7 @@ with gr.Blocks() as demo:
                     clear_btn_3 = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                 gr.Markdown(user_simulator_doc)
-        with gr.Column(variant="compact"):
             # with gr.Column():
             model = gr.Dropdown(
                 ["Qwen2-0.5B-Instruct", "llama3.1", "gemini"],
@@ -155,8 +163,8 @@ with gr.Blocks() as demo:
     slider_top_k.change(set_top_k, inputs=[slider_top_k])
-# demo.queue().launch(share=False, server_name="0.0.0.0")
 # demo.queue().launch(concurrency_count=1, max_size=5)
 demo.queue().launch()

 """
 """
+import random
 import gradio
 import config
 from app_util import *
 user_simulator_doc = """\
+The agent acts as user simulator.
 There are maily two types of user simulator:
 - prompt-based user-simulator (role-play)
 - model-based user-simulator
+This demo is a model-based user simulator.
+"""
+# In most cases, large language models (LLMs) are used to serve as assistant generator.
+# Besides, it can also used as user simulator.
+assistant_simulator_doc = """\
+The agent acts as assistant simulator.
+"""
+self_chat_doc = """\
+Self-chat is a demo which make the model talk to itself.
+It is a combination of user simulator and response generator.
 """
 survey = """\
 ## 有不用概率的知识蒸馏吗？
 """
+with gr.Blocks(head=None) as demo:
     # Knowledge Distillation through Self Chatting
     # Distilling the Knowledge from LLM through Self Chatting
     # Generating Synthetic Data through Self Chat
+    gr.HTML("""<h1 align="center">Generating Synthetic Data via Self-Chat</h1>""")
     with gr.Row():
         with gr.Column(scale=5):
             system = gr.Dropdown(
                 choices=system_list,
+                # value=system_list[0],
                 allow_custom_value=True,
                 interactive=True,
                 label="System message",
             chatbot = gr.Chatbot(show_copy_button=True,
                                  show_share_button=True,
+                                 avatar_images=("assets/man.png", "assets/bot.png"),
+                                 likeable=True)
             # gr.Textbox("For faster inference, you can build locally with  ")
             # ss
                 input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
                 generate_btn = gr.Button("🤔️ Self-Chat", variant="primary")
                 with gr.Row():
+                    retry_btn = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
                     undo_btn = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                     clear_btn = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                     # stop_btn = gr.Button("停止生成", variant="stop", visible=False)
+                gr.Markdown(self_chat_doc)
             # 也叫 chat-assistant,
+            with gradio.Tab("Response Generator"):
                 with gr.Row():
+                    input_text_2 = gr.Textbox(show_label=False, placeholder="Please type user input", scale=7)
                     generate_btn_2 = gr.Button("Send", variant="primary")
                 with gr.Row():
                     retry_btn_2 = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
                     undo_btn_2 = gr.Button("↩️ Undo", variant="secondary", size="sm", )
                     clear_btn_2 = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
+                gr.Markdown(assistant_simulator_doc)
             #
+            with gradio.Tab("User Simulator"):
                 with gr.Row():
+                    input_text_3 = gr.Textbox(show_label=False, placeholder="Please type assistant response", scale=7)
                     generate_btn_3 = gr.Button("Send", variant="primary")
                 with gr.Row():
                     retry_btn_3 = gr.Button("🔄  Regenerate", variant="secondary", size="sm", )
                     clear_btn_3 = gr.Button("🗑️  Clear", variant="secondary", size="sm", )  # 🧹 Clear History (清除历史)
                 gr.Markdown(user_simulator_doc)
+        with gr.Column(variant="compact", scale=1, min_width=300):
             # with gr.Column():
             model = gr.Dropdown(
                 ["Qwen2-0.5B-Instruct", "llama3.1", "gemini"],
     slider_top_k.change(set_top_k, inputs=[slider_top_k])
+    demo.load(lambda: gr.update(value=random.choice(system_list)), None, system)
+# demo.queue().launch(share=False, server_name="0.0.0.0", debug=True)
 # demo.queue().launch(concurrency_count=1, max_size=5)
 demo.queue().launch()

app_util.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import json
 import gradio as gr
 from utils.logging_util import logger
-from models.cpp_qwen2 import bot
-# from models.hf_qwen2 import bot
 #
@@ -22,15 +22,16 @@ system_list = [
     "You are a helpful assistant.",
     "你是一个导游。",
     "你是一名投资经理。",
-    # "你是一名医生。",
-    # "你是一个英语老师。",
-    # "你是一个程序员。",
-    # "你是一个心理咨询师。",
-    # "你是一名AI写作助手。"
-    # "你是一名作家，擅长写小说。"
 ]
-bot.pre_cache_system(system_list)
 def generate_user_message(chatbot, history):
     if history and history[-1]["role"] == "user":
@@ -52,7 +53,6 @@ def generate_assistant_message(chatbot, history):
     auto-mode：query is None
     manual-mode：query 是用户输入
     """
-    logger.info(f"generating {json.dumps(history, ensure_ascii=False)}")
     user_content = history[-1]["content"]
     if history[-1]["role"] != "user":
         gr.Warning('You should generate or type user-input first.')
@@ -65,13 +65,12 @@ def generate_assistant_message(chatbot, history):
         assistant_tokens = bot.strip_stoptokens(assistant_tokens)
         history.append({"role": "assistant", "content": assistant_content, "tokens": assistant_tokens})
-        print(f"chatbot is {chatbot}")
-        print(f"history is {history}")
         yield chatbot, history
 def generate(chatbot, history):
-    logger.info(f"chatbot: {chatbot}; history: {history}")
     streamer = None
     if history[-1]["role"] in ["assistant", "system"]:
         streamer = generate_user_message(chatbot, history)

 import json
 import gradio as gr
 from utils.logging_util import logger
+from models.cpp_qwen2 import Qwen2Simulator as Bot
+# from models.hf_qwen2 import Qwen2Simulator as Bot
 #
     "You are a helpful assistant.",
     "你是一个导游。",
     "你是一名投资经理。",
+    "你是一名医生。",
+    "你是一个英语老师。",
+    "你是一个程序员。",
+    "你是一个心理咨询师。",
+    "你是一名AI写作助手。"
+    "你是一名作家，擅长写小说。"
 ]
+bot = Bot(system_list)
 def generate_user_message(chatbot, history):
     if history and history[-1]["role"] == "user":
     auto-mode：query is None
     manual-mode：query 是用户输入
     """
     user_content = history[-1]["content"]
     if history[-1]["role"] != "user":
         gr.Warning('You should generate or type user-input first.')
         assistant_tokens = bot.strip_stoptokens(assistant_tokens)
         history.append({"role": "assistant", "content": assistant_content, "tokens": assistant_tokens})
         yield chatbot, history
 def generate(chatbot, history):
+    request_param = json.dumps({'chatbot': chatbot, 'history': history}, ensure_ascii=False)
+    logger.info(f"request_param: {request_param}")
     streamer = None
     if history[-1]["role"] in ["assistant", "system"]:
         streamer = generate_user_message(chatbot, history)

config.py CHANGED Viewed

@@ -1,6 +1,7 @@
-MAX_SEQUENCE_LENGTH = 32768  # max_seq_len
 DEFAULT_MAX_NEW_TOKENS = 128
 DEFAULT_TOP_K = 100

+# MAX_SEQUENCE_LENGTH = 32768  # 消耗内存太多
+MAX_SEQUENCE_LENGTH = 8192  #
 DEFAULT_MAX_NEW_TOKENS = 128
 DEFAULT_TOP_K = 100

models/cpp_qwen2.py CHANGED Viewed

@@ -77,7 +77,7 @@ import config
 class Qwen2Simulator(Simulator):
-    def __init__(self):
         local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
         if os.path.exists(local_path):
             self.hf_tokenizer = AutoTokenizer.from_pretrained(
@@ -105,30 +105,37 @@ class Qwen2Simulator(Simulator):
                     f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
                     f"env[CACHE]={os.environ.get('CACHE', None)}")
-        self.stop_words = [
             "<|im_end|>",
             "<|im_start|>",
             "<|endoftext|>",
         ]
-        self.stop_tokens = self.tokenize("".join(self.stop_words))
         self.generation_kwargs = dict(
             temperature=config.DEFAULT_TEMPERATURE,
             top_p=config.DEFAULT_TOP_P,
             top_k=config.DEFAULT_TOP_K,
             max_tokens=config.DEFAULT_MAX_NEW_TOKENS,
             repeat_penalty=1.1,
-            # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>，直接跟 <|im_start|>
-            stop=self.stop_words,
         )
         self.user_start_tokens = self.tokenize("<|im_start|>user\n")
         self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
         # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
         # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
-        cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30) # 2G
         self.llm.set_cache(cache)
     def tokenize(self, text):
         return self.llm.tokenize(text.encode("utf-8"))
@@ -136,10 +143,10 @@ class Qwen2Simulator(Simulator):
         return self.llm.detokenize(tokens).decode("utf-8")
     def strip_stoptokens(self, tokens):
-        while tokens and tokens[0] in self.stop_tokens:
             logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
             tokens.pop(0)
-        while tokens and tokens[-1] in self.stop_tokens:
             logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
             tokens.pop()
         return tokens
@@ -154,9 +161,12 @@ class Qwen2Simulator(Simulator):
         """
         if history[-1]['role'] in ["user"]:
             start_tokens = self.assistant_start_tokens
             suffix_tokens = self.user_start_tokens
         elif history[-1]['role'] in ["assistant", "system"]:
             start_tokens = self.user_start_tokens
             suffix_tokens = self.assistant_start_tokens
         input_ids = []
@@ -168,15 +178,16 @@ class Qwen2Simulator(Simulator):
                          + self.tokenize("<|im_end|>\n")
         input_ids += start_tokens
         if stream:
-            return self._stream_generate(input_ids, suffix_tokens)
         else:
             return self._generate(input_ids)
-    def _stream_generate(self, input_ids, suffix_tokens=None):
         logger.info(f"generation_kwargs {self.generation_kwargs}")
         output = self.llm.create_completion(
             input_ids,
             stream=True,
             **self.generation_kwargs
         )
         # TODO: 检测finish reason，如果是length，则shift，并继续生成。
@@ -201,37 +212,40 @@ class Qwen2Simulator(Simulator):
         for system_prompt in system_list:
             logger.info(f"pre caching '{system_prompt}'")
             input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
-            output = self.llm.create_completion(
                 input_ids,
                 stream=False,
                 max_tokens=1,
                 top_k=1
             )
-            logger.info(f"cache size {self.llm.cache.cache_size}, process_mem: "
-                        f"{psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
         self._disable_cache()
     def post_cache(self, suffix_tokens):
         """ warmup for next turn generation
         :param suffix_tokens:
         :return:
         """
         if suffix_tokens:
             logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
             self.llm.eval([151645, 198] + suffix_tokens)  # <|im_end|>\n
             logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
     def _disable_cache(self):
         llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
         llama_cpp.Llama.save_state = lambda *args: None
-bot = Qwen2Simulator()
 if __name__ == "__main__":
     messages = [{"role": "system", "content": "你是一个导游。"}]
     generated_tokens = None
     print("######## requesting", messages)

 class Qwen2Simulator(Simulator):
+    def __init__(self, system_list=None):
         local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
         if os.path.exists(local_path):
             self.hf_tokenizer = AutoTokenizer.from_pretrained(
                     f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
                     f"env[CACHE]={os.environ.get('CACHE', None)}")
+        # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>，直接跟 <|im_start|>
+        self.assistant_stop_words = [
             "<|im_end|>",
             "<|im_start|>",
             "<|endoftext|>",
         ]
+        self.assistant_stop_tokens = self.tokenize("".join(self.assistant_stop_words))
+        self.user_stop_words = self.assistant_stop_words + ["？", "?"]
+        self.user_stop_tokens = self.tokenize("".join(self.user_stop_words))
+        logger.info(f"assistant_stop_tokens: {self.assistant_stop_tokens}")
+        logger.info(f"user_stop_tokens: {self.user_stop_tokens}")
         self.generation_kwargs = dict(
             temperature=config.DEFAULT_TEMPERATURE,
             top_p=config.DEFAULT_TOP_P,
             top_k=config.DEFAULT_TOP_K,
             max_tokens=config.DEFAULT_MAX_NEW_TOKENS,
             repeat_penalty=1.1,
         )
         self.user_start_tokens = self.tokenize("<|im_start|>user\n")
         self.assistant_start_tokens = self.tokenize("<|im_start|>assistant\n")
         # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
         # cache = llama_cpp.LlamaDiskCache(capacity_bytes=cache_size)
+        cache = llama_cpp.LlamaRAMCache(capacity_bytes=2 << 30)  # 2G
         self.llm.set_cache(cache)
+        if system_list is not None:
+            self.pre_cache_system(system_list)
     def tokenize(self, text):
         return self.llm.tokenize(text.encode("utf-8"))
         return self.llm.detokenize(tokens).decode("utf-8")
     def strip_stoptokens(self, tokens):
+        while tokens and tokens[0] in self.assistant_stop_tokens:
             logger.info(f"head-striping {tokens[0]} {self.detokenize([tokens[0]])}")
             tokens.pop(0)
+        while tokens and tokens[-1] in self.assistant_stop_tokens:
             logger.info(f"tail-striping {tokens[-1]} {self.detokenize([tokens[-1]])}")
             tokens.pop()
         return tokens
         """
         if history[-1]['role'] in ["user"]:
             start_tokens = self.assistant_start_tokens
+            stop_words = self.assistant_stop_words
             suffix_tokens = self.user_start_tokens
         elif history[-1]['role'] in ["assistant", "system"]:
             start_tokens = self.user_start_tokens
+            stop_words = self.user_stop_words
             suffix_tokens = self.assistant_start_tokens
         input_ids = []
                          + self.tokenize("<|im_end|>\n")
         input_ids += start_tokens
         if stream:
+            return self._stream_generate(input_ids, stop_words, suffix_tokens)
         else:
             return self._generate(input_ids)
+    def _stream_generate(self, input_ids, stop_words, suffix_tokens=None):
         logger.info(f"generation_kwargs {self.generation_kwargs}")
         output = self.llm.create_completion(
             input_ids,
             stream=True,
+            stop=stop_words,
             **self.generation_kwargs
         )
         # TODO: 检测finish reason，如果是length，则shift，并继续生成。
         for system_prompt in system_list:
             logger.info(f"pre caching '{system_prompt}'")
             input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
+            _output = self.llm.create_completion(
                 input_ids,
                 stream=False,
                 max_tokens=1,
                 top_k=1
             )
+            logger.info(
+                f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
+                f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
         self._disable_cache()
     def post_cache(self, suffix_tokens):
         """ warmup for next turn generation
         :param suffix_tokens:
         :return:
         """
+        logger.info(f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
+                    f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
         if suffix_tokens:
             logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
             self.llm.eval([151645, 198] + suffix_tokens)  # <|im_end|>\n
             logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
+        logger.info(f"cache size {self.llm.cache.cache_size}={self.llm.cache.cache_size / 1024 / 1024 / 1024:.2f} GB, "
+                    f"process_mem: {psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024:.2f} GB")
     def _disable_cache(self):
         llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
         llama_cpp.Llama.save_state = lambda *args: None
 if __name__ == "__main__":
+    bot = Qwen2Simulator()
     messages = [{"role": "system", "content": "你是一个导游。"}]
     generated_tokens = None
     print("######## requesting", messages)

models/hf_qwen2.py CHANGED Viewed

@@ -14,13 +14,15 @@ class Qwen2Simulator(Simulator):
         在传递 device_map 时，low_cpu_mem_usage 会自动设置为 True
         """
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name_or_path,
-            torch_dtype="auto",
-            device_map="auto"
-        )
-        self.model.eval()
         self.generation_kwargs = dict(
             do_sample=True,
             temperature=0.7,
@@ -93,11 +95,12 @@ class Qwen2Simulator(Simulator):
         return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
-bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
 # bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": "you are a helpful assistant"},
         {"role": "user", "content": "hi, what your name"}

         在传递 device_map 时，low_cpu_mem_usage 会自动设置为 True
         """
+        self.tokenizer = None
+        # self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = None
+        # self.model = AutoModelForCausalLM.from_pretrained(
+        #     model_name_or_path,
+        #     torch_dtype="auto",
+        #     device_map="auto"
+        # )
+        # self.model.eval()
         self.generation_kwargs = dict(
             do_sample=True,
             temperature=0.7,
         return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
 # bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
 if __name__ == "__main__":
+    bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
     messages = [
         {"role": "system", "content": "you are a helpful assistant"},
         {"role": "user", "content": "hi, what your name"}