Spaces:

xu-song
/

self-chat

Running

xu song commited on Aug 18, 2024

Commit

b70508d

1 Parent(s): 73119ac

update

Files changed (2) hide show

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ with gr.Blocks() as demo:
                                  show_share_button=True,
                                  avatar_images=("assets/man.png", "assets/bot.png"))
-            gr.Textbox("For faster inference, you can build locally with  ")
             # ss
             with gradio.Tab("Self Chat"):
                 input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
@@ -64,7 +64,7 @@ with gr.Blocks() as demo:
                     visible=True)
             # 也叫 chat-assistant,
-            with gradio.Tab("Response Generator"):
                 with gr.Row():
                     input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
                     generate_btn_2 = gr.Button("Send", variant="primary")
@@ -75,7 +75,7 @@ with gr.Blocks() as demo:
                 gr.Markdown("Response simulator is the most commonly used chatbot.")
             #
-            with gradio.Tab("User Simulator"):
                 with gr.Row():
                     input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
                     generate_btn_3 = gr.Button("Send", variant="primary")

                                  show_share_button=True,
                                  avatar_images=("assets/man.png", "assets/bot.png"))
+            # gr.Textbox("For faster inference, you can build locally with  ")
             # ss
             with gradio.Tab("Self Chat"):
                 input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
                     visible=True)
             # 也叫 chat-assistant,
+            with gradio.Tab("Response Generator", visible=False):
                 with gr.Row():
                     input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
                     generate_btn_2 = gr.Button("Send", variant="primary")
                 gr.Markdown("Response simulator is the most commonly used chatbot.")
             #
+            with gradio.Tab("User Simulator", visible=False):
                 with gr.Row():
                     input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
                     generate_btn_3 = gr.Button("Send", variant="primary")

models/cpp_qwen2.py CHANGED Viewed

@@ -188,22 +188,18 @@ class Qwen2Simulator(Simulator):
                 logger.info(
                     f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
-        # warmup for next turn (下轮解码的加速)
-        if suffix_tokens:
-            # <|im_end|>\n
-            logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
-            self.llm.eval([151645, 198] + suffix_tokens)  # 增加 n_tokens
-            logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
     def pre_cache_system(self, system_list):
-        """
         :param system_list:
         :return:
         """
         logger.info(f"cache size {self.llm.cache.cache_size}")
         for system_prompt in system_list:
-            logger.info(f"pre caching {system_prompt}")
-            input_ids = self.tokenize(f"<|im_start|>system{system_prompt}<|im_end|>\n<|im_start|>user\n")
             output = self.llm.create_completion(
                 input_ids,
                 stream=False,
@@ -215,8 +211,15 @@ class Qwen2Simulator(Simulator):
         # disable cache after
         llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
-    def complete(self):
-        pass
 bot = Qwen2Simulator()

                 logger.info(
                     f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
+        #
+        self.post_cache(suffix_tokens)
     def pre_cache_system(self, system_list):
+        """ warmup for system prompt
         :param system_list:
         :return:
         """
         logger.info(f"cache size {self.llm.cache.cache_size}")
         for system_prompt in system_list:
+            logger.info(f"pre caching '{system_prompt}'")
+            input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
             output = self.llm.create_completion(
                 input_ids,
                 stream=False,
         # disable cache after
         llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
+    def post_cache(self, suffix_tokens):
+        """ warmup for next turn generation
+        :param suffix_tokens:
+        :return:
+        """
+        if suffix_tokens:
+            logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
+            self.llm.eval([151645, 198] + suffix_tokens)  # <|im_end|>\n
+            logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
 bot = Qwen2Simulator()