xu song
commited on
Commit
·
b70508d
1
Parent(s):
73119ac
update
Browse files- app.py +3 -3
- models/cpp_qwen2.py +14 -11
app.py
CHANGED
|
@@ -48,7 +48,7 @@ with gr.Blocks() as demo:
|
|
| 48 |
show_share_button=True,
|
| 49 |
avatar_images=("assets/man.png", "assets/bot.png"))
|
| 50 |
|
| 51 |
-
gr.Textbox("For faster inference, you can build locally with ")
|
| 52 |
# ss
|
| 53 |
with gradio.Tab("Self Chat"):
|
| 54 |
input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
|
|
@@ -64,7 +64,7 @@ with gr.Blocks() as demo:
|
|
| 64 |
visible=True)
|
| 65 |
|
| 66 |
# 也叫 chat-assistant,
|
| 67 |
-
with gradio.Tab("Response Generator"):
|
| 68 |
with gr.Row():
|
| 69 |
input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
|
| 70 |
generate_btn_2 = gr.Button("Send", variant="primary")
|
|
@@ -75,7 +75,7 @@ with gr.Blocks() as demo:
|
|
| 75 |
gr.Markdown("Response simulator is the most commonly used chatbot.")
|
| 76 |
|
| 77 |
#
|
| 78 |
-
with gradio.Tab("User Simulator"):
|
| 79 |
with gr.Row():
|
| 80 |
input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
|
| 81 |
generate_btn_3 = gr.Button("Send", variant="primary")
|
|
|
|
| 48 |
show_share_button=True,
|
| 49 |
avatar_images=("assets/man.png", "assets/bot.png"))
|
| 50 |
|
| 51 |
+
# gr.Textbox("For faster inference, you can build locally with ")
|
| 52 |
# ss
|
| 53 |
with gradio.Tab("Self Chat"):
|
| 54 |
input_text_1 = gr.Textbox(show_label=False, placeholder="...", lines=10, visible=False)
|
|
|
|
| 64 |
visible=True)
|
| 65 |
|
| 66 |
# 也叫 chat-assistant,
|
| 67 |
+
with gradio.Tab("Response Generator", visible=False):
|
| 68 |
with gr.Row():
|
| 69 |
input_text_2 = gr.Textbox(show_label=False, placeholder="Please type your input", scale=7)
|
| 70 |
generate_btn_2 = gr.Button("Send", variant="primary")
|
|
|
|
| 75 |
gr.Markdown("Response simulator is the most commonly used chatbot.")
|
| 76 |
|
| 77 |
#
|
| 78 |
+
with gradio.Tab("User Simulator", visible=False):
|
| 79 |
with gr.Row():
|
| 80 |
input_text_3 = gr.Textbox(show_label=False, placeholder="Please type your response", scale=7)
|
| 81 |
generate_btn_3 = gr.Button("Send", variant="primary")
|
models/cpp_qwen2.py
CHANGED
|
@@ -188,22 +188,18 @@ class Qwen2Simulator(Simulator):
|
|
| 188 |
logger.info(
|
| 189 |
f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
| 190 |
|
| 191 |
-
#
|
| 192 |
-
|
| 193 |
-
# <|im_end|>\n
|
| 194 |
-
logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
|
| 195 |
-
self.llm.eval([151645, 198] + suffix_tokens) # 增加 n_tokens
|
| 196 |
-
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
|
| 197 |
|
| 198 |
def pre_cache_system(self, system_list):
|
| 199 |
-
"""
|
| 200 |
:param system_list:
|
| 201 |
:return:
|
| 202 |
"""
|
| 203 |
logger.info(f"cache size {self.llm.cache.cache_size}")
|
| 204 |
for system_prompt in system_list:
|
| 205 |
-
logger.info(f"pre caching {system_prompt}")
|
| 206 |
-
input_ids = self.tokenize(f"<|im_start|>system{system_prompt}<|im_end|>\n<|im_start|>user\n")
|
| 207 |
output = self.llm.create_completion(
|
| 208 |
input_ids,
|
| 209 |
stream=False,
|
|
@@ -215,8 +211,15 @@ class Qwen2Simulator(Simulator):
|
|
| 215 |
# disable cache after
|
| 216 |
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
| 217 |
|
| 218 |
-
def
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
|
| 222 |
bot = Qwen2Simulator()
|
|
|
|
| 188 |
logger.info(
|
| 189 |
f'finish_reason {stream["choices"][0]["finish_reason"]} with text: {stream["choices"][0]["text"]}')
|
| 190 |
|
| 191 |
+
#
|
| 192 |
+
self.post_cache(suffix_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
def pre_cache_system(self, system_list):
|
| 195 |
+
""" warmup for system prompt
|
| 196 |
:param system_list:
|
| 197 |
:return:
|
| 198 |
"""
|
| 199 |
logger.info(f"cache size {self.llm.cache.cache_size}")
|
| 200 |
for system_prompt in system_list:
|
| 201 |
+
logger.info(f"pre caching '{system_prompt}'")
|
| 202 |
+
input_ids = self.tokenize(f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n")
|
| 203 |
output = self.llm.create_completion(
|
| 204 |
input_ids,
|
| 205 |
stream=False,
|
|
|
|
| 211 |
# disable cache after
|
| 212 |
llama_cpp.LlamaRAMCache.__setitem__ = lambda *args: None
|
| 213 |
|
| 214 |
+
def post_cache(self, suffix_tokens):
|
| 215 |
+
""" warmup for next turn generation
|
| 216 |
+
:param suffix_tokens:
|
| 217 |
+
:return:
|
| 218 |
+
"""
|
| 219 |
+
if suffix_tokens:
|
| 220 |
+
logger.info(f"before warmup: n_tokens = {self.llm.n_tokens}")
|
| 221 |
+
self.llm.eval([151645, 198] + suffix_tokens) # <|im_end|>\n
|
| 222 |
+
logger.info(f"after warmup: n_tokens = {self.llm.n_tokens}")
|
| 223 |
|
| 224 |
|
| 225 |
bot = Qwen2Simulator()
|