Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
import huggingface_hub as hf_hub
|
2 |
-
import time
|
3 |
import openvino_genai as ov_genai
|
4 |
-
import numpy as np
|
5 |
import gradio as gr
|
6 |
import re
|
7 |
|
@@ -17,50 +15,42 @@ pipe = ov_genai.LLMPipeline(model_path, device)
|
|
17 |
tokenizer = pipe.get_tokenizer()
|
18 |
tokenizer.set_chat_template(tokenizer.chat_template)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
full_prompt = tokenizer.apply_chat_template(
|
23 |
-
messages=history + [{"role": "user", "content": prompt}],
|
24 |
-
tokenize=False,
|
25 |
-
add_generation_prompt=True
|
26 |
-
)
|
27 |
-
|
28 |
-
# Create a generator function for streaming
|
29 |
-
def stream_generate():
|
30 |
-
full_response = ""
|
31 |
-
start_time = time.time()
|
32 |
-
token_count = 0
|
33 |
-
|
34 |
def streamer(subword):
|
35 |
-
nonlocal
|
36 |
-
|
37 |
-
|
38 |
-
yield full_response, f'{token_count / (time.time() - start_time):.2f}'
|
39 |
return ov_genai.StreamingStatus.RUNNING
|
40 |
-
|
41 |
-
try:
|
42 |
-
pipe.generate(full_prompt, streamer=streamer, max_new_tokens=1024)
|
43 |
-
# Add user prompt and full response to the history
|
44 |
-
history.append({"role": "user", "content": prompt})
|
45 |
-
history.append({"role": "assistant", "content": full_response})
|
46 |
-
except Exception as e:
|
47 |
-
yield "發生錯誤", f"生成回應時發生錯誤:{e}"
|
48 |
-
|
49 |
-
# Yield the final response and tokens/sec
|
50 |
-
yield full_response, f'{token_count / (time.time() - start_time):.2f}'
|
51 |
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
56 |
fn=generate_response,
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
gr.Textbox(label="
|
61 |
-
]
|
|
|
|
|
62 |
)
|
63 |
|
64 |
-
|
65 |
if __name__ == "__main__":
|
66 |
-
demo.
|
|
|
1 |
import huggingface_hub as hf_hub
|
|
|
2 |
import openvino_genai as ov_genai
|
|
|
3 |
import gradio as gr
|
4 |
import re
|
5 |
|
|
|
15 |
tokenizer = pipe.get_tokenizer()
|
16 |
tokenizer.set_chat_template(tokenizer.chat_template)
|
17 |
|
18 |
+
# 修改 generate_response 函數以支持流式輸出
|
19 |
+
def generate_response(prompt):
|
20 |
+
try:
|
21 |
+
response = ""
|
22 |
+
tokens_per_sec = "N/A" # 預設值
|
23 |
|
24 |
+
# 定義流式處理回呼函數
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def streamer(subword):
|
26 |
+
nonlocal response
|
27 |
+
response += subword # 拼接輸出
|
28 |
+
print(subword, end='', flush=True) # 日誌輸出到控制台以便即時檢查
|
|
|
29 |
return ov_genai.StreamingStatus.RUNNING
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
# 啟動流式生成
|
32 |
+
pipe.start_chat()
|
33 |
+
pipe.generate(prompt, streamer=streamer, max_new_tokens=100)
|
34 |
+
pipe.finish_chat()
|
35 |
+
|
36 |
+
# 根據性能指標計算 tokens/sec
|
37 |
+
tokens_per_sec = f"{pipe.get_throughput():.2f}"
|
38 |
+
return tokens_per_sec, response
|
39 |
|
40 |
+
except Exception as e:
|
41 |
+
return "N/A", f"生成回應時發生錯誤:{e}"
|
42 |
+
|
43 |
+
# 建立 Gradio 介面(保持不變)
|
44 |
+
demo = gr.Interface(
|
45 |
fn=generate_response,
|
46 |
+
inputs=gr.Textbox(lines=1, label="輸入提示 (Prompt)"),
|
47 |
+
outputs=[
|
48 |
+
gr.Textbox(label="tokens/sec"),
|
49 |
+
gr.Textbox(label="回應")
|
50 |
+
],
|
51 |
+
title="Qwen3-0.6B-int4-ov",
|
52 |
+
description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI。"
|
53 |
)
|
54 |
|
|
|
55 |
if __name__ == "__main__":
|
56 |
+
demo.launch()
|