hsuwill000 commited on
Commit
7c5f0ef
·
verified ·
1 Parent(s): 1e44aa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import huggingface_hub as hf_hub
2
  import openvino_genai as ov_genai
3
  import gradio as gr
4
- import re
5
 
6
  # 下載模型
7
  model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
@@ -15,35 +14,37 @@ pipe = ov_genai.LLMPipeline(model_path, device)
15
  tokenizer = pipe.get_tokenizer()
16
  tokenizer.set_chat_template(tokenizer.chat_template)
17
 
18
- # 修改 generate_response 函數以支持流式輸出
19
  def generate_response(prompt):
20
  try:
21
  response = ""
22
- tokens_per_sec = "N/A" # 預設值
23
-
24
- # 定義流式處理回呼函數
25
  def streamer(subword):
26
- nonlocal response
27
- response += subword # 拼接輸出
28
- print(subword, end='', flush=True) # 日誌輸出到控制台以便即時檢查
29
  return ov_genai.StreamingStatus.RUNNING
30
 
31
- # 啟動流式生成
32
  pipe.start_chat()
33
- pipe.generate(prompt, streamer=streamer, max_new_tokens=100)
34
  pipe.finish_chat()
35
 
36
- # 根據性能指標計算 tokens/sec
37
- tokens_per_sec = f"{pipe.get_throughput():.2f}"
38
- return tokens_per_sec, response
 
 
39
 
40
  except Exception as e:
 
41
  return "N/A", f"生成回應時發生錯誤:{e}"
42
 
43
- # 建立 Gradio 介面(保持不變)
44
  demo = gr.Interface(
45
  fn=generate_response,
46
- inputs=gr.Textbox(lines=1, label="輸入提示 (Prompt)"),
47
  outputs=[
48
  gr.Textbox(label="tokens/sec"),
49
  gr.Textbox(label="回應")
 
1
  import huggingface_hub as hf_hub
2
  import openvino_genai as ov_genai
3
  import gradio as gr
 
4
 
5
  # 下載模型
6
  model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
 
14
  tokenizer = pipe.get_tokenizer()
15
  tokenizer.set_chat_template(tokenizer.chat_template)
16
 
17
+ # 流式處理的 generate_response 函數
18
  def generate_response(prompt):
19
  try:
20
  response = ""
21
+
22
+ # 定義流式處理的回調函數
 
23
  def streamer(subword):
24
+ nonlocal response # 引用外部變數 `response`
25
+ response += subword # 拼接生成的輸出
26
+ print(subword, end='', flush=True) # 輸出到控制台,便於觀察
27
  return ov_genai.StreamingStatus.RUNNING
28
 
29
+ # 使用流式生成
30
  pipe.start_chat()
31
+ generated = pipe.generate([prompt], streamer=streamer, max_length=1024)
32
  pipe.finish_chat()
33
 
34
+ # 計算每秒生成的 Tokens 數量
35
+ token_per_sec = f'{generated.perf_metrics.get_throughput().mean:.2f}'
36
+
37
+ # 返回性能指標和完整的生成結果
38
+ return token_per_sec, response
39
 
40
  except Exception as e:
41
+ # 錯誤處理
42
  return "N/A", f"生成回應時發生錯誤:{e}"
43
 
44
+ # Gradio 介面(保持不變)
45
  demo = gr.Interface(
46
  fn=generate_response,
47
+ inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
48
  outputs=[
49
  gr.Textbox(label="tokens/sec"),
50
  gr.Textbox(label="回應")