hsuwill000 commited on
Commit
1e44aa5
·
verified ·
1 Parent(s): 04d3fa9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -40
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import huggingface_hub as hf_hub
2
- import time
3
  import openvino_genai as ov_genai
4
- import numpy as np
5
  import gradio as gr
6
  import re
7
 
@@ -17,50 +15,42 @@ pipe = ov_genai.LLMPipeline(model_path, device)
17
  tokenizer = pipe.get_tokenizer()
18
  tokenizer.set_chat_template(tokenizer.chat_template)
19
 
 
 
 
 
 
20
 
21
- def generate_response(prompt, history=[]): # Added history parameter
22
- full_prompt = tokenizer.apply_chat_template(
23
- messages=history + [{"role": "user", "content": prompt}],
24
- tokenize=False,
25
- add_generation_prompt=True
26
- )
27
-
28
- # Create a generator function for streaming
29
- def stream_generate():
30
- full_response = ""
31
- start_time = time.time()
32
- token_count = 0
33
-
34
  def streamer(subword):
35
- nonlocal full_response, token_count
36
- full_response += subword
37
- token_count += 1
38
- yield full_response, f'{token_count / (time.time() - start_time):.2f}'
39
  return ov_genai.StreamingStatus.RUNNING
40
-
41
- try:
42
- pipe.generate(full_prompt, streamer=streamer, max_new_tokens=1024)
43
- # Add user prompt and full response to the history
44
- history.append({"role": "user", "content": prompt})
45
- history.append({"role": "assistant", "content": full_response})
46
- except Exception as e:
47
- yield "發生錯誤", f"生成回應時發生錯誤:{e}"
48
-
49
- # Yield the final response and tokens/sec
50
- yield full_response, f'{token_count / (time.time() - start_time):.2f}'
51
 
52
- return stream_generate()
 
 
 
 
 
 
 
53
 
54
- # 建立 Gradio 介面
55
- demo = gr.ChatInterface(
 
 
 
56
  fn=generate_response,
57
- title="Qwen3-0.6B-int4-ov ",
58
- description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI.",
59
- additional_inputs=[
60
- gr.Textbox(label="tokens/sec", value="", interactive=False)
61
- ]
 
 
62
  )
63
 
64
-
65
  if __name__ == "__main__":
66
- demo.queue().launch()
 
1
  import huggingface_hub as hf_hub
 
2
  import openvino_genai as ov_genai
 
3
  import gradio as gr
4
  import re
5
 
 
15
  tokenizer = pipe.get_tokenizer()
16
  tokenizer.set_chat_template(tokenizer.chat_template)
17
 
18
+ # 修改 generate_response 函數以支持流式輸出
19
+ def generate_response(prompt):
20
+ try:
21
+ response = ""
22
+ tokens_per_sec = "N/A" # 預設值
23
 
24
+ # 定義流式處理回呼函數
 
 
 
 
 
 
 
 
 
 
 
 
25
  def streamer(subword):
26
+ nonlocal response
27
+ response += subword # 拼接輸出
28
+ print(subword, end='', flush=True) # 日誌輸出到控制台以便即時檢查
 
29
  return ov_genai.StreamingStatus.RUNNING
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # 啟動流式生成
32
+ pipe.start_chat()
33
+ pipe.generate(prompt, streamer=streamer, max_new_tokens=100)
34
+ pipe.finish_chat()
35
+
36
+ # 根據性能指標計算 tokens/sec
37
+ tokens_per_sec = f"{pipe.get_throughput():.2f}"
38
+ return tokens_per_sec, response
39
 
40
+ except Exception as e:
41
+ return "N/A", f"生成回應時發生錯誤:{e}"
42
+
43
+ # 建立 Gradio 介面(保持不變)
44
+ demo = gr.Interface(
45
  fn=generate_response,
46
+ inputs=gr.Textbox(lines=1, label="輸入提示 (Prompt)"),
47
+ outputs=[
48
+ gr.Textbox(label="tokens/sec"),
49
+ gr.Textbox(label="回應")
50
+ ],
51
+ title="Qwen3-0.6B-int4-ov",
52
+ description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI。"
53
  )
54
 
 
55
  if __name__ == "__main__":
56
+ demo.launch()