hsuwill000 commited on
Commit
6bda22b
·
verified ·
1 Parent(s): 1353c4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -58
app.py CHANGED
@@ -1,86 +1,52 @@
1
- import huggingface_hub as hf_hub
2
  import gradio as gr
3
  import openvino_genai as ov_genai
 
 
4
  import time
5
 
6
- # 載入 OpenVINO 模型
7
  model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
8
  model_path = "Qwen3-0.6B-int4-ov"
9
  hf_hub.snapshot_download(model_id, local_dir=model_path)
10
-
11
  pipe = ov_genai.LLMPipeline(model_path, "CPU")
12
  pipe.start_chat()
13
 
14
- # 這個會在 token 產生時被呼叫
15
- def build_streamer(callback):
16
- def streamer(subword):
17
- callback(subword)
18
- return ov_genai.StreamingStatus.RUNNING
19
- return streamer
20
-
21
- # 對話處理函式
22
- def respond(
23
- message,
24
- history: list[tuple[str, str]],
25
- system_message,
26
- max_tokens,
27
- temperature,
28
- top_p,
29
- ):
30
  prompt = system_message + "\n"
31
  for user_msg, bot_msg in history:
32
  prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n"
33
  prompt += f"<|user|>\n{message}\n<|assistant|>\n"
34
 
35
- response = ""
36
-
37
- # 使用 generator 包裝 streamer
38
- def generator():
39
- nonlocal response
40
- start_time = time.time()
41
 
42
- def collect_output(subword):
43
- nonlocal response
44
- response += subword
45
- yield_fn.send(subword) # 把 token 傳給外部 generator
46
-
47
- yield_fn = yield # 讓第一個 yield 傳入收集函式
48
 
49
- # 執行生成
50
- gen_result = pipe.generate(
51
  [prompt],
52
- streamer=build_streamer(collect_output),
53
  max_new_tokens=max_tokens,
54
  temperature=temperature,
55
  top_p=top_p
56
  )
 
57
 
58
- elapsed = time.time() - start_time
59
- tps = gen_result.perf_metrics.get_throughput().mean
60
- print(f"\n--- TPS --- {tps:.2f} tokens/s --- {elapsed:.2f} sec")
61
- yield_fn.close() # 關閉 generator
62
 
63
- # 建立 streaming generator
64
- def streaming_generator():
65
- gen = generator()
66
- try:
67
- next(gen) # 啟動 generator
68
- while True:
69
- token = (yield)
70
- gen.send(token)
71
- yield token
72
- except StopIteration:
73
- return
74
-
75
- # Streaming to Gradio
76
- stream = streaming_generator()
77
- next(stream) # 啟動 stream
78
- for token in stream:
79
- yield response
80
-
81
- # 建立 Gradio Chat Interface
82
  demo = gr.ChatInterface(
83
- respond,
84
  additional_inputs=[
85
  gr.Textbox(value="You are a helpful assistant.", label="System message"),
86
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
 
 
1
  import gradio as gr
2
  import openvino_genai as ov_genai
3
+ import threading
4
+ import queue
5
  import time
6
 
7
+ # 初始化 OpenVINO 模型
8
  model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
9
  model_path = "Qwen3-0.6B-int4-ov"
10
  hf_hub.snapshot_download(model_id, local_dir=model_path)
 
11
  pipe = ov_genai.LLMPipeline(model_path, "CPU")
12
  pipe.start_chat()
13
 
14
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  prompt = system_message + "\n"
16
  for user_msg, bot_msg in history:
17
  prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n{bot_msg}\n"
18
  prompt += f"<|user|>\n{message}\n<|assistant|>\n"
19
 
20
+ # 使用 Queue 與 Thread 進行串流
21
+ q = queue.Queue()
 
 
 
 
22
 
23
+ def streamer(subword):
24
+ q.put(subword)
25
+ return ov_genai.StreamingStatus.RUNNING
 
 
 
26
 
27
+ def worker():
28
+ pipe.generate(
29
  [prompt],
30
+ streamer=streamer,
31
  max_new_tokens=max_tokens,
32
  temperature=temperature,
33
  top_p=top_p
34
  )
35
+ q.put(None) # 標記結束
36
 
37
+ threading.Thread(target=worker).start()
 
 
 
38
 
39
+ response = ""
40
+ while True:
41
+ token = q.get()
42
+ if token is None:
43
+ break
44
+ response += token
45
+ yield response # Gradio 逐步回傳給 UI
46
+
47
+ # Gradio 介面
 
 
 
 
 
 
 
 
 
 
48
  demo = gr.ChatInterface(
49
+ fn=respond,
50
  additional_inputs=[
51
  gr.Textbox(value="You are a helpful assistant.", label="System message"),
52
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),