hsuwill000 commited on
Commit
3592035
·
verified ·
1 Parent(s): 2ab439f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -33
app.py CHANGED
@@ -18,41 +18,43 @@ tokenizer = pipe.get_tokenizer()
18
  tokenizer.set_chat_template(tokenizer.chat_template)
19
 
20
 
21
- def generate_response(prompt):
22
- try:
23
- generated = pipe.generate([prompt], max_length=1024)
24
- tokenpersec=f'{generated.perf_metrics.get_throughput().mean:.2f}'
25
- '''
26
- match = re.search(r"<think>(.*?)</think>(.*)", generated, re.DOTALL)
27
- thinking = ""
28
- content = ""
29
- if match:
30
- thinking = match.group(1).strip()
31
- content = match.group(2).strip()
32
- else:
33
- thinking = "模型沒有提供思考過程"
34
- content = generated # 或者 generated.text, 取決於 generated 物件的屬性
35
-
36
- return tokenpersec, thinking, content
37
- '''
38
- return tokenpersec, generated
39
- except Exception as e:
40
- return "發生錯誤", "發生錯誤", f"生成回應時發生錯誤:{e}"
41
-
42
 
43
  # 建立 Gradio 介面
44
- demo = gr.Interface(
45
- fn=generate_response,
46
- inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
47
- outputs=[
48
- gr.Textbox(label="tokens/sec"),
49
- #gr.Textbox(label="思考過程"),
50
- #gr.Textbox(label="最終回應")
51
- gr.Textbox(label="回應")
52
- ],
53
- title="Qwen3-0.6B-int4-ov ",
54
- description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI。"
55
- )
 
 
56
 
57
  if __name__ == "__main__":
58
  demo.launch()
 
18
  tokenizer.set_chat_template(tokenizer.chat_template)
19
 
20
 
21
+ def generate_response(prompt, history=[]):
22
+ full_response = ""
23
+ token_count = 0
24
+ start_time = time.time()
25
+
26
+ def streamer(subword):
27
+ nonlocal full_response, token_count
28
+ full_response += subword
29
+ token_count += 1
30
+ return ov_genai.StreamingStatus.RUNNING
31
+
32
+ pipe.start_chat(history) # Pass the history to start_chat
33
+ pipe.generate(prompt, streamer=streamer, max_new_tokens=1024) # Adjust max_new_tokens as needed
34
+ pipe.finish_chat() # Finish the chat
35
+
36
+ end_time = time.time()
37
+ elapsed_time = end_time - start_time
38
+ tokenpersec = f'{token_count / elapsed_time:.2f}' if elapsed_time > 0 else "0.00"
39
+
40
+ history.append((prompt, full_response)) # Store the conversation history
41
+ return tokenpersec, history
42
 
43
  # 建立 Gradio 介面
44
+ with gr.Blocks() as demo:
45
+ chatbot = gr.Chatbot()
46
+ state = gr.State([])
47
+ msg = gr.Textbox(label="輸入提示 (Prompt)")
48
+
49
+ def respond(message, chat_history):
50
+ tokenpersec, chat_history = generate_response(message, chat_history)
51
+ response = chat_history[-1][1]
52
+ return "", chat_history, tokenpersec
53
+
54
+ msg.submit(respond, [msg, state], [msg, chatbot, gr.Textbox(label="tokens/sec")])
55
+
56
+ demo.title = "Qwen3-0.6B-int4-ov Streaming Demo"
57
+ demo.description = "基於 Qwen3-0.6B-int4-ov 推理應用,支援流式輸出與 GUI。"
58
 
59
  if __name__ == "__main__":
60
  demo.launch()