import huggingface_hub as hf_hub import time import openvino_genai as ov_genai import numpy as np import gradio as gr import re # 下載模型 model_id = "OpenVINO/Qwen3-0.6B-int4-ov" model_path = "Qwen3-0.6B-int4-ov" hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False) # 建立推理管線 device = "CPU" pipe = ov_genai.LLMPipeline(model_path, device) tokenizer = pipe.get_tokenizer() tokenizer.set_chat_template(tokenizer.chat_template) def generate_response(prompt, history=[]): # Added history parameter full_prompt = tokenizer.apply_chat_template( messages=history + [{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True ) # Create a generator function for streaming def stream_generate(): full_response = "" start_time = time.time() token_count = 0 def streamer(subword): nonlocal full_response, token_count full_response += subword token_count += 1 yield full_response, f'{token_count / (time.time() - start_time):.2f}' return ov_genai.StreamingStatus.RUNNING try: pipe.generate(full_prompt, streamer=streamer, max_new_tokens=1024) # Add user prompt and full response to the history history.append({"role": "user", "content": prompt}) history.append({"role": "assistant", "content": full_response}) except Exception as e: yield "發生錯誤", f"生成回應時發生錯誤:{e}" # Yield the final response and tokens/sec yield full_response, f'{token_count / (time.time() - start_time):.2f}' return stream_generate() # 建立 Gradio 介面 demo = gr.ChatInterface( fn=generate_response, title="Qwen3-0.6B-int4-ov ", description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI.", additional_inputs=[ gr.Textbox(label="tokens/sec", value="", interactive=False) ] ) if __name__ == "__main__": demo.queue().launch()