File size: 1,611 Bytes
6e96eae
d140707
6e96eae
d140707
e838cdf
d140707
6e96eae
 
 
 
 
6771aca
6e96eae
 
 
 
 
 
 
d140707
1e44aa5
0c88b92
 
 
 
 
b260b39
 
0c88b92
1e44aa5
0c88b92
 
 
15a68f9
0c88b92
1e44aa5
d140707
0c88b92
1e44aa5
d140707
1e44aa5
d140707
7c5f0ef
1e44aa5
d140707
71ae563
1e44aa5
d140707
 
7d7759a
6e96eae
 
d140707
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr
import re

# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"

hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False)

# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)


def generate_response(prompt):
    full_response = ""  # 用於儲存完整的回應

    def streamer(subword):
        nonlocal full_response
        full_response += subword
        yield full_response  # 使用 yield 使 streamer 成為生成器
        return ov_genai.StreamingStatus.RUNNING # 返回 StreamingStatus.RUNNING

    try:
        # 使用流式生成
        generated = pipe.generate(prompt, streamer=streamer, max_new_tokens=100)
        tokenpersec = f'{generated.perf_metrics.get_throughput().mean:.2f}'  # 恢復原本計算 tokenpersec 的方式

        return tokenpersec, full_response
    except Exception as e:
        return "發生錯誤", "發生錯誤", f"生成回應時發生錯誤:{e}"


# 建立 Gradio 介面
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
    outputs=[
        gr.Textbox(label="tokens/sec"),
        gr.Textbox(label="回應"),
    ],
    title="Qwen3-0.6B-int4-ov ",
    description="基於 Qwen3-0.6B-int4-ov 推理應用,支援思考過程分離與 GUI。"
)

if __name__ == "__main__":
    demo.launch()