maxtest01 / app.py
hsuwill000's picture
Update app.py
c58cf79 verified
raw
history blame
4.11 kB
import gradio as gr
import openvino_genai as ov_genai
import queue
import threading
import time
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
import nncf
from llama_index.core import SimpleDirectoryReader
from rank_bm25 import BM25Okapi
import jieba
import subprocess
import os
os.makedirs("./data/", exist_ok=True)
url = "https://www.cwa.gov.tw/V8/C/M/Fishery/tide_30day_MOD/T000311.html"
output_dir = "./data/"
cmd = ["wget", "-P", output_dir, url]
try:
subprocess.run(cmd, check=True)
print("下載成功")
except subprocess.CalledProcessError as e:
print("下載失敗:", e)
import huggingface_hub as hf_hub
# 初始化 OpenVINO 模型
#model_id = "hsuwill000/BitCPM4-1B_int4_ov"
model_id = "hsuwill000/MiniCPM3-4B_int4_ov"
model_path = "ov"
hf_hub.snapshot_download(model_id, local_dir=model_path)
config = ov_genai.GenerationConfig()
config.max_new_tokens = 4096
config.top_p = 0.9;
config.top_k = 30;
pipe = ov_genai.LLMPipeline(model_path, "CPU")
pipe.get_tokenizer().set_chat_template(pipe.get_tokenizer().chat_template)
# 載入文件(放 ./data 資料夾,支持多檔案)
documents = SimpleDirectoryReader("./data").load_data()
texts = [doc.get_content() for doc in documents]
# 使用 jieba 斷詞做 BM25
tokenized_corpus = [list(jieba.cut(text)) for text in texts]
bm25 = BM25Okapi(tokenized_corpus)
def start_chat():
pipe.start_chat()
return "✅ 開始對話!"
def finish_chat():
pipe.finish_chat()
return "🛑 結束對話!"
# 建立推論函式:使用 streamer 並回傳 generator 結果
def generate_stream(prompt):
prompt = prompt #+ " /no_think" + " 答案短且明瞭"
tokenized_query = list(jieba.cut(prompt))
# BM25 取得 top 3 相關文件段落
top_k = 1
doc_scores = bm25.get_scores(tokenized_query)
top_k_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_k]
retrieved_texts = [texts[i] for i in top_k_indices]
print("=== 檢索到的相關段落 ===")
for i, txt in enumerate(retrieved_texts, 1):
print(f"--- 段落 {i} ---\n{txt}\n")
# 拼接 prompt,避免全文貼上,只用 top3 段落
context = "\n\n".join(retrieved_texts)
final_prompt = f"根據以下資訊,請簡潔回答問題:\n{context}\n\n問題:{query}\n回答:"
print("=== 最終 prompt ===")
print(final_prompt)
q = queue.Queue()
tps_result = ""
def streamer(subword):
print(subword, end='', flush=True)
q.put(subword)
return ov_genai.StreamingStatus.RUNNING
def worker():
# 在背景 thread 中做推論
nonlocal tps_result
gen_result = pipe.generate([final_prompt], streamer=streamer, config=config)
tps = gen_result.perf_metrics.get_throughput().mean
tps_result = f"{tps:.2f} tokens/s"
q.put(None) # 結束符號
threading.Thread(target=worker).start()
result = ""
while True:
token = q.get()
if token is None:
break
result += token
yield result,"" # 把逐步結果傳給 output textbox
yield result, tps_result
with gr.Blocks() as demo:
gr.Markdown("## 🧠 OpenVINO Streaming Demo with Gradio Textbox")
with gr.Row():
with gr.Column():
start_btn = gr.Button("開始對話")
end_btn = gr.Button("結束對話")
status_box = gr.Textbox(label="狀態", interactive=False)
TPS_box = gr.Textbox(label="TPS", interactive=False)
with gr.Row():
textbox_input = gr.Textbox(label="Prompt", lines=1, placeholder="Enter prompt here...")
button = gr.Button("Submit")
textbox_output = gr.Markdown(label="robot answer:", elem_id="scroll_output")
start_btn.click(fn=start_chat, outputs=status_box)
end_btn.click(fn=finish_chat, outputs=status_box)
# 當按鈕被按下時,呼叫 generate_stream 並更新 textbox_output
button.click(fn=generate_stream, inputs=textbox_input, outputs=[textbox_output, TPS_box])
demo.launch()