hsuwill000's picture
Update app.py
397bd8a verified
raw
history blame
2.11 kB
import gradio as gr
from huggingface_hub import InferenceClient
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer, pipeline
# 載入模型和標記器
model_id = "hsuwill000/DeepSeek-R1-Distill-Qwen-1.5B-openvino"
print("Loading model...")
model = OVModelForCausalLM.from_pretrained(model_id, device_map="auto")
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True,)
def respond(prompt, history):
# 構建聊天模板
messages = [
{"role": "system", "content": "使用中文,直接回答用戶的問題,盡量簡潔在1024 token內。"},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
print("Chat template text:", text)
# 將文本轉換為模型輸入
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
print("Model inputs:", model_inputs)
# 生成回應
generated_ids = model.generate(
**model_inputs,
max_new_tokens=2048,
temperature=0.7, # 降低隨機性
top_p=0.9, # 限制生成的多樣性
do_sample=True # 啟用採樣
)
print("Generated IDs:", generated_ids)
# 解碼生成的 token IDs
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Decoded response:", response)
# **去除 `<think>` 及其他無用內容**
response = response.replace("<think>", "**THINK**").replace("</think>", "**THINK**").strip()
# 返回回應
return response
# 設定 Gradio 的聊天界面
demo = gr.ChatInterface(
fn=respond,
title="DeepSeek-R1-Distill-Qwen-1.5B-openvino",
description="DeepSeek-R1-Distill-Qwen-1.5B-openvino"
)
if __name__ == "__main__":
print("Launching Gradio app...")
demo.launch(server_name="0.0.0.0", server_port=7860)
#demo.launch()