import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import torch import os MODEL_NAMES = { "DeepSeek-R1-Distill-Qwen-7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "DeepSeek-R1-Distill-Llama-8B": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", } HF_TOKEN = os.getenv("HF_TOKEN") def load_model(model_path): tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, token=HF_TOKEN) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=HF_TOKEN) if hasattr(config, "quantization_config"): del config.quantization_config # 刪除量化配置,避免使用 FP8 model = AutoModelForCausalLM.from_pretrained( model_path, config=config, trust_remote_code=True, token=HF_TOKEN, torch_dtype=torch.float16, device_map="auto", ) return model, tokenizer current_model, current_tokenizer = load_model("deepseek-ai/DeepSeek-R1-Distill-Llama-8B") def chat(message, history, model_name): global current_model, current_tokenizer if model_name != current_model: current_model, current_tokenizer = load_model(MODEL_NAMES[model_name]) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = current_tokenizer(message, return_tensors="pt").to(device) outputs = current_model.generate(**inputs, max_length=1024) response = current_tokenizer.decode(outputs[0], skip_special_tokens=True) return response with gr.Blocks() as app: gr.Markdown("## Chatbot with DeepSeek Models") with gr.Row(): chat_interface = gr.ChatInterface( chat, type="messages", flagging_mode="manual", save_history=True, ) model_selector = gr.Dropdown( choices=list(MODEL_NAMES.keys()), value="DeepSeek-R1-Distill-Llama-8B", label="Select Model", ) app.launch()