Spaces:

Zakia
/

deepseek-r1-demo

Sleeping

File size: 1,432 Bytes

f7a736e
 
2fa9a9c
f7a736e
cda3c49
2fa9a9c
f7a736e
 
 
 
cda3c49
 
 
 
 
 
 
 
f7a736e
 
2fa9a9c
cda3c49
f7a736e
2fa9a9c
f7a736e
cda3c49
f7a736e
 
 
 
 
 
 
 
 
 
 
cda3c49
2fa9a9c
f7a736e
 
 
2fa9a9c

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Use a more compatible DeepSeek model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Fix quantization issue by using 4-bit
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit instead of 8-bit
    bnb_4bit_compute_dtype=torch.float16,  # Use FP16 for better compatibility
    bnb_4bit_use_double_quant=True,  # Enable double quantization for efficiency
)

# Load model with optimized quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    trust_remote_code=True
)

# Define text generation function
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=150)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Set up Gradio UI
interface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Enter your prompt"),
    outputs=gr.Textbox(label="AI Response"),
    title="DeepSeek-R1 Distill LLaMA Chatbot",
    description="Enter a prompt and receive a response from DeepSeek-R1-Distill-Llama-8B."
)

# Launch the app
interface.launch()