qwen2.5 / app.py
Avinash109's picture
Update app.py
59fb13e verified
raw
history blame
5.24 kB
import os
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime
# Page configuration
st.set_page_config(
page_title="πŸ’¬ Qwen2.5-Coder Chat",
page_icon="πŸ’¬",
layout="wide"
)
# Set cache directory explicitly for Hugging Face Spaces
os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"
# Initialize session state for conversation history
if 'messages' not in st.session_state:
st.session_state.messages = []
# Cache model loading to prevent re-loading each session
@st.cache_resource
def load_model_and_tokenizer():
model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" # Smaller 3B model for efficiency
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
st.info(f"Using device: {device}")
# Load model with optimizations for CPU
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32 if device == "cpu" else torch.float16,
device_map="auto" if device == "cuda" else {"": device},
trust_remote_code=True,
low_cpu_mem_usage=True # Reduce memory usage for CPU
)
return tokenizer, model
# Title
st.title("πŸ’¬ Qwen2.5-Coder Chat")
# Sidebar settings
with st.sidebar:
st.header("Settings")
max_length = st.slider(
"Maximum Length",
min_value=64,
max_value=1024, # Lowered for CPU
value=256, # Default setting for CPU
step=64,
help="Maximum number of tokens to generate"
)
temperature = st.slider(
"Temperature",
min_value=0.1,
max_value=1.5, # Lower range to make output more deterministic
value=0.5,
step=0.1,
help="Higher values make output more random, lower values more deterministic"
)
top_p = st.slider(
"Top P",
min_value=0.1,
max_value=1.0,
value=0.8,
step=0.1,
help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
)
if st.button("Clear Conversation"):
st.session_state.messages = []
st.rerun()
# Load model with caching
try:
with st.spinner("Loading model... Please wait..."):
tokenizer, model = load_model_and_tokenizer()
except Exception as e:
st.error(f"Error loading model: {str(e)}")
st.stop()
# Response generation function
def generate_response(prompt, max_new_tokens=256, temperature=0.5, top_p=0.8):
"""Generate response from the model"""
try:
# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Decode and return response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(prompt):].strip() # Extract only the model's response
except Exception as e:
st.error(f"Error generating response: {str(e)}")
return None
# Display conversation history
for message in st.session_state.messages[-5:]: # Limit to last 5 messages for efficiency
with st.chat_message(message["role"]):
st.write(f"{message['content']}\n\n_{message['timestamp']}_")
# Chat input
if prompt := st.chat_input("Ask me anything about coding..."):
# Add user message
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.messages.append({
"role": "user",
"content": prompt,
"timestamp": timestamp
})
# Display user message
with st.chat_message("user"):
st.write(f"{prompt}\n\n_{timestamp}_")
# Generate and display response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
# Prepare conversation context, limited to recent exchanges
conversation = "\n".join(
f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
for msg in st.session_state.messages[-3:] # Send only the last 3 messages
) + "\nAssistant:"
response = generate_response(
conversation,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p
)
if response:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.write(f"{response}\n\n_{timestamp}_")
# Add response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": response,
"timestamp": timestamp
})