Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import datetime | |
# Page configuration | |
st.set_page_config( | |
page_title="Qwen2.5-Coder Chat", | |
page_icon="π¬", | |
layout="wide" | |
) | |
# Initialize session state for conversation history | |
if 'messages' not in st.session_state: | |
st.session_state.messages = [] | |
# Cache the model loading | |
def load_model_and_tokenizer(): | |
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" # Using smaller 7B model | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name, | |
trust_remote_code=True | |
) | |
# Determine device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
st.info(f"Using device: {device}") | |
# Load model with appropriate settings for CPU/GPU | |
if device == "cuda": | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
trust_remote_code=True | |
) | |
else: | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float32, | |
device_map={"": device}, | |
trust_remote_code=True, | |
low_cpu_mem_usage=True | |
) | |
return tokenizer, model | |
# Main title | |
st.title("π¬ Qwen2.5-Coder Chat") | |
# Sidebar settings | |
with st.sidebar: | |
st.header("Settings") | |
max_length = st.slider( | |
"Maximum Length", | |
min_value=64, | |
max_value=2048, # Reduced for CPU usage | |
value=512, | |
step=64, | |
help="Maximum number of tokens to generate" | |
) | |
temperature = st.slider( | |
"Temperature", | |
min_value=0.1, | |
max_value=2.0, | |
value=0.7, | |
step=0.1, | |
help="Higher values make output more random, lower values more deterministic" | |
) | |
top_p = st.slider( | |
"Top P", | |
min_value=0.1, | |
max_value=1.0, | |
value=0.9, | |
step=0.1, | |
help="Nucleus sampling: higher values consider more tokens, lower values are more focused" | |
) | |
if st.button("Clear Conversation"): | |
st.session_state.messages = [] | |
st.rerun() | |
# Load model with error handling | |
try: | |
with st.spinner("Loading model... Please wait..."): | |
tokenizer, model = load_model_and_tokenizer() | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
st.stop() | |
def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9): | |
"""Generate response from the model""" | |
try: | |
# Tokenize input | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
# Generate response | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=max_new_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
do_sample=True, | |
pad_token_id=tokenizer.pad_token_id, | |
eos_token_id=tokenizer.eos_token_id, | |
) | |
# Decode and return response | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Extract only the model's response (after the prompt) | |
response = response[len(prompt):].strip() | |
return response | |
except Exception as e: | |
st.error(f"Error generating response: {str(e)}") | |
return None | |
# Display chat history | |
for message in st.session_state.messages: | |
with st.chat_message(message["role"]): | |
st.write(f"{message['content']}\n\n_{message['timestamp']}_") | |
# Chat input | |
if prompt := st.chat_input("Ask me anything about coding..."): | |
# Add user message to chat | |
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
st.session_state.messages.append({ | |
"role": "user", | |
"content": prompt, | |
"timestamp": timestamp | |
}) | |
# Display user message | |
with st.chat_message("user"): | |
st.write(f"{prompt}\n\n_{timestamp}_") | |
# Generate and display response | |
with st.chat_message("assistant"): | |
with st.spinner("Thinking..."): | |
# Prepare conversation history | |
conversation = "" | |
for msg in st.session_state.messages: | |
if msg["role"] == "user": | |
conversation += f"Human: {msg['content']}\n" | |
else: | |
conversation += f"Assistant: {msg['content']}\n" | |
conversation += "Assistant:" | |
response = generate_response( | |
conversation, | |
max_new_tokens=max_length, | |
temperature=temperature, | |
top_p=top_p | |
) | |
if response: | |
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
st.write(f"{response}\n\n_{timestamp}_") | |
# Add assistant response to chat history | |
st.session_state.messages.append({ | |
"role": "assistant", | |
"content": response, | |
"timestamp": timestamp | |
}) |