Spaces:
Sleeping
Sleeping
File size: 5,199 Bytes
bba8253 fb19b6e 3e64474 bba8253 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 fb19b6e 3d4f049 fb19b6e 088f906 fb19b6e 088f906 bba8253 088f906 0bd5ba6 088f906 fb19b6e 3e64474 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 bba8253 088f906 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime
# Page configuration
st.set_page_config(
page_title="Qwen2.5-Coder Chat",
page_icon="π¬",
layout="wide"
)
# Initialize session state for conversation history
if 'messages' not in st.session_state:
st.session_state.messages = []
# Cache the model loading
@st.cache_resource
def load_model_and_tokenizer():
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" # Using smaller 7B model
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"
st.info(f"Using device: {device}")
# Load model with appropriate settings for CPU/GPU
if device == "cuda":
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32,
device_map={"": device},
trust_remote_code=True,
low_cpu_mem_usage=True
)
return tokenizer, model
# Main title
st.title("π¬ Qwen2.5-Coder Chat")
# Sidebar settings
with st.sidebar:
st.header("Settings")
max_length = st.slider(
"Maximum Length",
min_value=64,
max_value=2048, # Reduced for CPU usage
value=512,
step=64,
help="Maximum number of tokens to generate"
)
temperature = st.slider(
"Temperature",
min_value=0.1,
max_value=2.0,
value=0.7,
step=0.1,
help="Higher values make output more random, lower values more deterministic"
)
top_p = st.slider(
"Top P",
min_value=0.1,
max_value=1.0,
value=0.9,
step=0.1,
help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
)
if st.button("Clear Conversation"):
st.session_state.messages = []
st.rerun()
# Load model with error handling
try:
with st.spinner("Loading model... Please wait..."):
tokenizer, model = load_model_and_tokenizer()
except Exception as e:
st.error(f"Error loading model: {str(e)}")
st.stop()
def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
"""Generate response from the model"""
try:
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Decode and return response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the model's response (after the prompt)
response = response[len(prompt):].strip()
return response
except Exception as e:
st.error(f"Error generating response: {str(e)}")
return None
# Display chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(f"{message['content']}\n\n_{message['timestamp']}_")
# Chat input
if prompt := st.chat_input("Ask me anything about coding..."):
# Add user message to chat
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.messages.append({
"role": "user",
"content": prompt,
"timestamp": timestamp
})
# Display user message
with st.chat_message("user"):
st.write(f"{prompt}\n\n_{timestamp}_")
# Generate and display response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
# Prepare conversation history
conversation = ""
for msg in st.session_state.messages:
if msg["role"] == "user":
conversation += f"Human: {msg['content']}\n"
else:
conversation += f"Assistant: {msg['content']}\n"
conversation += "Assistant:"
response = generate_response(
conversation,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p
)
if response:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.write(f"{response}\n\n_{timestamp}_")
# Add assistant response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": response,
"timestamp": timestamp
}) |