Spaces:
Sleeping
Sleeping
File size: 5,238 Bytes
80351f4 bba8253 fb19b6e 3e64474 bba8253 59fb13e bba8253 59fb13e bba8253 088f906 bba8253 59fb13e 80351f4 088f906 bba8253 59fb13e bba8253 088f906 59fb13e 80351f4 fb19b6e 088f906 80351f4 fb19b6e 80351f4 59fb13e 088f906 bba8253 80351f4 088f906 0bd5ba6 088f906 59fb13e 088f906 bba8253 088f906 bba8253 59fb13e bba8253 088f906 bba8253 088f906 bba8253 59fb13e bba8253 088f906 bba8253 088f906 bba8253 80351f4 088f906 80351f4 59fb13e 088f906 59fb13e 088f906 59fb13e 088f906 59fb13e 088f906 59fb13e 088f906 80351f4 59fb13e 088f906 80351f4 088f906 59fb13e 80351f4 59fb13e 80351f4 088f906 80351f4 088f906 80351f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import os
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime
# Page configuration
st.set_page_config(
page_title="π¬ Qwen2.5-Coder Chat",
page_icon="π¬",
layout="wide"
)
# Set cache directory explicitly for Hugging Face Spaces
os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"
# Initialize session state for conversation history
if 'messages' not in st.session_state:
st.session_state.messages = []
# Cache model loading to prevent re-loading each session
@st.cache_resource
def load_model_and_tokenizer():
model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" # Smaller 3B model for efficiency
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
st.info(f"Using device: {device}")
# Load model with optimizations for CPU
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32 if device == "cpu" else torch.float16,
device_map="auto" if device == "cuda" else {"": device},
trust_remote_code=True,
low_cpu_mem_usage=True # Reduce memory usage for CPU
)
return tokenizer, model
# Title
st.title("π¬ Qwen2.5-Coder Chat")
# Sidebar settings
with st.sidebar:
st.header("Settings")
max_length = st.slider(
"Maximum Length",
min_value=64,
max_value=1024, # Lowered for CPU
value=256, # Default setting for CPU
step=64,
help="Maximum number of tokens to generate"
)
temperature = st.slider(
"Temperature",
min_value=0.1,
max_value=1.5, # Lower range to make output more deterministic
value=0.5,
step=0.1,
help="Higher values make output more random, lower values more deterministic"
)
top_p = st.slider(
"Top P",
min_value=0.1,
max_value=1.0,
value=0.8,
step=0.1,
help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
)
if st.button("Clear Conversation"):
st.session_state.messages = []
st.rerun()
# Load model with caching
try:
with st.spinner("Loading model... Please wait..."):
tokenizer, model = load_model_and_tokenizer()
except Exception as e:
st.error(f"Error loading model: {str(e)}")
st.stop()
# Response generation function
def generate_response(prompt, max_new_tokens=256, temperature=0.5, top_p=0.8):
"""Generate response from the model"""
try:
# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Decode and return response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(prompt):].strip() # Extract only the model's response
except Exception as e:
st.error(f"Error generating response: {str(e)}")
return None
# Display conversation history
for message in st.session_state.messages[-5:]: # Limit to last 5 messages for efficiency
with st.chat_message(message["role"]):
st.write(f"{message['content']}\n\n_{message['timestamp']}_")
# Chat input
if prompt := st.chat_input("Ask me anything about coding..."):
# Add user message
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.messages.append({
"role": "user",
"content": prompt,
"timestamp": timestamp
})
# Display user message
with st.chat_message("user"):
st.write(f"{prompt}\n\n_{timestamp}_")
# Generate and display response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
# Prepare conversation context, limited to recent exchanges
conversation = "\n".join(
f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
for msg in st.session_state.messages[-3:] # Send only the last 3 messages
) + "\nAssistant:"
response = generate_response(
conversation,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p
)
if response:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.write(f"{response}\n\n_{timestamp}_")
# Add response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": response,
"timestamp": timestamp
})
|