Spaces:
Sleeping
Sleeping
File size: 5,390 Bytes
bba8253 fb19b6e 3e64474 ed64278 bba8253 6f080ab bba8253 6f080ab bba8253 ed64278 bba8253 6f080ab 088f906 ed64278 bba8253 ed64278 088f906 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 80351f4 ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab ed64278 6f080ab bba8253 6f080ab 088f906 0bd5ba6 ed64278 088f906 6f080ab 088f906 ed64278 088f906 ed64278 bba8253 6f080ab 088f906 bba8253 6f080ab ed64278 6f080ab 088f906 80351f4 6f080ab 088f906 6f080ab 088f906 6f080ab 80351f4 088f906 6f080ab 088f906 ed64278 6f080ab ed64278 6f080ab 088f906 6f080ab ed64278 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime
import gc
import os
# Enable memory efficient options
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
# Set page configuration
st.set_page_config(
page_title="Qwen2.5-Coder Chat",
page_icon="π¬",
layout="wide",
)
# Initialize session state
if 'messages' not in st.session_state:
st.session_state.messages = []
if 'model_loaded' not in st.session_state:
st.session_state.model_loaded = False
@st.cache_resource(show_spinner=False)
def load_model_and_tokenizer():
try:
model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
with st.spinner("π Loading tokenizer..."):
# Load tokenizer first
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
with st.spinner("π Loading model... (this may take a few minutes on CPU)"):
# Load model with 8-bit quantization for CPU
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map={"": "cpu"},
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float32,
load_in_8bit=True # Enable 8-bit quantization
)
# Force CPU mode and eval mode
model = model.to("cpu").eval()
# Clear memory after loading
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None
st.session_state.model_loaded = True
return tokenizer, model
except Exception as e:
st.error(f"β Error loading model: {str(e)}")
return None, None
def generate_response(prompt, model, tokenizer, max_length=256):
try:
# Clear memory before generation
gc.collect()
# Tokenize with shorter maximum length
inputs = tokenizer(
prompt,
return_tensors="pt",
max_length=512,
truncation=True
).to("cpu")
# Generate with minimal parameters for CPU
with torch.no_grad(), st.spinner("π€ Thinking... (please be patient)"):
outputs = model.generate(
**inputs,
max_new_tokens=max_length,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
num_beams=1, # Disable beam search
early_stopping=True
)
# Clear memory after generation
gc.collect()
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(prompt):].strip()
except torch.cuda.OutOfMemoryError:
st.error("πΎ Memory exceeded. Try reducing the maximum length.")
return None
except Exception as e:
st.error(f"β Error: {str(e)}")
return None
# Main UI
st.title("π¬ Qwen2.5-Coder Chat")
# Sidebar with minimal settings
with st.sidebar:
st.header("βοΈ Settings")
max_length = st.slider(
"Response Length π",
min_value=64,
max_value=512,
value=256,
step=64,
help="Shorter lengths are recommended for CPU"
)
if st.button("ποΈ Clear Conversation"):
st.session_state.messages = []
st.rerun()
# Load model
if not st.session_state.model_loaded:
tokenizer, model = load_model_and_tokenizer()
if model is None:
st.stop()
else:
tokenizer, model = load_model_and_tokenizer()
# Display conversation history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(f"{message['content']}\n\n_{message['timestamp']}_")
# Chat input
if prompt := st.chat_input("π Ask me anything about coding..."):
# Add user message
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.messages.append({
"role": "user",
"content": prompt,
"timestamp": timestamp
})
# Display user message
with st.chat_message("user"):
st.markdown(f"{prompt}\n\n_{timestamp}_")
# Generate and display response
with st.chat_message("assistant"):
# Keep only last message for context to reduce memory usage
conversation = f"Human: {prompt}\nAssistant:"
response = generate_response(
conversation,
model,
tokenizer,
max_length=max_length
)
if response:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.markdown(f"{response}\n\n_{timestamp}_")
# Add response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": response,
"timestamp": timestamp
})
else:
st.error("β Failed to generate response. Please try again with a shorter length.")
# Clear memory after response
gc.collect() |