Avinash109 commited on
Commit
6f080ab
Β·
verified Β·
1 Parent(s): 59fb13e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -103
app.py CHANGED
@@ -1,129 +1,139 @@
1
- import os
2
  import streamlit as st
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import datetime
6
 
7
- # Page configuration
8
  st.set_page_config(
9
- page_title="πŸ’¬ Qwen2.5-Coder Chat",
10
  page_icon="πŸ’¬",
11
  layout="wide"
12
  )
13
 
14
- # Set cache directory explicitly for Hugging Face Spaces
15
- os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"
16
-
17
- # Initialize session state for conversation history
18
  if 'messages' not in st.session_state:
19
  st.session_state.messages = []
20
 
21
- # Cache model loading to prevent re-loading each session
22
  @st.cache_resource
23
  def load_model_and_tokenizer():
24
- model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" # Smaller 3B model for efficiency
25
-
26
- # Load tokenizer
27
- tokenizer = AutoTokenizer.from_pretrained(
28
- model_name,
29
- trust_remote_code=True
30
- )
31
-
32
- # Device configuration
33
- device = "cuda" if torch.cuda.is_available() else "cpu"
34
- st.info(f"Using device: {device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- # Load model with optimizations for CPU
37
- model = AutoModelForCausalLM.from_pretrained(
38
- model_name,
39
- torch_dtype=torch.float32 if device == "cpu" else torch.float16,
40
- device_map="auto" if device == "cuda" else {"": device},
41
- trust_remote_code=True,
42
- low_cpu_mem_usage=True # Reduce memory usage for CPU
43
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- return tokenizer, model
 
 
 
 
 
46
 
47
- # Title
48
  st.title("πŸ’¬ Qwen2.5-Coder Chat")
49
 
50
  # Sidebar settings
51
  with st.sidebar:
52
- st.header("Settings")
53
 
 
54
  max_length = st.slider(
55
- "Maximum Length",
56
  min_value=64,
57
- max_value=1024, # Lowered for CPU
58
- value=256, # Default setting for CPU
59
- step=64,
60
- help="Maximum number of tokens to generate"
61
  )
62
 
63
  temperature = st.slider(
64
- "Temperature",
65
  min_value=0.1,
66
- max_value=1.5, # Lower range to make output more deterministic
67
- value=0.5,
68
- step=0.1,
69
- help="Higher values make output more random, lower values more deterministic"
70
  )
71
 
72
  top_p = st.slider(
73
- "Top P",
74
  min_value=0.1,
75
  max_value=1.0,
76
- value=0.8,
77
- step=0.1,
78
- help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
79
  )
80
 
81
- if st.button("Clear Conversation"):
 
82
  st.session_state.messages = []
83
  st.rerun()
84
 
85
- # Load model with caching
86
  try:
87
- with st.spinner("Loading model... Please wait..."):
88
- tokenizer, model = load_model_and_tokenizer()
89
  except Exception as e:
90
- st.error(f"Error loading model: {str(e)}")
91
  st.stop()
92
 
93
- # Response generation function
94
- def generate_response(prompt, max_new_tokens=256, temperature=0.5, top_p=0.8):
95
- """Generate response from the model"""
96
- try:
97
- # Tokenize the input
98
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
99
-
100
- # Generate response
101
- with torch.no_grad():
102
- outputs = model.generate(
103
- **inputs,
104
- max_new_tokens=max_new_tokens,
105
- temperature=temperature,
106
- top_p=top_p,
107
- do_sample=True,
108
- pad_token_id=tokenizer.pad_token_id,
109
- eos_token_id=tokenizer.eos_token_id,
110
- )
111
-
112
- # Decode and return response
113
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
114
- return response[len(prompt):].strip() # Extract only the model's response
115
-
116
- except Exception as e:
117
- st.error(f"Error generating response: {str(e)}")
118
- return None
119
-
120
  # Display conversation history
121
- for message in st.session_state.messages[-5:]: # Limit to last 5 messages for efficiency
122
  with st.chat_message(message["role"]):
123
- st.write(f"{message['content']}\n\n_{message['timestamp']}_")
124
 
125
  # Chat input
126
- if prompt := st.chat_input("Ask me anything about coding..."):
127
  # Add user message
128
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
129
  st.session_state.messages.append({
@@ -134,31 +144,34 @@ if prompt := st.chat_input("Ask me anything about coding..."):
134
 
135
  # Display user message
136
  with st.chat_message("user"):
137
- st.write(f"{prompt}\n\n_{timestamp}_")
138
 
139
  # Generate and display response
140
  with st.chat_message("assistant"):
141
- with st.spinner("Thinking..."):
142
- # Prepare conversation context, limited to recent exchanges
143
- conversation = "\n".join(
144
- f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
145
- for msg in st.session_state.messages[-3:] # Send only the last 3 messages
146
- ) + "\nAssistant:"
147
-
148
- response = generate_response(
149
- conversation,
150
- max_new_tokens=max_length,
151
- temperature=temperature,
152
- top_p=top_p
153
- )
 
 
 
 
 
154
 
155
- if response:
156
- timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
157
- st.write(f"{response}\n\n_{timestamp}_")
158
-
159
- # Add response to chat history
160
- st.session_state.messages.append({
161
- "role": "assistant",
162
- "content": response,
163
- "timestamp": timestamp
164
- })
 
 
1
  import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import datetime
5
 
6
+ # Set page configuration
7
  st.set_page_config(
8
+ page_title="Qwen2.5-Coder Chat",
9
  page_icon="πŸ’¬",
10
  layout="wide"
11
  )
12
 
13
+ # Initialize session state
 
 
 
14
  if 'messages' not in st.session_state:
15
  st.session_state.messages = []
16
 
 
17
  @st.cache_resource
18
  def load_model_and_tokenizer():
19
+ try:
20
+ # Display loading message
21
+ with st.spinner("πŸ”„ Loading model and tokenizer... This might take a few minutes..."):
22
+ model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
23
+
24
+ # Load tokenizer first
25
+ tokenizer = AutoTokenizer.from_pretrained(
26
+ model_name,
27
+ trust_remote_code=True
28
+ )
29
+
30
+ # Determine device and display info
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ st.info(f"πŸ’» Using device: {device}")
33
+
34
+ # Load model with appropriate settings
35
+ if device == "cuda":
36
+ model = AutoModelForCausalLM.from_pretrained(
37
+ model_name,
38
+ torch_dtype=torch.float16, # Use float16 for GPU
39
+ device_map="auto",
40
+ trust_remote_code=True
41
+ ).eval() # Set to evaluation mode
42
+ else:
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ model_name,
45
+ device_map={"": device},
46
+ trust_remote_code=True,
47
+ low_cpu_mem_usage=True
48
+ ).eval() # Set to evaluation mode
49
+
50
+ return tokenizer, model
51
+ except Exception as e:
52
+ st.error(f"❌ Error loading model: {str(e)}")
53
+ raise e
54
 
55
+ def generate_response(prompt, model, tokenizer, max_new_tokens=512, temperature=0.7, top_p=0.9):
56
+ """Generate response from the model with better error handling"""
57
+ try:
58
+ # Tokenize input
59
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
60
+
61
+ # Generate response with progress bar
62
+ with torch.no_grad(), st.spinner("πŸ€” Thinking..."):
63
+ outputs = model.generate(
64
+ **inputs,
65
+ max_new_tokens=max_new_tokens,
66
+ temperature=temperature,
67
+ top_p=top_p,
68
+ do_sample=True,
69
+ pad_token_id=tokenizer.pad_token_id,
70
+ eos_token_id=tokenizer.eos_token_id,
71
+ repetition_penalty=1.1,
72
+ no_repeat_ngram_size=3
73
+ )
74
+
75
+ # Decode and return response
76
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
77
+ return response[len(prompt):].strip()
78
 
79
+ except torch.cuda.OutOfMemoryError:
80
+ st.error("πŸ’Ύ GPU memory exceeded. Try reducing the maximum length or clearing the conversation.")
81
+ return None
82
+ except Exception as e:
83
+ st.error(f"❌ Error generating response: {str(e)}")
84
+ return None
85
 
86
+ # Main UI
87
  st.title("πŸ’¬ Qwen2.5-Coder Chat")
88
 
89
  # Sidebar settings
90
  with st.sidebar:
91
+ st.header("βš™οΈ Settings")
92
 
93
+ # Model settings
94
  max_length = st.slider(
95
+ "Maximum Length πŸ“",
96
  min_value=64,
97
+ max_value=2048,
98
+ value=512,
99
+ step=64
 
100
  )
101
 
102
  temperature = st.slider(
103
+ "Temperature 🌑️",
104
  min_value=0.1,
105
+ max_value=2.0,
106
+ value=0.7,
107
+ step=0.1
 
108
  )
109
 
110
  top_p = st.slider(
111
+ "Top P πŸ“Š",
112
  min_value=0.1,
113
  max_value=1.0,
114
+ value=0.9,
115
+ step=0.1
 
116
  )
117
 
118
+ # Clear conversation button
119
+ if st.button("πŸ—‘οΈ Clear Conversation"):
120
  st.session_state.messages = []
121
  st.rerun()
122
 
123
+ # Load model
124
  try:
125
+ tokenizer, model = load_model_and_tokenizer()
 
126
  except Exception as e:
127
+ st.error("❌ Failed to load model. Please check the logs and refresh the page.")
128
  st.stop()
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  # Display conversation history
131
+ for message in st.session_state.messages:
132
  with st.chat_message(message["role"]):
133
+ st.markdown(f"{message['content']}\n\n_{message['timestamp']}_")
134
 
135
  # Chat input
136
+ if prompt := st.chat_input("πŸ’­ Ask me anything about coding..."):
137
  # Add user message
138
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
139
  st.session_state.messages.append({
 
144
 
145
  # Display user message
146
  with st.chat_message("user"):
147
+ st.markdown(f"{prompt}\n\n_{timestamp}_")
148
 
149
  # Generate and display response
150
  with st.chat_message("assistant"):
151
+ # Prepare conversation context (limit to last 3 messages to prevent context overflow)
152
+ conversation = "\n".join(
153
+ f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
154
+ for msg in st.session_state.messages[-3:]
155
+ ) + "\nAssistant:"
156
+
157
+ response = generate_response(
158
+ conversation,
159
+ model,
160
+ tokenizer,
161
+ max_new_tokens=max_length,
162
+ temperature=temperature,
163
+ top_p=top_p
164
+ )
165
+
166
+ if response:
167
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
168
+ st.markdown(f"{response}\n\n_{timestamp}_")
169
 
170
+ # Add response to chat history
171
+ st.session_state.messages.append({
172
+ "role": "assistant",
173
+ "content": response,
174
+ "timestamp": timestamp
175
+ })
176
+ else:
177
+ st.error("❌ Failed to generate response. Please try again with different settings.")