Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,7 @@ import os
|
|
3 |
import threading
|
4 |
import time
|
5 |
from pathlib import Path
|
6 |
-
from huggingface_hub import
|
7 |
-
|
8 |
|
9 |
# Try to import llama-cpp-python, fallback to instructions if not available
|
10 |
try:
|
@@ -14,15 +13,14 @@ except ImportError:
|
|
14 |
LLAMA_CPP_AVAILABLE = False
|
15 |
print("llama-cpp-python not installed. Please install it with: pip install llama-cpp-python")
|
16 |
|
17 |
-
hf_token = os.environ.get("HF_TOKEN")
|
18 |
-
|
19 |
-
login(token = hf_token)
|
20 |
-
|
21 |
-
|
22 |
# Global variables for model
|
23 |
model = None
|
24 |
model_loaded = False
|
25 |
|
|
|
|
|
|
|
|
|
26 |
def find_gguf_file(directory="."):
|
27 |
"""Find GGUF files in the specified directory"""
|
28 |
gguf_files = []
|
@@ -32,11 +30,32 @@ def find_gguf_file(directory="."):
|
|
32 |
gguf_files.append(os.path.join(root, file))
|
33 |
return gguf_files
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def get_optimal_settings():
|
36 |
"""Get optimal CPU threads and GPU layers automatically"""
|
37 |
# Auto-detect CPU threads (use all available cores)
|
38 |
n_threads = os.cpu_count()
|
39 |
|
|
|
|
|
|
|
|
|
40 |
# Auto-detect GPU layers (try to use GPU if available)
|
41 |
n_gpu_layers = 0
|
42 |
try:
|
@@ -52,59 +71,28 @@ def get_optimal_settings():
|
|
52 |
|
53 |
return n_threads, n_gpu_layers
|
54 |
|
55 |
-
def
|
56 |
-
"""Load the model from
|
57 |
-
global model, model_loaded
|
58 |
-
|
59 |
-
if not LLAMA_CPP_AVAILABLE:
|
60 |
-
return False, "llama-cpp-python not installed. Please install it with: pip install llama-cpp-python"
|
61 |
-
|
62 |
-
try:
|
63 |
-
print(f"Loading model from Hugging Face: {repo_id}/{filename}")
|
64 |
-
|
65 |
-
# Get optimal settings automatically
|
66 |
-
n_threads, n_gpu_layers = get_optimal_settings()
|
67 |
-
print(f"Auto-detected settings: {n_threads} CPU threads, {n_gpu_layers} GPU layers")
|
68 |
-
|
69 |
-
# Load model from Hugging Face with optimized settings
|
70 |
-
model = Llama.from_pretrained(
|
71 |
-
repo_id=repo_id,
|
72 |
-
filename=filename,
|
73 |
-
n_ctx=n_ctx, # Context window (configurable)
|
74 |
-
n_threads=n_threads, # CPU threads (auto-detected)
|
75 |
-
n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU (auto-detected)
|
76 |
-
verbose=False,
|
77 |
-
chat_format="chatml", # Use Llama-3 chat format
|
78 |
-
n_batch=512, # Batch size for prompt processing
|
79 |
-
use_mlock=True, # Keep model in memory
|
80 |
-
use_mmap=True, # Use memory mapping
|
81 |
-
)
|
82 |
-
|
83 |
-
model_loaded = True
|
84 |
-
print("Model loaded successfully!")
|
85 |
-
return True, f"✅ Model loaded successfully from {repo_id}/{filename}\n📊 Context: {n_ctx} tokens\n🖥️ CPU Threads: {n_threads}\n🎮 GPU Layers: {n_gpu_layers}"
|
86 |
-
|
87 |
-
except Exception as e:
|
88 |
-
model_loaded = False
|
89 |
-
error_msg = f"Error loading model: {str(e)}"
|
90 |
-
print(error_msg)
|
91 |
-
return False, f"❌ {error_msg}"
|
92 |
-
|
93 |
-
def load_model_from_gguf(gguf_path=None, n_ctx=2048):
|
94 |
-
"""Load the model from a local GGUF file with automatic optimization"""
|
95 |
global model, model_loaded
|
96 |
|
97 |
if not LLAMA_CPP_AVAILABLE:
|
98 |
return False, "llama-cpp-python not installed. Please install it with: pip install llama-cpp-python"
|
99 |
|
100 |
try:
|
101 |
-
# If no path provided, try
|
102 |
if gguf_path is None:
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
# Check if file exists
|
110 |
if not os.path.exists(gguf_path):
|
@@ -116,22 +104,22 @@ def load_model_from_gguf(gguf_path=None, n_ctx=2048):
|
|
116 |
n_threads, n_gpu_layers = get_optimal_settings()
|
117 |
print(f"Auto-detected settings: {n_threads} CPU threads, {n_gpu_layers} GPU layers")
|
118 |
|
119 |
-
# Load model with optimized settings
|
120 |
model = Llama(
|
121 |
model_path=gguf_path,
|
122 |
n_ctx=n_ctx, # Context window (configurable)
|
123 |
-
n_threads=n_threads, # CPU threads (
|
124 |
-
n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU
|
125 |
verbose=False,
|
126 |
chat_format="llama-3", # Use Llama-3 chat format
|
127 |
-
n_batch=
|
128 |
-
use_mlock=
|
129 |
use_mmap=True, # Use memory mapping
|
130 |
)
|
131 |
|
132 |
model_loaded = True
|
133 |
print("Model loaded successfully!")
|
134 |
-
return True, f"✅ Model loaded successfully from {os.path.basename(gguf_path)}\n📊 Context: {n_ctx} tokens\n🖥️ CPU Threads: {n_threads}\n🎮 GPU Layers: {n_gpu_layers}"
|
135 |
|
136 |
except Exception as e:
|
137 |
model_loaded = False
|
@@ -202,34 +190,39 @@ def clear_chat():
|
|
202 |
"""Clear the chat history"""
|
203 |
return [], ""
|
204 |
|
205 |
-
def load_model_interface(
|
206 |
"""Interface function to load model with configurable context size"""
|
207 |
-
|
208 |
-
success, message = load_model_from_huggingface(repo_id, filename, n_ctx=int(context_size))
|
209 |
-
else: # Local file
|
210 |
-
success, message = load_model_from_gguf(gguf_path, n_ctx=int(context_size))
|
211 |
return message
|
212 |
|
213 |
def get_available_gguf_files():
|
214 |
"""Get list of available GGUF files"""
|
215 |
gguf_files = find_gguf_file()
|
216 |
if not gguf_files:
|
217 |
-
return ["No GGUF files found"]
|
218 |
return [os.path.basename(f) for f in gguf_files]
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
# Create the Gradio interface
|
221 |
def create_interface():
|
222 |
-
#
|
223 |
-
|
224 |
-
gguf_choices = [os.path.basename(f) for f in gguf_files] if gguf_files else ["No GGUF files found"]
|
225 |
|
226 |
-
with gr.Blocks(title="Llama-
|
227 |
gr.HTML("""
|
228 |
<h1 style="text-align: center; color: #2E86AB; margin-bottom: 30px;">
|
229 |
🦙 MMed-Llama-Alpaca GGUF Chatbot
|
230 |
</h1>
|
231 |
<p style="text-align: center; color: #666; margin-bottom: 30px;">
|
232 |
-
Chat with the MMed-Llama-Alpaca model (Q4_K_M quantized) for medical assistance
|
|
|
233 |
</p>
|
234 |
""")
|
235 |
|
@@ -246,7 +239,7 @@ def create_interface():
|
|
246 |
|
247 |
with gr.Row():
|
248 |
msg = gr.Textbox(
|
249 |
-
placeholder="Type your
|
250 |
container=False,
|
251 |
scale=7,
|
252 |
show_label=False
|
@@ -259,59 +252,30 @@ def create_interface():
|
|
259 |
gr.HTML("<h3>🔧 Model Control</h3>")
|
260 |
|
261 |
# Model source selection
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
info="Choose where to load the model from"
|
267 |
)
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
gr.HTML("<h4>🤗 Hugging Face Settings</h4>")
|
272 |
-
repo_id = gr.Textbox(
|
273 |
-
value="Axcel1/MMed-llama-alpaca-Q4_K_M-GGUF",
|
274 |
-
label="Repository ID",
|
275 |
-
info="e.g., username/repo-name"
|
276 |
-
)
|
277 |
-
filename = gr.Textbox(
|
278 |
-
value="mmed-llama-alpaca-q4_k_m.gguf",
|
279 |
-
label="Filename",
|
280 |
-
info="GGUF filename in the repository"
|
281 |
-
)
|
282 |
-
|
283 |
-
# Local file settings
|
284 |
-
with gr.Group(visible=False) as local_group:
|
285 |
-
gr.HTML("<h4>📁 Local File Settings</h4>")
|
286 |
-
if gguf_files:
|
287 |
-
gguf_dropdown = gr.Dropdown(
|
288 |
-
choices=gguf_choices,
|
289 |
-
value=gguf_choices[0] if gguf_choices[0] != "No GGUF files found" else None,
|
290 |
-
label="Select GGUF File",
|
291 |
-
info="Choose which GGUF file to load"
|
292 |
-
)
|
293 |
-
else:
|
294 |
-
gguf_dropdown = gr.Textbox(
|
295 |
-
value="No GGUF files found in repository",
|
296 |
-
label="GGUF File",
|
297 |
-
interactive=False
|
298 |
-
)
|
299 |
|
300 |
load_btn = gr.Button("Load Model", variant="primary", size="lg")
|
301 |
model_status = gr.Textbox(
|
302 |
label="Status",
|
303 |
-
value="Model not loaded
|
304 |
interactive=False,
|
305 |
-
max_lines=
|
306 |
)
|
307 |
|
308 |
# Generation parameters
|
309 |
gr.HTML("<h3>⚙️ Generation Settings</h3>")
|
310 |
|
311 |
-
# Context size (
|
312 |
context_size = gr.Slider(
|
313 |
minimum=512,
|
314 |
-
maximum=
|
315 |
value=2048,
|
316 |
step=256,
|
317 |
label="Context Size",
|
@@ -320,7 +284,7 @@ def create_interface():
|
|
320 |
|
321 |
max_tokens = gr.Slider(
|
322 |
minimum=50,
|
323 |
-
maximum=
|
324 |
value=512,
|
325 |
step=50,
|
326 |
label="Max Tokens",
|
@@ -354,12 +318,13 @@ def create_interface():
|
|
354 |
# Information section
|
355 |
gr.HTML("""
|
356 |
<h3>ℹ️ About</h3>
|
|
|
|
|
357 |
<p><strong>Format:</strong> GGUF (optimized)</p>
|
358 |
<p><strong>Backend:</strong> llama-cpp-python</p>
|
359 |
<p><strong>Features:</strong> CPU/GPU support, streaming</p>
|
360 |
-
<p><strong>
|
361 |
<p><strong>Auto-Optimization:</strong> CPU threads & GPU layers detected automatically</p>
|
362 |
-
<p><strong>Sources:</strong> Hugging Face Hub or Local Files</p>
|
363 |
""")
|
364 |
|
365 |
if not LLAMA_CPP_AVAILABLE:
|
@@ -374,21 +339,9 @@ def create_interface():
|
|
374 |
""")
|
375 |
|
376 |
# Event handlers
|
377 |
-
def toggle_source_visibility(source_type):
|
378 |
-
if source_type == "Hugging Face":
|
379 |
-
return gr.update(visible=True), gr.update(visible=False)
|
380 |
-
else:
|
381 |
-
return gr.update(visible=False), gr.update(visible=True)
|
382 |
-
|
383 |
-
source_type.change(
|
384 |
-
toggle_source_visibility,
|
385 |
-
inputs=source_type,
|
386 |
-
outputs=[hf_group, local_group]
|
387 |
-
)
|
388 |
-
|
389 |
load_btn.click(
|
390 |
load_model_interface,
|
391 |
-
inputs=[
|
392 |
outputs=model_status
|
393 |
)
|
394 |
|
@@ -415,7 +368,7 @@ if __name__ == "__main__":
|
|
415 |
# Create and launch the interface
|
416 |
demo = create_interface()
|
417 |
|
418 |
-
# Launch with
|
419 |
demo.launch(
|
420 |
server_name="0.0.0.0",
|
421 |
server_port=7860,
|
|
|
3 |
import threading
|
4 |
import time
|
5 |
from pathlib import Path
|
6 |
+
from huggingface_hub import hf_hub_download
|
|
|
7 |
|
8 |
# Try to import llama-cpp-python, fallback to instructions if not available
|
9 |
try:
|
|
|
13 |
LLAMA_CPP_AVAILABLE = False
|
14 |
print("llama-cpp-python not installed. Please install it with: pip install llama-cpp-python")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
16 |
# Global variables for model
|
17 |
model = None
|
18 |
model_loaded = False
|
19 |
|
20 |
+
# HuggingFace repository information
|
21 |
+
HF_REPO_ID = "Axcel1/MMed-llama-alpaca-Q4_K_M-GGUF"
|
22 |
+
HF_FILENAME = "mmed-llama-alpaca-q4_k_m.gguf"
|
23 |
+
|
24 |
def find_gguf_file(directory="."):
|
25 |
"""Find GGUF files in the specified directory"""
|
26 |
gguf_files = []
|
|
|
30 |
gguf_files.append(os.path.join(root, file))
|
31 |
return gguf_files
|
32 |
|
33 |
+
def download_model_from_hf(repo_id=HF_REPO_ID, filename=HF_FILENAME):
|
34 |
+
"""Download GGUF model from HuggingFace Hub"""
|
35 |
+
try:
|
36 |
+
print(f"Downloading model from {repo_id}/{filename}...")
|
37 |
+
gguf_path = hf_hub_download(
|
38 |
+
repo_id=repo_id,
|
39 |
+
filename=filename,
|
40 |
+
cache_dir="./models",
|
41 |
+
resume_download=True # Resume partial downloads
|
42 |
+
)
|
43 |
+
print(f"Model downloaded to: {gguf_path}")
|
44 |
+
return gguf_path, None
|
45 |
+
except Exception as e:
|
46 |
+
error_msg = f"Error downloading model: {str(e)}"
|
47 |
+
print(error_msg)
|
48 |
+
return None, error_msg
|
49 |
+
|
50 |
def get_optimal_settings():
|
51 |
"""Get optimal CPU threads and GPU layers automatically"""
|
52 |
# Auto-detect CPU threads (use all available cores)
|
53 |
n_threads = os.cpu_count()
|
54 |
|
55 |
+
# For Hugging Face Spaces, limit threads to avoid resource issues
|
56 |
+
if n_threads and n_threads > 4:
|
57 |
+
n_threads = 4
|
58 |
+
|
59 |
# Auto-detect GPU layers (try to use GPU if available)
|
60 |
n_gpu_layers = 0
|
61 |
try:
|
|
|
71 |
|
72 |
return n_threads, n_gpu_layers
|
73 |
|
74 |
+
def load_model_from_gguf(gguf_path=None, n_ctx=2048, use_hf_download=True):
|
75 |
+
"""Load the model from a GGUF file with automatic optimization"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
global model, model_loaded
|
77 |
|
78 |
if not LLAMA_CPP_AVAILABLE:
|
79 |
return False, "llama-cpp-python not installed. Please install it with: pip install llama-cpp-python"
|
80 |
|
81 |
try:
|
82 |
+
# If no path provided, try different approaches
|
83 |
if gguf_path is None:
|
84 |
+
if use_hf_download:
|
85 |
+
# Try to download from HuggingFace first
|
86 |
+
gguf_path, error = download_model_from_hf()
|
87 |
+
if error:
|
88 |
+
return False, f"❌ Failed to download from HuggingFace: {error}"
|
89 |
+
else:
|
90 |
+
# Try to find local GGUF files
|
91 |
+
gguf_files = find_gguf_file()
|
92 |
+
if not gguf_files:
|
93 |
+
return False, "No GGUF files found in the repository"
|
94 |
+
gguf_path = gguf_files[0] # Use the first one found
|
95 |
+
print(f"Found local GGUF file: {gguf_path}")
|
96 |
|
97 |
# Check if file exists
|
98 |
if not os.path.exists(gguf_path):
|
|
|
104 |
n_threads, n_gpu_layers = get_optimal_settings()
|
105 |
print(f"Auto-detected settings: {n_threads} CPU threads, {n_gpu_layers} GPU layers")
|
106 |
|
107 |
+
# Load model with optimized settings for Hugging Face Spaces
|
108 |
model = Llama(
|
109 |
model_path=gguf_path,
|
110 |
n_ctx=n_ctx, # Context window (configurable)
|
111 |
+
n_threads=n_threads, # CPU threads (limited for Spaces)
|
112 |
+
n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU
|
113 |
verbose=False,
|
114 |
chat_format="llama-3", # Use Llama-3 chat format
|
115 |
+
n_batch=256, # Smaller batch size for Spaces
|
116 |
+
use_mlock=False, # Disabled for Spaces compatibility
|
117 |
use_mmap=True, # Use memory mapping
|
118 |
)
|
119 |
|
120 |
model_loaded = True
|
121 |
print("Model loaded successfully!")
|
122 |
+
return True, f"✅ Model loaded successfully from {os.path.basename(gguf_path)}\n📊 Context: {n_ctx} tokens\n🖥️ CPU Threads: {n_threads}\n🎮 GPU Layers: {n_gpu_layers}\n📦 Source: {HF_REPO_ID}"
|
123 |
|
124 |
except Exception as e:
|
125 |
model_loaded = False
|
|
|
190 |
"""Clear the chat history"""
|
191 |
return [], ""
|
192 |
|
193 |
+
def load_model_interface(context_size, use_hf_download):
|
194 |
"""Interface function to load model with configurable context size"""
|
195 |
+
success, message = load_model_from_gguf(gguf_path=None, n_ctx=int(context_size), use_hf_download=use_hf_download)
|
|
|
|
|
|
|
196 |
return message
|
197 |
|
198 |
def get_available_gguf_files():
|
199 |
"""Get list of available GGUF files"""
|
200 |
gguf_files = find_gguf_file()
|
201 |
if not gguf_files:
|
202 |
+
return ["No local GGUF files found"]
|
203 |
return [os.path.basename(f) for f in gguf_files]
|
204 |
|
205 |
+
def check_model_availability():
|
206 |
+
"""Check if model is available locally or needs to be downloaded"""
|
207 |
+
local_files = find_gguf_file()
|
208 |
+
if local_files:
|
209 |
+
return f"Local GGUF files found: {len(local_files)}"
|
210 |
+
else:
|
211 |
+
return "No local GGUF files found. Will download from HuggingFace."
|
212 |
+
|
213 |
# Create the Gradio interface
|
214 |
def create_interface():
|
215 |
+
# Check for available models
|
216 |
+
availability_status = check_model_availability()
|
|
|
217 |
|
218 |
+
with gr.Blocks(title="MMed-Llama-Alpaca GGUF Chatbot", theme=gr.themes.Soft()) as demo:
|
219 |
gr.HTML("""
|
220 |
<h1 style="text-align: center; color: #2E86AB; margin-bottom: 30px;">
|
221 |
🦙 MMed-Llama-Alpaca GGUF Chatbot
|
222 |
</h1>
|
223 |
<p style="text-align: center; color: #666; margin-bottom: 30px;">
|
224 |
+
Chat with the MMed-Llama-Alpaca model (Q4_K_M quantized) for medical assistance!<br>
|
225 |
+
<strong>⚠️ This is for educational purposes only. Always consult healthcare professionals for medical advice.</strong>
|
226 |
</p>
|
227 |
""")
|
228 |
|
|
|
239 |
|
240 |
with gr.Row():
|
241 |
msg = gr.Textbox(
|
242 |
+
placeholder="Type your medical question here...",
|
243 |
container=False,
|
244 |
scale=7,
|
245 |
show_label=False
|
|
|
252 |
gr.HTML("<h3>🔧 Model Control</h3>")
|
253 |
|
254 |
# Model source selection
|
255 |
+
use_hf_download = gr.Checkbox(
|
256 |
+
value=True,
|
257 |
+
label="Download from HuggingFace",
|
258 |
+
info="Uncheck to use local GGUF files"
|
|
|
259 |
)
|
260 |
|
261 |
+
gr.HTML(f"<p style='font-size: 0.9em; color: #666;'><strong>Repository:</strong> {HF_REPO_ID}</p>")
|
262 |
+
gr.HTML(f"<p style='font-size: 0.9em; color: #666;'><strong>File:</strong> {HF_FILENAME}</p>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
load_btn = gr.Button("Load Model", variant="primary", size="lg")
|
265 |
model_status = gr.Textbox(
|
266 |
label="Status",
|
267 |
+
value=f"Model not loaded.\n{availability_status}\n⚙️ Auto-optimized: CPU threads & GPU layers auto-detected\n📝 Context size can be configured below",
|
268 |
interactive=False,
|
269 |
+
max_lines=6
|
270 |
)
|
271 |
|
272 |
# Generation parameters
|
273 |
gr.HTML("<h3>⚙️ Generation Settings</h3>")
|
274 |
|
275 |
+
# Context size (limited for Spaces)
|
276 |
context_size = gr.Slider(
|
277 |
minimum=512,
|
278 |
+
maximum=4096,
|
279 |
value=2048,
|
280 |
step=256,
|
281 |
label="Context Size",
|
|
|
284 |
|
285 |
max_tokens = gr.Slider(
|
286 |
minimum=50,
|
287 |
+
maximum=1024,
|
288 |
value=512,
|
289 |
step=50,
|
290 |
label="Max Tokens",
|
|
|
318 |
# Information section
|
319 |
gr.HTML("""
|
320 |
<h3>ℹ️ About</h3>
|
321 |
+
<p><strong>Model:</strong> MMed-Llama-Alpaca</p>
|
322 |
+
<p><strong>Quantization:</strong> Q4_K_M</p>
|
323 |
<p><strong>Format:</strong> GGUF (optimized)</p>
|
324 |
<p><strong>Backend:</strong> llama-cpp-python</p>
|
325 |
<p><strong>Features:</strong> CPU/GPU support, streaming</p>
|
326 |
+
<p><strong>Specialty:</strong> Medical assistance</p>
|
327 |
<p><strong>Auto-Optimization:</strong> CPU threads & GPU layers detected automatically</p>
|
|
|
328 |
""")
|
329 |
|
330 |
if not LLAMA_CPP_AVAILABLE:
|
|
|
339 |
""")
|
340 |
|
341 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
load_btn.click(
|
343 |
load_model_interface,
|
344 |
+
inputs=[context_size, use_hf_download],
|
345 |
outputs=model_status
|
346 |
)
|
347 |
|
|
|
368 |
# Create and launch the interface
|
369 |
demo = create_interface()
|
370 |
|
371 |
+
# Launch with settings optimized for Hugging Face Spaces
|
372 |
demo.launch(
|
373 |
server_name="0.0.0.0",
|
374 |
server_port=7860,
|