Axcel1 commited on
Commit
764c2d9
·
verified ·
1 Parent(s): ccb3f7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -127
app.py CHANGED
@@ -3,8 +3,7 @@ import os
3
  import threading
4
  import time
5
  from pathlib import Path
6
- from huggingface_hub import login
7
-
8
 
9
  # Try to import llama-cpp-python, fallback to instructions if not available
10
  try:
@@ -14,15 +13,14 @@ except ImportError:
14
  LLAMA_CPP_AVAILABLE = False
15
  print("llama-cpp-python not installed. Please install it with: pip install llama-cpp-python")
16
 
17
- hf_token = os.environ.get("HF_TOKEN")
18
-
19
- login(token = hf_token)
20
-
21
-
22
  # Global variables for model
23
  model = None
24
  model_loaded = False
25
 
 
 
 
 
26
  def find_gguf_file(directory="."):
27
  """Find GGUF files in the specified directory"""
28
  gguf_files = []
@@ -32,11 +30,32 @@ def find_gguf_file(directory="."):
32
  gguf_files.append(os.path.join(root, file))
33
  return gguf_files
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def get_optimal_settings():
36
  """Get optimal CPU threads and GPU layers automatically"""
37
  # Auto-detect CPU threads (use all available cores)
38
  n_threads = os.cpu_count()
39
 
 
 
 
 
40
  # Auto-detect GPU layers (try to use GPU if available)
41
  n_gpu_layers = 0
42
  try:
@@ -52,59 +71,28 @@ def get_optimal_settings():
52
 
53
  return n_threads, n_gpu_layers
54
 
55
- def load_model_from_huggingface(repo_id, filename, n_ctx=2048):
56
- """Load the model from Hugging Face repository"""
57
- global model, model_loaded
58
-
59
- if not LLAMA_CPP_AVAILABLE:
60
- return False, "llama-cpp-python not installed. Please install it with: pip install llama-cpp-python"
61
-
62
- try:
63
- print(f"Loading model from Hugging Face: {repo_id}/{filename}")
64
-
65
- # Get optimal settings automatically
66
- n_threads, n_gpu_layers = get_optimal_settings()
67
- print(f"Auto-detected settings: {n_threads} CPU threads, {n_gpu_layers} GPU layers")
68
-
69
- # Load model from Hugging Face with optimized settings
70
- model = Llama.from_pretrained(
71
- repo_id=repo_id,
72
- filename=filename,
73
- n_ctx=n_ctx, # Context window (configurable)
74
- n_threads=n_threads, # CPU threads (auto-detected)
75
- n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU (auto-detected)
76
- verbose=False,
77
- chat_format="chatml", # Use Llama-3 chat format
78
- n_batch=512, # Batch size for prompt processing
79
- use_mlock=True, # Keep model in memory
80
- use_mmap=True, # Use memory mapping
81
- )
82
-
83
- model_loaded = True
84
- print("Model loaded successfully!")
85
- return True, f"✅ Model loaded successfully from {repo_id}/{filename}\n📊 Context: {n_ctx} tokens\n🖥️ CPU Threads: {n_threads}\n🎮 GPU Layers: {n_gpu_layers}"
86
-
87
- except Exception as e:
88
- model_loaded = False
89
- error_msg = f"Error loading model: {str(e)}"
90
- print(error_msg)
91
- return False, f"❌ {error_msg}"
92
-
93
- def load_model_from_gguf(gguf_path=None, n_ctx=2048):
94
- """Load the model from a local GGUF file with automatic optimization"""
95
  global model, model_loaded
96
 
97
  if not LLAMA_CPP_AVAILABLE:
98
  return False, "llama-cpp-python not installed. Please install it with: pip install llama-cpp-python"
99
 
100
  try:
101
- # If no path provided, try to find GGUF files
102
  if gguf_path is None:
103
- gguf_files = find_gguf_file()
104
- if not gguf_files:
105
- return False, "No GGUF files found in the repository"
106
- gguf_path = gguf_files[0] # Use the first one found
107
- print(f"Found GGUF file: {gguf_path}")
 
 
 
 
 
 
 
108
 
109
  # Check if file exists
110
  if not os.path.exists(gguf_path):
@@ -116,22 +104,22 @@ def load_model_from_gguf(gguf_path=None, n_ctx=2048):
116
  n_threads, n_gpu_layers = get_optimal_settings()
117
  print(f"Auto-detected settings: {n_threads} CPU threads, {n_gpu_layers} GPU layers")
118
 
119
- # Load model with optimized settings
120
  model = Llama(
121
  model_path=gguf_path,
122
  n_ctx=n_ctx, # Context window (configurable)
123
- n_threads=n_threads, # CPU threads (auto-detected)
124
- n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU (auto-detected)
125
  verbose=False,
126
  chat_format="llama-3", # Use Llama-3 chat format
127
- n_batch=512, # Batch size for prompt processing
128
- use_mlock=True, # Keep model in memory
129
  use_mmap=True, # Use memory mapping
130
  )
131
 
132
  model_loaded = True
133
  print("Model loaded successfully!")
134
- return True, f"✅ Model loaded successfully from {os.path.basename(gguf_path)}\n📊 Context: {n_ctx} tokens\n🖥️ CPU Threads: {n_threads}\n🎮 GPU Layers: {n_gpu_layers}"
135
 
136
  except Exception as e:
137
  model_loaded = False
@@ -202,34 +190,39 @@ def clear_chat():
202
  """Clear the chat history"""
203
  return [], ""
204
 
205
- def load_model_interface(source_type, gguf_path, repo_id, filename, context_size):
206
  """Interface function to load model with configurable context size"""
207
- if source_type == "Hugging Face":
208
- success, message = load_model_from_huggingface(repo_id, filename, n_ctx=int(context_size))
209
- else: # Local file
210
- success, message = load_model_from_gguf(gguf_path, n_ctx=int(context_size))
211
  return message
212
 
213
  def get_available_gguf_files():
214
  """Get list of available GGUF files"""
215
  gguf_files = find_gguf_file()
216
  if not gguf_files:
217
- return ["No GGUF files found"]
218
  return [os.path.basename(f) for f in gguf_files]
219
 
 
 
 
 
 
 
 
 
220
  # Create the Gradio interface
221
  def create_interface():
222
- # Get available GGUF files
223
- gguf_files = find_gguf_file()
224
- gguf_choices = [os.path.basename(f) for f in gguf_files] if gguf_files else ["No GGUF files found"]
225
 
226
- with gr.Blocks(title="Llama-3-8B GGUF Chatbot", theme=gr.themes.Soft()) as demo:
227
  gr.HTML("""
228
  <h1 style="text-align: center; color: #2E86AB; margin-bottom: 30px;">
229
  🦙 MMed-Llama-Alpaca GGUF Chatbot
230
  </h1>
231
  <p style="text-align: center; color: #666; margin-bottom: 30px;">
232
- Chat with the MMed-Llama-Alpaca model (Q4_K_M quantized) for medical assistance!
 
233
  </p>
234
  """)
235
 
@@ -246,7 +239,7 @@ def create_interface():
246
 
247
  with gr.Row():
248
  msg = gr.Textbox(
249
- placeholder="Type your message here...",
250
  container=False,
251
  scale=7,
252
  show_label=False
@@ -259,59 +252,30 @@ def create_interface():
259
  gr.HTML("<h3>🔧 Model Control</h3>")
260
 
261
  # Model source selection
262
- source_type = gr.Radio(
263
- choices=["Hugging Face", "Local File"],
264
- value="Hugging Face",
265
- label="Model Source",
266
- info="Choose where to load the model from"
267
  )
268
 
269
- # Hugging Face settings
270
- with gr.Group(visible=True) as hf_group:
271
- gr.HTML("<h4>🤗 Hugging Face Settings</h4>")
272
- repo_id = gr.Textbox(
273
- value="Axcel1/MMed-llama-alpaca-Q4_K_M-GGUF",
274
- label="Repository ID",
275
- info="e.g., username/repo-name"
276
- )
277
- filename = gr.Textbox(
278
- value="mmed-llama-alpaca-q4_k_m.gguf",
279
- label="Filename",
280
- info="GGUF filename in the repository"
281
- )
282
-
283
- # Local file settings
284
- with gr.Group(visible=False) as local_group:
285
- gr.HTML("<h4>📁 Local File Settings</h4>")
286
- if gguf_files:
287
- gguf_dropdown = gr.Dropdown(
288
- choices=gguf_choices,
289
- value=gguf_choices[0] if gguf_choices[0] != "No GGUF files found" else None,
290
- label="Select GGUF File",
291
- info="Choose which GGUF file to load"
292
- )
293
- else:
294
- gguf_dropdown = gr.Textbox(
295
- value="No GGUF files found in repository",
296
- label="GGUF File",
297
- interactive=False
298
- )
299
 
300
  load_btn = gr.Button("Load Model", variant="primary", size="lg")
301
  model_status = gr.Textbox(
302
  label="Status",
303
- value="Model not loaded. Configure settings and click 'Load Model'.\n⚙️ Auto-optimized: CPU threads & GPU layers auto-detected\n📝 Context size can be configured in Generation Settings",
304
  interactive=False,
305
- max_lines=5
306
  )
307
 
308
  # Generation parameters
309
  gr.HTML("<h3>⚙️ Generation Settings</h3>")
310
 
311
- # Context size (now as a slider)
312
  context_size = gr.Slider(
313
  minimum=512,
314
- maximum=8192,
315
  value=2048,
316
  step=256,
317
  label="Context Size",
@@ -320,7 +284,7 @@ def create_interface():
320
 
321
  max_tokens = gr.Slider(
322
  minimum=50,
323
- maximum=2048,
324
  value=512,
325
  step=50,
326
  label="Max Tokens",
@@ -354,12 +318,13 @@ def create_interface():
354
  # Information section
355
  gr.HTML("""
356
  <h3>ℹ️ About</h3>
 
 
357
  <p><strong>Format:</strong> GGUF (optimized)</p>
358
  <p><strong>Backend:</strong> llama-cpp-python</p>
359
  <p><strong>Features:</strong> CPU/GPU support, streaming</p>
360
- <p><strong>Memory:</strong> Optimized usage</p>
361
  <p><strong>Auto-Optimization:</strong> CPU threads & GPU layers detected automatically</p>
362
- <p><strong>Sources:</strong> Hugging Face Hub or Local Files</p>
363
  """)
364
 
365
  if not LLAMA_CPP_AVAILABLE:
@@ -374,21 +339,9 @@ def create_interface():
374
  """)
375
 
376
  # Event handlers
377
- def toggle_source_visibility(source_type):
378
- if source_type == "Hugging Face":
379
- return gr.update(visible=True), gr.update(visible=False)
380
- else:
381
- return gr.update(visible=False), gr.update(visible=True)
382
-
383
- source_type.change(
384
- toggle_source_visibility,
385
- inputs=source_type,
386
- outputs=[hf_group, local_group]
387
- )
388
-
389
  load_btn.click(
390
  load_model_interface,
391
- inputs=[source_type, gguf_dropdown, repo_id, filename, context_size],
392
  outputs=model_status
393
  )
394
 
@@ -415,7 +368,7 @@ if __name__ == "__main__":
415
  # Create and launch the interface
416
  demo = create_interface()
417
 
418
- # Launch with appropriate settings for Hugging Face Spaces
419
  demo.launch(
420
  server_name="0.0.0.0",
421
  server_port=7860,
 
3
  import threading
4
  import time
5
  from pathlib import Path
6
+ from huggingface_hub import hf_hub_download
 
7
 
8
  # Try to import llama-cpp-python, fallback to instructions if not available
9
  try:
 
13
  LLAMA_CPP_AVAILABLE = False
14
  print("llama-cpp-python not installed. Please install it with: pip install llama-cpp-python")
15
 
 
 
 
 
 
16
  # Global variables for model
17
  model = None
18
  model_loaded = False
19
 
20
+ # HuggingFace repository information
21
+ HF_REPO_ID = "Axcel1/MMed-llama-alpaca-Q4_K_M-GGUF"
22
+ HF_FILENAME = "mmed-llama-alpaca-q4_k_m.gguf"
23
+
24
  def find_gguf_file(directory="."):
25
  """Find GGUF files in the specified directory"""
26
  gguf_files = []
 
30
  gguf_files.append(os.path.join(root, file))
31
  return gguf_files
32
 
33
+ def download_model_from_hf(repo_id=HF_REPO_ID, filename=HF_FILENAME):
34
+ """Download GGUF model from HuggingFace Hub"""
35
+ try:
36
+ print(f"Downloading model from {repo_id}/{filename}...")
37
+ gguf_path = hf_hub_download(
38
+ repo_id=repo_id,
39
+ filename=filename,
40
+ cache_dir="./models",
41
+ resume_download=True # Resume partial downloads
42
+ )
43
+ print(f"Model downloaded to: {gguf_path}")
44
+ return gguf_path, None
45
+ except Exception as e:
46
+ error_msg = f"Error downloading model: {str(e)}"
47
+ print(error_msg)
48
+ return None, error_msg
49
+
50
  def get_optimal_settings():
51
  """Get optimal CPU threads and GPU layers automatically"""
52
  # Auto-detect CPU threads (use all available cores)
53
  n_threads = os.cpu_count()
54
 
55
+ # For Hugging Face Spaces, limit threads to avoid resource issues
56
+ if n_threads and n_threads > 4:
57
+ n_threads = 4
58
+
59
  # Auto-detect GPU layers (try to use GPU if available)
60
  n_gpu_layers = 0
61
  try:
 
71
 
72
  return n_threads, n_gpu_layers
73
 
74
+ def load_model_from_gguf(gguf_path=None, n_ctx=2048, use_hf_download=True):
75
+ """Load the model from a GGUF file with automatic optimization"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  global model, model_loaded
77
 
78
  if not LLAMA_CPP_AVAILABLE:
79
  return False, "llama-cpp-python not installed. Please install it with: pip install llama-cpp-python"
80
 
81
  try:
82
+ # If no path provided, try different approaches
83
  if gguf_path is None:
84
+ if use_hf_download:
85
+ # Try to download from HuggingFace first
86
+ gguf_path, error = download_model_from_hf()
87
+ if error:
88
+ return False, f" Failed to download from HuggingFace: {error}"
89
+ else:
90
+ # Try to find local GGUF files
91
+ gguf_files = find_gguf_file()
92
+ if not gguf_files:
93
+ return False, "No GGUF files found in the repository"
94
+ gguf_path = gguf_files[0] # Use the first one found
95
+ print(f"Found local GGUF file: {gguf_path}")
96
 
97
  # Check if file exists
98
  if not os.path.exists(gguf_path):
 
104
  n_threads, n_gpu_layers = get_optimal_settings()
105
  print(f"Auto-detected settings: {n_threads} CPU threads, {n_gpu_layers} GPU layers")
106
 
107
+ # Load model with optimized settings for Hugging Face Spaces
108
  model = Llama(
109
  model_path=gguf_path,
110
  n_ctx=n_ctx, # Context window (configurable)
111
+ n_threads=n_threads, # CPU threads (limited for Spaces)
112
+ n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU
113
  verbose=False,
114
  chat_format="llama-3", # Use Llama-3 chat format
115
+ n_batch=256, # Smaller batch size for Spaces
116
+ use_mlock=False, # Disabled for Spaces compatibility
117
  use_mmap=True, # Use memory mapping
118
  )
119
 
120
  model_loaded = True
121
  print("Model loaded successfully!")
122
+ return True, f"✅ Model loaded successfully from {os.path.basename(gguf_path)}\n📊 Context: {n_ctx} tokens\n🖥️ CPU Threads: {n_threads}\n🎮 GPU Layers: {n_gpu_layers}\n📦 Source: {HF_REPO_ID}"
123
 
124
  except Exception as e:
125
  model_loaded = False
 
190
  """Clear the chat history"""
191
  return [], ""
192
 
193
+ def load_model_interface(context_size, use_hf_download):
194
  """Interface function to load model with configurable context size"""
195
+ success, message = load_model_from_gguf(gguf_path=None, n_ctx=int(context_size), use_hf_download=use_hf_download)
 
 
 
196
  return message
197
 
198
  def get_available_gguf_files():
199
  """Get list of available GGUF files"""
200
  gguf_files = find_gguf_file()
201
  if not gguf_files:
202
+ return ["No local GGUF files found"]
203
  return [os.path.basename(f) for f in gguf_files]
204
 
205
+ def check_model_availability():
206
+ """Check if model is available locally or needs to be downloaded"""
207
+ local_files = find_gguf_file()
208
+ if local_files:
209
+ return f"Local GGUF files found: {len(local_files)}"
210
+ else:
211
+ return "No local GGUF files found. Will download from HuggingFace."
212
+
213
  # Create the Gradio interface
214
  def create_interface():
215
+ # Check for available models
216
+ availability_status = check_model_availability()
 
217
 
218
+ with gr.Blocks(title="MMed-Llama-Alpaca GGUF Chatbot", theme=gr.themes.Soft()) as demo:
219
  gr.HTML("""
220
  <h1 style="text-align: center; color: #2E86AB; margin-bottom: 30px;">
221
  🦙 MMed-Llama-Alpaca GGUF Chatbot
222
  </h1>
223
  <p style="text-align: center; color: #666; margin-bottom: 30px;">
224
+ Chat with the MMed-Llama-Alpaca model (Q4_K_M quantized) for medical assistance!<br>
225
+ <strong>⚠️ This is for educational purposes only. Always consult healthcare professionals for medical advice.</strong>
226
  </p>
227
  """)
228
 
 
239
 
240
  with gr.Row():
241
  msg = gr.Textbox(
242
+ placeholder="Type your medical question here...",
243
  container=False,
244
  scale=7,
245
  show_label=False
 
252
  gr.HTML("<h3>🔧 Model Control</h3>")
253
 
254
  # Model source selection
255
+ use_hf_download = gr.Checkbox(
256
+ value=True,
257
+ label="Download from HuggingFace",
258
+ info="Uncheck to use local GGUF files"
 
259
  )
260
 
261
+ gr.HTML(f"<p style='font-size: 0.9em; color: #666;'><strong>Repository:</strong> {HF_REPO_ID}</p>")
262
+ gr.HTML(f"<p style='font-size: 0.9em; color: #666;'><strong>File:</strong> {HF_FILENAME}</p>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  load_btn = gr.Button("Load Model", variant="primary", size="lg")
265
  model_status = gr.Textbox(
266
  label="Status",
267
+ value=f"Model not loaded.\n{availability_status}\n⚙️ Auto-optimized: CPU threads & GPU layers auto-detected\n📝 Context size can be configured below",
268
  interactive=False,
269
+ max_lines=6
270
  )
271
 
272
  # Generation parameters
273
  gr.HTML("<h3>⚙️ Generation Settings</h3>")
274
 
275
+ # Context size (limited for Spaces)
276
  context_size = gr.Slider(
277
  minimum=512,
278
+ maximum=4096,
279
  value=2048,
280
  step=256,
281
  label="Context Size",
 
284
 
285
  max_tokens = gr.Slider(
286
  minimum=50,
287
+ maximum=1024,
288
  value=512,
289
  step=50,
290
  label="Max Tokens",
 
318
  # Information section
319
  gr.HTML("""
320
  <h3>ℹ️ About</h3>
321
+ <p><strong>Model:</strong> MMed-Llama-Alpaca</p>
322
+ <p><strong>Quantization:</strong> Q4_K_M</p>
323
  <p><strong>Format:</strong> GGUF (optimized)</p>
324
  <p><strong>Backend:</strong> llama-cpp-python</p>
325
  <p><strong>Features:</strong> CPU/GPU support, streaming</p>
326
+ <p><strong>Specialty:</strong> Medical assistance</p>
327
  <p><strong>Auto-Optimization:</strong> CPU threads & GPU layers detected automatically</p>
 
328
  """)
329
 
330
  if not LLAMA_CPP_AVAILABLE:
 
339
  """)
340
 
341
  # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
342
  load_btn.click(
343
  load_model_interface,
344
+ inputs=[context_size, use_hf_download],
345
  outputs=model_status
346
  )
347
 
 
368
  # Create and launch the interface
369
  demo = create_interface()
370
 
371
+ # Launch with settings optimized for Hugging Face Spaces
372
  demo.launch(
373
  server_name="0.0.0.0",
374
  server_port=7860,