Spaces:

Cylanoid
/

llama_4_Medical_Fraud_Detection

Paused

Cylanoid commited on Apr 22

Commit

e73698a

1 Parent(s): b2c8265

Use device_map='auto' + offload_folder to avoid OOM

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,9 +1,6 @@
 # app.py
 import os
-import json
-import re
 import gradio as gr
 import pdfplumber
 import nltk
@@ -17,7 +14,7 @@ import huggingface_hub
 from document_analyzer import HealthcareFraudAnalyzer
-print("Running updated app.py with CPU offloading (version: 2025-04-22 v1)")
 # — Ensure NLTK punkt tokenizer is available
 try:
@@ -43,13 +40,17 @@ quant_config = BitsAndBytesConfig(
     llm_int8_enable_fp32_cpu_offload=True
 )
-print("Loading model with 8-bit quantization, CPU offload, and automatic device mapping")
 model = Llama4ForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
-    device_map="auto",        # let Accelerate decide which layers go to GPU vs. CPU
     quantization_config=quant_config,
-    offload_folder="./offload"  # spill CPU‑offloaded weights here
 )
 # — Resize embeddings if we added a pad token

 # app.py
 import os
 import gradio as gr
 import pdfplumber
 import nltk
 from document_analyzer import HealthcareFraudAnalyzer
+print("Running updated app.py with restricted GPU usage (version: 2025-04-22 v2)")
 # — Ensure NLTK punkt tokenizer is available
 try:
     llm_int8_enable_fp32_cpu_offload=True
 )
+print("Loading model with 8-bit quantization, CPU offload, auto device mapping + max_memory cap")
 model = Llama4ForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
+    device_map="auto",
+    max_memory={                               # cap GPU usage to ~11 GiB
+        "0": "11GiB",
+        "cpu": "200GiB"
+    },
     quantization_config=quant_config,
+    offload_folder="./offload"
 )
 # — Resize embeddings if we added a pad token