Use device_map='auto' + offload_folder to avoid OOM
Browse files
app.py
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
# app.py
|
2 |
|
3 |
import os
|
4 |
-
import json
|
5 |
-
import re
|
6 |
-
|
7 |
import gradio as gr
|
8 |
import pdfplumber
|
9 |
import nltk
|
@@ -17,7 +14,7 @@ import huggingface_hub
|
|
17 |
|
18 |
from document_analyzer import HealthcareFraudAnalyzer
|
19 |
|
20 |
-
print("Running updated app.py with
|
21 |
|
22 |
# — Ensure NLTK punkt tokenizer is available
|
23 |
try:
|
@@ -43,13 +40,17 @@ quant_config = BitsAndBytesConfig(
|
|
43 |
llm_int8_enable_fp32_cpu_offload=True
|
44 |
)
|
45 |
|
46 |
-
print("Loading model with 8-bit quantization, CPU offload,
|
47 |
model = Llama4ForConditionalGeneration.from_pretrained(
|
48 |
MODEL_ID,
|
49 |
torch_dtype=torch.bfloat16,
|
50 |
-
device_map="auto",
|
|
|
|
|
|
|
|
|
51 |
quantization_config=quant_config,
|
52 |
-
offload_folder="./offload"
|
53 |
)
|
54 |
|
55 |
# — Resize embeddings if we added a pad token
|
|
|
1 |
# app.py
|
2 |
|
3 |
import os
|
|
|
|
|
|
|
4 |
import gradio as gr
|
5 |
import pdfplumber
|
6 |
import nltk
|
|
|
14 |
|
15 |
from document_analyzer import HealthcareFraudAnalyzer
|
16 |
|
17 |
+
print("Running updated app.py with restricted GPU usage (version: 2025-04-22 v2)")
|
18 |
|
19 |
# — Ensure NLTK punkt tokenizer is available
|
20 |
try:
|
|
|
40 |
llm_int8_enable_fp32_cpu_offload=True
|
41 |
)
|
42 |
|
43 |
+
print("Loading model with 8-bit quantization, CPU offload, auto device mapping + max_memory cap")
|
44 |
model = Llama4ForConditionalGeneration.from_pretrained(
|
45 |
MODEL_ID,
|
46 |
torch_dtype=torch.bfloat16,
|
47 |
+
device_map="auto",
|
48 |
+
max_memory={ # cap GPU usage to ~11 GiB
|
49 |
+
"0": "11GiB",
|
50 |
+
"cpu": "200GiB"
|
51 |
+
},
|
52 |
quantization_config=quant_config,
|
53 |
+
offload_folder="./offload"
|
54 |
)
|
55 |
|
56 |
# — Resize embeddings if we added a pad token
|