Cylanoid commited on
Commit
e73698a
·
1 Parent(s): b2c8265

Use device_map='auto' + offload_folder to avoid OOM

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -1,9 +1,6 @@
1
  # app.py
2
 
3
  import os
4
- import json
5
- import re
6
-
7
  import gradio as gr
8
  import pdfplumber
9
  import nltk
@@ -17,7 +14,7 @@ import huggingface_hub
17
 
18
  from document_analyzer import HealthcareFraudAnalyzer
19
 
20
- print("Running updated app.py with CPU offloading (version: 2025-04-22 v1)")
21
 
22
  # — Ensure NLTK punkt tokenizer is available
23
  try:
@@ -43,13 +40,17 @@ quant_config = BitsAndBytesConfig(
43
  llm_int8_enable_fp32_cpu_offload=True
44
  )
45
 
46
- print("Loading model with 8-bit quantization, CPU offload, and automatic device mapping")
47
  model = Llama4ForConditionalGeneration.from_pretrained(
48
  MODEL_ID,
49
  torch_dtype=torch.bfloat16,
50
- device_map="auto", # let Accelerate decide which layers go to GPU vs. CPU
 
 
 
 
51
  quantization_config=quant_config,
52
- offload_folder="./offload" # spill CPU‑offloaded weights here
53
  )
54
 
55
  # — Resize embeddings if we added a pad token
 
1
  # app.py
2
 
3
  import os
 
 
 
4
  import gradio as gr
5
  import pdfplumber
6
  import nltk
 
14
 
15
  from document_analyzer import HealthcareFraudAnalyzer
16
 
17
+ print("Running updated app.py with restricted GPU usage (version: 2025-04-22 v2)")
18
 
19
  # — Ensure NLTK punkt tokenizer is available
20
  try:
 
40
  llm_int8_enable_fp32_cpu_offload=True
41
  )
42
 
43
+ print("Loading model with 8-bit quantization, CPU offload, auto device mapping + max_memory cap")
44
  model = Llama4ForConditionalGeneration.from_pretrained(
45
  MODEL_ID,
46
  torch_dtype=torch.bfloat16,
47
+ device_map="auto",
48
+ max_memory={ # cap GPU usage to ~11 GiB
49
+ "0": "11GiB",
50
+ "cpu": "200GiB"
51
+ },
52
  quantization_config=quant_config,
53
+ offload_folder="./offload"
54
  )
55
 
56
  # — Resize embeddings if we added a pad token