Cylanoid commited on
Commit
fab7ed8
·
1 Parent(s): 80efb49
Files changed (1) hide show
  1. app.py +23 -14
app.py CHANGED
@@ -2,7 +2,7 @@
2
  # Gradio app for Llama 4 Maverick healthcare fraud detection (text-only with CPU offloading)
3
 
4
  import gradio as gr
5
- from transformers import AutoTokenizer, Llama4ForConditionalGeneration
6
  import datasets
7
  import torch
8
  import json
@@ -24,7 +24,7 @@ except LookupError:
24
  from document_analyzer import HealthcareFraudAnalyzer
25
 
26
  # Debug: Confirm file version
27
- print("Running updated app.py with CPU offloading (version: 2025-04-21 v2)")
28
 
29
  # Debug: Print environment variables
30
  print("Environment variables:", dict(os.environ))
@@ -47,27 +47,36 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
47
  if tokenizer.pad_token is None:
48
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
49
 
50
- # Custom device map for CPU offloading
 
 
 
 
 
 
51
  device_map = {
52
  "model.embed_tokens": 0,
53
- "model.layers.0-15": 0, # First 16 layers on GPU
54
- "model.layers.16-31": "cpu", # Remaining layers on CPU
55
  "model.norm": 0,
56
  "lm_head": 0
57
  }
58
 
59
  # Debug: Confirm offloading settings
60
- print("Loading model with CPU offloading: llm_int8_enable_fp32_cpu_offload=True, device_map=", device_map)
61
 
62
  # Load model with 8-bit quantization and CPU offloading
63
- model = Llama4ForConditionalGeneration.from_pretrained(
64
- MODEL_ID,
65
- torch_dtype=torch.bfloat16,
66
- device_map=device_map,
67
- quantization_config={"load_in_8bit": True},
68
- llm_int8_enable_fp32_cpu_offload=True,
69
- attn_implementation="flex_attention"
70
- )
 
 
 
71
 
72
  # Resize token embeddings if pad token was added
73
  model.resize_token_embeddings(len(tokenizer))
 
2
  # Gradio app for Llama 4 Maverick healthcare fraud detection (text-only with CPU offloading)
3
 
4
  import gradio as gr
5
+ from transformers import AutoTokenizer, Llama4ForConditionalGeneration, BitsAndBytesConfig
6
  import datasets
7
  import torch
8
  import json
 
24
  from document_analyzer import HealthcareFraudAnalyzer
25
 
26
  # Debug: Confirm file version
27
+ print("Running updated app.py with CPU offloading (version: 2025-04-21 v3)")
28
 
29
  # Debug: Print environment variables
30
  print("Environment variables:", dict(os.environ))
 
47
  if tokenizer.pad_token is None:
48
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
49
 
50
+ # Explicit quantization configuration
51
+ quant_config = BitsAndBytesConfig(
52
+ load_in_8bit=True,
53
+ llm_int8_enable_fp32_cpu_offload=True
54
+ )
55
+
56
+ # Custom device map for CPU offloading (more layers to CPU)
57
  device_map = {
58
  "model.embed_tokens": 0,
59
+ "model.layers.0-10": 0, # First 11 layers on GPU
60
+ "model.layers.11-31": "cpu", # Remaining layers on CPU
61
  "model.norm": 0,
62
  "lm_head": 0
63
  }
64
 
65
  # Debug: Confirm offloading settings
66
+ print("Loading model with: quantization_config=", quant_config, ", device_map=", device_map)
67
 
68
  # Load model with 8-bit quantization and CPU offloading
69
+ try:
70
+ model = Llama4ForConditionalGeneration.from_pretrained(
71
+ MODEL_ID,
72
+ torch_dtype=torch.bfloat16,
73
+ device_map=device_map,
74
+ quantization_config=quant_config,
75
+ attn_implementation="flex_attention"
76
+ )
77
+ except Exception as e:
78
+ print(f"Model loading failed: {str(e)}")
79
+ raise
80
 
81
  # Resize token embeddings if pad token was added
82
  model.resize_token_embeddings(len(tokenizer))