johnbridges commited on
Commit
a9fd6b4
·
1 Parent(s): 557adf7
Files changed (1) hide show
  1. app.py +4 -31
app.py CHANGED
@@ -9,7 +9,6 @@ import requests
9
  from transformers import AutoModelForImageTextToText, AutoProcessor
10
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
11
  import torch
12
- from torch.ao.quantization import quantize_dynamic
13
  import re
14
  import traceback
15
 
@@ -18,32 +17,6 @@ MODEL_ID = "Hcompany/Holo1-3B"
18
 
19
  # --- Helpers (robust across different transformers versions) ---
20
 
21
- def locate_text_backbone(model):
22
- """
23
- Tries common attribute names used by VLMs to find the LLM/text stack.
24
- Falls back to the whole model if unknown.
25
- """
26
- # common in Qwen-like / custom repos
27
- for name in [
28
- "language_model", # e.g., model.language_model
29
- "text_model", # e.g., model.text_model
30
- "model", # sometimes the text core is 'model'
31
- "llm", # generic
32
- "transformer", # some repos expose raw transformer as 'transformer'
33
- ]:
34
- m = getattr(model, name, None)
35
- if m is not None:
36
- return m, name
37
-
38
- # last resort: look for a child that has an lm_head or tied weights
39
- for name, child in model.named_children():
40
- if hasattr(child, "lm_head") or hasattr(child, "get_input_embeddings"):
41
- return child, name
42
-
43
- # if still not found, return the model itself
44
- return model, None
45
-
46
-
47
  def pick_device() -> str:
48
  # Force CPU per request
49
  return "cpu"
@@ -105,10 +78,10 @@ model_loaded = False
105
  load_error_message = ""
106
 
107
  try:
108
- # CPU-friendly dtype; bf16 on CPU is spotty, so prefer bfloat16
109
  model = AutoModelForImageTextToText.from_pretrained(
110
  MODEL_ID,
111
- torch_dtype=torch.bfloat16,
112
  trust_remote_code=True
113
  ).to(pick_device())
114
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -266,7 +239,7 @@ except Exception as e:
266
  pass
267
 
268
  # --- Gradio UI ---
269
- title = "Holo1-3B: Action VLM Localization Demo (CPU)"
270
  article = f"""
271
  <p style='text-align: center'>
272
  Model: <a href='https://huggingface.co/{MODEL_ID}' target='_blank'>{MODEL_ID}</a> by HCompany |
@@ -325,4 +298,4 @@ else:
325
 
326
  if __name__ == "__main__":
327
  # CPU Spaces can be slow; keep debug True for logs
328
- demo.launch(debug=True)
 
9
  from transformers import AutoModelForImageTextToText, AutoProcessor
10
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
11
  import torch
 
12
  import re
13
  import traceback
14
 
 
17
 
18
  # --- Helpers (robust across different transformers versions) ---
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def pick_device() -> str:
21
  # Force CPU per request
22
  return "cpu"
 
78
  load_error_message = ""
79
 
80
  try:
81
+ # CPU-friendly dtype; bf16 on CPU is spotty, so prefer float32
82
  model = AutoModelForImageTextToText.from_pretrained(
83
  MODEL_ID,
84
+ torch_dtype=torch.float32,
85
  trust_remote_code=True
86
  ).to(pick_device())
87
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 
239
  pass
240
 
241
  # --- Gradio UI ---
242
+ title = "Holo1-7B: Action VLM Localization Demo (CPU)"
243
  article = f"""
244
  <p style='text-align: center'>
245
  Model: <a href='https://huggingface.co/{MODEL_ID}' target='_blank'>{MODEL_ID}</a> by HCompany |
 
298
 
299
  if __name__ == "__main__":
300
  # CPU Spaces can be slow; keep debug True for logs
301
+ demo.launch(debug=True)