Spaces:
Sleeping
Sleeping
vidhanm
commited on
Commit
·
97c8139
1
Parent(s):
e198913
trying to solve config file error
Browse files
app.py
CHANGED
@@ -2,26 +2,22 @@ import sys
|
|
2 |
import os
|
3 |
|
4 |
# Add the cloned nanoVLM directory to Python's system path
|
5 |
-
|
6 |
-
NANOVLM_REPO_PATH = "/app/nanoVLM" # Path where we cloned it in Dockerfile
|
7 |
if NANOVLM_REPO_PATH not in sys.path:
|
8 |
sys.path.insert(0, NANOVLM_REPO_PATH)
|
9 |
|
10 |
import gradio as gr
|
11 |
from PIL import Image
|
12 |
import torch
|
13 |
-
from transformers import AutoProcessor # AutoProcessor
|
14 |
|
15 |
-
#
|
16 |
try:
|
17 |
from models.vision_language_model import VisionLanguageModel
|
18 |
-
|
19 |
-
print("Successfully imported VisionLanguageModel and VisionLanguageConfig from nanoVLM clone.")
|
20 |
except ImportError as e:
|
21 |
-
print(f"Error importing from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
|
22 |
VisionLanguageModel = None
|
23 |
-
VisionLanguageConfig = None
|
24 |
-
|
25 |
|
26 |
# Determine the device to use
|
27 |
device_choice = os.environ.get("DEVICE", "auto")
|
@@ -36,39 +32,33 @@ model_id = "lusxvr/nanoVLM-222M"
|
|
36 |
processor = None
|
37 |
model = None
|
38 |
|
39 |
-
if VisionLanguageModel
|
40 |
try:
|
41 |
print(f"Attempting to load processor for {model_id}")
|
42 |
-
#
|
43 |
-
#
|
44 |
-
# trust_remote_code might be needed if processor has custom code too.
|
45 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
46 |
print("Processor loaded.")
|
47 |
-
|
48 |
-
print(f"Attempting to load model config for {model_id} using VisionLanguageConfig")
|
49 |
-
# Load the configuration using the custom config class, pointing to your model_id
|
50 |
-
# trust_remote_code=True allows it to use any specific code paths from your model_id if needed for config.
|
51 |
-
config = VisionLanguageConfig.from_pretrained(model_id, trust_remote_code=True)
|
52 |
-
print("Model config loaded.")
|
53 |
|
54 |
-
print(f"Attempting to load model
|
55 |
-
#
|
56 |
-
|
57 |
-
|
|
|
|
|
58 |
model.eval() # Set to evaluation mode
|
59 |
|
60 |
except Exception as e:
|
61 |
-
print(f"Error loading model
|
62 |
-
# Fallback if any step fails
|
63 |
processor = None
|
64 |
model = None
|
65 |
else:
|
66 |
-
print("Custom
|
67 |
|
68 |
|
69 |
def generate_text_for_image(image_input, prompt_input):
|
70 |
-
if model is None or processor is None
|
71 |
-
return "Error: Model or processor not loaded correctly
|
72 |
|
73 |
if image_input is None:
|
74 |
return "Please upload an image."
|
@@ -84,22 +74,19 @@ def generate_text_for_image(image_input, prompt_input):
|
|
84 |
if pil_image.mode != "RGB":
|
85 |
pil_image = pil_image.convert("RGB")
|
86 |
|
87 |
-
# Prepare inputs for the model using the processor
|
88 |
-
# The exact format for nanoVLM's custom model might require specific handling.
|
89 |
-
# The processor from AutoProcessor should generally work.
|
90 |
inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
|
91 |
|
92 |
-
#
|
93 |
-
#
|
|
|
94 |
generated_ids = model.generate(
|
95 |
-
inputs
|
96 |
-
inputs
|
97 |
-
attention_mask=inputs.get('attention_mask'),
|
98 |
max_new_tokens=150,
|
99 |
num_beams=3,
|
100 |
no_repeat_ngram_size=2,
|
101 |
early_stopping=True
|
102 |
-
# Check nanoVLM's VisionLanguageModel.generate() for specific parameters
|
103 |
)
|
104 |
|
105 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
|
2 |
import os
|
3 |
|
4 |
# Add the cloned nanoVLM directory to Python's system path
|
5 |
+
NANOVLM_REPO_PATH = "/app/nanoVLM"
|
|
|
6 |
if NANOVLM_REPO_PATH not in sys.path:
|
7 |
sys.path.insert(0, NANOVLM_REPO_PATH)
|
8 |
|
9 |
import gradio as gr
|
10 |
from PIL import Image
|
11 |
import torch
|
12 |
+
from transformers import AutoProcessor # AutoProcessor should still be fine
|
13 |
|
14 |
+
# Import the custom VisionLanguageModel class from the cloned nanoVLM repository
|
15 |
try:
|
16 |
from models.vision_language_model import VisionLanguageModel
|
17 |
+
print("Successfully imported VisionLanguageModel from nanoVLM clone.")
|
|
|
18 |
except ImportError as e:
|
19 |
+
print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
|
20 |
VisionLanguageModel = None
|
|
|
|
|
21 |
|
22 |
# Determine the device to use
|
23 |
device_choice = os.environ.get("DEVICE", "auto")
|
|
|
32 |
processor = None
|
33 |
model = None
|
34 |
|
35 |
+
if VisionLanguageModel:
|
36 |
try:
|
37 |
print(f"Attempting to load processor for {model_id}")
|
38 |
+
# trust_remote_code=True might be beneficial if the processor config itself refers to custom code,
|
39 |
+
# though less likely for processors.
|
|
|
40 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
41 |
print("Processor loaded.")
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
|
44 |
+
# The VisionLanguageModel.from_pretrained method should handle its own configuration loading
|
45 |
+
# from the model_id repository (which includes config.json).
|
46 |
+
# trust_remote_code=True here allows the custom VisionLanguageModel code to run.
|
47 |
+
model = VisionLanguageModel.from_pretrained(model_id, trust_remote_code=True).to(device)
|
48 |
+
print("Model loaded successfully.")
|
49 |
model.eval() # Set to evaluation mode
|
50 |
|
51 |
except Exception as e:
|
52 |
+
print(f"Error loading model or processor: {e}")
|
|
|
53 |
processor = None
|
54 |
model = None
|
55 |
else:
|
56 |
+
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
57 |
|
58 |
|
59 |
def generate_text_for_image(image_input, prompt_input):
|
60 |
+
if model is None or processor is None:
|
61 |
+
return "Error: Model or processor not loaded correctly. Check logs."
|
62 |
|
63 |
if image_input is None:
|
64 |
return "Please upload an image."
|
|
|
74 |
if pil_image.mode != "RGB":
|
75 |
pil_image = pil_image.convert("RGB")
|
76 |
|
|
|
|
|
|
|
77 |
inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
|
78 |
|
79 |
+
# Call the generate method of the VisionLanguageModel instance
|
80 |
+
# Check the definition of generate in nanoVLM/models/vision_language_model.py for exact signature if issues persist
|
81 |
+
# It likely expects pixel_values and input_ids directly or as part of a dictionary
|
82 |
generated_ids = model.generate(
|
83 |
+
pixel_values=inputs.get('pixel_values'),
|
84 |
+
input_ids=inputs.get('input_ids'),
|
85 |
+
attention_mask=inputs.get('attention_mask'),
|
86 |
max_new_tokens=150,
|
87 |
num_beams=3,
|
88 |
no_repeat_ngram_size=2,
|
89 |
early_stopping=True
|
|
|
90 |
)
|
91 |
|
92 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|