Spaces:
Sleeping
Sleeping
vidhanm
commited on
Commit
·
16bf2d1
1
Parent(s):
97c8139
trying to solve config error
Browse files
app.py
CHANGED
@@ -9,14 +9,15 @@ if NANOVLM_REPO_PATH not in sys.path:
|
|
9 |
import gradio as gr
|
10 |
from PIL import Image
|
11 |
import torch
|
12 |
-
|
|
|
13 |
|
14 |
-
# Import the custom VisionLanguageModel class
|
15 |
try:
|
16 |
from models.vision_language_model import VisionLanguageModel
|
17 |
print("Successfully imported VisionLanguageModel from nanoVLM clone.")
|
18 |
except ImportError as e:
|
19 |
-
print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}.
|
20 |
VisionLanguageModel = None
|
21 |
|
22 |
# Determine the device to use
|
@@ -27,38 +28,68 @@ else:
|
|
27 |
device = device_choice
|
28 |
print(f"Using device: {device}")
|
29 |
|
30 |
-
# Load the model and processor
|
31 |
model_id = "lusxvr/nanoVLM-222M"
|
32 |
-
|
|
|
33 |
model = None
|
34 |
|
35 |
if VisionLanguageModel:
|
36 |
try:
|
37 |
-
print(f"Attempting to load processor for {model_id}")
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
48 |
print("Model loaded successfully.")
|
49 |
-
model.eval()
|
50 |
|
51 |
except Exception as e:
|
52 |
-
print(f"Error loading model or processor: {e}")
|
53 |
-
|
|
|
54 |
model = None
|
55 |
else:
|
56 |
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
def generate_text_for_image(image_input, prompt_input):
|
60 |
-
if model is None or
|
61 |
-
return "Error: Model or processor not loaded correctly. Check logs."
|
62 |
|
63 |
if image_input is None:
|
64 |
return "Please upload an image."
|
@@ -74,23 +105,33 @@ def generate_text_for_image(image_input, prompt_input):
|
|
74 |
if pil_image.mode != "RGB":
|
75 |
pil_image = pil_image.convert("RGB")
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
#
|
80 |
-
# Check the definition of generate in nanoVLM/models/vision_language_model.py for exact signature if issues persist
|
81 |
-
# It likely expects pixel_values and input_ids directly or as part of a dictionary
|
82 |
generated_ids = model.generate(
|
83 |
-
pixel_values=inputs
|
84 |
-
input_ids=inputs
|
85 |
-
attention_mask=inputs
|
86 |
max_new_tokens=150,
|
87 |
num_beams=3,
|
88 |
no_repeat_ngram_size=2,
|
89 |
-
early_stopping=True
|
|
|
90 |
)
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
if prompt_input and generated_text.startswith(prompt_input):
|
95 |
cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
|
96 |
else:
|
@@ -100,6 +141,8 @@ def generate_text_for_image(image_input, prompt_input):
|
|
100 |
|
101 |
except Exception as e:
|
102 |
print(f"Error during generation: {e}")
|
|
|
|
|
103 |
return f"An error occurred during text generation: {str(e)}"
|
104 |
|
105 |
description = "Interactive demo for lusxvr/nanoVLM-222M."
|
@@ -125,8 +168,8 @@ iface = gr.Interface(
|
|
125 |
)
|
126 |
|
127 |
if __name__ == "__main__":
|
128 |
-
if model is None or
|
129 |
-
print("CRITICAL: Model or processor failed to load.
|
130 |
else:
|
131 |
print("Launching Gradio interface...")
|
132 |
iface.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
9 |
import gradio as gr
|
10 |
from PIL import Image
|
11 |
import torch
|
12 |
+
# Import specific processor components
|
13 |
+
from transformers import CLIPImageProcessor, GPT2TokenizerFast
|
14 |
|
15 |
+
# Import the custom VisionLanguageModel class
|
16 |
try:
|
17 |
from models.vision_language_model import VisionLanguageModel
|
18 |
print("Successfully imported VisionLanguageModel from nanoVLM clone.")
|
19 |
except ImportError as e:
|
20 |
+
print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}.")
|
21 |
VisionLanguageModel = None
|
22 |
|
23 |
# Determine the device to use
|
|
|
28 |
device = device_choice
|
29 |
print(f"Using device: {device}")
|
30 |
|
31 |
+
# Load the model and processor components
|
32 |
model_id = "lusxvr/nanoVLM-222M"
|
33 |
+
image_processor = None
|
34 |
+
tokenizer = None
|
35 |
model = None
|
36 |
|
37 |
if VisionLanguageModel:
|
38 |
try:
|
39 |
+
print(f"Attempting to load specific processor components for {model_id}")
|
40 |
+
# Load the image processor
|
41 |
+
image_processor = CLIPImageProcessor.from_pretrained(model_id, trust_remote_code=True)
|
42 |
+
print("CLIPImageProcessor loaded.")
|
43 |
+
|
44 |
+
# Load the tokenizer
|
45 |
+
tokenizer = GPT2TokenizerFast.from_pretrained(model_id, trust_remote_code=True)
|
46 |
+
# Add a padding token if it's not already there (common for GPT2)
|
47 |
+
if tokenizer.pad_token is None:
|
48 |
+
tokenizer.pad_token = tokenizer.eos_token
|
49 |
+
print("Set tokenizer pad_token to eos_token.")
|
50 |
+
print("GPT2TokenizerFast loaded.")
|
51 |
|
52 |
print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
|
53 |
+
model = VisionLanguageModel.from_pretrained(
|
54 |
+
model_id,
|
55 |
+
trust_remote_code=True # Allows custom model code to run
|
56 |
+
# The VisionLanguageModel might need image_processor and tokenizer passed during init,
|
57 |
+
# or it might retrieve them from its config. Check its __init__ if issues persist.
|
58 |
+
# For now, assume it gets them from config or they are not strictly needed at init.
|
59 |
+
).to(device)
|
60 |
print("Model loaded successfully.")
|
61 |
+
model.eval()
|
62 |
|
63 |
except Exception as e:
|
64 |
+
print(f"Error loading model or processor components: {e}")
|
65 |
+
image_processor = None
|
66 |
+
tokenizer = None
|
67 |
model = None
|
68 |
else:
|
69 |
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
70 |
|
71 |
+
# Define a simple processor-like function for preparing inputs
|
72 |
+
def prepare_inputs(text, image, image_processor_instance, tokenizer_instance, device_to_use):
|
73 |
+
if image_processor_instance is None or tokenizer_instance is None:
|
74 |
+
raise ValueError("Image processor or tokenizer not initialized.")
|
75 |
+
|
76 |
+
# Process image
|
77 |
+
processed_image = image_processor_instance(images=image, return_tensors="pt").pixel_values.to(device_to_use)
|
78 |
+
|
79 |
+
# Process text
|
80 |
+
# Ensure padding is handled correctly for batching (even if batch size is 1)
|
81 |
+
processed_text = tokenizer_instance(
|
82 |
+
text=text, return_tensors="pt", padding=True, truncation=True
|
83 |
+
)
|
84 |
+
input_ids = processed_text.input_ids.to(device_to_use)
|
85 |
+
attention_mask = processed_text.attention_mask.to(device_to_use)
|
86 |
+
|
87 |
+
return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
|
88 |
+
|
89 |
|
90 |
def generate_text_for_image(image_input, prompt_input):
|
91 |
+
if model is None or image_processor is None or tokenizer is None:
|
92 |
+
return "Error: Model or processor components not loaded correctly. Check logs."
|
93 |
|
94 |
if image_input is None:
|
95 |
return "Please upload an image."
|
|
|
105 |
if pil_image.mode != "RGB":
|
106 |
pil_image = pil_image.convert("RGB")
|
107 |
|
108 |
+
# Use our custom input preparation function
|
109 |
+
inputs = prepare_inputs(
|
110 |
+
text=[prompt_input], # Expects a list of text prompts
|
111 |
+
image=pil_image, # Expects a single PIL image or list
|
112 |
+
image_processor_instance=image_processor,
|
113 |
+
tokenizer_instance=tokenizer,
|
114 |
+
device_to_use=device
|
115 |
+
)
|
116 |
|
117 |
+
# Generate text using the model's generate method
|
|
|
|
|
118 |
generated_ids = model.generate(
|
119 |
+
pixel_values=inputs['pixel_values'],
|
120 |
+
input_ids=inputs['input_ids'],
|
121 |
+
attention_mask=inputs['attention_mask'],
|
122 |
max_new_tokens=150,
|
123 |
num_beams=3,
|
124 |
no_repeat_ngram_size=2,
|
125 |
+
early_stopping=True,
|
126 |
+
pad_token_id=tokenizer.pad_token_id # Important for generation
|
127 |
)
|
128 |
|
129 |
+
# Decode the generated tokens
|
130 |
+
# skip_special_tokens=True removes special tokens like <|endoftext|>
|
131 |
+
generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
132 |
+
generated_text = generated_text_list[0] if generated_text_list else ""
|
133 |
+
|
134 |
+
# Basic cleaning of the prompt if the model includes it in the output
|
135 |
if prompt_input and generated_text.startswith(prompt_input):
|
136 |
cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
|
137 |
else:
|
|
|
141 |
|
142 |
except Exception as e:
|
143 |
print(f"Error during generation: {e}")
|
144 |
+
import traceback
|
145 |
+
traceback.print_exc() # Print full traceback for debugging
|
146 |
return f"An error occurred during text generation: {str(e)}"
|
147 |
|
148 |
description = "Interactive demo for lusxvr/nanoVLM-222M."
|
|
|
168 |
)
|
169 |
|
170 |
if __name__ == "__main__":
|
171 |
+
if model is None or image_processor is None or tokenizer is None:
|
172 |
+
print("CRITICAL: Model or processor components failed to load.")
|
173 |
else:
|
174 |
print("Launching Gradio interface...")
|
175 |
iface.launch(server_name="0.0.0.0", server_port=7860)
|