Spaces:
Sleeping
Sleeping
vidhanm
commited on
Commit
·
978a6b3
1
Parent(s):
16bf2d1
app.py
CHANGED
@@ -28,65 +28,63 @@ else:
|
|
28 |
device = device_choice
|
29 |
print(f"Using device: {device}")
|
30 |
|
31 |
-
#
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
image_processor = None
|
34 |
tokenizer = None
|
35 |
model = None
|
36 |
|
37 |
if VisionLanguageModel:
|
38 |
try:
|
39 |
-
print(f"Attempting to load
|
40 |
-
|
41 |
-
image_processor = CLIPImageProcessor.from_pretrained(model_id, trust_remote_code=True)
|
42 |
print("CLIPImageProcessor loaded.")
|
43 |
|
44 |
-
|
45 |
-
tokenizer = GPT2TokenizerFast.from_pretrained(
|
46 |
-
# Add a padding token if it's not already there (common for GPT2)
|
47 |
if tokenizer.pad_token is None:
|
48 |
tokenizer.pad_token = tokenizer.eos_token
|
49 |
print("Set tokenizer pad_token to eos_token.")
|
50 |
print("GPT2TokenizerFast loaded.")
|
51 |
|
52 |
-
print(f"Attempting to load model {
|
53 |
model = VisionLanguageModel.from_pretrained(
|
54 |
-
|
55 |
-
trust_remote_code=True
|
56 |
-
# The VisionLanguageModel might need image_processor and tokenizer passed during init,
|
57 |
-
# or it might retrieve them from its config. Check its __init__ if issues persist.
|
58 |
-
# For now, assume it gets them from config or they are not strictly needed at init.
|
59 |
).to(device)
|
60 |
print("Model loaded successfully.")
|
61 |
model.eval()
|
62 |
|
63 |
except Exception as e:
|
64 |
print(f"Error loading model or processor components: {e}")
|
|
|
|
|
65 |
image_processor = None
|
66 |
tokenizer = None
|
67 |
model = None
|
68 |
else:
|
69 |
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
70 |
|
71 |
-
|
72 |
-
def prepare_inputs(text, image, image_processor_instance, tokenizer_instance, device_to_use):
|
73 |
if image_processor_instance is None or tokenizer_instance is None:
|
74 |
raise ValueError("Image processor or tokenizer not initialized.")
|
75 |
|
76 |
-
|
77 |
-
processed_image = image_processor_instance(images=image, return_tensors="pt").pixel_values.to(device_to_use)
|
78 |
|
79 |
-
# Process text
|
80 |
-
# Ensure padding is handled correctly for batching (even if batch size is 1)
|
81 |
processed_text = tokenizer_instance(
|
82 |
-
text=
|
83 |
)
|
84 |
input_ids = processed_text.input_ids.to(device_to_use)
|
85 |
attention_mask = processed_text.attention_mask.to(device_to_use)
|
86 |
|
87 |
return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
|
88 |
|
89 |
-
|
90 |
def generate_text_for_image(image_input, prompt_input):
|
91 |
if model is None or image_processor is None or tokenizer is None:
|
92 |
return "Error: Model or processor components not loaded correctly. Check logs."
|
@@ -105,16 +103,14 @@ def generate_text_for_image(image_input, prompt_input):
|
|
105 |
if pil_image.mode != "RGB":
|
106 |
pil_image = pil_image.convert("RGB")
|
107 |
|
108 |
-
# Use our custom input preparation function
|
109 |
inputs = prepare_inputs(
|
110 |
-
|
111 |
-
|
112 |
image_processor_instance=image_processor,
|
113 |
tokenizer_instance=tokenizer,
|
114 |
device_to_use=device
|
115 |
)
|
116 |
|
117 |
-
# Generate text using the model's generate method
|
118 |
generated_ids = model.generate(
|
119 |
pixel_values=inputs['pixel_values'],
|
120 |
input_ids=inputs['input_ids'],
|
@@ -123,15 +119,12 @@ def generate_text_for_image(image_input, prompt_input):
|
|
123 |
num_beams=3,
|
124 |
no_repeat_ngram_size=2,
|
125 |
early_stopping=True,
|
126 |
-
pad_token_id=tokenizer.pad_token_id
|
127 |
)
|
128 |
|
129 |
-
# Decode the generated tokens
|
130 |
-
# skip_special_tokens=True removes special tokens like <|endoftext|>
|
131 |
generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
132 |
generated_text = generated_text_list[0] if generated_text_list else ""
|
133 |
|
134 |
-
# Basic cleaning of the prompt if the model includes it in the output
|
135 |
if prompt_input and generated_text.startswith(prompt_input):
|
136 |
cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
|
137 |
else:
|
@@ -142,12 +135,12 @@ def generate_text_for_image(image_input, prompt_input):
|
|
142 |
except Exception as e:
|
143 |
print(f"Error during generation: {e}")
|
144 |
import traceback
|
145 |
-
traceback.print_exc()
|
146 |
return f"An error occurred during text generation: {str(e)}"
|
147 |
|
148 |
description = "Interactive demo for lusxvr/nanoVLM-222M."
|
149 |
example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
150 |
-
gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
|
151 |
|
152 |
iface = gr.Interface(
|
153 |
fn=generate_text_for_image,
|
@@ -162,8 +155,8 @@ iface = gr.Interface(
|
|
162 |
[example_image_url, "a photo of a"],
|
163 |
[example_image_url, "Describe the image in detail."],
|
164 |
],
|
165 |
-
cache_examples=True,
|
166 |
-
examples_cache_folder=gradio_cache_dir,
|
167 |
allow_flagging="never"
|
168 |
)
|
169 |
|
|
|
28 |
device = device_choice
|
29 |
print(f"Using device: {device}")
|
30 |
|
31 |
+
# --- Configuration for model components ---
|
32 |
+
# The main model ID for weights and overall config
|
33 |
+
model_id_for_weights = "lusxvr/nanoVLM-222M"
|
34 |
+
# The ID for the vision backbone's image processor configuration
|
35 |
+
image_processor_id = "openai/clip-vit-base-patch32"
|
36 |
+
# The ID for the tokenizer (can be the main model ID if it provides specific tokenizer files)
|
37 |
+
tokenizer_id = "lusxvr/nanoVLM-222M" # Or directly "gpt2" if preferred, but model_id is usually safer
|
38 |
+
|
39 |
image_processor = None
|
40 |
tokenizer = None
|
41 |
model = None
|
42 |
|
43 |
if VisionLanguageModel:
|
44 |
try:
|
45 |
+
print(f"Attempting to load CLIPImageProcessor from: {image_processor_id}")
|
46 |
+
image_processor = CLIPImageProcessor.from_pretrained(image_processor_id, trust_remote_code=True)
|
|
|
47 |
print("CLIPImageProcessor loaded.")
|
48 |
|
49 |
+
print(f"Attempting to load GPT2TokenizerFast from: {tokenizer_id}")
|
50 |
+
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_id, trust_remote_code=True)
|
|
|
51 |
if tokenizer.pad_token is None:
|
52 |
tokenizer.pad_token = tokenizer.eos_token
|
53 |
print("Set tokenizer pad_token to eos_token.")
|
54 |
print("GPT2TokenizerFast loaded.")
|
55 |
|
56 |
+
print(f"Attempting to load model weights from {model_id_for_weights} using VisionLanguageModel.from_pretrained")
|
57 |
model = VisionLanguageModel.from_pretrained(
|
58 |
+
model_id_for_weights,
|
59 |
+
trust_remote_code=True
|
|
|
|
|
|
|
60 |
).to(device)
|
61 |
print("Model loaded successfully.")
|
62 |
model.eval()
|
63 |
|
64 |
except Exception as e:
|
65 |
print(f"Error loading model or processor components: {e}")
|
66 |
+
import traceback
|
67 |
+
traceback.print_exc() # Print full traceback
|
68 |
image_processor = None
|
69 |
tokenizer = None
|
70 |
model = None
|
71 |
else:
|
72 |
print("Custom VisionLanguageModel class not imported, cannot load model.")
|
73 |
|
74 |
+
def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
|
|
|
75 |
if image_processor_instance is None or tokenizer_instance is None:
|
76 |
raise ValueError("Image processor or tokenizer not initialized.")
|
77 |
|
78 |
+
processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
|
|
|
79 |
|
|
|
|
|
80 |
processed_text = tokenizer_instance(
|
81 |
+
text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=tokenizer_instance.model_max_length
|
82 |
)
|
83 |
input_ids = processed_text.input_ids.to(device_to_use)
|
84 |
attention_mask = processed_text.attention_mask.to(device_to_use)
|
85 |
|
86 |
return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
|
87 |
|
|
|
88 |
def generate_text_for_image(image_input, prompt_input):
|
89 |
if model is None or image_processor is None or tokenizer is None:
|
90 |
return "Error: Model or processor components not loaded correctly. Check logs."
|
|
|
103 |
if pil_image.mode != "RGB":
|
104 |
pil_image = pil_image.convert("RGB")
|
105 |
|
|
|
106 |
inputs = prepare_inputs(
|
107 |
+
text_list=[prompt_input],
|
108 |
+
image_input=pil_image,
|
109 |
image_processor_instance=image_processor,
|
110 |
tokenizer_instance=tokenizer,
|
111 |
device_to_use=device
|
112 |
)
|
113 |
|
|
|
114 |
generated_ids = model.generate(
|
115 |
pixel_values=inputs['pixel_values'],
|
116 |
input_ids=inputs['input_ids'],
|
|
|
119 |
num_beams=3,
|
120 |
no_repeat_ngram_size=2,
|
121 |
early_stopping=True,
|
122 |
+
pad_token_id=tokenizer.pad_token_id
|
123 |
)
|
124 |
|
|
|
|
|
125 |
generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
126 |
generated_text = generated_text_list[0] if generated_text_list else ""
|
127 |
|
|
|
128 |
if prompt_input and generated_text.startswith(prompt_input):
|
129 |
cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
|
130 |
else:
|
|
|
135 |
except Exception as e:
|
136 |
print(f"Error during generation: {e}")
|
137 |
import traceback
|
138 |
+
traceback.print_exc()
|
139 |
return f"An error occurred during text generation: {str(e)}"
|
140 |
|
141 |
description = "Interactive demo for lusxvr/nanoVLM-222M."
|
142 |
example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
143 |
+
# gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp") # Not used for now
|
144 |
|
145 |
iface = gr.Interface(
|
146 |
fn=generate_text_for_image,
|
|
|
155 |
[example_image_url, "a photo of a"],
|
156 |
[example_image_url, "Describe the image in detail."],
|
157 |
],
|
158 |
+
cache_examples=True, # This might cause issues if Gradio version is old. Remove if needed.
|
159 |
+
# examples_cache_folder=gradio_cache_dir, # Removed due to potential Gradio version issue
|
160 |
allow_flagging="never"
|
161 |
)
|
162 |
|