vidhanm commited on
Commit
978a6b3
·
1 Parent(s): 16bf2d1
Files changed (1) hide show
  1. app.py +27 -34
app.py CHANGED
@@ -28,65 +28,63 @@ else:
28
  device = device_choice
29
  print(f"Using device: {device}")
30
 
31
- # Load the model and processor components
32
- model_id = "lusxvr/nanoVLM-222M"
 
 
 
 
 
 
33
  image_processor = None
34
  tokenizer = None
35
  model = None
36
 
37
  if VisionLanguageModel:
38
  try:
39
- print(f"Attempting to load specific processor components for {model_id}")
40
- # Load the image processor
41
- image_processor = CLIPImageProcessor.from_pretrained(model_id, trust_remote_code=True)
42
  print("CLIPImageProcessor loaded.")
43
 
44
- # Load the tokenizer
45
- tokenizer = GPT2TokenizerFast.from_pretrained(model_id, trust_remote_code=True)
46
- # Add a padding token if it's not already there (common for GPT2)
47
  if tokenizer.pad_token is None:
48
  tokenizer.pad_token = tokenizer.eos_token
49
  print("Set tokenizer pad_token to eos_token.")
50
  print("GPT2TokenizerFast loaded.")
51
 
52
- print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
53
  model = VisionLanguageModel.from_pretrained(
54
- model_id,
55
- trust_remote_code=True # Allows custom model code to run
56
- # The VisionLanguageModel might need image_processor and tokenizer passed during init,
57
- # or it might retrieve them from its config. Check its __init__ if issues persist.
58
- # For now, assume it gets them from config or they are not strictly needed at init.
59
  ).to(device)
60
  print("Model loaded successfully.")
61
  model.eval()
62
 
63
  except Exception as e:
64
  print(f"Error loading model or processor components: {e}")
 
 
65
  image_processor = None
66
  tokenizer = None
67
  model = None
68
  else:
69
  print("Custom VisionLanguageModel class not imported, cannot load model.")
70
 
71
- # Define a simple processor-like function for preparing inputs
72
- def prepare_inputs(text, image, image_processor_instance, tokenizer_instance, device_to_use):
73
  if image_processor_instance is None or tokenizer_instance is None:
74
  raise ValueError("Image processor or tokenizer not initialized.")
75
 
76
- # Process image
77
- processed_image = image_processor_instance(images=image, return_tensors="pt").pixel_values.to(device_to_use)
78
 
79
- # Process text
80
- # Ensure padding is handled correctly for batching (even if batch size is 1)
81
  processed_text = tokenizer_instance(
82
- text=text, return_tensors="pt", padding=True, truncation=True
83
  )
84
  input_ids = processed_text.input_ids.to(device_to_use)
85
  attention_mask = processed_text.attention_mask.to(device_to_use)
86
 
87
  return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
88
 
89
-
90
  def generate_text_for_image(image_input, prompt_input):
91
  if model is None or image_processor is None or tokenizer is None:
92
  return "Error: Model or processor components not loaded correctly. Check logs."
@@ -105,16 +103,14 @@ def generate_text_for_image(image_input, prompt_input):
105
  if pil_image.mode != "RGB":
106
  pil_image = pil_image.convert("RGB")
107
 
108
- # Use our custom input preparation function
109
  inputs = prepare_inputs(
110
- text=[prompt_input], # Expects a list of text prompts
111
- image=pil_image, # Expects a single PIL image or list
112
  image_processor_instance=image_processor,
113
  tokenizer_instance=tokenizer,
114
  device_to_use=device
115
  )
116
 
117
- # Generate text using the model's generate method
118
  generated_ids = model.generate(
119
  pixel_values=inputs['pixel_values'],
120
  input_ids=inputs['input_ids'],
@@ -123,15 +119,12 @@ def generate_text_for_image(image_input, prompt_input):
123
  num_beams=3,
124
  no_repeat_ngram_size=2,
125
  early_stopping=True,
126
- pad_token_id=tokenizer.pad_token_id # Important for generation
127
  )
128
 
129
- # Decode the generated tokens
130
- # skip_special_tokens=True removes special tokens like <|endoftext|>
131
  generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
132
  generated_text = generated_text_list[0] if generated_text_list else ""
133
 
134
- # Basic cleaning of the prompt if the model includes it in the output
135
  if prompt_input and generated_text.startswith(prompt_input):
136
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
137
  else:
@@ -142,12 +135,12 @@ def generate_text_for_image(image_input, prompt_input):
142
  except Exception as e:
143
  print(f"Error during generation: {e}")
144
  import traceback
145
- traceback.print_exc() # Print full traceback for debugging
146
  return f"An error occurred during text generation: {str(e)}"
147
 
148
  description = "Interactive demo for lusxvr/nanoVLM-222M."
149
  example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
150
- gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
151
 
152
  iface = gr.Interface(
153
  fn=generate_text_for_image,
@@ -162,8 +155,8 @@ iface = gr.Interface(
162
  [example_image_url, "a photo of a"],
163
  [example_image_url, "Describe the image in detail."],
164
  ],
165
- cache_examples=True,
166
- examples_cache_folder=gradio_cache_dir,
167
  allow_flagging="never"
168
  )
169
 
 
28
  device = device_choice
29
  print(f"Using device: {device}")
30
 
31
+ # --- Configuration for model components ---
32
+ # The main model ID for weights and overall config
33
+ model_id_for_weights = "lusxvr/nanoVLM-222M"
34
+ # The ID for the vision backbone's image processor configuration
35
+ image_processor_id = "openai/clip-vit-base-patch32"
36
+ # The ID for the tokenizer (can be the main model ID if it provides specific tokenizer files)
37
+ tokenizer_id = "lusxvr/nanoVLM-222M" # Or directly "gpt2" if preferred, but model_id is usually safer
38
+
39
  image_processor = None
40
  tokenizer = None
41
  model = None
42
 
43
  if VisionLanguageModel:
44
  try:
45
+ print(f"Attempting to load CLIPImageProcessor from: {image_processor_id}")
46
+ image_processor = CLIPImageProcessor.from_pretrained(image_processor_id, trust_remote_code=True)
 
47
  print("CLIPImageProcessor loaded.")
48
 
49
+ print(f"Attempting to load GPT2TokenizerFast from: {tokenizer_id}")
50
+ tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_id, trust_remote_code=True)
 
51
  if tokenizer.pad_token is None:
52
  tokenizer.pad_token = tokenizer.eos_token
53
  print("Set tokenizer pad_token to eos_token.")
54
  print("GPT2TokenizerFast loaded.")
55
 
56
+ print(f"Attempting to load model weights from {model_id_for_weights} using VisionLanguageModel.from_pretrained")
57
  model = VisionLanguageModel.from_pretrained(
58
+ model_id_for_weights,
59
+ trust_remote_code=True
 
 
 
60
  ).to(device)
61
  print("Model loaded successfully.")
62
  model.eval()
63
 
64
  except Exception as e:
65
  print(f"Error loading model or processor components: {e}")
66
+ import traceback
67
+ traceback.print_exc() # Print full traceback
68
  image_processor = None
69
  tokenizer = None
70
  model = None
71
  else:
72
  print("Custom VisionLanguageModel class not imported, cannot load model.")
73
 
74
+ def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
 
75
  if image_processor_instance is None or tokenizer_instance is None:
76
  raise ValueError("Image processor or tokenizer not initialized.")
77
 
78
+ processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
 
79
 
 
 
80
  processed_text = tokenizer_instance(
81
+ text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=tokenizer_instance.model_max_length
82
  )
83
  input_ids = processed_text.input_ids.to(device_to_use)
84
  attention_mask = processed_text.attention_mask.to(device_to_use)
85
 
86
  return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
87
 
 
88
  def generate_text_for_image(image_input, prompt_input):
89
  if model is None or image_processor is None or tokenizer is None:
90
  return "Error: Model or processor components not loaded correctly. Check logs."
 
103
  if pil_image.mode != "RGB":
104
  pil_image = pil_image.convert("RGB")
105
 
 
106
  inputs = prepare_inputs(
107
+ text_list=[prompt_input],
108
+ image_input=pil_image,
109
  image_processor_instance=image_processor,
110
  tokenizer_instance=tokenizer,
111
  device_to_use=device
112
  )
113
 
 
114
  generated_ids = model.generate(
115
  pixel_values=inputs['pixel_values'],
116
  input_ids=inputs['input_ids'],
 
119
  num_beams=3,
120
  no_repeat_ngram_size=2,
121
  early_stopping=True,
122
+ pad_token_id=tokenizer.pad_token_id
123
  )
124
 
 
 
125
  generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
126
  generated_text = generated_text_list[0] if generated_text_list else ""
127
 
 
128
  if prompt_input and generated_text.startswith(prompt_input):
129
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
130
  else:
 
135
  except Exception as e:
136
  print(f"Error during generation: {e}")
137
  import traceback
138
+ traceback.print_exc()
139
  return f"An error occurred during text generation: {str(e)}"
140
 
141
  description = "Interactive demo for lusxvr/nanoVLM-222M."
142
  example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
143
+ # gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp") # Not used for now
144
 
145
  iface = gr.Interface(
146
  fn=generate_text_for_image,
 
155
  [example_image_url, "a photo of a"],
156
  [example_image_url, "Describe the image in detail."],
157
  ],
158
+ cache_examples=True, # This might cause issues if Gradio version is old. Remove if needed.
159
+ # examples_cache_folder=gradio_cache_dir, # Removed due to potential Gradio version issue
160
  allow_flagging="never"
161
  )
162