vidhanm commited on
Commit
16bf2d1
·
1 Parent(s): 97c8139

trying to solve config error

Browse files
Files changed (1) hide show
  1. app.py +74 -31
app.py CHANGED
@@ -9,14 +9,15 @@ if NANOVLM_REPO_PATH not in sys.path:
9
  import gradio as gr
10
  from PIL import Image
11
  import torch
12
- from transformers import AutoProcessor # AutoProcessor should still be fine
 
13
 
14
- # Import the custom VisionLanguageModel class from the cloned nanoVLM repository
15
  try:
16
  from models.vision_language_model import VisionLanguageModel
17
  print("Successfully imported VisionLanguageModel from nanoVLM clone.")
18
  except ImportError as e:
19
- print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
20
  VisionLanguageModel = None
21
 
22
  # Determine the device to use
@@ -27,38 +28,68 @@ else:
27
  device = device_choice
28
  print(f"Using device: {device}")
29
 
30
- # Load the model and processor
31
  model_id = "lusxvr/nanoVLM-222M"
32
- processor = None
 
33
  model = None
34
 
35
  if VisionLanguageModel:
36
  try:
37
- print(f"Attempting to load processor for {model_id}")
38
- # trust_remote_code=True might be beneficial if the processor config itself refers to custom code,
39
- # though less likely for processors.
40
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
41
- print("Processor loaded.")
 
 
 
 
 
 
 
42
 
43
  print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
44
- # The VisionLanguageModel.from_pretrained method should handle its own configuration loading
45
- # from the model_id repository (which includes config.json).
46
- # trust_remote_code=True here allows the custom VisionLanguageModel code to run.
47
- model = VisionLanguageModel.from_pretrained(model_id, trust_remote_code=True).to(device)
 
 
 
48
  print("Model loaded successfully.")
49
- model.eval() # Set to evaluation mode
50
 
51
  except Exception as e:
52
- print(f"Error loading model or processor: {e}")
53
- processor = None
 
54
  model = None
55
  else:
56
  print("Custom VisionLanguageModel class not imported, cannot load model.")
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def generate_text_for_image(image_input, prompt_input):
60
- if model is None or processor is None:
61
- return "Error: Model or processor not loaded correctly. Check logs."
62
 
63
  if image_input is None:
64
  return "Please upload an image."
@@ -74,23 +105,33 @@ def generate_text_for_image(image_input, prompt_input):
74
  if pil_image.mode != "RGB":
75
  pil_image = pil_image.convert("RGB")
76
 
77
- inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
 
 
 
 
 
 
 
78
 
79
- # Call the generate method of the VisionLanguageModel instance
80
- # Check the definition of generate in nanoVLM/models/vision_language_model.py for exact signature if issues persist
81
- # It likely expects pixel_values and input_ids directly or as part of a dictionary
82
  generated_ids = model.generate(
83
- pixel_values=inputs.get('pixel_values'),
84
- input_ids=inputs.get('input_ids'),
85
- attention_mask=inputs.get('attention_mask'),
86
  max_new_tokens=150,
87
  num_beams=3,
88
  no_repeat_ngram_size=2,
89
- early_stopping=True
 
90
  )
91
 
92
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
93
-
 
 
 
 
94
  if prompt_input and generated_text.startswith(prompt_input):
95
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
96
  else:
@@ -100,6 +141,8 @@ def generate_text_for_image(image_input, prompt_input):
100
 
101
  except Exception as e:
102
  print(f"Error during generation: {e}")
 
 
103
  return f"An error occurred during text generation: {str(e)}"
104
 
105
  description = "Interactive demo for lusxvr/nanoVLM-222M."
@@ -125,8 +168,8 @@ iface = gr.Interface(
125
  )
126
 
127
  if __name__ == "__main__":
128
- if model is None or processor is None:
129
- print("CRITICAL: Model or processor failed to load. Gradio interface may not function correctly.")
130
  else:
131
  print("Launching Gradio interface...")
132
  iface.launch(server_name="0.0.0.0", server_port=7860)
 
9
  import gradio as gr
10
  from PIL import Image
11
  import torch
12
+ # Import specific processor components
13
+ from transformers import CLIPImageProcessor, GPT2TokenizerFast
14
 
15
+ # Import the custom VisionLanguageModel class
16
  try:
17
  from models.vision_language_model import VisionLanguageModel
18
  print("Successfully imported VisionLanguageModel from nanoVLM clone.")
19
  except ImportError as e:
20
+ print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}.")
21
  VisionLanguageModel = None
22
 
23
  # Determine the device to use
 
28
  device = device_choice
29
  print(f"Using device: {device}")
30
 
31
+ # Load the model and processor components
32
  model_id = "lusxvr/nanoVLM-222M"
33
+ image_processor = None
34
+ tokenizer = None
35
  model = None
36
 
37
  if VisionLanguageModel:
38
  try:
39
+ print(f"Attempting to load specific processor components for {model_id}")
40
+ # Load the image processor
41
+ image_processor = CLIPImageProcessor.from_pretrained(model_id, trust_remote_code=True)
42
+ print("CLIPImageProcessor loaded.")
43
+
44
+ # Load the tokenizer
45
+ tokenizer = GPT2TokenizerFast.from_pretrained(model_id, trust_remote_code=True)
46
+ # Add a padding token if it's not already there (common for GPT2)
47
+ if tokenizer.pad_token is None:
48
+ tokenizer.pad_token = tokenizer.eos_token
49
+ print("Set tokenizer pad_token to eos_token.")
50
+ print("GPT2TokenizerFast loaded.")
51
 
52
  print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
53
+ model = VisionLanguageModel.from_pretrained(
54
+ model_id,
55
+ trust_remote_code=True # Allows custom model code to run
56
+ # The VisionLanguageModel might need image_processor and tokenizer passed during init,
57
+ # or it might retrieve them from its config. Check its __init__ if issues persist.
58
+ # For now, assume it gets them from config or they are not strictly needed at init.
59
+ ).to(device)
60
  print("Model loaded successfully.")
61
+ model.eval()
62
 
63
  except Exception as e:
64
+ print(f"Error loading model or processor components: {e}")
65
+ image_processor = None
66
+ tokenizer = None
67
  model = None
68
  else:
69
  print("Custom VisionLanguageModel class not imported, cannot load model.")
70
 
71
+ # Define a simple processor-like function for preparing inputs
72
+ def prepare_inputs(text, image, image_processor_instance, tokenizer_instance, device_to_use):
73
+ if image_processor_instance is None or tokenizer_instance is None:
74
+ raise ValueError("Image processor or tokenizer not initialized.")
75
+
76
+ # Process image
77
+ processed_image = image_processor_instance(images=image, return_tensors="pt").pixel_values.to(device_to_use)
78
+
79
+ # Process text
80
+ # Ensure padding is handled correctly for batching (even if batch size is 1)
81
+ processed_text = tokenizer_instance(
82
+ text=text, return_tensors="pt", padding=True, truncation=True
83
+ )
84
+ input_ids = processed_text.input_ids.to(device_to_use)
85
+ attention_mask = processed_text.attention_mask.to(device_to_use)
86
+
87
+ return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
88
+
89
 
90
  def generate_text_for_image(image_input, prompt_input):
91
+ if model is None or image_processor is None or tokenizer is None:
92
+ return "Error: Model or processor components not loaded correctly. Check logs."
93
 
94
  if image_input is None:
95
  return "Please upload an image."
 
105
  if pil_image.mode != "RGB":
106
  pil_image = pil_image.convert("RGB")
107
 
108
+ # Use our custom input preparation function
109
+ inputs = prepare_inputs(
110
+ text=[prompt_input], # Expects a list of text prompts
111
+ image=pil_image, # Expects a single PIL image or list
112
+ image_processor_instance=image_processor,
113
+ tokenizer_instance=tokenizer,
114
+ device_to_use=device
115
+ )
116
 
117
+ # Generate text using the model's generate method
 
 
118
  generated_ids = model.generate(
119
+ pixel_values=inputs['pixel_values'],
120
+ input_ids=inputs['input_ids'],
121
+ attention_mask=inputs['attention_mask'],
122
  max_new_tokens=150,
123
  num_beams=3,
124
  no_repeat_ngram_size=2,
125
+ early_stopping=True,
126
+ pad_token_id=tokenizer.pad_token_id # Important for generation
127
  )
128
 
129
+ # Decode the generated tokens
130
+ # skip_special_tokens=True removes special tokens like <|endoftext|>
131
+ generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
132
+ generated_text = generated_text_list[0] if generated_text_list else ""
133
+
134
+ # Basic cleaning of the prompt if the model includes it in the output
135
  if prompt_input and generated_text.startswith(prompt_input):
136
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
137
  else:
 
141
 
142
  except Exception as e:
143
  print(f"Error during generation: {e}")
144
+ import traceback
145
+ traceback.print_exc() # Print full traceback for debugging
146
  return f"An error occurred during text generation: {str(e)}"
147
 
148
  description = "Interactive demo for lusxvr/nanoVLM-222M."
 
168
  )
169
 
170
  if __name__ == "__main__":
171
+ if model is None or image_processor is None or tokenizer is None:
172
+ print("CRITICAL: Model or processor components failed to load.")
173
  else:
174
  print("Launching Gradio interface...")
175
  iface.launch(server_name="0.0.0.0", server_port=7860)