Spaces:

witcher23
/

nanoVLM-inference

Sleeping

App Files Files Community

vidhanm commited on May 23

Commit

978a6b3

1 Parent(s): 16bf2d1

.

Browse files

Files changed (1) hide show

app.py +27 -34

app.py CHANGED Viewed

@@ -28,65 +28,63 @@ else:
     device = device_choice
 print(f"Using device: {device}")
-# Load the model and processor components
-model_id = "lusxvr/nanoVLM-222M"
 image_processor = None
 tokenizer = None
 model = None
 if VisionLanguageModel:
     try:
-        print(f"Attempting to load specific processor components for {model_id}")
-        # Load the image processor
-        image_processor = CLIPImageProcessor.from_pretrained(model_id, trust_remote_code=True)
         print("CLIPImageProcessor loaded.")
-        # Load the tokenizer
-        tokenizer = GPT2TokenizerFast.from_pretrained(model_id, trust_remote_code=True)
-        # Add a padding token if it's not already there (common for GPT2)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             print("Set tokenizer pad_token to eos_token.")
         print("GPT2TokenizerFast loaded.")
-        print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
         model = VisionLanguageModel.from_pretrained(
-            model_id,
-            trust_remote_code=True # Allows custom model code to run
-            # The VisionLanguageModel might need image_processor and tokenizer passed during init,
-            # or it might retrieve them from its config. Check its __init__ if issues persist.
-            # For now, assume it gets them from config or they are not strictly needed at init.
         ).to(device)
         print("Model loaded successfully.")
         model.eval()
     except Exception as e:
         print(f"Error loading model or processor components: {e}")
         image_processor = None
         tokenizer = None
         model = None
 else:
     print("Custom VisionLanguageModel class not imported, cannot load model.")
-# Define a simple processor-like function for preparing inputs
-def prepare_inputs(text, image, image_processor_instance, tokenizer_instance, device_to_use):
     if image_processor_instance is None or tokenizer_instance is None:
         raise ValueError("Image processor or tokenizer not initialized.")
-    # Process image
-    processed_image = image_processor_instance(images=image, return_tensors="pt").pixel_values.to(device_to_use)
-    # Process text
-    # Ensure padding is handled correctly for batching (even if batch size is 1)
     processed_text = tokenizer_instance(
-        text=text, return_tensors="pt", padding=True, truncation=True
     )
     input_ids = processed_text.input_ids.to(device_to_use)
     attention_mask = processed_text.attention_mask.to(device_to_use)
     return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
 def generate_text_for_image(image_input, prompt_input):
     if model is None or image_processor is None or tokenizer is None:
         return "Error: Model or processor components not loaded correctly. Check logs."
@@ -105,16 +103,14 @@ def generate_text_for_image(image_input, prompt_input):
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
-        # Use our custom input preparation function
         inputs = prepare_inputs(
-            text=[prompt_input],  # Expects a list of text prompts
-            image=pil_image,      # Expects a single PIL image or list
             image_processor_instance=image_processor,
             tokenizer_instance=tokenizer,
             device_to_use=device
         )
-        # Generate text using the model's generate method
         generated_ids = model.generate(
             pixel_values=inputs['pixel_values'],
             input_ids=inputs['input_ids'],
@@ -123,15 +119,12 @@ def generate_text_for_image(image_input, prompt_input):
             num_beams=3,
             no_repeat_ngram_size=2,
             early_stopping=True,
-            pad_token_id=tokenizer.pad_token_id # Important for generation
         )
-        # Decode the generated tokens
-        # skip_special_tokens=True removes special tokens like <|endoftext|>
         generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         generated_text = generated_text_list[0] if generated_text_list else ""
-        # Basic cleaning of the prompt if the model includes it in the output
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
@@ -142,12 +135,12 @@ def generate_text_for_image(image_input, prompt_input):
     except Exception as e:
         print(f"Error during generation: {e}")
         import traceback
-        traceback.print_exc() # Print full traceback for debugging
         return f"An error occurred during text generation: {str(e)}"
 description = "Interactive demo for lusxvr/nanoVLM-222M."
 example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
 iface = gr.Interface(
     fn=generate_text_for_image,
@@ -162,8 +155,8 @@ iface = gr.Interface(
         [example_image_url, "a photo of a"],
         [example_image_url, "Describe the image in detail."],
     ],
-    cache_examples=True,
-    examples_cache_folder=gradio_cache_dir,
     allow_flagging="never"
 )

     device = device_choice
 print(f"Using device: {device}")
+# --- Configuration for model components ---
+# The main model ID for weights and overall config
+model_id_for_weights = "lusxvr/nanoVLM-222M"
+# The ID for the vision backbone's image processor configuration
+image_processor_id = "openai/clip-vit-base-patch32"
+# The ID for the tokenizer (can be the main model ID if it provides specific tokenizer files)
+tokenizer_id = "lusxvr/nanoVLM-222M" # Or directly "gpt2" if preferred, but model_id is usually safer
 image_processor = None
 tokenizer = None
 model = None
 if VisionLanguageModel:
     try:
+        print(f"Attempting to load CLIPImageProcessor from: {image_processor_id}")
+        image_processor = CLIPImageProcessor.from_pretrained(image_processor_id, trust_remote_code=True)
         print("CLIPImageProcessor loaded.")
+        print(f"Attempting to load GPT2TokenizerFast from: {tokenizer_id}")
+        tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_id, trust_remote_code=True)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             print("Set tokenizer pad_token to eos_token.")
         print("GPT2TokenizerFast loaded.")
+        print(f"Attempting to load model weights from {model_id_for_weights} using VisionLanguageModel.from_pretrained")
         model = VisionLanguageModel.from_pretrained(
+            model_id_for_weights,
+            trust_remote_code=True
         ).to(device)
         print("Model loaded successfully.")
         model.eval()
     except Exception as e:
         print(f"Error loading model or processor components: {e}")
+        import traceback
+        traceback.print_exc() # Print full traceback
         image_processor = None
         tokenizer = None
         model = None
 else:
     print("Custom VisionLanguageModel class not imported, cannot load model.")
+def prepare_inputs(text_list, image_input, image_processor_instance, tokenizer_instance, device_to_use):
     if image_processor_instance is None or tokenizer_instance is None:
         raise ValueError("Image processor or tokenizer not initialized.")
+    processed_image = image_processor_instance(images=image_input, return_tensors="pt").pixel_values.to(device_to_use)
     processed_text = tokenizer_instance(
+        text=text_list, return_tensors="pt", padding=True, truncation=True, max_length=tokenizer_instance.model_max_length
     )
     input_ids = processed_text.input_ids.to(device_to_use)
     attention_mask = processed_text.attention_mask.to(device_to_use)
     return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
 def generate_text_for_image(image_input, prompt_input):
     if model is None or image_processor is None or tokenizer is None:
         return "Error: Model or processor components not loaded correctly. Check logs."
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
         inputs = prepare_inputs(
+            text_list=[prompt_input],
+            image_input=pil_image,
             image_processor_instance=image_processor,
             tokenizer_instance=tokenizer,
             device_to_use=device
         )
         generated_ids = model.generate(
             pixel_values=inputs['pixel_values'],
             input_ids=inputs['input_ids'],
             num_beams=3,
             no_repeat_ngram_size=2,
             early_stopping=True,
+            pad_token_id=tokenizer.pad_token_id
         )
         generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         generated_text = generated_text_list[0] if generated_text_list else ""
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
     except Exception as e:
         print(f"Error during generation: {e}")
         import traceback
+        traceback.print_exc()
         return f"An error occurred during text generation: {str(e)}"
 description = "Interactive demo for lusxvr/nanoVLM-222M."
 example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp") # Not used for now
 iface = gr.Interface(
     fn=generate_text_for_image,
         [example_image_url, "a photo of a"],
         [example_image_url, "Describe the image in detail."],
     ],
+    cache_examples=True, # This might cause issues if Gradio version is old. Remove if needed.
+    # examples_cache_folder=gradio_cache_dir, # Removed due to potential Gradio version issue
     allow_flagging="never"
 )