Spaces:

witcher23
/

nanoVLM-inference

Sleeping

App Files Files Community

vidhanm commited on May 23

Commit

16bf2d1

1 Parent(s): 97c8139

trying to solve config error

Browse files

Files changed (1) hide show

app.py +74 -31

app.py CHANGED Viewed

@@ -9,14 +9,15 @@ if NANOVLM_REPO_PATH not in sys.path:
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import AutoProcessor # AutoProcessor should still be fine
-# Import the custom VisionLanguageModel class from the cloned nanoVLM repository
 try:
     from models.vision_language_model import VisionLanguageModel
     print("Successfully imported VisionLanguageModel from nanoVLM clone.")
 except ImportError as e:
-    print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
     VisionLanguageModel = None
 # Determine the device to use
@@ -27,38 +28,68 @@ else:
     device = device_choice
 print(f"Using device: {device}")
-# Load the model and processor
 model_id = "lusxvr/nanoVLM-222M"
-processor = None
 model = None
 if VisionLanguageModel:
     try:
-        print(f"Attempting to load processor for {model_id}")
-        # trust_remote_code=True might be beneficial if the processor config itself refers to custom code,
-        # though less likely for processors.
-        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-        print("Processor loaded.")
         print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
-        # The VisionLanguageModel.from_pretrained method should handle its own configuration loading
-        # from the model_id repository (which includes config.json).
-        # trust_remote_code=True here allows the custom VisionLanguageModel code to run.
-        model = VisionLanguageModel.from_pretrained(model_id, trust_remote_code=True).to(device)
         print("Model loaded successfully.")
-        model.eval() # Set to evaluation mode
     except Exception as e:
-        print(f"Error loading model or processor: {e}")
-        processor = None
         model = None
 else:
     print("Custom VisionLanguageModel class not imported, cannot load model.")
 def generate_text_for_image(image_input, prompt_input):
-    if model is None or processor is None:
-        return "Error: Model or processor not loaded correctly. Check logs."
     if image_input is None:
         return "Please upload an image."
@@ -74,23 +105,33 @@ def generate_text_for_image(image_input, prompt_input):
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
-        inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
-        # Call the generate method of the VisionLanguageModel instance
-        # Check the definition of generate in nanoVLM/models/vision_language_model.py for exact signature if issues persist
-        # It likely expects pixel_values and input_ids directly or as part of a dictionary
         generated_ids = model.generate(
-            pixel_values=inputs.get('pixel_values'),
-            input_ids=inputs.get('input_ids'),
-            attention_mask=inputs.get('attention_mask'),
             max_new_tokens=150,
             num_beams=3,
             no_repeat_ngram_size=2,
-            early_stopping=True
         )
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
@@ -100,6 +141,8 @@ def generate_text_for_image(image_input, prompt_input):
     except Exception as e:
         print(f"Error during generation: {e}")
         return f"An error occurred during text generation: {str(e)}"
 description = "Interactive demo for lusxvr/nanoVLM-222M."
@@ -125,8 +168,8 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    if model is None or processor is None:
-        print("CRITICAL: Model or processor failed to load. Gradio interface may not function correctly.")
     else:
         print("Launching Gradio interface...")
     iface.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from PIL import Image
 import torch
+# Import specific processor components
+from transformers import CLIPImageProcessor, GPT2TokenizerFast
+# Import the custom VisionLanguageModel class
 try:
     from models.vision_language_model import VisionLanguageModel
     print("Successfully imported VisionLanguageModel from nanoVLM clone.")
 except ImportError as e:
+    print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}.")
     VisionLanguageModel = None
 # Determine the device to use
     device = device_choice
 print(f"Using device: {device}")
+# Load the model and processor components
 model_id = "lusxvr/nanoVLM-222M"
+image_processor = None
+tokenizer = None
 model = None
 if VisionLanguageModel:
     try:
+        print(f"Attempting to load specific processor components for {model_id}")
+        # Load the image processor
+        image_processor = CLIPImageProcessor.from_pretrained(model_id, trust_remote_code=True)
+        print("CLIPImageProcessor loaded.")
+        # Load the tokenizer
+        tokenizer = GPT2TokenizerFast.from_pretrained(model_id, trust_remote_code=True)
+        # Add a padding token if it's not already there (common for GPT2)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            print("Set tokenizer pad_token to eos_token.")
+        print("GPT2TokenizerFast loaded.")
         print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
+        model = VisionLanguageModel.from_pretrained(
+            model_id,
+            trust_remote_code=True # Allows custom model code to run
+            # The VisionLanguageModel might need image_processor and tokenizer passed during init,
+            # or it might retrieve them from its config. Check its __init__ if issues persist.
+            # For now, assume it gets them from config or they are not strictly needed at init.
+        ).to(device)
         print("Model loaded successfully.")
+        model.eval()
     except Exception as e:
+        print(f"Error loading model or processor components: {e}")
+        image_processor = None
+        tokenizer = None
         model = None
 else:
     print("Custom VisionLanguageModel class not imported, cannot load model.")
+# Define a simple processor-like function for preparing inputs
+def prepare_inputs(text, image, image_processor_instance, tokenizer_instance, device_to_use):
+    if image_processor_instance is None or tokenizer_instance is None:
+        raise ValueError("Image processor or tokenizer not initialized.")
+    # Process image
+    processed_image = image_processor_instance(images=image, return_tensors="pt").pixel_values.to(device_to_use)
+    # Process text
+    # Ensure padding is handled correctly for batching (even if batch size is 1)
+    processed_text = tokenizer_instance(
+        text=text, return_tensors="pt", padding=True, truncation=True
+    )
+    input_ids = processed_text.input_ids.to(device_to_use)
+    attention_mask = processed_text.attention_mask.to(device_to_use)
+    return {"pixel_values": processed_image, "input_ids": input_ids, "attention_mask": attention_mask}
 def generate_text_for_image(image_input, prompt_input):
+    if model is None or image_processor is None or tokenizer is None:
+        return "Error: Model or processor components not loaded correctly. Check logs."
     if image_input is None:
         return "Please upload an image."
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
+        # Use our custom input preparation function
+        inputs = prepare_inputs(
+            text=[prompt_input],  # Expects a list of text prompts
+            image=pil_image,      # Expects a single PIL image or list
+            image_processor_instance=image_processor,
+            tokenizer_instance=tokenizer,
+            device_to_use=device
+        )
+        # Generate text using the model's generate method
         generated_ids = model.generate(
+            pixel_values=inputs['pixel_values'],
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
             max_new_tokens=150,
             num_beams=3,
             no_repeat_ngram_size=2,
+            early_stopping=True,
+            pad_token_id=tokenizer.pad_token_id # Important for generation
         )
+        # Decode the generated tokens
+        # skip_special_tokens=True removes special tokens like <|endoftext|>
+        generated_text_list = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        generated_text = generated_text_list[0] if generated_text_list else ""
+        # Basic cleaning of the prompt if the model includes it in the output
         if prompt_input and generated_text.startswith(prompt_input):
              cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
         else:
     except Exception as e:
         print(f"Error during generation: {e}")
+        import traceback
+        traceback.print_exc() # Print full traceback for debugging
         return f"An error occurred during text generation: {str(e)}"
 description = "Interactive demo for lusxvr/nanoVLM-222M."
 )
 if __name__ == "__main__":
+    if model is None or image_processor is None or tokenizer is None:
+        print("CRITICAL: Model or processor components failed to load.")
     else:
         print("Launching Gradio interface...")
     iface.launch(server_name="0.0.0.0", server_port=7860)