Spaces:

witcher23
/

nanoVLM-inference

Sleeping

App Files Files Community

vidhanm commited on May 23

Commit

97c8139

1 Parent(s): e198913

trying to solve config file error

Browse files

Files changed (1) hide show

app.py +24 -37

app.py CHANGED Viewed

@@ -2,26 +2,22 @@ import sys
 import os
 # Add the cloned nanoVLM directory to Python's system path
-# This allows us to import from the 'models' directory within nanoVLM
-NANOVLM_REPO_PATH = "/app/nanoVLM" # Path where we cloned it in Dockerfile
 if NANOVLM_REPO_PATH not in sys.path:
     sys.path.insert(0, NANOVLM_REPO_PATH)
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import AutoProcessor # AutoProcessor might still work
-# Now import the custom classes from the cloned nanoVLM repository
 try:
     from models.vision_language_model import VisionLanguageModel
-    from models.configurations import VisionLanguageConfig # Or the specific config class used by nanoVLM
-    print("Successfully imported VisionLanguageModel and VisionLanguageConfig from nanoVLM clone.")
 except ImportError as e:
-    print(f"Error importing from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
     VisionLanguageModel = None
-    VisionLanguageConfig = None
 # Determine the device to use
 device_choice = os.environ.get("DEVICE", "auto")
@@ -36,39 +32,33 @@ model_id = "lusxvr/nanoVLM-222M"
 processor = None
 model = None
-if VisionLanguageModel and VisionLanguageConfig:
     try:
         print(f"Attempting to load processor for {model_id}")
-        # Processor loading might still be okay with AutoProcessor,
-        # as processor_config.json is usually standard.
-        # trust_remote_code might be needed if processor has custom code too.
         processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
         print("Processor loaded.")
-        print(f"Attempting to load model config for {model_id} using VisionLanguageConfig")
-        # Load the configuration using the custom config class, pointing to your model_id
-        # trust_remote_code=True allows it to use any specific code paths from your model_id if needed for config.
-        config = VisionLanguageConfig.from_pretrained(model_id, trust_remote_code=True)
-        print("Model config loaded.")
-        print(f"Attempting to load model weights for {model_id} using VisionLanguageModel")
-        # Load the model weights using the custom model class and the loaded config
-        model = VisionLanguageModel.from_pretrained(model_id, config=config, trust_remote_code=True).to(device)
-        print("Model weights loaded successfully.")
         model.eval() # Set to evaluation mode
     except Exception as e:
-        print(f"Error loading model, processor, or config: {e}")
-        # Fallback if any step fails
         processor = None
         model = None
 else:
-    print("Custom nanoVLM classes not imported, cannot load model.")
 def generate_text_for_image(image_input, prompt_input):
-    if model is None or processor is None or not hasattr(model, 'generate'): # Check if model has generate
-        return "Error: Model or processor not loaded correctly or model doesn't have 'generate' method. Check logs."
     if image_input is None:
         return "Please upload an image."
@@ -84,22 +74,19 @@ def generate_text_for_image(image_input, prompt_input):
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
-        # Prepare inputs for the model using the processor
-        # The exact format for nanoVLM's custom model might require specific handling.
-        # The processor from AutoProcessor should generally work.
         inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
-        # Generate text using the model's generate method
-        # Common parameters for generation:
         generated_ids = model.generate(
-            inputs['pixel_values'], # Assuming processor output has 'pixel_values'
-            inputs['input_ids'],    # Assuming processor output has 'input_ids'
-            attention_mask=inputs.get('attention_mask'), # Optional, but good to include
             max_new_tokens=150,
             num_beams=3,
             no_repeat_ngram_size=2,
             early_stopping=True
-            # Check nanoVLM's VisionLanguageModel.generate() for specific parameters
         )
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

 import os
 # Add the cloned nanoVLM directory to Python's system path
+NANOVLM_REPO_PATH = "/app/nanoVLM"
 if NANOVLM_REPO_PATH not in sys.path:
     sys.path.insert(0, NANOVLM_REPO_PATH)
 import gradio as gr
 from PIL import Image
 import torch
+from transformers import AutoProcessor # AutoProcessor should still be fine
+# Import the custom VisionLanguageModel class from the cloned nanoVLM repository
 try:
     from models.vision_language_model import VisionLanguageModel
+    print("Successfully imported VisionLanguageModel from nanoVLM clone.")
 except ImportError as e:
+    print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
     VisionLanguageModel = None
 # Determine the device to use
 device_choice = os.environ.get("DEVICE", "auto")
 processor = None
 model = None
+if VisionLanguageModel:
     try:
         print(f"Attempting to load processor for {model_id}")
+        # trust_remote_code=True might be beneficial if the processor config itself refers to custom code,
+        # though less likely for processors.
         processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
         print("Processor loaded.")
+        print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
+        # The VisionLanguageModel.from_pretrained method should handle its own configuration loading
+        # from the model_id repository (which includes config.json).
+        # trust_remote_code=True here allows the custom VisionLanguageModel code to run.
+        model = VisionLanguageModel.from_pretrained(model_id, trust_remote_code=True).to(device)
+        print("Model loaded successfully.")
         model.eval() # Set to evaluation mode
     except Exception as e:
+        print(f"Error loading model or processor: {e}")
         processor = None
         model = None
 else:
+    print("Custom VisionLanguageModel class not imported, cannot load model.")
 def generate_text_for_image(image_input, prompt_input):
+    if model is None or processor is None:
+        return "Error: Model or processor not loaded correctly. Check logs."
     if image_input is None:
         return "Please upload an image."
         if pil_image.mode != "RGB":
             pil_image = pil_image.convert("RGB")
         inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
+        # Call the generate method of the VisionLanguageModel instance
+        # Check the definition of generate in nanoVLM/models/vision_language_model.py for exact signature if issues persist
+        # It likely expects pixel_values and input_ids directly or as part of a dictionary
         generated_ids = model.generate(
+            pixel_values=inputs.get('pixel_values'),
+            input_ids=inputs.get('input_ids'),
+            attention_mask=inputs.get('attention_mask'),
             max_new_tokens=150,
             num_beams=3,
             no_repeat_ngram_size=2,
             early_stopping=True
         )
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]