vidhanm commited on
Commit
97c8139
·
1 Parent(s): e198913

trying to solve config file error

Browse files
Files changed (1) hide show
  1. app.py +24 -37
app.py CHANGED
@@ -2,26 +2,22 @@ import sys
2
  import os
3
 
4
  # Add the cloned nanoVLM directory to Python's system path
5
- # This allows us to import from the 'models' directory within nanoVLM
6
- NANOVLM_REPO_PATH = "/app/nanoVLM" # Path where we cloned it in Dockerfile
7
  if NANOVLM_REPO_PATH not in sys.path:
8
  sys.path.insert(0, NANOVLM_REPO_PATH)
9
 
10
  import gradio as gr
11
  from PIL import Image
12
  import torch
13
- from transformers import AutoProcessor # AutoProcessor might still work
14
 
15
- # Now import the custom classes from the cloned nanoVLM repository
16
  try:
17
  from models.vision_language_model import VisionLanguageModel
18
- from models.configurations import VisionLanguageConfig # Or the specific config class used by nanoVLM
19
- print("Successfully imported VisionLanguageModel and VisionLanguageConfig from nanoVLM clone.")
20
  except ImportError as e:
21
- print(f"Error importing from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
22
  VisionLanguageModel = None
23
- VisionLanguageConfig = None
24
-
25
 
26
  # Determine the device to use
27
  device_choice = os.environ.get("DEVICE", "auto")
@@ -36,39 +32,33 @@ model_id = "lusxvr/nanoVLM-222M"
36
  processor = None
37
  model = None
38
 
39
- if VisionLanguageModel and VisionLanguageConfig:
40
  try:
41
  print(f"Attempting to load processor for {model_id}")
42
- # Processor loading might still be okay with AutoProcessor,
43
- # as processor_config.json is usually standard.
44
- # trust_remote_code might be needed if processor has custom code too.
45
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
46
  print("Processor loaded.")
47
-
48
- print(f"Attempting to load model config for {model_id} using VisionLanguageConfig")
49
- # Load the configuration using the custom config class, pointing to your model_id
50
- # trust_remote_code=True allows it to use any specific code paths from your model_id if needed for config.
51
- config = VisionLanguageConfig.from_pretrained(model_id, trust_remote_code=True)
52
- print("Model config loaded.")
53
 
54
- print(f"Attempting to load model weights for {model_id} using VisionLanguageModel")
55
- # Load the model weights using the custom model class and the loaded config
56
- model = VisionLanguageModel.from_pretrained(model_id, config=config, trust_remote_code=True).to(device)
57
- print("Model weights loaded successfully.")
 
 
58
  model.eval() # Set to evaluation mode
59
 
60
  except Exception as e:
61
- print(f"Error loading model, processor, or config: {e}")
62
- # Fallback if any step fails
63
  processor = None
64
  model = None
65
  else:
66
- print("Custom nanoVLM classes not imported, cannot load model.")
67
 
68
 
69
  def generate_text_for_image(image_input, prompt_input):
70
- if model is None or processor is None or not hasattr(model, 'generate'): # Check if model has generate
71
- return "Error: Model or processor not loaded correctly or model doesn't have 'generate' method. Check logs."
72
 
73
  if image_input is None:
74
  return "Please upload an image."
@@ -84,22 +74,19 @@ def generate_text_for_image(image_input, prompt_input):
84
  if pil_image.mode != "RGB":
85
  pil_image = pil_image.convert("RGB")
86
 
87
- # Prepare inputs for the model using the processor
88
- # The exact format for nanoVLM's custom model might require specific handling.
89
- # The processor from AutoProcessor should generally work.
90
  inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
91
 
92
- # Generate text using the model's generate method
93
- # Common parameters for generation:
 
94
  generated_ids = model.generate(
95
- inputs['pixel_values'], # Assuming processor output has 'pixel_values'
96
- inputs['input_ids'], # Assuming processor output has 'input_ids'
97
- attention_mask=inputs.get('attention_mask'), # Optional, but good to include
98
  max_new_tokens=150,
99
  num_beams=3,
100
  no_repeat_ngram_size=2,
101
  early_stopping=True
102
- # Check nanoVLM's VisionLanguageModel.generate() for specific parameters
103
  )
104
 
105
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
2
  import os
3
 
4
  # Add the cloned nanoVLM directory to Python's system path
5
+ NANOVLM_REPO_PATH = "/app/nanoVLM"
 
6
  if NANOVLM_REPO_PATH not in sys.path:
7
  sys.path.insert(0, NANOVLM_REPO_PATH)
8
 
9
  import gradio as gr
10
  from PIL import Image
11
  import torch
12
+ from transformers import AutoProcessor # AutoProcessor should still be fine
13
 
14
+ # Import the custom VisionLanguageModel class from the cloned nanoVLM repository
15
  try:
16
  from models.vision_language_model import VisionLanguageModel
17
+ print("Successfully imported VisionLanguageModel from nanoVLM clone.")
 
18
  except ImportError as e:
19
+ print(f"Error importing VisionLanguageModel from nanoVLM clone: {e}. Check NANOVLM_REPO_PATH and ensure nanoVLM cloned correctly.")
20
  VisionLanguageModel = None
 
 
21
 
22
  # Determine the device to use
23
  device_choice = os.environ.get("DEVICE", "auto")
 
32
  processor = None
33
  model = None
34
 
35
+ if VisionLanguageModel:
36
  try:
37
  print(f"Attempting to load processor for {model_id}")
38
+ # trust_remote_code=True might be beneficial if the processor config itself refers to custom code,
39
+ # though less likely for processors.
 
40
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
41
  print("Processor loaded.")
 
 
 
 
 
 
42
 
43
+ print(f"Attempting to load model {model_id} using VisionLanguageModel.from_pretrained")
44
+ # The VisionLanguageModel.from_pretrained method should handle its own configuration loading
45
+ # from the model_id repository (which includes config.json).
46
+ # trust_remote_code=True here allows the custom VisionLanguageModel code to run.
47
+ model = VisionLanguageModel.from_pretrained(model_id, trust_remote_code=True).to(device)
48
+ print("Model loaded successfully.")
49
  model.eval() # Set to evaluation mode
50
 
51
  except Exception as e:
52
+ print(f"Error loading model or processor: {e}")
 
53
  processor = None
54
  model = None
55
  else:
56
+ print("Custom VisionLanguageModel class not imported, cannot load model.")
57
 
58
 
59
  def generate_text_for_image(image_input, prompt_input):
60
+ if model is None or processor is None:
61
+ return "Error: Model or processor not loaded correctly. Check logs."
62
 
63
  if image_input is None:
64
  return "Please upload an image."
 
74
  if pil_image.mode != "RGB":
75
  pil_image = pil_image.convert("RGB")
76
 
 
 
 
77
  inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
78
 
79
+ # Call the generate method of the VisionLanguageModel instance
80
+ # Check the definition of generate in nanoVLM/models/vision_language_model.py for exact signature if issues persist
81
+ # It likely expects pixel_values and input_ids directly or as part of a dictionary
82
  generated_ids = model.generate(
83
+ pixel_values=inputs.get('pixel_values'),
84
+ input_ids=inputs.get('input_ids'),
85
+ attention_mask=inputs.get('attention_mask'),
86
  max_new_tokens=150,
87
  num_beams=3,
88
  no_repeat_ngram_size=2,
89
  early_stopping=True
 
90
  )
91
 
92
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]