Spaces:

witcher23
/

nanoVLM-inference

Running

App Files Files Community

vidhanm commited on May 22

Commit

4670dfa

1 Parent(s): 1313dd4

Add application files for nanoVLM

Browse files

Files changed (3) hide show

Dockerfile +30 -0
app.py +113 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Use a slim Python base image. For GPU, you'd need a CUDA-enabled base.
+FROM python:3.9-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install git (useful for some Hugging Face model/tokenizer downloads that might use it)
+# Also install common build tools often needed for Python packages
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the requirements file first to leverage Docker layer caching
+COPY requirements.txt requirements.txt
+# Install Python dependencies
+# --no-cache-dir reduces image size
+# --prefer-binary can speed up builds for packages with binary distributions
+RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
+# Copy the application code into the container
+COPY app.py app.py
+# Expose the port Gradio will run on (default is 7860)
+EXPOSE 7860
+# Set the default command to run the Gradio application
+# Using `python -u` for unbuffered output, which is good for logging
+CMD ["python", "-u", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+from PIL import Image
+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq
+import os
+# Determine the device to use
+# Using os.environ.get to allow device override from Space hardware config if needed
+# Defaults to CUDA if available, else CPU.
+device_choice = os.environ.get("DEVICE", "auto")
+if device_choice == "auto":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+else:
+    device = device_choice
+print(f"Using device: {device}")
+# Load the model and processor
+model_id = "lusxvr/nanoVLM-222M"
+try:
+    processor = AutoProcessor.from_pretrained(model_id)
+    model = AutoModelForVision2Seq.from_pretrained(model_id).to(device)
+    print("Model and processor loaded successfully.")
+except Exception as e:
+    print(f"Error loading model/processor: {e}")
+    # If loading fails, we'll have the Gradio app display an error.
+    # This helps in debugging if the Space doesn't start correctly.
+    processor = None
+    model = None
+def generate_text_for_image(image_input, prompt_input):
+    """
+    Generates text based on an image and a text prompt.
+    """
+    if model is None or processor is None:
+        return "Error: Model or processor not loaded. Check the Space logs for details."
+    if image_input is None:
+        return "Please upload an image."
+    if not prompt_input:
+        return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')."
+    try:
+        # Ensure the image is in PIL format and RGB
+        if not isinstance(image_input, Image.Image):
+            pil_image = Image.fromarray(image_input)
+        else:
+            pil_image = image_input
+        if pil_image.mode != "RGB":
+            pil_image = pil_image.convert("RGB")
+        # Prepare inputs for the model
+        # The prompt for nanoVLM is typically a question or an instruction.
+        inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
+        # Generate text
+        # You can adjust max_new_tokens, temperature, top_k, etc.
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=150,  # Increased for potentially longer descriptions
+            num_beams=3,         # Example of adding beam search
+            no_repeat_ngram_size=2,
+            early_stopping=True
+        )
+        # Decode the generated tokens
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # The output might sometimes include the prompt itself, depending on the model.
+        # Simple heuristic to remove prompt if it appears at the beginning:
+        if generated_text.startswith(prompt_input):
+             cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
+        else:
+            cleaned_text = generated_text
+        return cleaned_text.strip()
+    except Exception as e:
+        print(f"Error during generation: {e}")
+        return f"An error occurred: {str(e)}"
+# Create the Gradio interface
+description = """
+Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail.").
+The model will generate a textual response based on the visual content and your query.
+This Space uses the `lusxvr/nanoVLM-222M` model.
+"""
+# Example image from COCO dataset
+example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote
+iface = gr.Interface(
+    fn=generate_text_for_image,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),
+        gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'")
+    ],
+    outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
+    title="Interactive nanoVLM-222M Demo",
+    description=description,
+    examples=[
+        [example_image_url, "a photo of a"],
+        [example_image_url, "Describe the image in detail."],
+        [example_image_url, "What objects are on the sofa?"],
+    ],
+    cache_examples=True # Cache results for examples to load faster
+)
+if __name__ == "__main__":
+    # For Hugging Face Spaces, it's common to launch with server_name="0.0.0.0"
+    # The Space infrastructure handles the public URL and port mapping.
+    iface.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+transformers>=4.36.0
+Pillow>=10.0.0
+gradio
+sentencepiece
+accelerate