File size: 1,583 Bytes
7740f38 d5d0989 7740f38 d5d0989 7740f38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
import torch
from PIL import Image
from huggingface_hub import hf_hub_download
import sys
import os
# Ensure our working directory has the nanoVLM code
REPO_ID = "huggingface/nanoVLM"
LOCAL_MODEL_DIR = "models"
if not os.path.isdir(LOCAL_MODEL_DIR):
# clone just the models folder
from git import Repo
Repo.clone_from("https://github.com/huggingface/nanoVLM.git", ".", depth=1, no_single_branch=True, multi_options=["--filter=blob:none","--sparse"])
# enable sparse checkout of models/
Repo().git.sparse_checkout("set", "models")
# Add to path so we can import
sys.path.insert(0, os.path.abspath(LOCAL_MODEL_DIR))
from vision_language_model import VisionLanguageModel
# Load the VLM
model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-222M")
model.eval()
def predict(img: Image.Image, prompt: str = "") -> str:
# Preprocess image, add batch dimension
img_tensor = model.preprocess_image(img).unsqueeze(0) # (1, 3, H, W)
with torch.no_grad():
# generate_text handles your prompt internally
output = model.generate_text(img_tensor, prompt=prompt)
return output
demo = gr.Interface(
fn=predict,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Textbox(lines=1, placeholder="Prompt (e.g. 'What is in this picture?')", label="Prompt")
],
outputs=gr.Textbox(label="Model Output"),
title="nanoVLM-222M Vision-Language Demo",
description="A minimal Gradio app for image captioning and VQA with nanoVLM-222M."
)
if __name__ == "__main__":
demo.launch()
|