|
import gradio as gr |
|
import torch |
|
from PIL import Image |
|
from huggingface_hub import hf_hub_download |
|
import sys |
|
import os |
|
|
|
|
|
REPO_ID = "huggingface/nanoVLM" |
|
LOCAL_MODEL_DIR = "models" |
|
if not os.path.isdir(LOCAL_MODEL_DIR): |
|
|
|
from git import Repo |
|
Repo.clone_from("https://github.com/huggingface/nanoVLM.git", ".", depth=1, no_single_branch=True, multi_options=["--filter=blob:none","--sparse"]) |
|
|
|
Repo().git.sparse_checkout("set", "models") |
|
|
|
|
|
sys.path.insert(0, os.path.abspath(LOCAL_MODEL_DIR)) |
|
|
|
from vision_language_model import VisionLanguageModel |
|
|
|
|
|
model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-222M") |
|
model.eval() |
|
|
|
def predict(img: Image.Image, prompt: str = "") -> str: |
|
|
|
img_tensor = model.preprocess_image(img).unsqueeze(0) |
|
with torch.no_grad(): |
|
|
|
output = model.generate_text(img_tensor, prompt=prompt) |
|
return output |
|
|
|
demo = gr.Interface( |
|
fn=predict, |
|
inputs=[ |
|
gr.Image(type="pil", label="Upload Image"), |
|
gr.Textbox(lines=1, placeholder="Prompt (e.g. 'What is in this picture?')", label="Prompt") |
|
], |
|
outputs=gr.Textbox(label="Model Output"), |
|
title="nanoVLM-222M Vision-Language Demo", |
|
description="A minimal Gradio app for image captioning and VQA with nanoVLM-222M." |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|