Spaces:

Molbap
/

VisionForVision

Running

File size: 4,181 Bytes

import io
from typing import Optional

import gradio as gr
import matplotlib.pyplot as plt
from PIL import Image

from transformers.utils.processor_visualizer_utils import ImageVisualizer

MODELS = [
    "openai/clip-vit-base-patch32",
    "HuggingFaceM4/Idefics3-8B-Llama3",
    "llava-hf/llava-1.5-7b-hf",
    "OpenGVLab/InternVL2-2B",
    "OpenGVLab/InternVL3-8B-hf",
    "Salesforce/blip-image-captioning-base",
    "Salesforce/blip2-flan-t5-xl",
    "Qwen/Qwen2-VL-2B-Instruct",
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "meta-llama/Llama-3.2-11B-Vision",
    "microsoft/Florence-2-base",
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
]


def _fig_to_pil(fig) -> Image.Image:
    buf = io.BytesIO()
    fig.savefig(buf, format="png", bbox_inches="tight", dpi=160)
    buf.seek(0)
    return Image.open(buf).convert("RGB")

def _run(model_id: str, image: Optional[Image.Image], use_sample: bool, add_grid: bool):
    viz = ImageVisualizer(model_id)

    captured = []
    orig_show = plt.show

    def _capture_show(*_, **__):
        fig = plt.gcf()
        captured.append(fig)

    try:
        plt.show = _capture_show
        viz.visualize(images=None if use_sample else image, add_grid=add_grid)
    finally:
        plt.show = orig_show

    imgs = [_fig_to_pil(fig) for fig in captured] if captured else []
    prompt_preview = viz.default_message(full_output=False)
    return imgs, prompt_preview

def _resolve_and_run(model_pick, custom_model, image, use_sample, add_grid):
    model_id = (custom_model or "").strip() or (model_pick or "").strip()
    if not model_id:
        raise gr.Error("Pick a model or enter one.")
    return _run(model_id, image, use_sample, add_grid)

def _on_image_change(_):
    return False  # uncheck "use sample" when a custom image is set

theme = gr.themes.Soft(primary_hue="orange", neutral_hue="gray")

with gr.Blocks(title="Transformers Processor Visualizer", theme=theme) as demo:
    gr.Markdown("### Visualize what a processor feeds a vision–text model (uses the existing `ImageVisualizer`).")

    with gr.Row():
        # LEFT: clickable models + custom field
        with gr.Column(scale=1, min_width=260):
            model_pick = gr.Radio(
                label="Models",
                choices=MODELS,
                value=MODELS[0],
                interactive=True,
            )
            custom_model = gr.Textbox(
                label="Or type a model id",
                placeholder="owner/repo",
                lines=1,
            )
        # RIGHT: controls + outputs
        with gr.Column(scale=3):
            with gr.Row():
                add_grid = gr.Checkbox(label="Show patch grid", value=True)
                use_sample = gr.Checkbox(label="Use HF logo sample", value=True)
            image = gr.Image(
                label="Upload custom image",
                type="pil",
                height=140,
                sources=["upload"],
            )
            gr.Markdown("### Render")
            gallery = gr.Gallery(label="Processor output", columns=[1], height=900)
            prompt = gr.Textbox(label="Compact chat template preview", lines=2)

    # Reactive updates: change model, toggle options, upload -> update immediately
    model_pick.change(_resolve_and_run, inputs=[model_pick, custom_model, image, use_sample, add_grid], outputs=[gallery, prompt])
    custom_model.submit(_resolve_and_run, inputs=[model_pick, custom_model, image, use_sample, add_grid], outputs=[gallery, prompt])
    add_grid.change(_resolve_and_run, inputs=[model_pick, custom_model, image, use_sample, add_grid], outputs=[gallery, prompt])
    use_sample.change(_resolve_and_run, inputs=[model_pick, custom_model, image, use_sample, add_grid], outputs=[gallery, prompt])
    image.change(_on_image_change, inputs=image, outputs=use_sample).then(
        _resolve_and_run, inputs=[model_pick, custom_model, image, use_sample, add_grid], outputs=[gallery, prompt]
    )

    # Initial render so there is output before any interaction
    demo.load(_resolve_and_run, inputs=[model_pick, custom_model, image, use_sample, add_grid], outputs=[gallery, prompt])

if __name__ == "__main__":
    demo.launch()