import io from functools import lru_cache from typing import Optional import gradio as gr import matplotlib.pyplot as plt from PIL import Image from transformers.utils.processor_visualizer_utils import ImageVisualizer MODELS = [ "openai/clip-vit-base-patch32", "HuggingFaceM4/Idefics3-8B-Llama3", ] def _fig_to_pil(fig) -> Image.Image: buf = io.BytesIO() fig.savefig(buf, format="png", dpi=160) buf.seek(0) return Image.open(buf).convert("RGB") @lru_cache(maxsize=64) def get_viz(model_id: str) -> ImageVisualizer: return ImageVisualizer(model_id) def _run(model_id: str, image: Optional[Image.Image], add_grid: bool): viz = get_viz(model_id) captured = [] orig_show = plt.show def _capture_show(*_, **__): captured.append(plt.gcf()) try: plt.show = _capture_show # if image is None, the visualizer will use its default sample viz.visualize(images=image, add_grid=add_grid) finally: plt.show = orig_show left_img = _fig_to_pil(captured[0]) if len(captured) >= 1 else None right_img = _fig_to_pil(captured[1]) if len(captured) >= 2 else None prompt_preview = viz.default_message(full_output=False) return left_img, right_img, prompt_preview def _resolve_and_run(model_pick, custom_model, image, add_grid): model_id = (custom_model or "").strip() or (model_pick or "").strip() if not model_id: raise gr.Error("Pick a model or enter one.") return _run(model_id, image, add_grid) def _preload_models(): for mid in MODELS: try: get_viz(mid) except Exception: pass theme = gr.themes.Soft(primary_hue="orange", neutral_hue="gray") with gr.Blocks(title="Transformers Processor Visualizer", theme=theme) as demo: gr.Markdown("## Visualize what a processor feeds a vision–text model") with gr.Row(): with gr.Column(scale=1, min_width=280): model_pick = gr.Radio(label="Models", choices=MODELS, value=MODELS[0], interactive=True) custom_model = gr.Textbox(label="Or type a model id", placeholder="owner/repo", lines=1) with gr.Column(scale=3): with gr.Row(): add_grid = gr.Checkbox(label="Show patch grid", value=True) image = gr.Image(label="Upload custom image", type="pil", height=140, sources=["upload"]) gr.Markdown("## Output") with gr.Row(): left_output = gr.Image(label="Processor output", type="pil", height=900) right_output = gr.Image(label="Global image (if any)", type="pil", height=900) prompt = gr.Textbox(label="Compact chat template preview", lines=2) # reactive updates model_pick.change(_resolve_and_run, [model_pick, custom_model, image, add_grid], [left_output, right_output, prompt]) custom_model.submit(_resolve_and_run, [model_pick, custom_model, image, add_grid], [left_output, right_output, prompt]) add_grid.change(_resolve_and_run, [model_pick, custom_model, image, add_grid], [left_output, right_output, prompt]) image.change(_resolve_and_run, [model_pick, custom_model, image, add_grid], [left_output, right_output, prompt]) # preload models into cache and render once demo.load(_preload_models, [], []) demo.load(_resolve_and_run, [model_pick, custom_model, image, add_grid], [left_output, right_output, prompt]) if __name__ == "__main__": demo.launch()