InternVL3-8B / app.py
developer0hye's picture
Update app.py
263b9ed verified
raw
history blame
6.49 kB
import gradio as gr
import spaces
import torch
import math
import numpy as np
import os
from PIL import Image
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoConfig
# =============================================================================
# InternVL‑3 preprocessing utilities (image‑only version)
# =============================================================================
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size: int = 448):
"""Return torchvision transform matching InternVL pre‑training."""
return T.Compose(
[
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
]
)
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
tgt_ar = ratio[0] / ratio[1]
diff = abs(aspect_ratio - tgt_ar)
if diff < best_ratio_diff or (diff == best_ratio_diff and area > 0.5 * image_size * image_size * ratio[0] * ratio[1]):
best_ratio_diff = diff
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
"""Split arbitrarily‑sized image into ≤12 tiles sized 448×448 (InternVL spec)."""
ow, oh = image.size
aspect_ratio = ow / oh
target_ratios = sorted(
{(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if min_num <= i * j <= max_num},
key=lambda x: x[0] * x[1],
)
ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, ow, oh, image_size)
tw, th = image_size * ratio[0], image_size * ratio[1]
blocks = ratio[0] * ratio[1]
resized = image.resize((tw, th))
tiles = [
resized.crop(
(
(idx % (tw // image_size)) * image_size,
(idx // (tw // image_size)) * image_size,
((idx % (tw // image_size)) + 1) * image_size,
((idx // (tw // image_size)) + 1) * image_size,
)
)
for idx in range(blocks)
]
if use_thumbnail and blocks != 1:
tiles.append(image.resize((image_size, image_size)))
return tiles
def load_image(path: str, input_size: int = 448, max_num: int = 12):
"""Return tensor of shape (N, 3, H, W) ready for InternVL."""
img = Image.open(path).convert("RGB")
transform = build_transform(input_size)
tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
return torch.stack([transform(t) for t in tiles])
# =============================================================================
# InternVL‑3‑8B model loading (multi‑GPU aware)
# =============================================================================
MODEL_ID = "OpenGVLab/InternVL3-8B"
def split_model(model_name: str):
"""Distribute LLM layers across GPUs, keeping vision encoder on GPU 0."""
n_gpu = torch.cuda.device_count()
if n_gpu < 2:
return "auto" # let transformers decide
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
n_layers = cfg.llm_config.num_hidden_layers # type: ignore[attr-defined]
# GPU0 does vision + some text layers => treat as 0.5 GPU
per_gpu = math.ceil(n_layers / (n_gpu - 0.5))
alloc = [per_gpu] * n_gpu
alloc[0] = math.ceil(alloc[0] * 0.5)
dmap = {
"vision_model": 0,
"mlp1": 0,
"language_model.model.tok_embeddings": 0,
"language_model.model.embed_tokens": 0,
"language_model.output": 0,
"language_model.model.norm": 0,
"language_model.model.rotary_emb": 0,
"language_model.lm_head": 0,
}
layer_idx = 0
for gpu, n in enumerate(alloc):
for _ in range(n):
if layer_idx >= n_layers:
break
dmap[f"language_model.model.layers.{layer_idx}"] = 0 if layer_idx == n_layers - 1 else gpu
layer_idx += 1
return dmap
device_map = split_model(MODEL_ID)
model = AutoModel.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map,
).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
# =============================================================================
# Inference function (image‑only)
# =============================================================================
@spaces.GPU
def internvl_inference(image_path: str | None, text_input: str | None = None):
if image_path is None:
return "Please upload an image first."
pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()
prompt = f"<image>\n{text_input}" if text_input else "<image>\n"
gen_cfg = dict(max_new_tokens=1024, do_sample=True)
return model.chat(tokenizer, pixel_values, prompt, gen_cfg)
# =============================================================================
# Gradio UI (image‑only, Gradio 5 compatible)
# =============================================================================
DESCRIPTION = (
"[InternVL 3‑8B demo](https://huggingface.co/OpenGVLab/InternVL3-8B) — "
"upload an image and ask anything about it."
)
css = """
#output_text {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css, theme="origin") as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
# Left column: image, question, submit button (stacked vertically)
with gr.Column(scale=1):
input_image = gr.Image(label="Upload Image", type="filepath")
text_input = gr.Textbox(label="Question")
submit_btn = gr.Button("Submit")
# Right column: model output
with gr.Column(scale=1):
output_text = gr.Textbox(label="Model Output", elem_id="output_text")
submit_btn.click(internvl_inference, [input_image, text_input], [output_text])
if __name__ == "__main__":
demo.launch()