Spaces:

GrassData
/

cliptagger-12b

Running on A100

File size: 8,989 Bytes

dcdd99b
0557d7f
ace6ed9
 
 
21b17c3
ace6ed9
b989be2
0557d7f
dcdd99b
b989be2
 
 
 
 
9f13dde
0557d7f
b989be2
ace6ed9
 
 
 
 
 
 
 
 
 
 
 
 
 
b989be2
 
 
 
 
 
 
 
 
ace6ed9
 
 
6a4178d
 
 
b989be2
6a4178d
b989be2
 
 
 
ace6ed9
 
b989be2
dcdd99b
b989be2
21b17c3
dcdd99b
21b17c3
dcdd99b
b989be2
dcdd99b
 
b989be2
dcdd99b
 
 
 
 
 
 
 
 
 
21b17c3
6a4178d
8d3d460
21b17c3
 
 
 
 
0557d7f
b989be2
8d3d460
9f13dde
 
b989be2
9f13dde
b989be2
dcdd99b
9f13dde
b989be2
30343a2
 
 
 
 
dcdd99b
21b17c3
8d3d460
dcdd99b
 
 
b989be2
dcdd99b
 
b989be2
dcdd99b
30343a2
 
 
 
 
6a4178d
b989be2
dcdd99b
6a4178d
 
30343a2
dcdd99b
ace6ed9
b989be2
9f13dde
b989be2
30343a2
dcdd99b
6a4178d
dcdd99b
6a4178d
30343a2
b989be2
ace6ed9
30343a2
b989be2
 
dcdd99b
 
b989be2
dcdd99b
 
8d3d460
dcdd99b
b989be2
dcdd99b
b989be2
dcdd99b
 
 
b989be2
dcdd99b
 
b989be2
dcdd99b
 
8d3d460
dcdd99b
b989be2
dcdd99b
b989be2
dcdd99b
b989be2
dcdd99b
8d3d460
0557d7f
b989be2
8d3d460
 
30343a2
 
 
ace6ed9
 
 
0557d7f
ace6ed9
0557d7f
ace6ed9
dcdd99b
21b17c3
dcdd99b
1cffa06

import os, json, traceback
from typing import Any, Dict, Tuple
import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig

# ===== Env / params =====
MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
HF_TOKEN = os.environ.get("HF_TOKEN")

# Latency/quality knobs (tuned for A100)
TEMP = 0.1                      # per model docs
MAX_NEW_TOKENS = 384            # fast + sufficient for schema (raise to 512/768 later if needed)
VISION_LONG_SIDE = 896          # matches your vision_config.image_size
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# ===== Prompts (exact, no example output) =====
SYSTEM_PROMPT = (
    "You are an image annotation API trained to analyze YouTube video keyframes. "
    "You will be given instructions on the output format, what to caption, and how to perform your job. "
    "Follow those instructions. For descriptions and summaries, provide them directly and do not lead them with "
    "'This image shows' or 'This keyframe displays...', just get right into the details."
)

USER_PROMPT = """You are an image annotation API trained to analyze YouTube video keyframes. You must respond with a valid JSON object matching the exact structure below.

Your job is to extract detailed **factual elements directly visible** in the image. Do not speculate or interpret artistic intent, camera focus, or composition. Do not include phrases like "this appears to be", "this looks like", or anything about the image itself. Describe what **is physically present in the frame**, and nothing more.

Return JSON in this structure:

{
    "description": "A detailed, factual account of what is visibly happening (4 sentences max). Only mention concrete elements or actions that are clearly shown. Do not include anything about how the image is styled, shot, or composed. Do not lead the description with something like 'This image shows' or 'this keyframe is...', just get right into the details.",
    "objects": ["object1 with relevant visual details", "object2 with relevant visual details", ...],
    "actions": ["action1 with participants and context", "action2 with participants and context", ...],
    "environment": "Detailed factual description of the setting and atmosphere based on visible cues (e.g., interior of a classroom with fluorescent lighting, or outdoor forest path with snow-covered trees).",
    "content_type": "The type of content it is, e.g., 'real-world footage', 'video game', 'animation', 'cartoon', 'CGI', 'VTuber', etc.",
    "specific_style": "Specific genre, aesthetic, or platform style (e.g., anime, 3D animation, mobile gameplay, vlog, tutorial, news broadcast, etc.)",
    "production_quality": "Visible production level: e.g., 'professional studio', 'amateur handheld', 'webcam recording', 'TV broadcast', etc.",
    "summary": "One clear, comprehensive sentence summarizing the visual content of the frame. Like the description, get right to the point.",
    "logos": ["logo1 with visual description", "logo2 with visual description", ...]
}

Rules:
- Be specific and literal. Focus on what is explicitly visible.
- Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
- No artistic or cinematic analysis.
- Always include the language of any text in the image if present as an object, e.g., "English text", "Japanese text", "Russian text", etc.
- Maximum 10 objects and 5 actions.
- Return an empty array for 'logos' if none are present.
- Always output strictly valid JSON with proper escaping.
- Output **only the JSON**, no extra text or explanation.
- Do not use placeholder strings or ellipses ('...'). Replace with concrete values directly observed in the image only.
"""

# ===== Utils =====
def extract_top_level_json(s: str):
    """Parse JSON; if there’s surrounding text, extract the first balanced {...} block."""
    try:
        return json.loads(s)
    except Exception:
        pass
    start, depth = None, 0
    for i, ch in enumerate(s):
        if ch == '{':
            if depth == 0: start = i
            depth += 1
        elif ch == '}':
            if depth > 0:
                depth -= 1
                if depth == 0 and start is not None:
                    chunk = s[start:i+1]
                    try:
                        return json.loads(chunk)
                    except Exception:
                        start = None
    return None

def build_messages(image):
    return [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
        {"role": "user",   "content": [{"type": "image", "image": image},
                                       {"type": "text",  "text": USER_PROMPT}]}
    ]

def resize_to_vision(pil: Image.Image, long_side: int = VISION_LONG_SIDE) -> Image.Image:
    if pil is None: return pil
    w, h = pil.size
    m = max(w, h)
    if m <= long_side:
        return pil.convert("RGB")
    s = long_side / m
    return pil.convert("RGB").resize((int(w*s), int(h*s)), Image.BICUBIC)

# ===== Load model (A100) =====
processor = tokenizer = model = None
LOAD_ERROR = None
try:
    cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
    if "clip" in cfg.__class__.__name__.lower():
        raise RuntimeError(f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder repo; need a causal VLM.")

    processor = AutoProcessor.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        token=HF_TOKEN,
        device_map="cuda",     # keep on A100
        torch_dtype=DTYPE,
        trust_remote_code=True,
        # quantization_config=None,  # uncomment to force full precision if you removed quant
    )
    tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
        MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
    )
except Exception as e:
    LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"

# ===== Inference =====
def generate(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
    if image is None:
        return "Please upload an image.", None, False
    if model is None or processor is None:
        return f"❌ Load error:\n{LOAD_ERROR}", None, False

    image = resize_to_vision(image, VISION_LONG_SIDE)

    # Chat prompt
    if hasattr(processor, "apply_chat_template"):
        prompt = processor.apply_chat_template(build_messages(image), add_generation_prompt=True, tokenize=False)
    else:
        prompt = USER_PROMPT

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
    eos = getattr(model.config, "eos_token_id", None)

    tried = []

    # (1) Greedy (fast, stable)
    try:
        g = dict(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)
        if eos is not None: g["eos_token_id"] = eos
        with torch.inference_mode():
            out = model.generate(**inputs, **g)
        text = processor.decode(out[0], skip_special_tokens=True)
        parsed = extract_top_level_json(text)
        if isinstance(parsed, dict) and "..." not in json.dumps(parsed):
            return json.dumps(parsed, indent=2), parsed, True
        tried.append(("greedy", "parse-failed-or-ellipses"))
    except Exception as e:
        tried.append(("greedy", f"err={e}"))

    # (2) Short sampled retry
    try:
        g = dict(do_sample=True, temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
        if eos is not None: g["eos_token_id"] = eos
        with torch.inference_mode():
            out = model.generate(**inputs, **g)
        text = processor.decode(out[0], skip_special_tokens=True)
        parsed = extract_top_level_json(text)
        if isinstance(parsed, dict) and "..." not in json.dumps(parsed):
            return json.dumps(parsed, indent=2), parsed, True
        tried.append(("sample_t0.1", "parse-failed-or-ellipses"))
    except Exception as e:
        tried.append(("sample_t0.1", f"err={e}"))

    return "Generation failed.\n" + str(tried), None, False

# ===== UI =====
with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="ClipTagger (VLM)") as demo:
    gr.Markdown("# ClipTagger\nUpload an image to get **strict JSON** annotations.")
    if LOAD_ERROR:
        with gr.Accordion("Startup Error Details", open=False):
            gr.Markdown(f"```\n{LOAD_ERROR}\n```")
    with gr.Row():
        with gr.Column(scale=1):
            image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")
            btn = gr.Button("Annotate", variant="primary")
        with gr.Column(scale=1):
            out_text = gr.Code(label="Output (JSON or error)")
            out_json = gr.JSON(label="Parsed JSON")
            ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)

    btn.click(generate, inputs=[image], outputs=[out_text, out_json, ok_flag])

demo.queue(max_size=32).launch()