Spaces:

GrassData
/

cliptagger-12b

Running on A100

File size: 10,102 Bytes

6a4178d
ace6ed9
 
 
 
6a4178d
 
 
 
 
 
 
 
 
ace6ed9
 
6a4178d
ace6ed9
6a4178d
 
 
ace6ed9
 
 
 
 
 
 
 
 
 
 
 
 
 
6a4178d
 
 
 
 
 
 
 
 
ace6ed9
 
 
6a4178d
 
 
 
 
 
 
 
ace6ed9
 
6a4178d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace6ed9
6a4178d
 
ace6ed9
 
6a4178d
 
 
 
 
 
 
 
 
 
 
ace6ed9
6a4178d
ace6ed9
6a4178d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace6ed9
6a4178d
 
ace6ed9
6a4178d
ace6ed9
6a4178d
 
 
 
 
 
 
 
 
 
ace6ed9
6a4178d
 
 
ace6ed9
6a4178d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace6ed9
 
 
 
6a4178d
ace6ed9
6a4178d
ace6ed9
6a4178d
 
b72fbba
6a4178d
 
 
ace6ed9
6a4178d
ace6ed9
bf8afaf

import os, json, re, traceback
import gradio as gr
from PIL import Image
import torch

# --------------------------
# Config (via Space secrets)
# --------------------------
# ADAPTER_ID: your fine-tune adapter repo (PEFT). Example: GrassData/cliptagger-12b
# BASE_ID: the Gemma-3 VLM base you fine-tuned from. Example: google/gemma-3-12b-it (gated)
# HF_TOKEN: user access token that has access to BASE_ID (if gated)
ADAPTER_ID = os.environ.get("MODEL_ID", os.environ.get("ADAPTER_ID", "GrassData/cliptagger-12b"))
BASE_ID    = os.environ.get("BASE_ID", "google/gemma-3-12b-it")
HF_TOKEN   = os.environ.get("HF_TOKEN")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE  = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# --------------------------
# Prompts (your spec)
# --------------------------
SYSTEM_PROMPT = (
    "You are an image annotation API trained to analyze YouTube video keyframes. "
    "You will be given instructions on the output format, what to caption, and how to perform your job. "
    "Follow those instructions. For descriptions and summaries, provide them directly and do not lead them with "
    "'This image shows' or 'This keyframe displays...', just get right into the details."
)

USER_PROMPT = """You are an image annotation API trained to analyze YouTube video keyframes. You must respond with a valid JSON object matching the exact structure below.

Your job is to extract detailed **factual elements directly visible** in the image. Do not speculate or interpret artistic intent, camera focus, or composition. Do not include phrases like "this appears to be", "this looks like", or anything about the image itself. Describe what **is physically present in the frame**, and nothing more.

Return JSON in this structure:

{
    "description": "A detailed, factual account of what is visibly happening (4 sentences max). Only mention concrete elements or actions that are clearly shown. Do not include anything about how the image is styled, shot, or composed. Do not lead the description with something like 'This image shows' or 'this keyframe is...', just get right into the details.",
    "objects": ["object1 with relevant visual details", "object2 with relevant visual details", ...],
    "actions": ["action1 with participants and context", "action2 with participants and context", ...],
    "environment": "Detailed factual description of the setting and atmosphere based on visible cues (e.g., interior of a classroom with fluorescent lighting, or outdoor forest path with snow-covered trees).",
    "content_type": "The type of content it is, e.g. 'real-world footage', 'video game', 'animation', 'cartoon', 'CGI', 'VTuber', etc.",
    "specific_style": "Specific genre, aesthetic, or platform style (e.g., anime, 3D animation, mobile gameplay, vlog, tutorial, news broadcast, etc.)",
    "production_quality": "Visible production level: e.g., 'professional studio', 'amateur handheld', 'webcam recording', 'TV broadcast', etc.",
    "summary": "One clear, comprehensive sentence summarizing the visual content of the frame. Like the description, get right to the point.",
    "logos": ["logo1 with visual description", "logo2 with visual description", ...]
}

Rules:
- Be specific and literal. Focus on what is explicitly visible.
- Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
- No artistic or cinematic analysis.
- Always include the language of any text in the image if present as an object, e.g. "English text", "Japanese text", "Russian text", etc.
- Maximum 10 objects and 5 actions.
- Return an empty array for 'logos' if none are present.
- Always output strictly valid JSON with proper escaping.
- Output **only the JSON**, no extra text or explanation.
"""

# --------------------------
# Load base + adapter (PEFT)
# --------------------------
def load_model_stack():
    from transformers import AutoProcessor, AutoTokenizer, AutoConfig, AutoModelForCausalLM
    from peft import PeftModel

    # Prefer loading processor from BASE_ID (has preproc files). If you've vendored
    # processor files into the adapter repo, you can switch to ADAPTER_ID here.
    try:
        processor = AutoProcessor.from_pretrained(
            BASE_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
        )
    except TypeError:
        # Some processor classes don't accept use_fast
        processor = AutoProcessor.from_pretrained(
            BASE_ID, token=HF_TOKEN, trust_remote_code=True
        )

    # Sanity check: ADAPTER should not be CLIP-only
    cfg = AutoConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN, trust_remote_code=True)
    if cfg.__class__.__name__.lower().startswith("clip"):
        raise RuntimeError(
            f"MODEL_ID/ADAPTER_ID ({ADAPTER_ID}) resolves to a CLIP/encoder config "
            "and cannot be used with AutoModelForCausalLM. Point to your PEFT adapter "
            "repo (Gemma-3 VLM adapters) or a full causal VLM checkpoint."
        )

    base = AutoModelForCausalLM.from_pretrained(
        BASE_ID,
        token=HF_TOKEN,
        device_map="auto",
        torch_dtype=DTYPE,
        trust_remote_code=True,
    )

    model = PeftModel.from_pretrained(
        base,
        ADAPTER_ID,
        token=HF_TOKEN,
    )

    # Merge adapters for faster inference (optional)
    try:
        model = model.merge_and_unload()
    except Exception:
        # If merge isn’t supported, we keep PEFT wrapper
        pass

    tokenizer = getattr(processor, "tokenizer", None)
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(
            BASE_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
        )

    return processor, tokenizer, model

LOAD_ERROR = None
processor = tokenizer = model = None
try:
    processor, tokenizer, model = load_model_stack()
except Exception as e:
    LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"

# --------------------------
# Inference
# --------------------------
def build_messages(image: Image.Image):
    return [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
        {"role": "user",   "content": [{"type": "image", "image": image},
                                       {"type": "text", "text": USER_PROMPT}]}
    ]

def generate_json(image: Image.Image):
    if image is None:
        return "Please upload an image.", None, False

    if model is None or processor is None:
        msg = (
            "❌ Model failed to load.\n\n"
            f"{LOAD_ERROR or 'Unknown error. Check BASE_ID/ADAPTER_ID/HF_TOKEN.'}\n"
            "• Ensure HF_TOKEN belongs to an account with access to the BASE_ID (if gated).\n"
            "• Ensure MODEL_ID/ADAPTER_ID points to a Gemma-3 VLM PEFT adapter (not CLIP).\n"
            "• Optionally vendor processor files into your adapter repo."
        )
        return msg, None, False

    # Prepare chat prompt
    if hasattr(processor, "apply_chat_template"):
        prompt = processor.apply_chat_template(
            build_messages(image), add_generation_prompt=True, tokenize=False
        )
    else:
        # Fallback join (rare for Gemma-3)
        msgs = build_messages(image)
        prompt = ""
        for m in msgs:
            role = m["role"].upper()
            for chunk in m["content"]:
                if chunk["type"] == "text":
                    prompt += f"{role}: {chunk['text']}\n"
                elif chunk["type"] == "image":
                    prompt += f"{role}: [IMAGE]\n"

    # Tokenize with vision
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

    # Generate with fixed params
    gen_kwargs = dict(
        max_new_tokens=2000,
        temperature=0.1,
        eos_token_id=getattr(tokenizer, "eos_token_id", None),
    )

    # Ask for JSON-only if supported by the model head
    # (Some trust_remote_code models accept response_format)
    try:
        gen_kwargs["response_format"] = {"type": "json_object"}
    except Exception:
        pass

    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)

    # Decode
    if hasattr(processor, "decode"):
        text = processor.decode(out[0], skip_special_tokens=True)
    else:
        text = tokenizer.decode(out[0], skip_special_tokens=True)

    # Best-effort: trim any preamble
    if USER_PROMPT in text:
        text = text.split(USER_PROMPT)[-1].strip()

    # Parse JSON
    try:
        parsed = json.loads(text)
        return json.dumps(parsed, indent=2), parsed, True
    except Exception:
        # Try to recover a top-level {...}
        m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
        if m:
            try:
                parsed = json.loads(m.group(0))
                return json.dumps(parsed, indent=2), parsed, True
            except Exception:
                pass
        return text, None, False

# --------------------------
# UI
# --------------------------
with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 + Adapter)") as demo:
    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT)\nUpload an image to get **strict JSON** annotations.")

    if LOAD_ERROR:
        with gr.Accordion("Startup Error Details", open=False):
            gr.Markdown(f"```\n{LOAD_ERROR}\n```")

    with gr.Row():
        with gr.Column(scale=1):
            image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")
            annotate_btn = gr.Button("Annotate", variant="primary")
        with gr.Column(scale=1):
            out_code = gr.Code(label="Model Output (JSON or error text)")
            out_json = gr.JSON(label="Parsed JSON")
            ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)

    @spaces.GPU  # ensures a GPU task is registered
    def on_submit(img):
        text, js, ok = generate_json(img)
        return text, js, ok

    annotate_btn.click(on_submit, inputs=[image], outputs=[out_code, out_json, ok_flag])

demo.queue(max_size=32).launch()