File size: 10,289 Bytes
6a4178d
0557d7f
ace6ed9
 
 
21b17c3
ace6ed9
30343a2
 
 
0557d7f
30343a2
0557d7f
 
9f13dde
0557d7f
30343a2
 
 
ace6ed9
 
 
 
 
 
 
 
 
 
 
 
 
 
6a4178d
 
 
 
 
 
 
 
 
ace6ed9
 
 
6a4178d
 
 
 
 
 
 
 
ace6ed9
 
30343a2
 
 
21b17c3
30343a2
21b17c3
 
 
 
 
 
 
 
 
 
6a4178d
21b17c3
 
 
 
 
 
0557d7f
30343a2
 
9f13dde
 
 
 
 
 
 
 
 
 
30343a2
 
 
 
 
 
 
 
 
 
 
 
21b17c3
30343a2
 
 
0557d7f
6a4178d
30343a2
 
 
 
 
 
 
21b17c3
 
 
9f13dde
21b17c3
6a4178d
21b17c3
30343a2
21b17c3
30343a2
 
 
 
 
 
 
 
21b17c3
30343a2
21b17c3
30343a2
 
 
9f13dde
30343a2
 
6a4178d
30343a2
 
 
 
6a4178d
 
30343a2
 
 
 
 
 
 
ace6ed9
30343a2
9f13dde
30343a2
 
 
6a4178d
30343a2
0557d7f
6a4178d
 
 
 
 
 
 
 
 
30343a2
 
ace6ed9
30343a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f13dde
30343a2
 
0557d7f
30343a2
 
 
 
 
 
 
 
ace6ed9
 
 
 
0557d7f
ace6ed9
0557d7f
ace6ed9
30343a2
6a4178d
30343a2
 
ace6ed9
30343a2
21b17c3
30343a2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os, json, re, traceback
from typing import Any, Dict, Tuple
import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig

# --------------------------
# Env / params
# --------------------------
MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
HF_TOKEN = os.environ.get("HF_TOKEN")  # set in Space β†’ Settings β†’ Variables & secrets
TEMP = 0.1
MAX_NEW_TOKENS = 2000
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

# --------------------------
# Prompts (yours)
# --------------------------
SYSTEM_PROMPT = (
    "You are an image annotation API trained to analyze YouTube video keyframes. "
    "You will be given instructions on the output format, what to caption, and how to perform your job. "
    "Follow those instructions. For descriptions and summaries, provide them directly and do not lead them with "
    "'This image shows' or 'This keyframe displays...', just get right into the details."
)

USER_PROMPT = """You are an image annotation API trained to analyze YouTube video keyframes. You must respond with a valid JSON object matching the exact structure below.

Your job is to extract detailed **factual elements directly visible** in the image. Do not speculate or interpret artistic intent, camera focus, or composition. Do not include phrases like "this appears to be", "this looks like", or anything about the image itself. Describe what **is physically present in the frame**, and nothing more.

Return JSON in this structure:

{
    "description": "A detailed, factual account of what is visibly happening (4 sentences max). Only mention concrete elements or actions that are clearly shown. Do not include anything about how the image is styled, shot, or composed. Do not lead the description with something like 'This image shows' or 'this keyframe is...', just get right into the details.",
    "objects": ["object1 with relevant visual details", "object2 with relevant visual details", ...],
    "actions": ["action1 with participants and context", "action2 with participants and context", ...],
    "environment": "Detailed factual description of the setting and atmosphere based on visible cues (e.g., interior of a classroom with fluorescent lighting, or outdoor forest path with snow-covered trees).",
    "content_type": "The type of content it is, e.g. 'real-world footage', 'video game', 'animation', 'cartoon', 'CGI', 'VTuber', etc.",
    "specific_style": "Specific genre, aesthetic, or platform style (e.g., anime, 3D animation, mobile gameplay, vlog, tutorial, news broadcast, etc.)",
    "production_quality": "Visible production level: e.g., 'professional studio', 'amateur handheld', 'webcam recording', 'TV broadcast', etc.",
    "summary": "One clear, comprehensive sentence summarizing the visual content of the frame. Like the description, get right to the point.",
    "logos": ["logo1 with visual description", "logo2 with visual description", ...]
}

Rules:
- Be specific and literal. Focus on what is explicitly visible.
- Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
- No artistic or cinematic analysis.
- Always include the language of any text in the image if present as an object, e.g. "English text", "Japanese text", "Russian text", etc.
- Maximum 10 objects and 5 actions.
- Return an empty array for 'logos' if none are present.
- Always output strictly valid JSON with proper escaping.
- Output **only the JSON**, no extra text or explanation.
"""

# --------------------------
# Utilities
# --------------------------
def _json_extract(text: str):
    """Strict JSON parse with top-level {...} fallback."""
    try:
        return json.loads(text)
    except Exception:
        m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
        if m:
            try:
                return json.loads(m.group(0))
            except Exception:
                pass
    return None

def _build_messages(image: Image.Image):
    return [
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
        {"role": "user",   "content": [{"type": "image", "image": image},
                                       {"type": "text",  "text": USER_PROMPT}]}
    ]

def _downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
    """Cap longest side to keep memory predictable; A100 is roomy but this avoids extreme uploads."""
    if pil is None:
        return pil
    w, h = pil.size
    m = max(w, h)
    if m <= max_side:
        return pil.convert("RGB")
    scale = max_side / m
    new_w, new_h = int(w * scale), int(h * scale)
    return pil.convert("RGB").resize((new_w, new_h), Image.BICUBIC)

# --------------------------
# Load model (dedicated GPU)
# --------------------------
processor = tokenizer = model = None
LOAD_ERROR = None

try:
    cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
    if "clip" in cfg.__class__.__name__.lower():
        raise RuntimeError(
            f"MODEL_ID '{MODEL_ID}' resolves to a CLIP/encoder config; need a causal VLM checkpoint."
        )

    # Try quantized path (compressed-tensors) per your config
    try:
        processor = AutoProcessor.from_pretrained(
            MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
        )
    except TypeError:
        processor = AutoProcessor.from_pretrained(
            MODEL_ID, token=HF_TOKEN, trust_remote_code=True
        )

    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            token=HF_TOKEN,
            device_map="auto",
            torch_dtype=DTYPE,
            trust_remote_code=True,
        )
    except Exception as e:
        # Fallback: disable quantization if the backend isn't available
        if "compressed_tensors" in str(e):
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                token=HF_TOKEN,
                device_map="auto",
                torch_dtype=DTYPE,
                trust_remote_code=True,
                quantization_config=None,
            )
        else:
            raise

    tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
        MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
    )

except Exception as e:
    LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"

# --------------------------
# Inference
# --------------------------
def run(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
    if image is None:
        return "Please upload an image.", None, False
    if model is None or processor is None:
        msg = (
            "❌ Model failed to load.\n\n"
            f"{LOAD_ERROR or 'Unknown error.'}\n"
            "Check MODEL_ID/HF_TOKEN and that the repo includes model + processor files."
        )
        return msg, None, False

    image = _downscale_if_huge(image)

    # Build chat prompt
    if hasattr(processor, "apply_chat_template"):
        prompt = processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
    else:
        # Very rare fallback path
        msgs = _build_messages(image)
        prompt = ""
        for m in msgs:
            role = m["role"].upper()
            for chunk in m["content"]:
                if chunk["type"] == "text":
                    prompt += f"{role}: {chunk['text']}\n"
                elif chunk["type"] == "image":
                    prompt += f"{role}: [IMAGE]\n"

    # Tokenize with vision
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

    # Gen args
    gen_kwargs = dict(
        temperature=TEMP,
        max_new_tokens=MAX_NEW_TOKENS,
    )
    eos = getattr(model.config, "eos_token_id", None)
    if eos is not None:
        gen_kwargs["eos_token_id"] = eos

    # Try to enforce JSON; if unsupported, we'll retry without
    tried = []
    for tag, extra in [
        ("json_object", {"response_format": {"type": "json_object"}}),
        ("no_response_format", {}),
        ("short_deterministic", {"temperature": 0.0, "max_new_tokens": min(512, MAX_NEW_TOKENS)}),
    ]:
        try:
            with torch.inference_mode():
                out = model.generate(**inputs, **{**gen_kwargs, **extra})
            text = (processor.decode(out[0], skip_special_tokens=True)
                    if hasattr(processor, "decode")
                    else AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN, use_fast=True).decode(out[0], skip_special_tokens=True))
            if USER_PROMPT in text:
                text = text.split(USER_PROMPT)[-1].strip()
            parsed = _json_extract(text)
            if isinstance(parsed, dict):
                return json.dumps(parsed, indent=2), parsed, True
            tried.append((tag, "parsed-failed"))
        except Exception as e:
            tried.append((tag, f"err={e}"))

    # If all strategies failed, return debug info
    return "Generation failed.\nTried: " + "\n".join([f"{t[0]} -> {t[1]}" for t in tried]), None, False

# --------------------------
# UI
# --------------------------
with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM)") as demo:
    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT Β· A100)\nUpload an image to get **strict JSON** annotations.")
    if LOAD_ERROR:
        with gr.Accordion("Startup Error Details", open=False):
            gr.Markdown(f"```\n{LOAD_ERROR}\n```")

    with gr.Row():
        with gr.Column(scale=1):
            image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")
            btn = gr.Button("Annotate", variant="primary")
        with gr.Column(scale=1):
            out_text = gr.Code(label="Output (JSON or error)")
            out_json = gr.JSON(label="Parsed JSON")
            ok = gr.Checkbox(label="Valid JSON", value=False, interactive=False)

    def on_click(img):
        return run(img)

    btn.click(on_click, inputs=[image], outputs=[out_text, out_json, ok])

# Conservative concurrency to avoid OOM spikes; A100-80GB can increase this.
demo.queue(max_size=32, max_concurrency=1).launch()