Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on 4 days ago

Commit

9f13dde

verified ·

1 Parent(s): 62cada7

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -48

app.py CHANGED Viewed

@@ -6,19 +6,20 @@ import torch
 import spaces
 from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
-# --------- ENV / PARAMS ----------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
-HF_TOKEN = os.environ.get("HF_TOKEN")  # put this in Space -> Settings -> Variables & secrets
 TEMP = 0.1
 MAX_NEW_TOKENS = 2000
-# Lazy globals (ZeroGPU-safe)
 _processor: Any = None
 _tokenizer: Any = None
 _model: Any = None
 _last_load_error: str | None = None
-# --------- PROMPTS (yours) ----------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
@@ -55,9 +56,8 @@ Rules:
 - Output **only the JSON**, no extra text or explanation.
 """
-# --------- HELPERS ----------
 def _json_extract(text: str):
-    """Strict parse -> top-level {...} fallback."""
     try:
         return json.loads(text)
     except Exception:
@@ -76,26 +76,32 @@ def _build_messages(image: Image.Image):
                                        {"type": "text",  "text": USER_PROMPT}]}
     ]
-# --------- ZERO-GPU LAZY LOADER ----------
 @spaces.GPU
 def _ensure_loaded() -> str:
-    """
-    Load the model only when a ZeroGPU worker with a GPU is attached.
-    Tries quantized path first (compressed-tensors), then falls back to unquantized.
-    """
     global _processor, _tokenizer, _model, _last_load_error
     if _model is not None and _processor is not None:
         return "already_loaded"
     try:
-        # Sanity: config should be gemma3 causal VLM (not CLIP)
         cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
         if "clip" in cfg.__class__.__name__.lower():
             raise RuntimeError(
-                f"MODEL_ID '{MODEL_ID}' resolves to CLIP/encoder config; need a causal VLM checkpoint."
             )
-        # Try quantized (as per your config)
         _processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
@@ -103,7 +109,7 @@ def _ensure_loaded() -> str:
             MODEL_ID,
             token=HF_TOKEN,
             device_map="auto",
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
             trust_remote_code=True,
         )
         _tokenizer = getattr(_processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
@@ -112,7 +118,7 @@ def _ensure_loaded() -> str:
         _last_load_error = None
         return "ok_quant"
     except Exception as e:
-        # Fallback: disable quantization (more VRAM)
         if "compressed_tensors" in str(e):
             try:
                 _processor = AutoProcessor.from_pretrained(
@@ -122,7 +128,7 @@ def _ensure_loaded() -> str:
                     MODEL_ID,
                     token=HF_TOKEN,
                     device_map="auto",
-                    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                     trust_remote_code=True,
                     quantization_config=None,  # force dequantized load
                 )
@@ -140,7 +146,53 @@ def _ensure_loaded() -> str:
             _processor = _tokenizer = _model = None
             return "fail"
-# --------- INFERENCE ----------
 @spaces.GPU
 def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     status = _ensure_loaded()
@@ -150,10 +202,13 @@ def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool
     if image is None:
         return "Please upload an image.", None, False
-    # Prompt assembly
     if hasattr(_processor, "apply_chat_template"):
         prompt = _processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
         msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
@@ -164,39 +219,28 @@ def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool
                 elif chunk["type"] == "image":
                     prompt += f"{role}: [IMAGE]\n"
-    inputs = _processor(text=prompt, images=image, return_tensors="pt").to(_model.device)
-    gen_kwargs = dict(
-        temperature=TEMP,
-        max_new_tokens=MAX_NEW_TOKENS,
-    )
-    # respect multiple eos ids if present
-    eos = getattr(_model.config, "eos_token_id", None)
-    if eos is not None:
-        gen_kwargs["eos_token_id"] = eos
-    # Try JSON-only output (if supported)
     try:
-        gen_kwargs["response_format"] = {"type": "json_object"}
-    except Exception:
-        pass
-    with torch.inference_mode():
-        out = _model.generate(**inputs, **gen_kwargs)
-    text = (_processor.decode(out[0], skip_special_tokens=True)
-            if hasattr(_processor, "decode")
-            else _tokenizer.decode(out[0], skip_special_tokens=True))
-    if USER_PROMPT in text:
-        text = text.split(USER_PROMPT)[-1].strip()
-    parsed = _json_extract(text)
     if isinstance(parsed, dict):
         return json.dumps(parsed, indent=2), parsed, True
-    return text, None, False
-# Optional: quick warmup to validate loading on first worker
 @spaces.GPU(duration=60)
 def _warmup():
     try:
@@ -204,7 +248,7 @@ def _warmup():
     except Exception as e:
         return f"warmup error: {e}"
-# --------- UI ----------
 with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (ZeroGPU)") as demo:
     gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · ZeroGPU)\nUpload an image to get **strict JSON** annotations.")
@@ -219,7 +263,7 @@ with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe
     btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])
-# fire a non-blocking warmup
 try:
     _ = _warmup()
 except Exception:

 import spaces
 from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# ------------------ ENV ------------------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
+HF_TOKEN = os.environ.get("HF_TOKEN")
 TEMP = 0.1
 MAX_NEW_TOKENS = 2000
+DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# ------------------ GLOBALS (lazy) ------------------
 _processor: Any = None
 _tokenizer: Any = None
 _model: Any = None
 _last_load_error: str | None = None
+# ------------------ PROMPTS ------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
 - Output **only the JSON**, no extra text or explanation.
 """
+# ------------------ HELPERS ------------------
 def _json_extract(text: str):
     try:
         return json.loads(text)
     except Exception:
                                        {"type": "text",  "text": USER_PROMPT}]}
     ]
+def _downscale_if_huge(pil: Image.Image, max_side: int = 1280) -> Image.Image:
+    # keep aspect, cap longest side to max_side to avoid enormous tensors on ZeroGPU
+    if pil is None:
+        return pil
+    w, h = pil.size
+    m = max(w, h)
+    if m <= max_side:
+        return pil.convert("RGB")
+    scale = max_side / m
+    new_w, new_h = int(w * scale), int(h * scale)
+    return pil.convert("RGB").resize((new_w, new_h), Image.BICUBIC)
+# ------------------ ZERO-GPU LAZY LOADER ------------------
 @spaces.GPU
 def _ensure_loaded() -> str:
     global _processor, _tokenizer, _model, _last_load_error
     if _model is not None and _processor is not None:
         return "already_loaded"
     try:
         cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
         if "clip" in cfg.__class__.__name__.lower():
             raise RuntimeError(
+                f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder config; need a causal VLM."
             )
+        # Try quantized (as requested by your config)
         _processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
             MODEL_ID,
             token=HF_TOKEN,
             device_map="auto",
+            torch_dtype=DTYPE,
             trust_remote_code=True,
         )
         _tokenizer = getattr(_processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
         _last_load_error = None
         return "ok_quant"
     except Exception as e:
+        # If the worker image doesn't have compressed-tensors, fall back dequantized
         if "compressed_tensors" in str(e):
             try:
                 _processor = AutoProcessor.from_pretrained(
                     MODEL_ID,
                     token=HF_TOKEN,
                     device_map="auto",
+                    torch_dtype=DTYPE,
                     trust_remote_code=True,
                     quantization_config=None,  # force dequantized load
                 )
             _processor = _tokenizer = _model = None
             return "fail"
+def _safe_generate(inputs, try_json: bool = True) -> Tuple[str, bool, str]:
+    """
+    Multi-try generation to dodge ZeroGPU/transformers edge cases:
+    1) with response_format=json_object (if supported)
+    2) no response_format
+    3) shorter output + temp 0.0
+    Returns: (text_or_error, ok, detail_tag)
+    """
+    gen_sets = []
+    # (1) Preferred
+    g1 = dict(temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
+    eos = getattr(_model.config, "eos_token_id", None)
+    if eos is not None:
+        g1["eos_token_id"] = eos
+    if try_json:
+        g1["response_format"] = {"type": "json_object"}
+    gen_sets.append(("json_object", g1))
+    # (2) No response_format
+    g2 = dict(temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
+    if eos is not None:
+        g2["eos_token_id"] = eos
+    gen_sets.append(("no_response_format", g2))
+    # (3) Shorter, deterministic
+    g3 = dict(temperature=0.0, max_new_tokens=min(512, MAX_NEW_TOKENS))
+    if eos is not None:
+        g3["eos_token_id"] = eos
+    gen_sets.append(("short_deterministic", g3))
+    last_err = None
+    for tag, g in gen_sets:
+        try:
+            with torch.inference_mode():
+                out = _model.generate(**inputs, **g)
+            if hasattr(_processor, "decode"):
+                text = _processor.decode(out[0], skip_special_tokens=True)
+            else:
+                text = _tokenizer.decode(out[0], skip_special_tokens=True)
+            return text, True, tag
+        except Exception as e:
+            last_err = f"{tag}: {e}\n{traceback.format_exc()}"
+            # continue to next strategy
+    return f"Generation failed.\n{last_err or ''}", False, "all_failed"
+# ------------------ INFERENCE ------------------
 @spaces.GPU
 def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     status = _ensure_loaded()
     if image is None:
         return "Please upload an image.", None, False
+    image = _downscale_if_huge(image, max_side=1280)
+    # Build prompt
     if hasattr(_processor, "apply_chat_template"):
         prompt = _processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
+        # conservative fallback (rarely used on Gemma-3)
         msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
                 elif chunk["type"] == "image":
                     prompt += f"{role}: [IMAGE]\n"
     try:
+        inputs = _processor(text=prompt, images=image, return_tensors="pt").to(_model.device)
+    except Exception as e:
+        err = f"Preprocessing failed: {e}\n{traceback.format_exc()}"
+        return err, None, False
+    txt, ok, tag = _safe_generate(inputs, try_json=True)
+    if not ok:
+        return txt, None, False
+    # Trim echoed prompt if present
+    if USER_PROMPT in txt:
+        txt = txt.split(USER_PROMPT)[-1].strip()
+    parsed = _json_extract(txt)
     if isinstance(parsed, dict):
         return json.dumps(parsed, indent=2), parsed, True
+    # Show raw + tag to help debug ValueError causes
+    return f"(strategy={tag})\n" + txt, None, False
+# Optional warmup to validate load on first worker
 @spaces.GPU(duration=60)
 def _warmup():
     try:
     except Exception as e:
         return f"warmup error: {e}"
+# ------------------ UI ------------------
 with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (ZeroGPU)") as demo:
     gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · ZeroGPU)\nUpload an image to get **strict JSON** annotations.")
     btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])
+# best-effort warmup
 try:
     _ = _warmup()
 except Exception: