Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on 2 days ago

Commit

30343a2

verified ·

1 Parent(s): 906a676

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -151

app.py CHANGED Viewed

@@ -3,23 +3,20 @@ from typing import Any, Dict, Tuple
 import gradio as gr
 from PIL import Image
 import torch
-import spaces
 from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
-# ------------------ ENV ------------------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
-HF_TOKEN = os.environ.get("HF_TOKEN")
 TEMP = 0.1
 MAX_NEW_TOKENS = 2000
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-# ------------------ GLOBALS (lazy) ------------------
-_processor: Any = None
-_tokenizer: Any = None
-_model: Any = None
-_last_load_error: str | None = None
-# ------------------ PROMPTS ------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
@@ -56,8 +53,11 @@ Rules:
 - Output **only the JSON**, no extra text or explanation.
 """
-# ------------------ HELPERS ------------------
 def _json_extract(text: str):
     try:
         return json.loads(text)
     except Exception:
@@ -76,8 +76,8 @@ def _build_messages(image: Image.Image):
                                        {"type": "text",  "text": USER_PROMPT}]}
     ]
-def _downscale_if_huge(pil: Image.Image, max_side: int = 1280) -> Image.Image:
-    # keep aspect, cap longest side to max_side to avoid enormous tensors on ZeroGPU
     if pil is None:
         return pil
     w, h = pil.size
@@ -88,127 +88,79 @@ def _downscale_if_huge(pil: Image.Image, max_side: int = 1280) -> Image.Image:
     new_w, new_h = int(w * scale), int(h * scale)
     return pil.convert("RGB").resize((new_w, new_h), Image.BICUBIC)
-# ------------------ ZERO-GPU LAZY LOADER ------------------
-@spaces.GPU
-def _ensure_loaded() -> str:
-    global _processor, _tokenizer, _model, _last_load_error
-    if _model is not None and _processor is not None:
-        return "already_loaded"
-    try:
-        cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
-        if "clip" in cfg.__class__.__name__.lower():
-            raise RuntimeError(
-                f"MODEL_ID '{MODEL_ID}' is a CLIP/encoder config; need a causal VLM."
-            )
-        # Try quantized (as requested by your config)
-        _processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
-        _model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             token=HF_TOKEN,
             device_map="auto",
             torch_dtype=DTYPE,
             trust_remote_code=True,
         )
-        _tokenizer = getattr(_processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
-            MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
-        )
-        _last_load_error = None
-        return "ok_quant"
     except Exception as e:
-        # If the worker image doesn't have compressed-tensors, fall back dequantized
         if "compressed_tensors" in str(e):
-            try:
-                _processor = AutoProcessor.from_pretrained(
-                    MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
-                )
-                _model = AutoModelForCausalLM.from_pretrained(
-                    MODEL_ID,
-                    token=HF_TOKEN,
-                    device_map="auto",
-                    torch_dtype=DTYPE,
-                    trust_remote_code=True,
-                    quantization_config=None,  # force dequantized load
-                )
-                _tokenizer = getattr(_processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
-                    MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
-                )
-                _last_load_error = None
-                return "ok_dequant"
-            except Exception as e2:
-                _last_load_error = f"{e}\n\nFallback failed:\n{e2}\n{traceback.format_exc()}"
-                _processor = _tokenizer = _model = None
-                return "fail"
         else:
-            _last_load_error = f"{e}\n{traceback.format_exc()}"
-            _processor = _tokenizer = _model = None
-            return "fail"
-def _safe_generate(inputs, try_json: bool = True) -> Tuple[str, bool, str]:
-    """
-    Multi-try generation to dodge ZeroGPU/transformers edge cases:
-    1) with response_format=json_object (if supported)
-    2) no response_format
-    3) shorter output + temp 0.0
-    Returns: (text_or_error, ok, detail_tag)
-    """
-    gen_sets = []
-    # (1) Preferred
-    g1 = dict(temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
-    eos = getattr(_model.config, "eos_token_id", None)
-    if eos is not None:
-        g1["eos_token_id"] = eos
-    if try_json:
-        g1["response_format"] = {"type": "json_object"}
-    gen_sets.append(("json_object", g1))
-    # (2) No response_format
-    g2 = dict(temperature=TEMP, max_new_tokens=MAX_NEW_TOKENS)
-    if eos is not None:
-        g2["eos_token_id"] = eos
-    gen_sets.append(("no_response_format", g2))
-    # (3) Shorter, deterministic
-    g3 = dict(temperature=0.0, max_new_tokens=min(512, MAX_NEW_TOKENS))
-    if eos is not None:
-        g3["eos_token_id"] = eos
-    gen_sets.append(("short_deterministic", g3))
-    last_err = None
-    for tag, g in gen_sets:
-        try:
-            with torch.inference_mode():
-                out = _model.generate(**inputs, **g)
-            if hasattr(_processor, "decode"):
-                text = _processor.decode(out[0], skip_special_tokens=True)
-            else:
-                text = _tokenizer.decode(out[0], skip_special_tokens=True)
-            return text, True, tag
-        except Exception as e:
-            last_err = f"{tag}: {e}\n{traceback.format_exc()}"
-            # continue to next strategy
-    return f"Generation failed.\n{last_err or ''}", False, "all_failed"
-# ------------------ INFERENCE ------------------
-@spaces.GPU
-def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
-    status = _ensure_loaded()
-    if status == "fail":
-        return f"❌ Load error:\n{_last_load_error}", None, False
     if image is None:
         return "Please upload an image.", None, False
-    image = _downscale_if_huge(image, max_side=1280)
-    # Build prompt
-    if hasattr(_processor, "apply_chat_template"):
-        prompt = _processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
-        # conservative fallback (rarely used on Gemma-3)
         msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
@@ -219,38 +171,51 @@ def annotate_image(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool
                 elif chunk["type"] == "image":
                     prompt += f"{role}: [IMAGE]\n"
-    try:
-        inputs = _processor(text=prompt, images=image, return_tensors="pt").to(_model.device)
-    except Exception as e:
-        err = f"Preprocessing failed: {e}\n{traceback.format_exc()}"
-        return err, None, False
-    txt, ok, tag = _safe_generate(inputs, try_json=True)
-    if not ok:
-        return txt, None, False
-    # Trim echoed prompt if present
-    if USER_PROMPT in txt:
-        txt = txt.split(USER_PROMPT)[-1].strip()
-    parsed = _json_extract(txt)
-    if isinstance(parsed, dict):
-        return json.dumps(parsed, indent=2), parsed, True
-    # Show raw + tag to help debug ValueError causes
-    return f"(strategy={tag})\n" + txt, None, False
-# Optional warmup to validate load on first worker
-@spaces.GPU(duration=60)
-def _warmup():
-    try:
-        return _ensure_loaded()
-    except Exception as e:
-        return f"warmup error: {e}"
-# ------------------ UI ------------------
-with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (ZeroGPU)") as demo:
-    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · ZeroGPU)\nUpload an image to get **strict JSON** annotations.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -259,14 +224,12 @@ with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe
         with gr.Column(scale=1):
             out_text = gr.Code(label="Output (JSON or error)")
             out_json = gr.JSON(label="Parsed JSON")
-            ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
-    btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])
-# best-effort warmup
-try:
-    _ = _warmup()
-except Exception:
-    pass
-demo.queue(max_size=32).launch()

 import gradio as gr
 from PIL import Image
 import torch
 from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# --------------------------
+# Env / params
+# --------------------------
 MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
+HF_TOKEN = os.environ.get("HF_TOKEN")  # set in Space → Settings → Variables & secrets
 TEMP = 0.1
 MAX_NEW_TOKENS = 2000
 DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+# --------------------------
+# Prompts (yours)
+# --------------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
     "You will be given instructions on the output format, what to caption, and how to perform your job. "
 - Output **only the JSON**, no extra text or explanation.
 """
+# --------------------------
+# Utilities
+# --------------------------
 def _json_extract(text: str):
+    """Strict JSON parse with top-level {...} fallback."""
     try:
         return json.loads(text)
     except Exception:
                                        {"type": "text",  "text": USER_PROMPT}]}
     ]
+def _downscale_if_huge(pil: Image.Image, max_side: int = 1792) -> Image.Image:
+    """Cap longest side to keep memory predictable; A100 is roomy but this avoids extreme uploads."""
     if pil is None:
         return pil
     w, h = pil.size
     new_w, new_h = int(w * scale), int(h * scale)
     return pil.convert("RGB").resize((new_w, new_h), Image.BICUBIC)
+# --------------------------
+# Load model (dedicated GPU)
+# --------------------------
+processor = tokenizer = model = None
+LOAD_ERROR = None
+try:
+    cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
+    if "clip" in cfg.__class__.__name__.lower():
+        raise RuntimeError(
+            f"MODEL_ID '{MODEL_ID}' resolves to a CLIP/encoder config; need a causal VLM checkpoint."
+        )
+    # Try quantized path (compressed-tensors) per your config
+    try:
+        processor = AutoProcessor.from_pretrained(
             MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
+    except TypeError:
+        processor = AutoProcessor.from_pretrained(
+            MODEL_ID, token=HF_TOKEN, trust_remote_code=True
+        )
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             token=HF_TOKEN,
             device_map="auto",
             torch_dtype=DTYPE,
             trust_remote_code=True,
         )
     except Exception as e:
+        # Fallback: disable quantization if the backend isn't available
         if "compressed_tensors" in str(e):
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                token=HF_TOKEN,
+                device_map="auto",
+                torch_dtype=DTYPE,
+                trust_remote_code=True,
+                quantization_config=None,
+            )
         else:
+            raise
+    tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
+        MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
+    )
+except Exception as e:
+    LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
+# --------------------------
+# Inference
+# --------------------------
+def run(image: Image.Image) -> Tuple[str, Dict[str, Any] | None, bool]:
     if image is None:
         return "Please upload an image.", None, False
+    if model is None or processor is None:
+        msg = (
+            "❌ Model failed to load.\n\n"
+            f"{LOAD_ERROR or 'Unknown error.'}\n"
+            "Check MODEL_ID/HF_TOKEN and that the repo includes model + processor files."
+        )
+        return msg, None, False
+    image = _downscale_if_huge(image)
+    # Build chat prompt
+    if hasattr(processor, "apply_chat_template"):
+        prompt = processor.apply_chat_template(_build_messages(image), add_generation_prompt=True, tokenize=False)
     else:
+        # Very rare fallback path
         msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
                 elif chunk["type"] == "image":
                     prompt += f"{role}: [IMAGE]\n"
+    # Tokenize with vision
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+    # Gen args
+    gen_kwargs = dict(
+        temperature=TEMP,
+        max_new_tokens=MAX_NEW_TOKENS,
+    )
+    eos = getattr(model.config, "eos_token_id", None)
+    if eos is not None:
+        gen_kwargs["eos_token_id"] = eos
+    # Try to enforce JSON; if unsupported, we'll retry without
+    tried = []
+    for tag, extra in [
+        ("json_object", {"response_format": {"type": "json_object"}}),
+        ("no_response_format", {}),
+        ("short_deterministic", {"temperature": 0.0, "max_new_tokens": min(512, MAX_NEW_TOKENS)}),
+    ]:
+        try:
+            with torch.inference_mode():
+                out = model.generate(**inputs, **{**gen_kwargs, **extra})
+            text = (processor.decode(out[0], skip_special_tokens=True)
+                    if hasattr(processor, "decode")
+                    else AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN, use_fast=True).decode(out[0], skip_special_tokens=True))
+            if USER_PROMPT in text:
+                text = text.split(USER_PROMPT)[-1].strip()
+            parsed = _json_extract(text)
+            if isinstance(parsed, dict):
+                return json.dumps(parsed, indent=2), parsed, True
+            tried.append((tag, "parsed-failed"))
+        except Exception as e:
+            tried.append((tag, f"err={e}"))
+    # If all strategies failed, return debug info
+    return "Generation failed.\nTried: " + "\n".join([f"{t[0]} -> {t[1]}" for t in tried]), None, False
+# --------------------------
+# UI
+# --------------------------
+with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM)") as demo:
+    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT · A100)\nUpload an image to get **strict JSON** annotations.")
+    if LOAD_ERROR:
+        with gr.Accordion("Startup Error Details", open=False):
+            gr.Markdown(f"```\n{LOAD_ERROR}\n```")
     with gr.Row():
         with gr.Column(scale=1):
         with gr.Column(scale=1):
             out_text = gr.Code(label="Output (JSON or error)")
             out_json = gr.JSON(label="Parsed JSON")
+            ok = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
+    def on_click(img):
+        return run(img)
+    btn.click(on_click, inputs=[image], outputs=[out_text, out_json, ok])
+# Conservative concurrency to avoid OOM spikes; A100-80GB can increase this.
+demo.queue(max_size=32, max_concurrency=1).launch()