Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

andrejrad commited on 3 days ago

Commit

0557d7f

verified ·

1 Parent(s): 501fb5d

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -86

app.py CHANGED Viewed

@@ -1,24 +1,24 @@
 import os, json, re, traceback
 import gradio as gr
 from PIL import Image
 import torch
 import spaces
 # --------------------------
-# Config (via Space secrets)
 # --------------------------
-# ADAPTER_ID: your fine-tune adapter repo (PEFT). Example: GrassData/cliptagger-12b
-# BASE_ID: the Gemma-3 VLM base you fine-tuned from. Example: google/gemma-3-12b-it (gated)
-# HF_TOKEN: user access token that has access to BASE_ID (if gated)
-ADAPTER_ID = os.environ.get("MODEL_ID", os.environ.get("ADAPTER_ID", "inference-net/ClipTagger-12b"))
-BASE_ID    = os.environ.get("BASE_ID", "google/gemma-3-12b-it")
-HF_TOKEN   = os.environ.get("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE  = torch.bfloat16 if torch.cuda.is_available() else torch.float32
 # --------------------------
-# Prompts (your spec)
 # --------------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
@@ -57,101 +57,77 @@ Rules:
 """
 # --------------------------
-# Load base + adapter (PEFT)
 # --------------------------
-def load_model_stack():
-    from transformers import AutoProcessor, AutoTokenizer, AutoConfig, AutoModelForCausalLM
-    from peft import PeftModel
-    # Prefer loading processor from BASE_ID (has preproc files). If you've vendored
-    # processor files into the adapter repo, you can switch to ADAPTER_ID here.
     try:
         processor = AutoProcessor.from_pretrained(
-            BASE_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
     except TypeError:
-        # Some processor classes don't accept use_fast
         processor = AutoProcessor.from_pretrained(
-            BASE_ID, token=HF_TOKEN, trust_remote_code=True
         )
-    # Sanity check: ADAPTER should not be CLIP-only
-    cfg = AutoConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN, trust_remote_code=True)
-    if cfg.__class__.__name__.lower().startswith("clip"):
-        raise RuntimeError(
-            f"MODEL_ID/ADAPTER_ID ({ADAPTER_ID}) resolves to a CLIP/encoder config "
-            "and cannot be used with AutoModelForCausalLM. Point to your PEFT adapter "
-            "repo (Gemma-3 VLM adapters) or a full causal VLM checkpoint."
-        )
-    base = AutoModelForCausalLM.from_pretrained(
-        BASE_ID,
         token=HF_TOKEN,
         device_map="auto",
         torch_dtype=DTYPE,
         trust_remote_code=True,
     )
-    model = PeftModel.from_pretrained(
-        base,
-        ADAPTER_ID,
-        token=HF_TOKEN,
     )
-    # Merge adapters for faster inference (optional)
-    try:
-        model = model.merge_and_unload()
-    except Exception:
-        # If merge isn’t supported, we keep PEFT wrapper
-        pass
-    tokenizer = getattr(processor, "tokenizer", None)
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(
-            BASE_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
-        )
-    return processor, tokenizer, model
-LOAD_ERROR = None
-processor = tokenizer = model = None
-try:
-    processor, tokenizer, model = load_model_stack()
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
 # --------------------------
 # Inference
 # --------------------------
-def build_messages(image: Image.Image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
-        {"role": "user",   "content": [{"type": "image", "image": image},
-                                       {"type": "text", "text": USER_PROMPT}]}
     ]
-def generate_json(image: Image.Image):
     if image is None:
         return "Please upload an image.", None, False
     if model is None or processor is None:
         msg = (
             "❌ Model failed to load.\n\n"
-            f"{LOAD_ERROR or 'Unknown error. Check BASE_ID/ADAPTER_ID/HF_TOKEN.'}\n"
-            "• Ensure HF_TOKEN belongs to an account with access to the BASE_ID (if gated).\n"
-            "• Ensure MODEL_ID/ADAPTER_ID points to a Gemma-3 VLM PEFT adapter (not CLIP).\n"
-            "• Optionally vendor processor files into your adapter repo."
         )
         return msg, None, False
-    # Prepare chat prompt
     if hasattr(processor, "apply_chat_template"):
         prompt = processor.apply_chat_template(
-            build_messages(image), add_generation_prompt=True, tokenize=False
         )
     else:
-        # Fallback join (rare for Gemma-3)
-        msgs = build_messages(image)
         prompt = ""
         for m in msgs:
             role = m["role"].upper()
@@ -164,39 +140,49 @@ def generate_json(image: Image.Image):
     # Tokenize with vision
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
-    # Generate with fixed params
     gen_kwargs = dict(
-        max_new_tokens=2000,
-        temperature=0.1,
-        eos_token_id=getattr(tokenizer, "eos_token_id", None),
     )
-    # Ask for JSON-only if supported by the model head
-    # (Some trust_remote_code models accept response_format)
     try:
         gen_kwargs["response_format"] = {"type": "json_object"}
     except Exception:
         pass
     with torch.inference_mode():
-        out = model.generate(**inputs, **gen_kwargs)
-    # Decode
     if hasattr(processor, "decode"):
-        text = processor.decode(out[0], skip_special_tokens=True)
     else:
-        text = tokenizer.decode(out[0], skip_special_tokens=True)
-    # Best-effort: trim any preamble
     if USER_PROMPT in text:
         text = text.split(USER_PROMPT)[-1].strip()
-    # Parse JSON
     try:
         parsed = json.loads(text)
         return json.dumps(parsed, indent=2), parsed, True
     except Exception:
-        # Try to recover a top-level {...}
         m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
         if m:
             try:
@@ -204,14 +190,37 @@ def generate_json(image: Image.Image):
                 return json.dumps(parsed, indent=2), parsed, True
             except Exception:
                 pass
         return text, None, False
 # --------------------------
-# UI
 # --------------------------
-with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (ClipTagger + Adapter)") as demo:
-    gr.Markdown("# Keyframe Annotator (ClipTagger)\nUpload an image to get **strict JSON** annotations.")
     if LOAD_ERROR:
         with gr.Accordion("Startup Error Details", open=False):
             gr.Markdown(f"```\n{LOAD_ERROR}\n```")
@@ -219,17 +228,16 @@ with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe
     with gr.Row():
         with gr.Column(scale=1):
             image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")
-            annotate_btn = gr.Button("Annotate", variant="primary")
         with gr.Column(scale=1):
-            out_code = gr.Code(label="Model Output (JSON or error text)")
             out_json = gr.JSON(label="Parsed JSON")
             ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
-    @spaces.GPU  # ensures a GPU task is registered
-    def on_submit(img):
-        text, js, ok = generate_json(img)
         return text, js, ok
-    annotate_btn.click(on_submit, inputs=[image], outputs=[out_code, out_json, ok_flag])
 demo.queue(max_size=32).launch()

 import os, json, re, traceback
+from typing import Any, Dict, Tuple
 import gradio as gr
 from PIL import Image
 import torch
 import spaces
 # --------------------------
+# Environment
 # --------------------------
+MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
+HF_TOKEN = os.environ.get("HF_TOKEN")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE  = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+TEMP = 0.1
+MAX_NEW_TOKENS = 2000
 # --------------------------
+# Prompts (yours)
 # --------------------------
 SYSTEM_PROMPT = (
     "You are an image annotation API trained to analyze YouTube video keyframes. "
 """
 # --------------------------
+# Load full VLM (Gemma-3)
 # --------------------------
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
+processor = tokenizer = model = None
+LOAD_ERROR = None
+try:
+    cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
+    if "clip" in cfg.__class__.__name__.lower():
+        raise RuntimeError(
+            f"MODEL_ID '{MODEL_ID}' resolves to a CLIP/encoder config. "
+            "Point MODEL_ID to your full VLM checkpoint (this repo's config shows gemma3)."
+        )
+    # Processor (has vision + tokenizer routing)
     try:
         processor = AutoProcessor.from_pretrained(
+            MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
         )
     except TypeError:
         processor = AutoProcessor.from_pretrained(
+            MODEL_ID, token=HF_TOKEN, trust_remote_code=True
         )
+    # Model
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
         token=HF_TOKEN,
         device_map="auto",
         torch_dtype=DTYPE,
         trust_remote_code=True,
     )
+    # Tokenizer (fall back in case processor doesn't expose it)
+    tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
+        MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
     )
 except Exception as e:
     LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"
 # --------------------------
 # Inference
 # --------------------------
+def _build_messages(image: Image.Image):
     return [
         {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+        {"role": "user", "content": [{"type": "image", "image": image},
+                                     {"type": "text",  "text": USER_PROMPT}]}
     ]
+def _run(image: Image.Image) -> Tuple[str, Dict[str, Any], bool]:
     if image is None:
         return "Please upload an image.", None, False
     if model is None or processor is None:
         msg = (
             "❌ Model failed to load.\n\n"
+            f"{LOAD_ERROR or 'Unknown error.'}\n"
+            "Check: MODEL_ID, HF_TOKEN, and that the repo includes processor + model shards."
         )
         return msg, None, False
+    # Build chat input
     if hasattr(processor, "apply_chat_template"):
         prompt = processor.apply_chat_template(
+            _build_messages(image), add_generation_prompt=True, tokenize=False
         )
     else:
+        # Conservative fallback
+        msgs = _build_messages(image)
         prompt = ""
         for m in msgs:
             role = m["role"].upper()
     # Tokenize with vision
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
+    # Generation args
     gen_kwargs = dict(
+        temperature=TEMP,
+        max_new_tokens=MAX_NEW_TOKENS,
     )
+    # If your config has multiple eos ids (yours does: [1, 106]), pass them
+    eos_id = getattr(tokenizer, "eos_token_id", None)
+    try:
+        # prefer config’s eos_token_id if list-like
+        from transformers.utils import is_torch_available
+        cfg_eos = getattr(model.config, "eos_token_id", None)
+        if isinstance(cfg_eos, (list, tuple)):
+            gen_kwargs["eos_token_id"] = list(cfg_eos)
+        elif eos_id is not None:
+            gen_kwargs["eos_token_id"] = eos_id
+    except Exception:
+        if eos_id is not None:
+            gen_kwargs["eos_token_id"] = eos_id
+    # Ask model to emit strict JSON (supported in newer transformers for some models)
     try:
         gen_kwargs["response_format"] = {"type": "json_object"}
     except Exception:
         pass
     with torch.inference_mode():
+        out_ids = model.generate(**inputs, **gen_kwargs)
+    # Decode via processor if available (some VLMs override decode)
     if hasattr(processor, "decode"):
+        text = processor.decode(out_ids[0], skip_special_tokens=True)
     else:
+        text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
+    # Trim any echoed prompt
     if USER_PROMPT in text:
         text = text.split(USER_PROMPT)[-1].strip()
+    # Strict parse, with fallback to top-level {...}
     try:
         parsed = json.loads(text)
         return json.dumps(parsed, indent=2), parsed, True
     except Exception:
         m = re.search(r"\{(?:[^{}]|(?R))*\}", text, flags=re.DOTALL)
         if m:
             try:
                 return json.dumps(parsed, indent=2), parsed, True
             except Exception:
                 pass
+        # Return raw text to help debug prompt adherence if needed
         return text, None, False
 # --------------------------
+# Spaces GPU entry + warmup
 # --------------------------
+@spaces.GPU
+def annotate_image(pil: Image.Image):
+    return _run(pil)
+@spaces.GPU(duration=60)
+def _warmup():
+    if model is None or processor is None:
+        return "skip"
+    try:
+        dummy = Image.new("RGB", (64, 64), (127, 127, 127))
+        _ = _run(dummy)
+        return "ok"
+    except Exception as e:
+        return f"warmup error: {e}"
+try:
+    _ = _warmup()
+except Exception:
+    pass
+# --------------------------
+# UI
+# --------------------------
+with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM)") as demo:
+    gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT)\nUpload an image to get **strict JSON** annotations.")
     if LOAD_ERROR:
         with gr.Accordion("Startup Error Details", open=False):
             gr.Markdown(f"```\n{LOAD_ERROR}\n```")
     with gr.Row():
         with gr.Column(scale=1):
             image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")
+            btn = gr.Button("Annotate", variant="primary")
         with gr.Column(scale=1):
+            out_text = gr.Code(label="Output (JSON or error)")
             out_json = gr.JSON(label="Parsed JSON")
             ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)
+    def on_click(img):
+        text, js, ok = _run(img)
         return text, js, ok
+    btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])
 demo.queue(max_size=32).launch()