Spaces:

GrassData
/

cliptagger-12b

Running on A100

App Files Files Community

cliptagger-12b / app.py

andrejrad

Update app.py

0557d7f verified 8 days ago

raw

history blame

9.91 kB

	import os, json, re, traceback
	from typing import Any, Dict, Tuple
	import gradio as gr
	from PIL import Image
	import torch
	import spaces

	# --------------------------
	# Environment
	# --------------------------
	MODEL_ID = os.environ.get("MODEL_ID", "inference-net/ClipTagger-12b")
	HF_TOKEN = os.environ.get("HF_TOKEN")

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	TEMP = 0.1
	MAX_NEW_TOKENS = 2000

	# --------------------------
	# Prompts (yours)
	# --------------------------
	SYSTEM_PROMPT = (
	"You are an image annotation API trained to analyze YouTube video keyframes. "
	"You will be given instructions on the output format, what to caption, and how to perform your job. "
	"Follow those instructions. For descriptions and summaries, provide them directly and do not lead them with "
	"'This image shows' or 'This keyframe displays...', just get right into the details."
	)

	USER_PROMPT = """You are an image annotation API trained to analyze YouTube video keyframes. You must respond with a valid JSON object matching the exact structure below.

	Your job is to extract detailed factual elements directly visible in the image. Do not speculate or interpret artistic intent, camera focus, or composition. Do not include phrases like "this appears to be", "this looks like", or anything about the image itself. Describe what is physically present in the frame, and nothing more.

	Return JSON in this structure:

	{
	"description": "A detailed, factual account of what is visibly happening (4 sentences max). Only mention concrete elements or actions that are clearly shown. Do not include anything about how the image is styled, shot, or composed. Do not lead the description with something like 'This image shows' or 'this keyframe is...', just get right into the details.",
	"objects": ["object1 with relevant visual details", "object2 with relevant visual details", ...],
	"actions": ["action1 with participants and context", "action2 with participants and context", ...],
	"environment": "Detailed factual description of the setting and atmosphere based on visible cues (e.g., interior of a classroom with fluorescent lighting, or outdoor forest path with snow-covered trees).",
	"content_type": "The type of content it is, e.g. 'real-world footage', 'video game', 'animation', 'cartoon', 'CGI', 'VTuber', etc.",
	"specific_style": "Specific genre, aesthetic, or platform style (e.g., anime, 3D animation, mobile gameplay, vlog, tutorial, news broadcast, etc.)",
	"production_quality": "Visible production level: e.g., 'professional studio', 'amateur handheld', 'webcam recording', 'TV broadcast', etc.",
	"summary": "One clear, comprehensive sentence summarizing the visual content of the frame. Like the description, get right to the point.",
	"logos": ["logo1 with visual description", "logo2 with visual description", ...]
	}

	Rules:
	- Be specific and literal. Focus on what is explicitly visible.
	- Do NOT include interpretations of emotion, mood, or narrative unless it's visually explicit.
	- No artistic or cinematic analysis.
	- Always include the language of any text in the image if present as an object, e.g. "English text", "Japanese text", "Russian text", etc.
	- Maximum 10 objects and 5 actions.
	- Return an empty array for 'logos' if none are present.
	- Always output strictly valid JSON with proper escaping.
	- Output only the JSON, no extra text or explanation.
	"""

	# --------------------------
	# Load full VLM (Gemma-3)
	# --------------------------
	from transformers import AutoConfig, AutoProcessor, AutoTokenizer, AutoModelForCausalLM

	processor = tokenizer = model = None
	LOAD_ERROR = None

	try:
	cfg = AutoConfig.from_pretrained(MODEL_ID, token=HF_TOKEN, trust_remote_code=True)
	if "clip" in cfg.__class__.__name__.lower():
	raise RuntimeError(
	f"MODEL_ID '{MODEL_ID}' resolves to a CLIP/encoder config. "
	"Point MODEL_ID to your full VLM checkpoint (this repo's config shows gemma3)."
	)

	# Processor (has vision + tokenizer routing)
	try:
	processor = AutoProcessor.from_pretrained(
	MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
	)
	except TypeError:
	processor = AutoProcessor.from_pretrained(
	MODEL_ID, token=HF_TOKEN, trust_remote_code=True
	)

	# Model
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	token=HF_TOKEN,
	device_map="auto",
	torch_dtype=DTYPE,
	trust_remote_code=True,
	)

	# Tokenizer (fall back in case processor doesn't expose it)
	tokenizer = getattr(processor, "tokenizer", None) or AutoTokenizer.from_pretrained(
	MODEL_ID, token=HF_TOKEN, trust_remote_code=True, use_fast=True
	)

	except Exception as e:
	LOAD_ERROR = f"{e}\n\n{traceback.format_exc()}"

	# --------------------------
	# Inference
	# --------------------------
	def _build_messages(image: Image.Image):
	return [
	{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
	{"role": "user", "content": [{"type": "image", "image": image},
	{"type": "text", "text": USER_PROMPT}]}
	]

	def _run(image: Image.Image) -> Tuple[str, Dict[str, Any], bool]:
	if image is None:
	return "Please upload an image.", None, False
	if model is None or processor is None:
	msg = (
	"❌ Model failed to load.\n\n"
	f"{LOAD_ERROR or 'Unknown error.'}\n"
	"Check: MODEL_ID, HF_TOKEN, and that the repo includes processor + model shards."
	)
	return msg, None, False

	# Build chat input
	if hasattr(processor, "apply_chat_template"):
	prompt = processor.apply_chat_template(
	_build_messages(image), add_generation_prompt=True, tokenize=False
	)
	else:
	# Conservative fallback
	msgs = _build_messages(image)
	prompt = ""
	for m in msgs:
	role = m["role"].upper()
	for chunk in m["content"]:
	if chunk["type"] == "text":
	prompt += f"{role}: {chunk['text']}\n"
	elif chunk["type"] == "image":
	prompt += f"{role}: [IMAGE]\n"

	# Tokenize with vision
	inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

	# Generation args
	gen_kwargs = dict(
	temperature=TEMP,
	max_new_tokens=MAX_NEW_TOKENS,
	)
	# If your config has multiple eos ids (yours does: [1, 106]), pass them
	eos_id = getattr(tokenizer, "eos_token_id", None)
	try:
	# prefer config’s eos_token_id if list-like
	from transformers.utils import is_torch_available
	cfg_eos = getattr(model.config, "eos_token_id", None)
	if isinstance(cfg_eos, (list, tuple)):
	gen_kwargs["eos_token_id"] = list(cfg_eos)
	elif eos_id is not None:
	gen_kwargs["eos_token_id"] = eos_id
	except Exception:
	if eos_id is not None:
	gen_kwargs["eos_token_id"] = eos_id

	# Ask model to emit strict JSON (supported in newer transformers for some models)
	try:
	gen_kwargs["response_format"] = {"type": "json_object"}
	except Exception:
	pass

	with torch.inference_mode():
	out_ids = model.generate(inputs, gen_kwargs)

	# Decode via processor if available (some VLMs override decode)
	if hasattr(processor, "decode"):
	text = processor.decode(out_ids[0], skip_special_tokens=True)
	else:
	text = tokenizer.decode(out_ids[0], skip_special_tokens=True)

	# Trim any echoed prompt
	if USER_PROMPT in text:
	text = text.split(USER_PROMPT)[-1].strip()

	# Strict parse, with fallback to top-level {...}
	try:
	parsed = json.loads(text)
	return json.dumps(parsed, indent=2), parsed, True
	except Exception:
	m = re.search(r"\{(?:[^{}]\|(?R))*\}", text, flags=re.DOTALL)
	if m:
	try:
	parsed = json.loads(m.group(0))
	return json.dumps(parsed, indent=2), parsed, True
	except Exception:
	pass
	# Return raw text to help debug prompt adherence if needed
	return text, None, False

	# --------------------------
	# Spaces GPU entry + warmup
	# --------------------------
	@spaces.GPU
	def annotate_image(pil: Image.Image):
	return _run(pil)

	@spaces.GPU(duration=60)
	def _warmup():
	if model is None or processor is None:
	return "skip"
	try:
	dummy = Image.new("RGB", (64, 64), (127, 127, 127))
	_ = _run(dummy)
	return "ok"
	except Exception as e:
	return f"warmup error: {e}"

	try:
	_ = _warmup()
	except Exception:
	pass

	# --------------------------
	# UI
	# --------------------------
	with gr.Blocks(theme=gr.themes.Soft(), analytics_enabled=False, title="Keyframe Annotator (Gemma-3 VLM)") as demo:
	gr.Markdown("# Keyframe Annotator (Gemma-3-12B FT)\nUpload an image to get strict JSON annotations.")
	if LOAD_ERROR:
	with gr.Accordion("Startup Error Details", open=False):
	gr.Markdown(f"```\n{LOAD_ERROR}\n```")

	with gr.Row():
	with gr.Column(scale=1):
	image = gr.Image(type="pil", label="Upload Image", image_mode="RGB")
	btn = gr.Button("Annotate", variant="primary")
	with gr.Column(scale=1):
	out_text = gr.Code(label="Output (JSON or error)")
	out_json = gr.JSON(label="Parsed JSON")
	ok_flag = gr.Checkbox(label="Valid JSON", value=False, interactive=False)

	def on_click(img):
	text, js, ok = _run(img)
	return text, js, ok

	btn.click(annotate_image, inputs=[image], outputs=[out_text, out_json, ok_flag])

	demo.queue(max_size=32).launch()