Spaces:
Paused
Paused
| from transformers import Blip2Processor, Blip2ForConditionalGeneration | |
| from PIL import Image | |
| import torch | |
| import re | |
| BLIP2_MODEL_NAME = "Salesforce/blip2-flan-t5-xl" | |
| BLIP_DEVICE = "cpu" | |
| MAX_LENGTH = 120 | |
| processor = None | |
| model = None | |
| def lazy_load_blip2(): | |
| global processor, model | |
| if processor is None or model is None: | |
| print("\U0001F680 [BLIP2] Loading BLIP-2 model and processor on CPU...") | |
| processor = Blip2Processor.from_pretrained(BLIP2_MODEL_NAME) | |
| model = Blip2ForConditionalGeneration.from_pretrained( | |
| BLIP2_MODEL_NAME, | |
| torch_dtype=torch.float32 | |
| ).to(BLIP_DEVICE).eval() | |
| def clean_caption(text: str) -> str: | |
| text = text.strip() | |
| text = re.sub(r"\s+", " ", text) | |
| text = text.strip(' "\n') | |
| return text[0].upper() + text[1:] if text else text | |
| def describe_uploaded_images(images: list[Image.Image]) -> dict: | |
| if not images: | |
| return {"style_description": "", "full_caption": ""} | |
| lazy_load_blip2() | |
| captions = [] | |
| prompt = ( | |
| "Describe this image in detail. Focus on the art medium, visual style, mood or tone, lighting or rendering cues, " | |
| "and describe how people interact with objects if applicable." | |
| ) | |
| for img in images: | |
| try: | |
| inputs = processor(images=img, text=prompt, return_tensors="pt").to(BLIP_DEVICE) | |
| generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH) | |
| caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True) | |
| cleaned = clean_caption(caption) | |
| if cleaned and cleaned not in captions: | |
| captions.append(cleaned) | |
| except Exception as e: | |
| print(f"β [BLIP-2 ERROR] Failed to describe image: {e}") | |
| continue | |
| joined_caption = "; ".join(captions) | |
| return { | |
| "style_description": joined_caption, | |
| "full_caption": joined_caption | |
| } | |