from datetime import datetime import os from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image, ImageOps import torch from peft import PeftModel from huggingface_hub import snapshot_download device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") base_model_name = "HuggingFaceTB/SmolVLM-256M-Instruct" processor = AutoProcessor.from_pretrained( base_model_name, torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2" if device == "cuda" else "eager" ) base_model = AutoModelForVision2Seq.from_pretrained(base_model_name, torch_dtype=torch.bfloat16).to(device) repo_local_path = snapshot_download( repo_id="Irina1402/smolvlm-painting-description" ) model = PeftModel.from_pretrained(base_model, model_id=repo_local_path) model.eval() def process_chat(text: str = None, image: Image.Image = None): """Process the input and generate a response using SmolVLM.""" image_data = None inputs = [] if image: image_data = image.convert("RGB") image_data = ImageOps.exif_transpose(image_data) inputs.append({"type": "image"}) if text: inputs.append({"type": "text", "text": text}) message = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}] prompt = processor.apply_chat_template(message, add_generation_prompt=True) print(f"Prepared prompt:\n{prompt}") processed_inputs = processor( text=prompt, images=[image_data] if image_data else None, return_tensors="pt" ).to(device) with torch.no_grad(): generated_ids = model.generate(**processed_inputs, max_new_tokens=50, repetition_penalty=1.2) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] assistant_text = generated_text.split("Assistant:", 1)[-1].strip() return assistant_text