from transformers import AutoProcessor, AutoModelForImageTextToText import torch # https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct # https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct # model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" # model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" # Load model & processor model_name= "SmolVLM2-2.2B-Instruct" model_path=f"HuggingFaceTB/{model_name}" processor = AutoProcessor.from_pretrained(model_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModelForImageTextToText.from_pretrained( model_path, torch_dtype=torch.float16, # Use FP16 for better performance on T4 device_map="auto" # Auto-assign model to GPU ).to(device) import torch import os def describe_image(image_path, user_prompt="Describe the image in detail.",system_role=""): global model, processor messages=[] if not os.path.exists(image_path): return None if system_role!="": messages.append( { "role": "system", "content": [{"type": "text", "text": system_role}] }) messages.append( { "role": "user", "content": [ {"type": "text", "text": user_prompt}, {"type": "image", "path": image_path}, ] } ) # Prepare input inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) # Convert only float32 tensors to float16 for k, v in inputs.items(): if v.dtype == torch.float32: inputs[k] = v.to(torch.float16) # Generate response generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=1024) # Decode and return output generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) return generated_texts[0].split("Assistant:")[-1].replace("\n\n\n\n\n\n", "").strip() import gradio as gr def ui(): return gr.Interface( fn=describe_image, inputs=[ gr.Image(type="filepath", label="Upload Image"), gr.Textbox(value="Describe the image in detail.", label="User Prompt"), gr.Textbox(value="", label="System Role (Optional)") ], outputs=gr.Textbox(label="Image Description"), title="Image Captioning App", description="Upload an image and customize prompts to get a detailed description." ) demo=ui() demo.queue().launch()