File size: 2,605 Bytes
4255784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

# https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct
# https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct
# model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"

# Load model & processor
model_name= "SmolVLM2-2.2B-Instruct"
model_path=f"HuggingFaceTB/{model_name}"
processor = AutoProcessor.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use FP16 for better performance on T4
    device_map="auto"  # Auto-assign model to GPU
).to(device)


import torch
import os
def describe_image(image_path, user_prompt="Describe the image in detail.",system_role=""):
    global model, processor
    messages=[]
    if not os.path.exists(image_path):
      return None
    if system_role!="":
      messages.append( {
                "role": "system",
                "content": [{"type": "text", "text": system_role}]
         })
    messages.append(
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                {"type": "image", "path": image_path},
            ]
        }
    )

    # Prepare input
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    # Convert only float32 tensors to float16
    for k, v in inputs.items():
        if v.dtype == torch.float32:
            inputs[k] = v.to(torch.float16)

    # Generate response
    generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=1024)

    # Decode and return output
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_texts[0].split("Assistant:")[-1].replace("\n\n\n\n\n\n", "").strip()

import gradio as gr

def ui():
    return gr.Interface(
        fn=describe_image,
        inputs=[
            gr.Image(type="filepath", label="Upload Image"),
            gr.Textbox(value="Describe the image in detail.", label="User Prompt"),
            gr.Textbox(value="", label="System Role (Optional)")
        ],
        outputs=gr.Textbox(label="Image Description"),
        title="Image Captioning App",
        description="Upload an image and customize prompts to get a detailed description."
    )
demo=ui()
demo.queue().launch()