File size: 2,563 Bytes
fd2651a
cde52cf
 
 
 
d342d8b
fd2651a
cde52cf
 
 
 
 
 
 
 
 
 
 
fd2651a
d342d8b
fd2651a
 
 
 
 
 
 
 
 
 
cde52cf
 
 
d342d8b
fd2651a
cde52cf
 
 
 
fd2651a
cde52cf
 
 
 
 
fd2651a
cde52cf
 
 
 
 
 
 
fd2651a
 
cde52cf
fd2651a
cde52cf
 
fd2651a
cde52cf
 
 
 
fd2651a
 
 
48f8af8
cde52cf
fd2651a
cde52cf
fd2651a
cde52cf
fd2651a
 
 
 
 
 
 
 
cde52cf
 
fd2651a
 
 
cde52cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import torch
from PIL import Image
from threading import Thread
from transformers import AutoProcessor, AutoModelForVision2Seq, TextIteratorStreamer
import spaces

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

MODEL_ID = "HuggingFaceTB/SmolVLM-256M-Instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=torch_dtype,
    trust_remote_code=True
).to(device)


@spaces.GPU
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    print(message)
    print(history)

    messages.extend(history)

    images = []
    if message["files"]:
        pil_image = Image.open(message["files"][0]).convert("RGB")
        images.append(pil_image)

    current_user_message = {"role": "user", "content": message["text"]}
    messages.append(current_user_message)

    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device, torch_dtype)

    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    response = ""
    for new_text in streamer:
        response += new_text
        yield response

demo = gr.ChatInterface(
    respond,
    type='messages',
    multimodal=True,
    additional_inputs=[
        gr.Textbox(value="You are a helpful and friendly multimodal assistant. You can analyze images and answer questions about them.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    title="Chatbot",
    description="Ask me anything or upload an image. This version uses AutoModel and AutoProcessor directly.",
)

if __name__ == "__main__":
    demo.launch()