File size: 5,211 Bytes
0e53af6
 
 
bdb8c95
0e53af6
 
abc934b
 
 
 
 
 
0e53af6
 
 
 
abc934b
 
 
 
 
 
0e53af6
811dbd7
 
0e53af6
abc934b
 
 
 
 
 
0e53af6
 
 
 
 
 
abc934b
13b9696
abc934b
0e53af6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abc934b
 
 
 
13b9696
abc934b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e53af6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abc934b
0e53af6
 
 
 
 
 
 
 
 
 
 
 
 
abc934b
 
 
 
 
 
 
 
 
c57ffa7
 
 
 
 
 
 
 
abc934b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c57ffa7
 
0e53af6
abc934b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import spaces
import torch
import re
import os
import gradio as gr
from threading import Thread
from transformers import (
    TextIteratorStreamer,
    AutoTokenizer,
    AutoModelForCausalLM,
    StaticCache,
)
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize

import subprocess

subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)

auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
moondream = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream-next",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map={"": "cuda"},
    attn_implementation="flash_attention_2",
    token=auth_token,
)
moondream.eval()


@spaces.GPU(duration=10)
def answer_question(img, prompt):
    if img is None:
        yield ""

    image_embeds = moondream.encode_image(img)
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=moondream.answer_question,
        kwargs={
            "image_embeds": image_embeds,
            "question": prompt,
            "tokenizer": tokenizer,
            "streamer": streamer,
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer.strip()


@spaces.GPU(duration=10)
def caption(img, mode):
    if img is None:
        yield ""

    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=moondream.caption,
        kwargs={
            "images": [img],
            "length": "short" if mode == "Short" else None,
            "tokenizer": tokenizer,
            "streamer": streamer,
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer.strip()


def extract_floats(text):
    # Regular expression to match an array of four floating point numbers
    pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
    match = re.search(pattern, text)
    if match:
        # Extract the numbers and convert them to floats
        return [float(num) for num in match.groups()]
    return None  # Return None if no match is found


def extract_bbox(text):
    bbox = None
    if extract_floats(text) is not None:
        x1, y1, x2, y2 = extract_floats(text)
        bbox = (x1, y1, x2, y2)
    return bbox


def process_answer(img, answer):
    if extract_bbox(answer) is not None:
        x1, y1, x2, y2 = extract_bbox(answer)
        draw_image = Resize(768)(img)
        width, height = draw_image.size
        x1, x2 = int(x1 * width), int(x2 * width)
        y1, y2 = int(y1 * height), int(y2 * height)
        bbox = (x1, y1, x2, y2)
        ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
        return gr.update(visible=True, value=draw_image)

    return gr.update(visible=False, value=None)


with gr.Blocks(title="moondream vl (new)") as demo:
    gr.HTML(
        """
        <style type="text/css">
            .output-text span p { font-size: 1.4rem !important; }
        </style>
        """
    )
    gr.Markdown(
        """
        # 🌔 moondream vl (new)
        A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
        """
    )
    with gr.Row():
        with gr.Column():
            mode_radio = gr.Radio(
                ["Caption", "Query", "Detect"],
                show_label=False,
                value=lambda: "Caption",
            )

            @gr.render(inputs=[mode_radio])
            def show_inputs(mode):
                if mode == "Query":
                    with gr.Group():
                        with gr.Row():
                            prompt = gr.Textbox(
                                label="Input",
                                value="How many people are in this image?",
                                scale=4,
                            )
                            submit = gr.Button("Submit")
                        img = gr.Image(type="pil", label="Upload an Image")
                    submit.click(answer_question, [img, prompt], output)
                    prompt.submit(answer_question, [img, prompt], output)
                    img.change(answer_question, [img, prompt], output)
                elif mode == "Caption":
                    with gr.Group():
                        caption_mode = gr.Radio(
                            ["Short", "Normal"],
                            show_label=False,
                            value=lambda: "Normal",
                        )
                        img = gr.Image(type="pil", label="Upload an Image")
                    caption_mode.change(caption, [img, caption_mode], output)
                    img.change(caption, [img, caption_mode], output)
                else:
                    gr.Markdown("Coming soon!")

        with gr.Column():
            output = gr.Markdown(label="Response", elem_classes=["output-text"])
            ann = gr.Image(visible=False, label="Annotated Image")


demo.queue().launch()