Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,211 Bytes
0e53af6 bdb8c95 0e53af6 abc934b 0e53af6 abc934b 0e53af6 811dbd7 0e53af6 abc934b 0e53af6 abc934b 13b9696 abc934b 0e53af6 abc934b 13b9696 abc934b 0e53af6 abc934b 0e53af6 abc934b c57ffa7 abc934b c57ffa7 0e53af6 abc934b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import spaces
import torch
import re
import os
import gradio as gr
from threading import Thread
from transformers import (
TextIteratorStreamer,
AutoTokenizer,
AutoModelForCausalLM,
StaticCache,
)
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
import subprocess
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
moondream = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream-next",
trust_remote_code=True,
torch_dtype=torch.float16,
device_map={"": "cuda"},
attn_implementation="flash_attention_2",
token=auth_token,
)
moondream.eval()
@spaces.GPU(duration=10)
def answer_question(img, prompt):
if img is None:
yield ""
image_embeds = moondream.encode_image(img)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer.strip()
@spaces.GPU(duration=10)
def caption(img, mode):
if img is None:
yield ""
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.caption,
kwargs={
"images": [img],
"length": "short" if mode == "Short" else None,
"tokenizer": tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer.strip()
def extract_floats(text):
# Regular expression to match an array of four floating point numbers
pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
match = re.search(pattern, text)
if match:
# Extract the numbers and convert them to floats
return [float(num) for num in match.groups()]
return None # Return None if no match is found
def extract_bbox(text):
bbox = None
if extract_floats(text) is not None:
x1, y1, x2, y2 = extract_floats(text)
bbox = (x1, y1, x2, y2)
return bbox
def process_answer(img, answer):
if extract_bbox(answer) is not None:
x1, y1, x2, y2 = extract_bbox(answer)
draw_image = Resize(768)(img)
width, height = draw_image.size
x1, x2 = int(x1 * width), int(x2 * width)
y1, y2 = int(y1 * height), int(y2 * height)
bbox = (x1, y1, x2, y2)
ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
return gr.update(visible=True, value=draw_image)
return gr.update(visible=False, value=None)
with gr.Blocks(title="moondream vl (new)") as demo:
gr.HTML(
"""
<style type="text/css">
.output-text span p { font-size: 1.4rem !important; }
</style>
"""
)
gr.Markdown(
"""
# 🌔 moondream vl (new)
A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
"""
)
with gr.Row():
with gr.Column():
mode_radio = gr.Radio(
["Caption", "Query", "Detect"],
show_label=False,
value=lambda: "Caption",
)
@gr.render(inputs=[mode_radio])
def show_inputs(mode):
if mode == "Query":
with gr.Group():
with gr.Row():
prompt = gr.Textbox(
label="Input",
value="How many people are in this image?",
scale=4,
)
submit = gr.Button("Submit")
img = gr.Image(type="pil", label="Upload an Image")
submit.click(answer_question, [img, prompt], output)
prompt.submit(answer_question, [img, prompt], output)
img.change(answer_question, [img, prompt], output)
elif mode == "Caption":
with gr.Group():
caption_mode = gr.Radio(
["Short", "Normal"],
show_label=False,
value=lambda: "Normal",
)
img = gr.Image(type="pil", label="Upload an Image")
caption_mode.change(caption, [img, caption_mode], output)
img.change(caption, [img, caption_mode], output)
else:
gr.Markdown("Coming soon!")
with gr.Column():
output = gr.Markdown(label="Response", elem_classes=["output-text"])
ann = gr.Image(visible=False, label="Annotated Image")
demo.queue().launch()
|