test-space / app.py
vikhyatk's picture
Update app.py
1d3ecfd verified
raw
history blame
4.81 kB
import spaces
import torch
import re
import os
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
moondream = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream-next", trust_remote_code=True,
torch_dtype=torch.bfloat16, device_map={"": "cuda"},
attn_implementation="flash_attention_2", use_auth_token=auth_token
)
moondream.eval()
@spaces.GPU(duration=10)
def answer_question(img, prompt):
image_embeds = moondream.encode_image(img)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer.strip()
def extract_floats(text):
# Regular expression to match an array of four floating point numbers
pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
match = re.search(pattern, text)
if match:
# Extract the numbers and convert them to floats
return [float(num) for num in match.groups()]
return None # Return None if no match is found
def extract_bbox(text):
bbox = None
if extract_floats(text) is not None:
x1, y1, x2, y2 = extract_floats(text)
bbox = (x1, y1, x2, y2)
return bbox
def process_answer(img, answer):
if extract_bbox(answer) is not None:
x1, y1, x2, y2 = extract_bbox(answer)
draw_image = Resize(768)(img)
width, height = draw_image.size
x1, x2 = int(x1 * width), int(x2 * width)
y1, y2 = int(y1 * height), int(y2 * height)
bbox = (x1, y1, x2, y2)
ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
return gr.update(visible=True, value=draw_image)
return gr.update(visible=False, value=None)
custom_css = """
.container {
max-width: 900px;
margin: auto;
padding: 20px;
}
.header {
text-align: center;
margin-bottom: 2rem;
}
.github-link {
display: inline-block;
padding: 8px 16px;
background-color: #24292e;
color: white;
text-decoration: none;
border-radius: 6px;
margin-top: 10px;
}
.github-link:hover {
background-color: #2f363d;
}
.input-section {
margin: 20px 0;
}
.response-section {
margin-top: 20px;
padding: 20px;
border-radius: 8px;
background-color: #f8f9fa;
}
"""
with gr.Blocks(css=custom_css) as demo:
with gr.Column(elem_classes="container"):
gr.Markdown(
"""
<div class="header">
# ๐ŸŒ” Moondream VL
### A Tiny Vision Language Model
<a href="https://github.com/vikhyat/moondream" class="github-link" target="_blank">
๐Ÿ“ฆ GitHub Repository
</a>
</div>
""",
elem_classes="header"
)
with gr.Column(elem_classes="input-section"):
img = gr.Image(
type="pil",
label="Upload an Image",
elem_id="image-upload",
height=400
)
with gr.Row():
prompt = gr.Textbox(
label="Your Question",
placeholder="Ask something about the image...",
value="Describe this image.",
scale=4
)
submit = gr.Button(
"โœจ Analyze",
variant="primary",
scale=1
)
with gr.Column(elem_classes="response-section"):
output = gr.Markdown(label="AI Response")
ann = gr.Image(
visible=False,
label="Annotated Image"
)
# Event handlers
submit.click(
answer_question,
inputs=[img, prompt],
outputs=output
)
prompt.submit(
answer_question,
inputs=[img, prompt],
outputs=output
)
output.change(
process_answer,
inputs=[img, output],
outputs=ann,
show_progress=False
)
demo.queue().launch()