import spaces import torch import re import os import gradio as gr from threading import Thread from transformers import ( TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM, StaticCache, ) from PIL import ImageDraw from torchvision.transforms.v2 import Resize import subprocess subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) auth_token = os.environ.get("TOKEN_FROM_SECRET") or True tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2") moondream = AutoModelForCausalLM.from_pretrained( "vikhyatk/moondream-next", trust_remote_code=True, torch_dtype=torch.float16, device_map={"": "cuda"}, attn_implementation="flash_attention_2", token=auth_token, ) moondream.eval() @spaces.GPU(duration=10) def answer_question(img, prompt): if img is None: yield "" image_embeds = moondream.encode_image(img) streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) thread = Thread( target=moondream.answer_question, kwargs={ "image_embeds": image_embeds, "question": prompt, "tokenizer": tokenizer, "streamer": streamer, }, ) thread.start() buffer = "" for new_text in streamer: buffer += new_text yield buffer.strip() @spaces.GPU(duration=10) def caption(img, mode): if img is None: yield "" streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) thread = Thread( target=moondream.caption, kwargs={ "images": [img], "length": "short" if mode == "Short" else None, "tokenizer": tokenizer, "streamer": streamer, }, ) thread.start() buffer = "" for new_text in streamer: buffer += new_text yield buffer.strip() def extract_floats(text): # Regular expression to match an array of four floating point numbers pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]" match = re.search(pattern, text) if match: # Extract the numbers and convert them to floats return [float(num) for num in match.groups()] return None # Return None if no match is found def extract_bbox(text): bbox = None if extract_floats(text) is not None: x1, y1, x2, y2 = extract_floats(text) bbox = (x1, y1, x2, y2) return bbox def process_answer(img, answer): if extract_bbox(answer) is not None: x1, y1, x2, y2 = extract_bbox(answer) draw_image = Resize(768)(img) width, height = draw_image.size x1, x2 = int(x1 * width), int(x2 * width) y1, y2 = int(y1 * height), int(y2 * height) bbox = (x1, y1, x2, y2) ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3) return gr.update(visible=True, value=draw_image) return gr.update(visible=False, value=None) with gr.Blocks(title="moondream vl (new)") as demo: gr.HTML( """ """ ) gr.Markdown( """ # 🌔 moondream vl (new) A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream) """ ) with gr.Row(): with gr.Column(): mode_radio = gr.Radio( ["Caption", "Query", "Detect"], show_label=False, value=lambda: "Caption", ) @gr.render(inputs=[mode_radio]) def show_inputs(mode): if mode == "Query": with gr.Group(): with gr.Row(): prompt = gr.Textbox( label="Input", value="How many people are in this image?", scale=4, ) submit = gr.Button("Submit") img = gr.Image(type="pil", label="Upload an Image") submit.click(answer_question, [img, prompt], output) prompt.submit(answer_question, [img, prompt], output) img.change(answer_question, [img, prompt], output) elif mode == "Caption": with gr.Group(): caption_mode = gr.Radio( ["Short", "Normal"], show_label=False, value=lambda: "Normal", ) img = gr.Image(type="pil", label="Upload an Image") caption_mode.change(caption, [img, caption_mode], output) img.change(caption, [img, caption_mode], output) else: gr.Markdown("Coming soon!") with gr.Column(): output = gr.Markdown(label="Response", elem_classes=["output-text"]) ann = gr.Image(visible=False, label="Annotated Image") demo.queue().launch()