test-space

Running on Zero

App Files Files Community

test-space / app.py

vikhyatk

Update app.py

13b9696 verified 11 months ago

raw

history blame

5.21 kB

	import spaces
	import torch
	import re
	import os
	import gradio as gr
	from threading import Thread
	from transformers import (
	TextIteratorStreamer,
	AutoTokenizer,
	AutoModelForCausalLM,
	StaticCache,
	)
	from PIL import ImageDraw
	from torchvision.transforms.v2 import Resize

	import subprocess

	subprocess.run(
	"pip install flash-attn --no-build-isolation",
	env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
	shell=True,
	)

	auth_token = os.environ.get("TOKEN_FROM_SECRET") or True
	tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
	moondream = AutoModelForCausalLM.from_pretrained(
	"vikhyatk/moondream-next",
	trust_remote_code=True,
	torch_dtype=torch.float16,
	device_map={"": "cuda"},
	attn_implementation="flash_attention_2",
	token=auth_token,
	)
	moondream.eval()


	@spaces.GPU(duration=10)
	def answer_question(img, prompt):
	if img is None:
	yield ""

	image_embeds = moondream.encode_image(img)
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
	thread = Thread(
	target=moondream.answer_question,
	kwargs={
	"image_embeds": image_embeds,
	"question": prompt,
	"tokenizer": tokenizer,
	"streamer": streamer,
	},
	)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer.strip()


	@spaces.GPU(duration=10)
	def caption(img, mode):
	if img is None:
	yield ""

	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
	thread = Thread(
	target=moondream.caption,
	kwargs={
	"images": [img],
	"length": "short" if mode == "Short" else None,
	"tokenizer": tokenizer,
	"streamer": streamer,
	},
	)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer.strip()


	def extract_floats(text):
	# Regular expression to match an array of four floating point numbers
	pattern = r"\[\s(-?\d+\.\d+)\s,\s(-?\d+\.\d+)\s,\s(-?\d+\.\d+)\s,\s(-?\d+\.\d+)\s\]"
	match = re.search(pattern, text)
	if match:
	# Extract the numbers and convert them to floats
	return [float(num) for num in match.groups()]
	return None # Return None if no match is found


	def extract_bbox(text):
	bbox = None
	if extract_floats(text) is not None:
	x1, y1, x2, y2 = extract_floats(text)
	bbox = (x1, y1, x2, y2)
	return bbox


	def process_answer(img, answer):
	if extract_bbox(answer) is not None:
	x1, y1, x2, y2 = extract_bbox(answer)
	draw_image = Resize(768)(img)
	width, height = draw_image.size
	x1, x2 = int(x1 * width), int(x2 * width)
	y1, y2 = int(y1 * height), int(y2 * height)
	bbox = (x1, y1, x2, y2)
	ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
	return gr.update(visible=True, value=draw_image)

	return gr.update(visible=False, value=None)


	with gr.Blocks(title="moondream vl (new)") as demo:
	gr.HTML(
	"""
	<style type="text/css">
	.output-text span p { font-size: 1.4rem !important; }
	</style>
	"""
	)
	gr.Markdown(
	"""
	# 🌔 moondream vl (new)
	A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
	"""
	)
	with gr.Row():
	with gr.Column():
	mode_radio = gr.Radio(
	["Caption", "Query", "Detect"],
	show_label=False,
	value=lambda: "Caption",
	)

	@gr.render(inputs=[mode_radio])
	def show_inputs(mode):
	if mode == "Query":
	with gr.Group():
	with gr.Row():
	prompt = gr.Textbox(
	label="Input",
	value="How many people are in this image?",
	scale=4,
	)
	submit = gr.Button("Submit")
	img = gr.Image(type="pil", label="Upload an Image")
	submit.click(answer_question, [img, prompt], output)
	prompt.submit(answer_question, [img, prompt], output)
	img.change(answer_question, [img, prompt], output)
	elif mode == "Caption":
	with gr.Group():
	caption_mode = gr.Radio(
	["Short", "Normal"],
	show_label=False,
	value=lambda: "Normal",
	)
	img = gr.Image(type="pil", label="Upload an Image")
	caption_mode.change(caption, [img, caption_mode], output)
	img.change(caption, [img, caption_mode], output)
	else:
	gr.Markdown("Coming soon!")

	with gr.Column():
	output = gr.Markdown(label="Response", elem_classes=["output-text"])
	ann = gr.Image(visible=False, label="Annotated Image")


	demo.queue().launch()