Spaces:

ankandrew
/

MiMo-VL-7B

Running on Zero

App Files Files Community

MiMo-VL-7B / app.py

ankandrew

Upload app.py

f5e6a70 verified 20 days ago

raw

history blame

12.4 kB

	# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py
	import os
	import gradio as gr
	from infer import MiMoVLInfer
	import spaces

	# infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL")
	infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL-2508")

	label_translations = {
	"gr_chatinterface_ofl": {
	"English": "Chatbot",
	},
	"gr_chatinterface_ol": {
	"English": "Chatbot",
	},
	"gr_tab_ol": {
	"English": "Online",
	},
	"gr_tab_ofl": {
	"English": "Offline",
	},
	"gr_temperature": {
	"English": "Temperature",
	},
	"gr_webcam_image": {
	"English": "🤳 Open Webcam",
	},
	"gr_webcam_images": {
	"English": "📹 Recorded Frames",
	},
	"gr_chatinterface_ofl.textbox.placeholder": {
	"English":
	"Ask me anything. You can also drop in images and .mp4 videos.",
	},
	"gr_chatinterface_ol.textbox.placeholder": {
	"English": "Ask me anything...",
	}
	}


	@spaces.GPU(duration=120) # bump if your requests take >60s
	def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float):
	infer.to_device("cuda")
	try:
	yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history
	for response_text, infer_history in infer(inputs=gr_inputs,
	history=infer_history,
	temperature=temperature):
	if response_text.startswith('<think>') and '</think>' not in response_text:
	reasoning_text = response_text.lstrip('<think>')
	response_message = [{
	"role": "assistant",
	"content": reasoning_text,
	'metadata': {'title': '🤔 Thinking'}
	}]
	yield response_message, infer_history
	elif '<think>' in response_text and '</think>' in response_text:
	reasoning_text, response_text2 = response_text.split('</think>', 1)
	reasoning_text = reasoning_text.lstrip('<think>')
	response_message = [{
	"role": "assistant",
	"content": reasoning_text,
	'metadata': {'title': '🤔 Thinking'}
	}, {
	"role": "assistant",
	"content": response_text2
	}]
	yield response_message, infer_history
	else:
	yield [{"role": "assistant", "content": response_text}], infer_history
	finally:
	infer.to_device("cpu")


	@spaces.GPU(duration=120)
	def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int,
	infer_history: list, temperature: float):
	infer.to_device("cuda")
	try:
	if not gr_webcam_images:
	gr_webcam_images = []
	gr_webcam_images = gr_webcam_images[gr_counter:]
	inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
	# send an immediate chunk
	yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history
	for response_message, infer_history in offline_chat(
	inputs, gr_history, infer_history, temperature):
	yield response_message, gr.skip(), infer_history
	finally:
	infer.to_device("cpu")


	with gr.Blocks() as demo:
	gr.Markdown("""<center><font size=8>MiMo-7b-VL</center>""")
	with gr.Column():
	# gr_title = gr.Markdown('# MiMo-VL')

	with gr.Row():
	gr_lang_selector = gr.Dropdown(choices=["English"],
	value="English",
	label="🌐 Interface",
	interactive=True,
	min_width=250,
	scale=0)
	with gr.Tabs():
	with gr.Tab("Offline") as gr_tab_ofl:
	gr_infer_history = gr.State([])
	gr_temperature_hidden = gr.Slider(minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=1.0,
	interactive=True,
	visible=False)
	gr_chatinterface_ofl = gr.ChatInterface(
	fn=offline_chat,
	type="messages",
	multimodal=True,
	chatbot=gr.Chatbot(height=800),
	textbox=gr.MultimodalTextbox(
	file_count="multiple",
	file_types=["image", ".mp4"],
	sources=["upload"],
	stop_btn=True,
	placeholder=label_translations[
	'gr_chatinterface_ofl.textbox.placeholder']['English'],
	),
	additional_inputs=[
	gr_infer_history, gr_temperature_hidden
	],
	additional_outputs=[gr_infer_history],
	)
	gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
	fn=lambda: [],
	outputs=[gr_infer_history])
	with gr.Row():
	with gr.Column(scale=1, min_width=200):
	gr_temperature_ofl = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=0.4,
	label=label_translations['gr_temperature']['English'],
	interactive=True)
	gr_temperature_ofl.change(lambda x: x,
	inputs=gr_temperature_ofl,
	outputs=gr_temperature_hidden)
	with gr.Column(scale=8):
	with gr.Column(visible=True) as gr_examples_en:
	gr.Examples(
	examples=[
	{
	"text": "Who are you?",
	"files": []
	},
	],
	inputs=[gr_chatinterface_ofl.textbox],
	)
	with gr.Tab("Online") as gr_tab_ol:
	with gr.Row():
	with gr.Column(scale=1):
	gr_infer_history = gr.State([])
	gr_temperature_hidden = gr.Slider(minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=1.0,
	interactive=True,
	visible=False)
	with gr.Row():
	with gr.Column(scale=1):
	gr_webcam_image = gr.Image(
	label=label_translations['gr_webcam_image']
	['English'],
	sources="webcam",
	height=250,
	type='filepath')
	gr_webcam_images = gr.Gallery(
	label=label_translations['gr_webcam_images']
	['English'],
	show_label=True,
	format='webp',
	columns=1,
	height=250,
	preview=True,
	interactive=False)
	gr_counter = gr.Number(value=0, visible=False)
	with gr.Column(scale=3):
	gr_chatinterface_ol = gr.ChatInterface(
	fn=online_record_chat,
	type="messages",
	multimodal=False,
	chatbot=gr.Chatbot(height=800),
	textbox=gr.
	Textbox(placeholder=label_translations[
	'gr_chatinterface_ol.textbox.placeholder']
	['English'],
	submit_btn=True,
	stop_btn=True),
	additional_inputs=[
	gr_webcam_images, gr_counter,
	gr_infer_history, gr_temperature_hidden
	],
	additional_outputs=[
	gr_counter, gr_infer_history
	],
	)

	def cache_webcam(recorded_image: str,
	recorded_images: list):
	if not recorded_images:
	recorded_images = []
	return recorded_images + [recorded_image]

	gr_webcam_image.stream(
	fn=cache_webcam,
	inputs=[gr_webcam_image, gr_webcam_images],
	outputs=[gr_webcam_images],
	stream_every=1,
	concurrency_limit=30,
	)
	with gr.Row():
	gr_temperature_ol = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=0.4,
	label=label_translations['gr_temperature']
	['English'],
	interactive=True)
	gr_temperature_ol.change(
	lambda x: x,
	inputs=gr_temperature_ol,
	outputs=gr_temperature_hidden)

	def update_lang(lang: str):
	return (
	gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
	gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
	gr.update(placeholder=label_translations[
	'gr_chatinterface_ofl.textbox.placeholder'][lang]),
	gr.update(placeholder=label_translations[
	'gr_chatinterface_ol.textbox.placeholder'][lang]),
	gr.update(label=label_translations['gr_tab_ofl'][lang]),
	gr.update(label=label_translations['gr_tab_ol'][lang]),
	gr.update(label=label_translations['gr_temperature'][lang]),
	gr.update(label=label_translations['gr_temperature'][lang]),
	gr.update(visible=lang == 'English'),
	gr.update(visible=lang != 'English'),
	gr.update(label=label_translations['gr_webcam_image'][lang]),
	gr.update(label=label_translations['gr_webcam_images'][lang]),
	)

	gr_lang_selector.change(fn=update_lang,
	inputs=[gr_lang_selector],
	outputs=[
	gr_chatinterface_ofl.chatbot,
	gr_chatinterface_ol.chatbot,
	gr_chatinterface_ofl.textbox,
	gr_chatinterface_ol.textbox,
	gr_tab_ofl,
	gr_tab_ol,
	gr_temperature_ofl,
	gr_temperature_ol,
	gr_examples_en,
	gr_webcam_image,
	gr_webcam_images,
	])
	demo.queue(default_concurrency_limit=2, max_size=50)

	if __name__ == "__main__":
	demo.launch()