Spaces:
Running
on
Zero
Running
on
Zero
# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py | |
import os | |
import gradio as gr | |
from infer import MiMoVLInfer | |
import spaces | |
# infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL") | |
infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL-2508") | |
label_translations = { | |
"gr_chatinterface_ofl": { | |
"English": "Chatbot", | |
}, | |
"gr_chatinterface_ol": { | |
"English": "Chatbot", | |
}, | |
"gr_tab_ol": { | |
"English": "Online", | |
}, | |
"gr_tab_ofl": { | |
"English": "Offline", | |
}, | |
"gr_temperature": { | |
"English": "Temperature", | |
}, | |
"gr_webcam_image": { | |
"English": "🤳 Open Webcam", | |
}, | |
"gr_webcam_images": { | |
"English": "📹 Recorded Frames", | |
}, | |
"gr_chatinterface_ofl.textbox.placeholder": { | |
"English": | |
"Ask me anything. You can also drop in images and .mp4 videos.", | |
}, | |
"gr_chatinterface_ol.textbox.placeholder": { | |
"English": "Ask me anything...", | |
} | |
} | |
# bump if your requests take >60s | |
def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float): | |
infer.to_device("cuda") | |
try: | |
yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history | |
for response_text, infer_history in infer(inputs=gr_inputs, | |
history=infer_history, | |
temperature=temperature): | |
if response_text.startswith('<think>') and '</think>' not in response_text: | |
reasoning_text = response_text.lstrip('<think>') | |
response_message = [{ | |
"role": "assistant", | |
"content": reasoning_text, | |
'metadata': {'title': '🤔 Thinking'} | |
}] | |
yield response_message, infer_history | |
elif '<think>' in response_text and '</think>' in response_text: | |
reasoning_text, response_text2 = response_text.split('</think>', 1) | |
reasoning_text = reasoning_text.lstrip('<think>') | |
response_message = [{ | |
"role": "assistant", | |
"content": reasoning_text, | |
'metadata': {'title': '🤔 Thinking'} | |
}, { | |
"role": "assistant", | |
"content": response_text2 | |
}] | |
yield response_message, infer_history | |
else: | |
yield [{"role": "assistant", "content": response_text}], infer_history | |
finally: | |
infer.to_device("cpu") | |
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int, | |
infer_history: list, temperature: float): | |
infer.to_device("cuda") | |
try: | |
if not gr_webcam_images: | |
gr_webcam_images = [] | |
gr_webcam_images = gr_webcam_images[gr_counter:] | |
inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} | |
# send an immediate chunk | |
yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history | |
for response_message, infer_history in offline_chat( | |
inputs, gr_history, infer_history, temperature): | |
yield response_message, gr.skip(), infer_history | |
finally: | |
infer.to_device("cpu") | |
with gr.Blocks() as demo: | |
gr.Markdown("""<center><font size=8>MiMo-7b-VL</center>""") | |
with gr.Column(): | |
# gr_title = gr.Markdown('# MiMo-VL') | |
with gr.Row(): | |
gr_lang_selector = gr.Dropdown(choices=["English"], | |
value="English", | |
label="🌐 Interface", | |
interactive=True, | |
min_width=250, | |
scale=0) | |
with gr.Tabs(): | |
with gr.Tab("Offline") as gr_tab_ofl: | |
gr_infer_history = gr.State([]) | |
gr_temperature_hidden = gr.Slider(minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=1.0, | |
interactive=True, | |
visible=False) | |
gr_chatinterface_ofl = gr.ChatInterface( | |
fn=offline_chat, | |
type="messages", | |
multimodal=True, | |
chatbot=gr.Chatbot(height=800), | |
textbox=gr.MultimodalTextbox( | |
file_count="multiple", | |
file_types=["image", ".mp4"], | |
sources=["upload"], | |
stop_btn=True, | |
placeholder=label_translations[ | |
'gr_chatinterface_ofl.textbox.placeholder']['English'], | |
), | |
additional_inputs=[ | |
gr_infer_history, gr_temperature_hidden | |
], | |
additional_outputs=[gr_infer_history], | |
) | |
gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], | |
fn=lambda: [], | |
outputs=[gr_infer_history]) | |
with gr.Row(): | |
with gr.Column(scale=1, min_width=200): | |
gr_temperature_ofl = gr.Slider( | |
minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=0.4, | |
label=label_translations['gr_temperature']['English'], | |
interactive=True) | |
gr_temperature_ofl.change(lambda x: x, | |
inputs=gr_temperature_ofl, | |
outputs=gr_temperature_hidden) | |
with gr.Column(scale=8): | |
with gr.Column(visible=True) as gr_examples_en: | |
gr.Examples( | |
examples=[ | |
{ | |
"text": "Who are you?", | |
"files": [] | |
}, | |
], | |
inputs=[gr_chatinterface_ofl.textbox], | |
) | |
with gr.Tab("Online") as gr_tab_ol: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr_infer_history = gr.State([]) | |
gr_temperature_hidden = gr.Slider(minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=1.0, | |
interactive=True, | |
visible=False) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr_webcam_image = gr.Image( | |
label=label_translations['gr_webcam_image'] | |
['English'], | |
sources="webcam", | |
height=250, | |
type='filepath') | |
gr_webcam_images = gr.Gallery( | |
label=label_translations['gr_webcam_images'] | |
['English'], | |
show_label=True, | |
format='webp', | |
columns=1, | |
height=250, | |
preview=True, | |
interactive=False) | |
gr_counter = gr.Number(value=0, visible=False) | |
with gr.Column(scale=3): | |
gr_chatinterface_ol = gr.ChatInterface( | |
fn=online_record_chat, | |
type="messages", | |
multimodal=False, | |
chatbot=gr.Chatbot(height=800), | |
textbox=gr. | |
Textbox(placeholder=label_translations[ | |
'gr_chatinterface_ol.textbox.placeholder'] | |
['English'], | |
submit_btn=True, | |
stop_btn=True), | |
additional_inputs=[ | |
gr_webcam_images, gr_counter, | |
gr_infer_history, gr_temperature_hidden | |
], | |
additional_outputs=[ | |
gr_counter, gr_infer_history | |
], | |
) | |
def cache_webcam(recorded_image: str, | |
recorded_images: list): | |
if not recorded_images: | |
recorded_images = [] | |
return recorded_images + [recorded_image] | |
gr_webcam_image.stream( | |
fn=cache_webcam, | |
inputs=[gr_webcam_image, gr_webcam_images], | |
outputs=[gr_webcam_images], | |
stream_every=1, | |
concurrency_limit=30, | |
) | |
with gr.Row(): | |
gr_temperature_ol = gr.Slider( | |
minimum=0.0, | |
maximum=2.0, | |
step=0.1, | |
value=0.4, | |
label=label_translations['gr_temperature'] | |
['English'], | |
interactive=True) | |
gr_temperature_ol.change( | |
lambda x: x, | |
inputs=gr_temperature_ol, | |
outputs=gr_temperature_hidden) | |
def update_lang(lang: str): | |
return ( | |
gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), | |
gr.update(label=label_translations['gr_chatinterface_ol'][lang]), | |
gr.update(placeholder=label_translations[ | |
'gr_chatinterface_ofl.textbox.placeholder'][lang]), | |
gr.update(placeholder=label_translations[ | |
'gr_chatinterface_ol.textbox.placeholder'][lang]), | |
gr.update(label=label_translations['gr_tab_ofl'][lang]), | |
gr.update(label=label_translations['gr_tab_ol'][lang]), | |
gr.update(label=label_translations['gr_temperature'][lang]), | |
gr.update(label=label_translations['gr_temperature'][lang]), | |
gr.update(visible=lang == 'English'), | |
gr.update(visible=lang != 'English'), | |
gr.update(label=label_translations['gr_webcam_image'][lang]), | |
gr.update(label=label_translations['gr_webcam_images'][lang]), | |
) | |
gr_lang_selector.change(fn=update_lang, | |
inputs=[gr_lang_selector], | |
outputs=[ | |
gr_chatinterface_ofl.chatbot, | |
gr_chatinterface_ol.chatbot, | |
gr_chatinterface_ofl.textbox, | |
gr_chatinterface_ol.textbox, | |
gr_tab_ofl, | |
gr_tab_ol, | |
gr_temperature_ofl, | |
gr_temperature_ol, | |
gr_examples_en, | |
gr_webcam_image, | |
gr_webcam_images, | |
]) | |
demo.queue(default_concurrency_limit=2, max_size=50) | |
if __name__ == "__main__": | |
demo.launch() | |