MiMo-VL-7B / app.py
ankandrew's picture
Upload app.py
f5e6a70 verified
raw
history blame
12.4 kB
# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py
import os
import gradio as gr
from infer import MiMoVLInfer
import spaces
# infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL")
infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL-2508")
label_translations = {
"gr_chatinterface_ofl": {
"English": "Chatbot",
},
"gr_chatinterface_ol": {
"English": "Chatbot",
},
"gr_tab_ol": {
"English": "Online",
},
"gr_tab_ofl": {
"English": "Offline",
},
"gr_temperature": {
"English": "Temperature",
},
"gr_webcam_image": {
"English": "🤳 Open Webcam",
},
"gr_webcam_images": {
"English": "📹 Recorded Frames",
},
"gr_chatinterface_ofl.textbox.placeholder": {
"English":
"Ask me anything. You can also drop in images and .mp4 videos.",
},
"gr_chatinterface_ol.textbox.placeholder": {
"English": "Ask me anything...",
}
}
@spaces.GPU(duration=120) # bump if your requests take >60s
def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float):
infer.to_device("cuda")
try:
yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history
for response_text, infer_history in infer(inputs=gr_inputs,
history=infer_history,
temperature=temperature):
if response_text.startswith('<think>') and '</think>' not in response_text:
reasoning_text = response_text.lstrip('<think>')
response_message = [{
"role": "assistant",
"content": reasoning_text,
'metadata': {'title': '🤔 Thinking'}
}]
yield response_message, infer_history
elif '<think>' in response_text and '</think>' in response_text:
reasoning_text, response_text2 = response_text.split('</think>', 1)
reasoning_text = reasoning_text.lstrip('<think>')
response_message = [{
"role": "assistant",
"content": reasoning_text,
'metadata': {'title': '🤔 Thinking'}
}, {
"role": "assistant",
"content": response_text2
}]
yield response_message, infer_history
else:
yield [{"role": "assistant", "content": response_text}], infer_history
finally:
infer.to_device("cpu")
@spaces.GPU(duration=120)
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int,
infer_history: list, temperature: float):
infer.to_device("cuda")
try:
if not gr_webcam_images:
gr_webcam_images = []
gr_webcam_images = gr_webcam_images[gr_counter:]
inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
# send an immediate chunk
yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history
for response_message, infer_history in offline_chat(
inputs, gr_history, infer_history, temperature):
yield response_message, gr.skip(), infer_history
finally:
infer.to_device("cpu")
with gr.Blocks() as demo:
gr.Markdown("""<center><font size=8>MiMo-7b-VL</center>""")
with gr.Column():
# gr_title = gr.Markdown('# MiMo-VL')
with gr.Row():
gr_lang_selector = gr.Dropdown(choices=["English"],
value="English",
label="🌐 Interface",
interactive=True,
min_width=250,
scale=0)
with gr.Tabs():
with gr.Tab("Offline") as gr_tab_ofl:
gr_infer_history = gr.State([])
gr_temperature_hidden = gr.Slider(minimum=0.0,
maximum=2.0,
step=0.1,
value=1.0,
interactive=True,
visible=False)
gr_chatinterface_ofl = gr.ChatInterface(
fn=offline_chat,
type="messages",
multimodal=True,
chatbot=gr.Chatbot(height=800),
textbox=gr.MultimodalTextbox(
file_count="multiple",
file_types=["image", ".mp4"],
sources=["upload"],
stop_btn=True,
placeholder=label_translations[
'gr_chatinterface_ofl.textbox.placeholder']['English'],
),
additional_inputs=[
gr_infer_history, gr_temperature_hidden
],
additional_outputs=[gr_infer_history],
)
gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
fn=lambda: [],
outputs=[gr_infer_history])
with gr.Row():
with gr.Column(scale=1, min_width=200):
gr_temperature_ofl = gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=0.4,
label=label_translations['gr_temperature']['English'],
interactive=True)
gr_temperature_ofl.change(lambda x: x,
inputs=gr_temperature_ofl,
outputs=gr_temperature_hidden)
with gr.Column(scale=8):
with gr.Column(visible=True) as gr_examples_en:
gr.Examples(
examples=[
{
"text": "Who are you?",
"files": []
},
],
inputs=[gr_chatinterface_ofl.textbox],
)
with gr.Tab("Online") as gr_tab_ol:
with gr.Row():
with gr.Column(scale=1):
gr_infer_history = gr.State([])
gr_temperature_hidden = gr.Slider(minimum=0.0,
maximum=2.0,
step=0.1,
value=1.0,
interactive=True,
visible=False)
with gr.Row():
with gr.Column(scale=1):
gr_webcam_image = gr.Image(
label=label_translations['gr_webcam_image']
['English'],
sources="webcam",
height=250,
type='filepath')
gr_webcam_images = gr.Gallery(
label=label_translations['gr_webcam_images']
['English'],
show_label=True,
format='webp',
columns=1,
height=250,
preview=True,
interactive=False)
gr_counter = gr.Number(value=0, visible=False)
with gr.Column(scale=3):
gr_chatinterface_ol = gr.ChatInterface(
fn=online_record_chat,
type="messages",
multimodal=False,
chatbot=gr.Chatbot(height=800),
textbox=gr.
Textbox(placeholder=label_translations[
'gr_chatinterface_ol.textbox.placeholder']
['English'],
submit_btn=True,
stop_btn=True),
additional_inputs=[
gr_webcam_images, gr_counter,
gr_infer_history, gr_temperature_hidden
],
additional_outputs=[
gr_counter, gr_infer_history
],
)
def cache_webcam(recorded_image: str,
recorded_images: list):
if not recorded_images:
recorded_images = []
return recorded_images + [recorded_image]
gr_webcam_image.stream(
fn=cache_webcam,
inputs=[gr_webcam_image, gr_webcam_images],
outputs=[gr_webcam_images],
stream_every=1,
concurrency_limit=30,
)
with gr.Row():
gr_temperature_ol = gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=0.4,
label=label_translations['gr_temperature']
['English'],
interactive=True)
gr_temperature_ol.change(
lambda x: x,
inputs=gr_temperature_ol,
outputs=gr_temperature_hidden)
def update_lang(lang: str):
return (
gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
gr.update(placeholder=label_translations[
'gr_chatinterface_ofl.textbox.placeholder'][lang]),
gr.update(placeholder=label_translations[
'gr_chatinterface_ol.textbox.placeholder'][lang]),
gr.update(label=label_translations['gr_tab_ofl'][lang]),
gr.update(label=label_translations['gr_tab_ol'][lang]),
gr.update(label=label_translations['gr_temperature'][lang]),
gr.update(label=label_translations['gr_temperature'][lang]),
gr.update(visible=lang == 'English'),
gr.update(visible=lang != 'English'),
gr.update(label=label_translations['gr_webcam_image'][lang]),
gr.update(label=label_translations['gr_webcam_images'][lang]),
)
gr_lang_selector.change(fn=update_lang,
inputs=[gr_lang_selector],
outputs=[
gr_chatinterface_ofl.chatbot,
gr_chatinterface_ol.chatbot,
gr_chatinterface_ofl.textbox,
gr_chatinterface_ol.textbox,
gr_tab_ofl,
gr_tab_ol,
gr_temperature_ofl,
gr_temperature_ol,
gr_examples_en,
gr_webcam_image,
gr_webcam_images,
])
demo.queue(default_concurrency_limit=2, max_size=50)
if __name__ == "__main__":
demo.launch()