Spaces:

ankandrew
/

MiMo-VL-7B

Running on Zero

File size: 12,379 Bytes

# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py
import os
import gradio as gr
from infer import MiMoVLInfer
import spaces

# infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL")
infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL-2508")

label_translations = {
    "gr_chatinterface_ofl": {
        "English": "Chatbot",
    },
    "gr_chatinterface_ol": {
        "English": "Chatbot",
    },
    "gr_tab_ol": {
        "English": "Online",
    },
    "gr_tab_ofl": {
        "English": "Offline",
    },
    "gr_temperature": {
        "English": "Temperature",
    },
    "gr_webcam_image": {
        "English": "🤳 Open Webcam",
    },
    "gr_webcam_images": {
        "English": "📹 Recorded Frames",
    },
    "gr_chatinterface_ofl.textbox.placeholder": {
        "English":
        "Ask me anything. You can also drop in images and .mp4 videos.",
    },
    "gr_chatinterface_ol.textbox.placeholder": {
        "English": "Ask me anything...",
    }
}


@spaces.GPU(duration=120)   # bump if your requests take >60s
def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float):
    infer.to_device("cuda")
    try:
        yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history
        for response_text, infer_history in infer(inputs=gr_inputs,
                                                  history=infer_history,
                                                  temperature=temperature):
            if response_text.startswith('<think>') and '</think>' not in response_text:
                reasoning_text = response_text.lstrip('<think>')
                response_message = [{
                    "role": "assistant",
                    "content": reasoning_text,
                    'metadata': {'title': '🤔 Thinking'}
                }]
                yield response_message, infer_history
            elif '<think>' in response_text and '</think>' in response_text:
                reasoning_text, response_text2 = response_text.split('</think>', 1)
                reasoning_text = reasoning_text.lstrip('<think>')
                response_message = [{
                    "role": "assistant",
                    "content": reasoning_text,
                    'metadata': {'title': '🤔 Thinking'}
                }, {
                    "role": "assistant",
                    "content": response_text2
                }]
                yield response_message, infer_history
            else:
                yield [{"role": "assistant", "content": response_text}], infer_history
    finally:
        infer.to_device("cpu")


@spaces.GPU(duration=120)
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int,
                       infer_history: list, temperature: float):
    infer.to_device("cuda")
    try:
        if not gr_webcam_images:
            gr_webcam_images = []
        gr_webcam_images = gr_webcam_images[gr_counter:]
        inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
        # send an immediate chunk
        yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history
        for response_message, infer_history in offline_chat(
                inputs, gr_history, infer_history, temperature):
            yield response_message, gr.skip(), infer_history
    finally:
        infer.to_device("cpu")


with gr.Blocks() as demo:
    gr.Markdown("""<center><font size=8>MiMo-7b-VL</center>""")
    with gr.Column():
        # gr_title = gr.Markdown('# MiMo-VL')
        
        with gr.Row():
            gr_lang_selector = gr.Dropdown(choices=["English"],
                                           value="English",
                                           label="🌐 Interface",
                                           interactive=True,
                                           min_width=250,
                                           scale=0)
    with gr.Tabs():
        with gr.Tab("Offline") as gr_tab_ofl:
            gr_infer_history = gr.State([])
            gr_temperature_hidden = gr.Slider(minimum=0.0,
                                              maximum=2.0,
                                              step=0.1,
                                              value=1.0,
                                              interactive=True,
                                              visible=False)
            gr_chatinterface_ofl = gr.ChatInterface(
                fn=offline_chat,
                type="messages",
                multimodal=True,
                chatbot=gr.Chatbot(height=800),
                textbox=gr.MultimodalTextbox(
                    file_count="multiple",
                    file_types=["image", ".mp4"],
                    sources=["upload"],
                    stop_btn=True,
                    placeholder=label_translations[
                        'gr_chatinterface_ofl.textbox.placeholder']['English'],
                ),
                additional_inputs=[
                    gr_infer_history, gr_temperature_hidden
                ],
                additional_outputs=[gr_infer_history],
            )
            gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
                  fn=lambda: [],
                  outputs=[gr_infer_history])
            with gr.Row():
                with gr.Column(scale=1, min_width=200):
                    gr_temperature_ofl = gr.Slider(
                        minimum=0.0,
                        maximum=2.0,
                        step=0.1,
                        value=0.4,
                        label=label_translations['gr_temperature']['English'],
                        interactive=True)
                    gr_temperature_ofl.change(lambda x: x,
                                              inputs=gr_temperature_ofl,
                                              outputs=gr_temperature_hidden)
                with gr.Column(scale=8):
                    with gr.Column(visible=True) as gr_examples_en:
                        gr.Examples(
                            examples=[
                                {
                                    "text": "Who are you?",
                                    "files": []
                                },
                            ],
                            inputs=[gr_chatinterface_ofl.textbox],
                        )
        with gr.Tab("Online") as gr_tab_ol:
            with gr.Row():
                with gr.Column(scale=1):
                    gr_infer_history = gr.State([])
                    gr_temperature_hidden = gr.Slider(minimum=0.0,
                                                      maximum=2.0,
                                                      step=0.1,
                                                      value=1.0,
                                                      interactive=True,
                                                      visible=False)
                    with gr.Row():
                        with gr.Column(scale=1):
                            gr_webcam_image = gr.Image(
                                label=label_translations['gr_webcam_image']
                                ['English'],
                                sources="webcam",
                                height=250,
                                type='filepath')
                            gr_webcam_images = gr.Gallery(
                                label=label_translations['gr_webcam_images']
                                ['English'],
                                show_label=True,
                                format='webp',
                                columns=1,
                                height=250,
                                preview=True,
                                interactive=False)
                            gr_counter = gr.Number(value=0, visible=False)
                        with gr.Column(scale=3):
                            gr_chatinterface_ol = gr.ChatInterface(
                                fn=online_record_chat,
                                type="messages",
                                multimodal=False,
                                chatbot=gr.Chatbot(height=800),
                                textbox=gr.
                                Textbox(placeholder=label_translations[
                                    'gr_chatinterface_ol.textbox.placeholder']
                                        ['English'],
                                        submit_btn=True,
                                        stop_btn=True),
                                additional_inputs=[
                                    gr_webcam_images, gr_counter,
                                    gr_infer_history, gr_temperature_hidden
                                ],
                                additional_outputs=[
                                    gr_counter, gr_infer_history
                                ],
                            )

                            def cache_webcam(recorded_image: str,
                                             recorded_images: list):
                                if not recorded_images:
                                    recorded_images = []
                                return recorded_images + [recorded_image]

                            gr_webcam_image.stream(
                                fn=cache_webcam,
                                inputs=[gr_webcam_image, gr_webcam_images],
                                outputs=[gr_webcam_images],
                                stream_every=1,
                                concurrency_limit=30,
                            )
                            with gr.Row():
                                gr_temperature_ol = gr.Slider(
                                    minimum=0.0,
                                    maximum=2.0,
                                    step=0.1,
                                    value=0.4,
                                    label=label_translations['gr_temperature']
                                    ['English'],
                                    interactive=True)
                                gr_temperature_ol.change(
                                    lambda x: x,
                                    inputs=gr_temperature_ol,
                                    outputs=gr_temperature_hidden)

    def update_lang(lang: str):
        return (
            gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
            gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
            gr.update(placeholder=label_translations[
                'gr_chatinterface_ofl.textbox.placeholder'][lang]),
            gr.update(placeholder=label_translations[
                'gr_chatinterface_ol.textbox.placeholder'][lang]),
            gr.update(label=label_translations['gr_tab_ofl'][lang]),
            gr.update(label=label_translations['gr_tab_ol'][lang]),
            gr.update(label=label_translations['gr_temperature'][lang]),
            gr.update(label=label_translations['gr_temperature'][lang]),
            gr.update(visible=lang == 'English'),
            gr.update(visible=lang != 'English'),
            gr.update(label=label_translations['gr_webcam_image'][lang]),
            gr.update(label=label_translations['gr_webcam_images'][lang]),
        )

    gr_lang_selector.change(fn=update_lang,
                            inputs=[gr_lang_selector],
                            outputs=[
                                gr_chatinterface_ofl.chatbot,
                                gr_chatinterface_ol.chatbot,
                                gr_chatinterface_ofl.textbox,
                                gr_chatinterface_ol.textbox,
                                gr_tab_ofl,
                                gr_tab_ol,
                                gr_temperature_ofl,
                                gr_temperature_ol,
                                gr_examples_en,
                                gr_webcam_image,
                                gr_webcam_images,
                            ])
demo.queue(default_concurrency_limit=2, max_size=50)

if __name__ == "__main__":
    demo.launch()