File size: 4,260 Bytes
e3a0e6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233531a
e3a0e6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233531a
 
e3a0e6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from urllib.request import urlopen
import spaces

# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"

# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="eager",
)

# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'

# Define inference function
@spaces.GPU
def process_input(input_type, file, question):
    if not file or not question:
        return "Please upload a file and provide a question."

    # Prepare the prompt
    if input_type == "Image":
        prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
        # Open image from uploaded file
        image = Image.open(file)
        inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
    elif input_type == "Audio":
        prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
        # Read audio from uploaded file
        audio, samplerate = sf.read(file)
        inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
    else:
        return "Invalid input type selected."

    # Generate response
    with torch.no_grad():
        generate_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            num_logits_to_keep=0,
        )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return response

# Gradio interface
with gr.Blocks(
    title="Demo of how GABI could use a Multimodal",
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="gray",
        radius_size="lg",
    ),
) as demo:
    gr.Markdown(
        """
        # This Space is using Phi-4 as the LLM for the Multimodal Demo
        Try uploading an **image** or **audio** file, ask a question, and get a response from the model!
        We want to leverage this to allow GABI to have the ability to interact and understand various contents.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            input_type = gr.Radio(
                choices=["Image", "Audio"],
                label="Select Input Type",
                value="Image",
            )
            file_input = gr.File(
                label="Upload Your File",
                file_types=["image", "audio"],
            )
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="e.g., 'Gabi, what is shown in this image?' or 'Gabi, transcribe this audio.'",
                lines=2,
            )
            submit_btn = gr.Button("Submit", variant="primary")
        
        with gr.Column(scale=2):
            output_text = gr.Textbox(
                label="Gabi's Response",
                placeholder="Gabi's response will appear here...",
                lines=10,
                interactive=False,
            )

    # Example section
    with gr.Accordion("Examples", open=False):
        gr.Markdown("Try these examples:")
        gr.Examples(
            examples=[
                ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "Gabi, what is shown in this image?"],
                ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Gabi, transcribe the audio to text."],
            ],
            inputs=[input_type, file_input, question_input],
            outputs=output_text,
            fn=process_input,
            cache_examples=False,
        )

    # Connect the submit button
    submit_btn.click(
        fn=process_input,
        inputs=[input_type, file_input, question_input],
        outputs=output_text,
    )

# Launch the demo
demo.launch()