Spaces:
Running
Running
File size: 4,260 Bytes
e3a0e6a 233531a e3a0e6a 233531a e3a0e6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
from PIL import Image
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from urllib.request import urlopen
import spaces
# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"
# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype="auto",
trust_remote_code=True,
_attn_implementation="eager",
)
# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
# Define inference function
@spaces.GPU
def process_input(input_type, file, question):
if not file or not question:
return "Please upload a file and provide a question."
# Prepare the prompt
if input_type == "Image":
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
# Open image from uploaded file
image = Image.open(file)
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
elif input_type == "Audio":
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
# Read audio from uploaded file
audio, samplerate = sf.read(file)
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
else:
return "Invalid input type selected."
# Generate response
with torch.no_grad():
generate_ids = model.generate(
**inputs,
max_new_tokens=200,
num_logits_to_keep=0,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
# Gradio interface
with gr.Blocks(
title="Demo of how GABI could use a Multimodal",
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="gray",
radius_size="lg",
),
) as demo:
gr.Markdown(
"""
# This Space is using Phi-4 as the LLM for the Multimodal Demo
Try uploading an **image** or **audio** file, ask a question, and get a response from the model!
We want to leverage this to allow GABI to have the ability to interact and understand various contents.
"""
)
with gr.Row():
with gr.Column(scale=1):
input_type = gr.Radio(
choices=["Image", "Audio"],
label="Select Input Type",
value="Image",
)
file_input = gr.File(
label="Upload Your File",
file_types=["image", "audio"],
)
question_input = gr.Textbox(
label="Your Question",
placeholder="e.g., 'Gabi, what is shown in this image?' or 'Gabi, transcribe this audio.'",
lines=2,
)
submit_btn = gr.Button("Submit", variant="primary")
with gr.Column(scale=2):
output_text = gr.Textbox(
label="Gabi's Response",
placeholder="Gabi's response will appear here...",
lines=10,
interactive=False,
)
# Example section
with gr.Accordion("Examples", open=False):
gr.Markdown("Try these examples:")
gr.Examples(
examples=[
["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "Gabi, what is shown in this image?"],
["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Gabi, transcribe the audio to text."],
],
inputs=[input_type, file_input, question_input],
outputs=output_text,
fn=process_input,
cache_examples=False,
)
# Connect the submit button
submit_btn.click(
fn=process_input,
inputs=[input_type, file_input, question_input],
outputs=output_text,
)
# Launch the demo
demo.launch() |