Spaces:
Sleeping
Sleeping
import gradio as gr | |
import time | |
import onnxruntime_genai as og | |
from huggingface_hub import snapshot_download | |
import os | |
# Load model and processor | |
local_dir = snapshot_download( | |
repo_id="microsoft/Phi-3.5-vision-instruct-onnx", | |
allow_patterns="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*" | |
) | |
model_folder = os.path.join( | |
local_dir, "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4" | |
) | |
model = og.Model(model_folder) | |
processor = model.create_multimodal_processor() | |
tokenizer_stream = processor.create_stream() | |
user_prompt = '<|user|>\n' | |
assistant_prompt = '<|assistant|>\n' | |
prompt_suffix = "<|end|>\n" | |
# Inference function | |
def ask_phi(image, question, max_length): | |
start_time = time.time() | |
prompt = f"{user_prompt}<|image_1|>\n{question}{prompt_suffix}{assistant_prompt}" | |
images = og.Images.open(image) | |
inputs = processor(prompt=prompt, images=images, return_tensors="pt") | |
params = og.GeneratorParams(model) | |
params.set_search_options(max_length=max_length) | |
params.set_inputs(inputs) | |
generator = og.Generator(model, params) | |
response = "" | |
while not generator.is_done(): | |
generator.generate_next_token() | |
new_token = generator.get_next_tokens()[0] | |
output = tokenizer_stream.decode(new_token) | |
print(output, end="", flush=True) | |
response += output | |
print(f"\nInference took {time.time() - start_time} seconds") | |
print(response) | |
print(type(response)) | |
return response | |
# Gradio Interface | |
demo = gr.Interface( | |
fn=ask_phi, | |
inputs=[ | |
gr.Image(type="filepath", label="Upload Image"), | |
gr.Textbox(label="Your Prompt"), | |
gr.Slider(minimum=16, maximum=16384, step=16, value=4096, label="Tokens") | |
], | |
outputs=gr.Textbox(label="Phi-3.5 Response"), | |
title="Phi-3.5 Vision Instruct (ONNX)", | |
description="Ask a question about an image using Phi-3.5 ONNX on CPU" | |
) | |
demo.launch(debug=True) |