import gradio as gr import time import onnxruntime_genai as og from huggingface_hub import snapshot_download import os # Load model and processor local_dir = snapshot_download( repo_id="microsoft/Phi-3.5-vision-instruct-onnx", allow_patterns="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*" ) model_folder = os.path.join( local_dir, "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4" ) model = og.Model(model_folder) processor = model.create_multimodal_processor() tokenizer_stream = processor.create_stream() user_prompt = '<|user|>\n' assistant_prompt = '<|assistant|>\n' prompt_suffix = "<|end|>\n" # Inference function def ask_phi(image, question, max_length): start_time = time.time() prompt = f"{user_prompt}<|image_1|>\n{question}{prompt_suffix}{assistant_prompt}" images = og.Images.open(image) inputs = processor(prompt=prompt, images=images, return_tensors="pt") params = og.GeneratorParams(model) params.set_search_options(max_length=max_length) params.set_inputs(inputs) generator = og.Generator(model, params) response = "" while not generator.is_done(): generator.generate_next_token() new_token = generator.get_next_tokens()[0] output = tokenizer_stream.decode(new_token) print(output, end="", flush=True) response += output print(f"\nInference took {time.time() - start_time} seconds") del generator return response # Gradio Interface demo = gr.Interface( fn=ask_phi, inputs=[ gr.Image(type="filepath", label="Upload Image"), gr.Textbox(label="Your Prompt"), gr.Slider(minimum=16, maximum=16384, step=16, value=5000, label="Context Length") ], outputs=gr.Textbox(label="Phi-3.5 Response"), title="Phi-3.5 Vision Instruct (ONNX)", description="Ask a question about an image using Phi-3.5 ONNX on CPU" ) demo.launch(debug=True)