File size: 2,051 Bytes
ab10ba2
 
 
 
e004a88
ab10ba2
 
 
 
 
 
03016fb
 
 
 
ab10ba2
 
03016fb
ab10ba2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
import time
import onnxruntime_genai as og
from huggingface_hub import snapshot_download
import os

# Load model and processor
local_dir = snapshot_download(
    repo_id="microsoft/Phi-3.5-vision-instruct-onnx",
    allow_patterns="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/*"
)
model_folder = os.path.join(
    local_dir, "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
)
model = og.Model(model_folder)
processor = model.create_multimodal_processor()


print("Setting up the tokenizer stream")
tokenizer_stream = processor.create_stream()
print("Tokenizer stream setup complete")

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

# Inference function
def ask_phi(image, question):
    

    prompt = f"{user_prompt}<|image_1|>\n{question}{prompt_suffix}{assistant_prompt}"
    print(f">>> Prompt\n{prompt}")
    images = og.Images.open(image)
    inputs = processor(prompt=prompt, images=images, return_tensors="pt")

    # start here
    params = og.GeneratorParams(model)
    params.set_inputs(inputs)
    params.set_search_options(max_length=7680)

    generator = og.Generator(model, params)
    start_time = time.time()

    response = ""
    while not generator.is_done():
        generator.generate_next_token()

        new_token = generator.get_next_tokens()[0]
        response += tokenizer_stream.decode(new_token) + " "
        print(f">>> Response\n{response}")

    total_run_time = time.time() - start_time
    print(f"Total Time : {total_run_time:.2f}")

    # Delete the generator to free the captured graph before creating another one
    del generator

    return response

# Gradio Interface
demo = gr.Interface(
      fn=ask_phi,
      inputs=[
          gr.Image(type="filepath", label="Upload Image"),
          gr.Textbox(label="Your Prompt")
      ],
      outputs=gr.Textbox(label="Phi-3.5 Response"),
      title="Phi-3.5 Vision Instruct (ONNX)",
      description="Ask a question about an image using Phi-3.5 ONNX on CPU"
)

demo.launch(debug=True)