File size: 2,295 Bytes
6e96eae
 
 
 
e838cdf
6e96eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e838cdf
 
 
6e96eae
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import huggingface_hub as hf_hub
import time
import openvino_genai as ov_genai
import numpy as np
import gradio as gr

# 下載模型
model_id = "OpenVINO/Qwen3-0.6B-int4-ov"
model_path = "Qwen3-0.6B-int4-ov"

hf_hub.snapshot_download(model_id, local_dir=model_path, local_dir_use_symlinks=False) # Added local_dir_use_symlinks=False to avoid potential issues

# 建立推理管線
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tokenizer = pipe.get_tokenizer()
tokenizer.set_chat_template(tokenizer.chat_template)


def generate_response(prompt):
    """
    Generates a response using the OpenVINO LLM pipeline.

    Args:
        prompt (str): The input prompt.

    Returns:
        str: The generated response.
    """
    start_time = time.time()
    output = pipe.generate([prompt], max_length=1024)
    end_time = time.time()

    generated_text = output.text[0]  # Extract the generated text

    performance_metrics = f"Generate duration: {output.perf_metrics.get_generate_duration().mean:.2f}ms\n"
    performance_metrics += f'Throughput: {output.perf_metrics.get_throughput().mean:.2f} tokens/s'

    return generated_text, performance_metrics


def main():
    """
    Creates and launches the Gradio interface.
    """

    with gr.Blocks() as demo:
        gr.Markdown("# OpenVINO Qwen3-8B Demo")  # Add a title
        prompt_input = gr.Textbox(lines=3, label="Enter your prompt:")
        output_text = gr.Textbox(label="Generated Response")
        performance_text = gr.Textbox(label="Performance Metrics", visible=False) # Initially hidden

        def update_output(prompt):
            response, performance = generate_response(prompt)
            return response, performance # return both values

        prompt_input.change(
            fn=update_output,
            inputs=prompt_input,
            outputs=[output_text, performance_text],  # Output both response and metrics
        )

        # Button to show/hide performance metrics
        show_metrics_button = gr.Button("Show/Hide Performance Metrics")
        show_metrics_button.click(
            fn=lambda visible: not visible,
            inputs=[performance_text.visible],
            outputs=[performance_text.visible],
        )


    demo.launch()


if __name__ == "__main__":
    main()