File size: 3,754 Bytes
ef37daa
3d08dbc
 
83e20b0
e1ff28f
691f69e
 
 
3d08dbc
83e20b0
3d08dbc
691f69e
3d08dbc
 
 
691f69e
83e20b0
 
691f69e
83e20b0
 
 
 
 
 
 
 
 
 
0ce6fc9
3d08dbc
 
 
 
691f69e
 
 
3d08dbc
 
 
fe44201
 
3d08dbc
fe44201
3d08dbc
fe44201
3d08dbc
b55e187
3d08dbc
 
 
 
 
 
e1ff28f
691f69e
83e20b0
 
 
 
 
 
 
 
 
 
 
 
e1ff28f
83e20b0
 
 
e1ff28f
83e20b0
56d5550
 
 
 
 
83e20b0
 
 
 
 
 
 
56d5550
e0b816f
56d5550
691f69e
 
e1ff28f
3d08dbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691f69e
 
 
3d08dbc
fe44201
3d08dbc
fe44201
691f69e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time

# Model configuration
MODEL_NAME = "Qwen/Qwen2-14B-Instruct"

# Initialize model and tokenizer
print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Model and tokenizer loaded!")

def simulate_typing(text, min_chars_per_sec=15, max_chars_per_sec=40):
    """Simulate typing animation with variable speed."""
    full_text = ""
    words = text.split()
    for i, word in enumerate(words):
        full_text += word
        if i < len(words) - 1:
            full_text += " "
        delay = 1 / (min_chars_per_sec + (max_chars_per_sec - min_chars_per_sec) * torch.rand(1).item())
        time.sleep(delay)
        yield full_text

def generate_response(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens=512,
    temperature=0.7,
    top_p=0.95
):
    # Prepare conversation history
    messages = [{"role": "system", "content": system_message}]
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    messages.append({"role": "user", "content": message})

    # Convert messages to model input format
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Generate response
    with torch.inference_mode():
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        generated_ids = generated_ids[0, len(model_inputs.input_ids[0]):]
        response = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # Return response with typing animation
    for partial_response in simulate_typing(response):
        yield partial_response

# Custom CSS with typing cursor animation
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
body, .gradio-container {
    font-family: 'Inter', sans-serif;
}
.typing-cursor::after {
    content: '|';
    animation: blink 1s step-start infinite;
}
@keyframes blink {
    50% { opacity: 0; }
}
"""

# System message
system_message = """You are Qwen 2.5 14B, an advanced AI assistant created by Alibaba Cloud. 
You are knowledgeable, helpful, and strive to provide accurate and comprehensive responses."""

# Gradio chat interface
demo = gr.ChatInterface(
    generate_response,
    additional_inputs=[
        gr.Textbox(
            value=system_message,
            visible=False,
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=512,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ],
    css=custom_css,
    title="Qwen 2.5 14B Chat",
    description="An advanced AI assistant powered by Qwen 2.5 14B"
)

# Launch the demo
if __name__ == "__main__":
    demo.queue(max_size=40)
    demo.launch(max_threads=40)