File size: 3,873 Bytes
9c12531
 
 
 
 
8f31e52
c7f8a5d
9c12531
 
 
 
c181c4d
9c12531
c181c4d
 
c7f8a5d
8dad82c
 
 
 
 
 
c181c4d
8b66151
 
c7f8a5d
 
 
 
 
 
 
 
 
 
8b66151
c7f8a5d
8b66151
 
 
88a0b72
 
c7f8a5d
88a0b72
 
c7f8a5d
88a0b72
c7f8a5d
8b66151
c7f8a5d
8b66151
8dad82c
 
c7f8a5d
 
 
88a0b72
c7f8a5d
 
88a0b72
c7f8a5d
c181c4d
88a0b72
c181c4d
 
 
88a0b72
 
 
 
c7f8a5d
88a0b72
9c12531
0abdfaa
8b66151
 
c7f8a5d
 
8dad82c
c7f8a5d
 
8dad82c
c7f8a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dad82c
9c12531
 
 
c7f8a5d
 
 
 
 
 
 
 
9c12531
 
c7f8a5d
9c12531
c7f8a5d
 
 
8dad82c
 
c7f8a5d
 
9c12531
 
c7f8a5d
8dad82c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
from openai import OpenAI
import os

ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message="You are a helpful assistant.",
    max_tokens=512,
    temperature=0.7,
    top_p=0.95,
    frequency_penalty=0.0,
    seed=-1
):
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    messages = [{"role": "system", "content": system_message}]
    print("Initial messages array constructed.")

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})
    print("Latest user message appended.")

    # Set the model to "meta" by default
    model_to_use = "meta-llama/Llama-3.3-70B-Instruct"
    print(f"Model selected for inference: {model_to_use}")

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print("Sending request to OpenAI API.")

    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        yield response

    print("Completed response generation.")

# GRADIO UI

chatbot = gr.Chatbot(height=600, show_copy_button=True, placeholder="Start chatting!", likeable=True, layout="panel")
print("Chatbot interface created.")

system_message_box = gr.Textbox(value="You are a helpful assistant.", label="System Prompt", visible=False)

max_tokens_slider = gr.Slider(
    minimum=1,
    maximum=4096,
    value=512,
    step=1,
    label="Max new tokens"
)
temperature_slider = gr.Slider(
    minimum=0.1,
    maximum=4.0,
    value=0.7,
    step=0.1,
    label="Temperature"
)
top_p_slider = gr.Slider(
    minimum=0.1,
    maximum=1.0,
    value=0.95,
    step=0.05,
    label="Top-P"
)
frequency_penalty_slider = gr.Slider(
    minimum=-2.0,
    maximum=2.0,
    value=0.0,
    step=0.1,
    label="Frequency Penalty"
)
seed_slider = gr.Slider(
    minimum=-1,
    maximum=65535,
    value=-1,
    step=1,
    label="Seed (-1 for random)"
)

# Removed the custom_model_box as the model is pre-set

demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        system_message_box,
        max_tokens_slider,
        temperature_slider,
        top_p_slider,
        frequency_penalty_slider,
        seed_slider,
    ],
    fill_height=True,
    chatbot=chatbot,
    theme="Nymbo/Nymbo_Theme",
)
print("ChatInterface object created.")

with demo:
    # No need for a model selection accordion since the model is fixed to "meta-llama"
    pass

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()