david-thrower commited on
Commit
15bd5c0
·
verified ·
1 Parent(s): a4822cd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
6
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ print("Loading tokenizer & model…")
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to(DEVICE)
11
+
12
+ # -------------------------------------------------
13
+ # Optional tool(s)
14
+ # -------------------------------------------------
15
+ TOOLS = [{
16
+ "name": "get_weather",
17
+ "description": "Get the current weather in a given city",
18
+ "parameters": {
19
+ "type": "object",
20
+ "properties": {
21
+ "city": {"type": "string", "description": "City name"}
22
+ },
23
+ "required": ["city"]
24
+ }
25
+ }]
26
+
27
+ # -------------------------------------------------
28
+ # Helpers
29
+ # -------------------------------------------------
30
+ def build_messages(history, enable_thinking: bool):
31
+ """Convert Gradio history to the chat template."""
32
+ messages = []
33
+ for h in history:
34
+ messages.append({"role": h["role"], "content": h["content"]})
35
+ # Add system instruction for mode
36
+ system_flag = "/think" if enable_thinking else "/no_think"
37
+ messages.insert(0, {"role": "system", "content": system_flag})
38
+ return messages
39
+
40
+ def chat_fn(history, enable_thinking, temperature, top_p):
41
+ """Generate a streaming response."""
42
+ messages = build_messages(history, enable_thinking)
43
+ text = tokenizer.apply_chat_template(
44
+ messages,
45
+ tokenize=False,
46
+ add_generation_prompt=True,
47
+ xml_tools=TOOLS
48
+ )
49
+ inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
50
+
51
+ streamer = model.generate(
52
+ **inputs,
53
+ max_new_tokens=1024,
54
+ do_sample=True,
55
+ temperature=temperature,
56
+ top_p=top_p,
57
+ top_k=20,
58
+ repetition_penalty=1.1,
59
+ pad_token_id=tokenizer.eos_token_id,
60
+ streamer=None # we'll yield manually
61
+ )
62
+ output_ids = streamer[0][len(inputs.input_ids[0]):]
63
+ response = tokenizer.decode(output_ids, skip_special_tokens=True)
64
+
65
+ # streaming char-by-char
66
+ history.append({"role": "assistant", "content": ""})
67
+ for ch in response:
68
+ history[-1]["content"] += ch
69
+ yield history
70
+
71
+ # -------------------------------------------------
72
+ # Blocks UI
73
+ # -------------------------------------------------
74
+ with gr.Blocks(title="SmolLM3-3B Chat") as demo:
75
+ gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
76
+ with gr.Row():
77
+ enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=False)
78
+ temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
79
+ top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
80
+ chatbot = gr.Chatbot(type="messages")
81
+ msg = gr.Textbox(placeholder="Type your message here…", lines=1)
82
+ clear = gr.Button("Clear")
83
+
84
+ def user_fn(user_msg, history):
85
+ return "", history + [{"role": "user", "content": user_msg}]
86
+
87
+ msg.submit(
88
+ user_fn, [msg, chatbot], [msg, chatbot], queue=False
89
+ ).then(
90
+ chat_fn, [chatbot, enable_think, temperature, top_p], chatbot
91
+ )
92
+ clear.click(lambda: None, None, chatbot, queue=False)
93
+
94
+ demo.queue().launch()