zRzRzRzRzRzRzR commited on
Commit
67199da
·
1 Parent(s): 2866134
Files changed (2) hide show
  1. app.py +212 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import re
3
+ import time
4
+ import html
5
+ from openai import OpenAI
6
+ import gradio as gr
7
+
8
+ stop_generation = False
9
+
10
+
11
+ def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0):
12
+ global stop_generation
13
+ client = OpenAI()
14
+
15
+ response = client.chat.completions.create(
16
+ model="glm-4.5",
17
+ messages=messages,
18
+ temperature=temperature,
19
+ stream=True,
20
+ max_tokens=32000,
21
+ extra_body={
22
+ "thinking":
23
+ {
24
+ "type": "enabled" if thinking_enabled else "disabled",
25
+ }
26
+ }
27
+ )
28
+
29
+ print(response)
30
+ for chunk in response:
31
+ if stop_generation:
32
+ break
33
+
34
+ if chunk.choices and chunk.choices[0].delta:
35
+ delta = chunk.choices[0].delta
36
+ yield delta
37
+
38
+
39
+ class GLM45Model:
40
+ def _strip_html(self, text: str) -> str:
41
+ return re.sub(r"<[^>]+>", "", text).strip()
42
+
43
+ def _wrap_text(self, text: str):
44
+ return [{"type": "text", "text": text}]
45
+
46
+ def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False):
47
+ think_html = ""
48
+ if reasoning_content and not skip_think:
49
+ think_content = html.escape(reasoning_content).replace("\n", "<br>")
50
+ think_html = (
51
+ "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>"
52
+ "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
53
+ + think_content
54
+ + "</div></details>"
55
+ )
56
+
57
+ answer_html = ""
58
+ if content:
59
+ content_escaped = html.escape(content)
60
+ content_formatted = content_escaped.replace("\n", "<br>")
61
+ answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>"
62
+
63
+ return think_html + answer_html
64
+
65
+ def _build_messages(self, raw_hist, sys_prompt):
66
+ msgs = []
67
+ if sys_prompt.strip():
68
+ msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
69
+ for h in raw_hist:
70
+ if h["role"] == "user":
71
+ msgs.append({"role": "user", "content": self._wrap_text(h["content"])})
72
+ else:
73
+ raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
74
+ clean_content = self._strip_html(raw).strip()
75
+ if clean_content:
76
+ msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
77
+ return msgs
78
+
79
+ def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0):
80
+ global stop_generation
81
+ stop_generation = False
82
+ msgs = self._build_messages(raw_hist, sys_prompt)
83
+ reasoning_buffer = ""
84
+ content_buffer = ""
85
+
86
+ try:
87
+ for delta in stream_from_vllm(msgs, thinking_enabled, temperature):
88
+ if stop_generation:
89
+ break
90
+
91
+ if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
92
+ reasoning_buffer += delta.reasoning_content
93
+ elif hasattr(delta, 'content') and delta.content:
94
+ content_buffer += delta.content
95
+ else:
96
+ if isinstance(delta, dict):
97
+ if 'reasoning_content' in delta and delta['reasoning_content']:
98
+ reasoning_buffer += delta['reasoning_content']
99
+ if 'content' in delta and delta['content']:
100
+ content_buffer += delta['content']
101
+ elif hasattr(delta, 'content') and delta.content:
102
+ content_buffer += delta.content
103
+
104
+ yield self._stream_fragment(reasoning_buffer, content_buffer, not thinking_enabled)
105
+
106
+ except Exception as e:
107
+ error_msg = f"Error during streaming: {str(e)}"
108
+ yield self._stream_fragment("", error_msg)
109
+
110
+
111
+ glm45 = GLM45Model()
112
+
113
+
114
+ def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature):
115
+ global stop_generation
116
+ stop_generation = False
117
+
118
+ if not msg.strip():
119
+ return raw_hist, copy.deepcopy(raw_hist), ""
120
+
121
+ user_rec = {"role": "user", "content": msg.strip()}
122
+ if raw_hist is None:
123
+ raw_hist = []
124
+ raw_hist.append(user_rec)
125
+ place = {"role": "assistant", "content": ""}
126
+ raw_hist.append(place)
127
+
128
+ yield raw_hist, copy.deepcopy(raw_hist), ""
129
+
130
+ try:
131
+ for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature):
132
+ if stop_generation:
133
+ break
134
+ place["content"] = chunk
135
+ yield raw_hist, copy.deepcopy(raw_hist), ""
136
+ except Exception as e:
137
+ error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
138
+ place["content"] = error_content
139
+ yield raw_hist, copy.deepcopy(raw_hist), ""
140
+
141
+ yield raw_hist, copy.deepcopy(raw_hist), ""
142
+
143
+
144
+ def reset():
145
+ global stop_generation
146
+ stop_generation = True
147
+ time.sleep(0.1)
148
+ return [], [], ""
149
+
150
+
151
+ demo = gr.Blocks(title="GLM-4.5 API Space", theme=gr.themes.Soft())
152
+
153
+ with demo:
154
+ gr.Markdown(
155
+ "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.5 API Space</div>"
156
+ "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>"
157
+ "This space uses the API version of the service for faster response.<br>"
158
+ "Chat only. For tool use, MCP support, and web search, please refer to the API.</div>"
159
+ "<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.5'>Model Hub</a> | "
160
+ "<a href='https://github.com/THUDM/GLM-4.5'>Github</a> | "
161
+ "<a href='https://www.bigmodel.cn'>API</a></div>"
162
+ )
163
+ raw_history = gr.State([])
164
+
165
+ with gr.Row():
166
+ with gr.Column(scale=2):
167
+ chatbox = gr.Chatbot(
168
+ label="Chat",
169
+ type="messages",
170
+ height=600,
171
+ elem_classes="chatbot-container",
172
+ sanitize_html=False,
173
+ line_breaks=True
174
+ )
175
+ textbox = gr.Textbox(label="Message", lines=3)
176
+ with gr.Row():
177
+ send = gr.Button("Send", variant="primary")
178
+ clear = gr.Button("Clear")
179
+ with gr.Column(scale=1):
180
+ thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True)
181
+ gr.Markdown(
182
+ "<div style='color:red;font-size:12px;margin-top:5px;margin-bottom:15px;'>"
183
+ "ON: Enable model thinking.<br>"
184
+ "OFF: Not enable model thinking, the model will directly answer the question without reasoning."
185
+ "</div>"
186
+ )
187
+ temperature_slider = gr.Slider(
188
+ minimum=0.0,
189
+ maximum=1.0,
190
+ value=1.0,
191
+ step=0.01,
192
+ label="Temperature"
193
+ )
194
+ sys = gr.Textbox(label="System Prompt", lines=8)
195
+
196
+ send.click(
197
+ chat,
198
+ inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
199
+ outputs=[chatbox, raw_history, textbox]
200
+ )
201
+ textbox.submit(
202
+ chat,
203
+ inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider],
204
+ outputs=[chatbox, raw_history, textbox]
205
+ )
206
+ clear.click(
207
+ reset,
208
+ outputs=[chatbox, raw_history, textbox]
209
+ )
210
+
211
+ if __name__ == "__main__":
212
+ demo.launch(server_name="0.0.0.0", share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==5.38.2
2
+ spaces>=0.37.1
3
+ torch==2.5.1
4
+ openai>=1.97.1