import copy import re import time import html from openai import OpenAI import gradio as gr stop_generation = False def stream_from_vllm(messages, thinking_enabled=True, temperature=1.0): global stop_generation client = OpenAI() response = client.chat.completions.create( model="GLM-4.5", messages=messages, temperature=temperature, stream=True, max_tokens=65536, extra_body={ "thinking": { "type": "enabled" if thinking_enabled else "disabled", } } ) for chunk in response: if stop_generation: break if chunk.choices and chunk.choices[0].delta: delta = chunk.choices[0].delta yield delta class GLM45Model: def __init__(self): self.reset_state() def reset_state(self): self.accumulated_text = "" def _strip_html(self, text: str) -> str: return re.sub(r"<[^>]+>", "", text).strip() def _wrap_text(self, text: str): return [{"type": "text", "text": text}] def _parse_thinking_content(self, text: str): thinking_content = "" regular_content = "" if "" in text: think_pattern = r'(.*?)' think_match = re.search(think_pattern, text, re.DOTALL) if think_match: thinking_content = think_match.group(1).strip() regular_content = re.sub(think_pattern, '', text, flags=re.DOTALL).strip() else: think_start = text.find("") if think_start != -1: thinking_content = text[think_start + 7:] regular_content = text[:think_start].strip() else: regular_content = text return thinking_content, regular_content def _render_response(self, thinking_content: str, regular_content: str, skip_think: bool = False): html_parts = [] if thinking_content and not skip_think: thinking_escaped = html.escape(thinking_content).replace("\n", "
") think_html = ( "

💭 Thinking

" "

" + thinking_escaped + "

" ) html_parts.append(think_html) if regular_content: content_escaped = html.escape(regular_content) content_formatted = content_escaped.replace("\n", "
") content_html = f"

{content_formatted}

" html_parts.append(content_html) return "".join(html_parts) def _build_messages(self, raw_hist, sys_prompt): msgs = [] if sys_prompt.strip(): msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]}) for h in raw_hist: if h["role"] == "user": msgs.append({"role": "user", "content": self._wrap_text(h["content"])}) else: raw = re.sub(r"", "", h["content"], flags=re.DOTALL) clean_content = self._strip_html(raw).strip() if clean_content: msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)}) return msgs def stream_generate(self, raw_hist, sys_prompt: str, thinking_enabled: bool = True, temperature: float = 1.0): global stop_generation stop_generation = False msgs = self._build_messages(raw_hist, sys_prompt) self.reset_state() try: for delta in stream_from_vllm(msgs, thinking_enabled, temperature): if stop_generation: break delta_content = "" if hasattr(delta, 'content') and delta.content: delta_content = delta.content elif isinstance(delta, dict) and 'content' in delta and delta['content']: delta_content = delta['content'] if delta_content: self.accumulated_text += delta_content thinking_content, regular_content = self._parse_thinking_content(self.accumulated_text) yield self._render_response(thinking_content, regular_content, not thinking_enabled) except Exception as e: error_msg = f"Error during streaming: {str(e)}" yield self._render_response("", error_msg) glm45 = GLM45Model() def chat(msg, raw_hist, sys_prompt, thinking_enabled, temperature): global stop_generation stop_generation = False if not msg.strip(): return raw_hist, copy.deepcopy(raw_hist), "" user_rec = {"role": "user", "content": msg.strip()} if raw_hist is None: raw_hist = [] raw_hist.append(user_rec) place = {"role": "assistant", "content": ""} raw_hist.append(place) yield raw_hist, copy.deepcopy(raw_hist), "" try: for chunk in glm45.stream_generate(raw_hist[:-1], sys_prompt, thinking_enabled, temperature): if stop_generation: break place["content"] = chunk yield raw_hist, copy.deepcopy(raw_hist), "" except Exception as e: error_content = f"

Error: {html.escape(str(e))}

" place["content"] = error_content yield raw_hist, copy.deepcopy(raw_hist), "" yield raw_hist, copy.deepcopy(raw_hist), "" def reset(): global stop_generation stop_generation = True time.sleep(0.1) return [], [], "" demo = gr.Blocks(title="GLM-4.5 API Demo", theme=gr.themes.Soft()) with demo: gr.HTML( "

GLM-4.5 API Demo

" "

" "This demo uses the API version of the service for faster response.
" "Chat only. For tool use, MCP support, and web search, please refer to the API.

" "

Model Hub | " "Github | " "Blog | " "API

" ) raw_history = gr.State([]) with gr.Row(): with gr.Column(scale=7): chatbox = gr.Chatbot( label="Chat", type="messages", height=600, elem_classes="chatbot-container", sanitize_html=False, line_breaks=True ) textbox = gr.Textbox(label="Message", lines=3) with gr.Row(): send = gr.Button("Send", variant="primary") clear = gr.Button("Clear") with gr.Column(scale=1): thinking_toggle = gr.Checkbox(label="Enable Thinking", value=True) gr.HTML( "

" "ON: Enable model thinking.
" "OFF: Not enable model thinking, the model will directly answer the question without reasoning." "

" ) temperature_slider = gr.Slider( minimum=0.0, maximum=1.0, value=1.0, step=0.01, label="Temperature" ) sys = gr.Textbox(label="System Prompt", lines=6) send.click( chat, inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], outputs=[chatbox, raw_history, textbox] ) textbox.submit( chat, inputs=[textbox, raw_history, sys, thinking_toggle, temperature_slider], outputs=[chatbox, raw_history, textbox] ) clear.click( reset, outputs=[chatbox, raw_history, textbox] ) if __name__ == "__main__": demo.launch()