zRzRzRzRzRzRzR commited on
Commit
ec636e9
·
1 Parent(s): 526a117
Files changed (3) hide show
  1. app.py +335 -0
  2. packages.txt +1 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import copy
3
+ import os
4
+ import re
5
+ import subprocess
6
+ import tempfile
7
+ import base64
8
+ from pathlib import Path
9
+ import fitz
10
+ import gradio as gr
11
+ import time
12
+ import html
13
+ from openai import OpenAI
14
+
15
+ stop_generation = False
16
+
17
+
18
+ def stream_from_vllm(messages):
19
+ global stop_generation
20
+ client = OpenAI(
21
+ base_url="https://open.bigmodel.cn/api/paas/v4"
22
+ )
23
+
24
+ response = client.chat.completions.create(
25
+ model="GLM-4.1V-Thinking-Flash",
26
+ messages=messages,
27
+ temperature=0.01,
28
+ stream=True,
29
+ max_tokens=8192
30
+ )
31
+
32
+ for chunk in response:
33
+ if stop_generation:
34
+ break
35
+
36
+ if chunk.choices and chunk.choices[0].delta:
37
+ delta = chunk.choices[0].delta
38
+ yield delta
39
+
40
+
41
+ class GLM4VModel:
42
+ def _strip_html(self, text: str) -> str:
43
+ return re.sub(r"<[^>]+>", "", text).strip()
44
+
45
+ def _wrap_text(self, text: str):
46
+ return [{"type": "text", "text": text}]
47
+
48
+ def _image_to_base64(self, image_path):
49
+ with open(image_path, "rb") as image_file:
50
+ encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
51
+ ext = Path(image_path).suffix.lower()
52
+ if ext in ['.jpg', '.jpeg']:
53
+ mime_type = 'image/jpeg'
54
+ elif ext == '.png':
55
+ mime_type = 'image/png'
56
+ elif ext == '.gif':
57
+ mime_type = 'image/gif'
58
+ elif ext == '.bmp':
59
+ mime_type = 'image/bmp'
60
+ elif ext in ['.tiff', '.tif']:
61
+ mime_type = 'image/tiff'
62
+ elif ext == '.webp':
63
+ mime_type = 'image/webp'
64
+ else:
65
+ mime_type = 'image/jpeg'
66
+
67
+ return f"data:{mime_type};base64,{encoded_string}"
68
+
69
+ def _pdf_to_imgs(self, pdf_path):
70
+ doc = fitz.open(pdf_path)
71
+ imgs = []
72
+ for i in range(doc.page_count):
73
+ pix = doc.load_page(i).get_pixmap(dpi=180)
74
+ img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png")
75
+ pix.save(img_p)
76
+ imgs.append(img_p)
77
+ doc.close()
78
+ return imgs
79
+
80
+ def _ppt_to_imgs(self, ppt_path):
81
+ tmp = tempfile.mkdtemp()
82
+ subprocess.run(
83
+ ["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path],
84
+ check=True,
85
+ )
86
+ pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf")
87
+ return self._pdf_to_imgs(pdf_path)
88
+
89
+ def _files_to_content(self, media):
90
+ out = []
91
+ for f in media or []:
92
+ ext = Path(f.name).suffix.lower()
93
+ if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
94
+ out.append({"type": "video_url", "video_url": {"url": f"file://{f.name}"}})
95
+ elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
96
+ base64_url = self._image_to_base64(f.name)
97
+ out.append({"type": "image_url", "image_url": {"url": base64_url}})
98
+ elif ext in [".ppt", ".pptx"]:
99
+ for p in self._ppt_to_imgs(f.name):
100
+ base64_url = self._image_to_base64(p)
101
+ out.append({"type": "image_url", "image_url": {"url": base64_url}})
102
+ elif ext == ".pdf":
103
+ for p in self._pdf_to_imgs(f.name):
104
+ base64_url = self._image_to_base64(p)
105
+ out.append({"type": "image_url", "image_url": {"url": base64_url}})
106
+ return out
107
+
108
+ def _stream_fragment(self, reasoning_content: str = "", content: str = "", skip_think: bool = False):
109
+ think_html = ""
110
+ if reasoning_content and not skip_think:
111
+ # Properly escape and format thinking content
112
+ think_content = html.escape(reasoning_content).replace("\n", "<br>")
113
+ think_html = (
114
+ "<details open><summary style='cursor:pointer;font-weight:bold;color:#007acc;'>💭 Thinking</summary>"
115
+ "<div style='color:#555555;line-height:1.6;padding:15px;border-left:4px solid #007acc;margin:10px 0;background-color:#f0f7ff;border-radius:4px;'>"
116
+ + think_content
117
+ + "</div></details>"
118
+ )
119
+
120
+ answer_html = ""
121
+ if content:
122
+ # Properly handle content formatting
123
+ content_escaped = html.escape(content)
124
+ # Convert newlines to HTML breaks
125
+ content_formatted = content_escaped.replace("\n", "<br>")
126
+ answer_html = f"<div style='margin:0.5em 0; white-space: pre-wrap; line-height:1.6;'>{content_formatted}</div>"
127
+
128
+ return think_html + answer_html
129
+
130
+ def _build_messages(self, raw_hist, sys_prompt):
131
+ msgs = []
132
+ if sys_prompt.strip():
133
+ msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
134
+ for h in raw_hist:
135
+ if h["role"] == "user":
136
+ msgs.append({"role": "user", "content": h["content"]})
137
+ else:
138
+ # Clean HTML from previous responses
139
+ raw = re.sub(r"<details.*?</details>", "", h["content"], flags=re.DOTALL)
140
+ clean_content = self._strip_html(raw).strip()
141
+ if clean_content:
142
+ msgs.append({"role": "assistant", "content": self._wrap_text(clean_content)})
143
+ return msgs
144
+
145
+ def stream_generate(self, raw_hist, sys_prompt: str, *, skip_special_tokens: bool = False):
146
+ global stop_generation
147
+ stop_generation = False
148
+ msgs = self._build_messages(raw_hist, sys_prompt)
149
+ reasoning_buffer = ""
150
+ content_buffer = ""
151
+
152
+ try:
153
+ for delta in stream_from_vllm(msgs):
154
+ if stop_generation:
155
+ break
156
+
157
+ # Handle different possible response formats
158
+ if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
159
+ reasoning_buffer += delta.reasoning_content
160
+ elif hasattr(delta, 'content') and delta.content:
161
+ content_buffer += delta.content
162
+ else:
163
+ # Fallback: check if delta itself contains the content
164
+ if isinstance(delta, dict):
165
+ if 'reasoning_content' in delta and delta['reasoning_content']:
166
+ reasoning_buffer += delta['reasoning_content']
167
+ if 'content' in delta and delta['content']:
168
+ content_buffer += delta['content']
169
+ # Additional fallback for standard OpenAI format
170
+ elif hasattr(delta, 'content') and delta.content:
171
+ content_buffer += delta.content
172
+
173
+ yield self._stream_fragment(reasoning_buffer, content_buffer)
174
+
175
+ except Exception as e:
176
+ error_msg = f"Error during streaming: {str(e)}"
177
+ yield self._stream_fragment("", error_msg)
178
+
179
+
180
+ def format_display_content(content):
181
+ if isinstance(content, list):
182
+ text_parts = []
183
+ file_count = 0
184
+ for item in content:
185
+ if item["type"] == "text":
186
+ text_parts.append(item["text"])
187
+ else:
188
+ file_count += 1
189
+ display_text = " ".join(text_parts)
190
+ if file_count > 0:
191
+ return f"[{file_count} file(s) uploaded]\n{display_text}"
192
+ return display_text
193
+ return content
194
+
195
+
196
+ def create_display_history(raw_hist):
197
+ display_hist = []
198
+ for h in raw_hist:
199
+ if h["role"] == "user":
200
+ display_content = format_display_content(h["content"])
201
+ display_hist.append({"role": "user", "content": display_content})
202
+ else:
203
+ display_hist.append({"role": "assistant", "content": h["content"]})
204
+ return display_hist
205
+
206
+
207
+ glm4v = GLM4VModel()
208
+
209
+
210
+ def check_files(files):
211
+ vids = imgs = ppts = pdfs = 0
212
+ for f in files or []:
213
+ ext = Path(f.name).suffix.lower()
214
+ if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
215
+ vids += 1
216
+ elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
217
+ imgs += 1
218
+ elif ext in [".ppt", ".pptx"]:
219
+ ppts += 1
220
+ elif ext == ".pdf":
221
+ pdfs += 1
222
+ if vids > 1 or ppts > 1 or pdfs > 1:
223
+ return False, "Only one video or one PPT or one PDF allowed"
224
+ if imgs > 10:
225
+ return False, "Maximum 10 images allowed"
226
+ if (ppts or pdfs) and (vids or imgs) or (vids and imgs):
227
+ return False, "Cannot mix documents, videos, and images"
228
+ return True, ""
229
+
230
+
231
+ def chat(files, msg, raw_hist, sys_prompt):
232
+ global stop_generation
233
+ stop_generation = False
234
+ ok, err = check_files(files)
235
+ if not ok:
236
+ raw_hist.append({"role": "assistant", "content": err})
237
+ display_hist = create_display_history(raw_hist)
238
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
239
+ return
240
+
241
+ payload = glm4v._files_to_content(files) if files else None
242
+ if msg.strip():
243
+ if payload is None:
244
+ payload = glm4v._wrap_text(msg.strip())
245
+ else:
246
+ payload.append({"type": "text", "text": msg.strip()})
247
+
248
+ user_rec = {"role": "user", "content": payload if payload else msg.strip()}
249
+ if raw_hist is None:
250
+ raw_hist = []
251
+ raw_hist.append(user_rec)
252
+ place = {"role": "assistant", "content": ""}
253
+ raw_hist.append(place)
254
+
255
+ display_hist = create_display_history(raw_hist)
256
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
257
+
258
+ try:
259
+ for chunk in glm4v.stream_generate(raw_hist[:-1], sys_prompt):
260
+ if stop_generation:
261
+ break
262
+ place["content"] = chunk
263
+ display_hist = create_display_history(raw_hist)
264
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
265
+ except Exception as e:
266
+ error_content = f"<div style='color: red;'>Error: {html.escape(str(e))}</div>"
267
+ place["content"] = error_content
268
+ display_hist = create_display_history(raw_hist)
269
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
270
+
271
+ display_hist = create_display_history(raw_hist)
272
+ yield display_hist, copy.deepcopy(raw_hist), None, ""
273
+
274
+
275
+ def reset():
276
+ global stop_generation
277
+ stop_generation = True
278
+ time.sleep(0.1)
279
+ return [], [], None, ""
280
+
281
+
282
+ demo = gr.Blocks(title="GLM-4.1V-9B-Thinking", theme=gr.themes.Soft())
283
+
284
+ with demo:
285
+ gr.Markdown(
286
+ "<div style='text-align:center;font-size:32px;font-weight:bold;margin-bottom:10px;'>GLM-4.1V-9B-Thinking</div>"
287
+ "<div style='text-align:center;color:red;font-size:16px;margin-bottom:20px;'>This demo uses the API version of the service for faster response.</div>"
288
+ "<div style='text-align:center;'><a href='https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking'>Model Hub</a> | "
289
+ "<a href='https://github.com/THUDM/GLM-4.1V-Thinking'>Github</a> | "
290
+ "<a href='https://arxiv.org/abs/2507.01006'>Paper</a> | "
291
+ "<a href='https://www.bigmodel.cn/dev/api/visual-reasoning-model/GLM-4.1V-Thinking'>API</a> |"
292
+ "<a href='https://huggingface.co/spaces/THUDM/GLM-4.1V-9B-Thinking-Demo'>GPU Local Demo</a> </div>"
293
+ )
294
+ raw_history = gr.State([])
295
+
296
+ with gr.Row():
297
+ with gr.Column(scale=7):
298
+ chatbox = gr.Chatbot(
299
+ label="Chat",
300
+ type="messages",
301
+ height=600,
302
+ elem_classes="chatbot-container",
303
+ sanitize_html=False,
304
+ line_breaks=True
305
+ )
306
+ textbox = gr.Textbox(label="Message", lines=3)
307
+ with gr.Row():
308
+ send = gr.Button("Send", variant="primary")
309
+ clear = gr.Button("Clear")
310
+ with gr.Column(scale=3):
311
+ up = gr.File(label="Upload Files", file_count="multiple", file_types=["file"], type="filepath")
312
+ gr.Markdown("Supports images / videos / PPT / PDF")
313
+ gr.Markdown(
314
+ "The maximum supported input is 10 images or 1 video/PPT/PDF(less than 10 pages) in this demo. "
315
+ "You may upload only one file type at a time (such as an image, video, PDF, or PPT"
316
+ )
317
+ sys = gr.Textbox(label="System Prompt", lines=6)
318
+
319
+ send.click(
320
+ chat,
321
+ inputs=[up, textbox, raw_history, sys],
322
+ outputs=[chatbox, raw_history, up, textbox]
323
+ )
324
+ textbox.submit(
325
+ chat,
326
+ inputs=[up, textbox, raw_history, sys],
327
+ outputs=[chatbox, raw_history, up, textbox]
328
+ )
329
+ clear.click(
330
+ reset,
331
+ outputs=[chatbox, raw_history, up, textbox]
332
+ )
333
+
334
+ if __name__ == "__main__":
335
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libreoffice
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==5.25.0
2
+ spaces>=0.37.1
3
+ PyMuPDF>=1.26.1
4
+ torchvision==0.20.1
5
+ torch==2.5.1
6
+ av>=14.4.0
7
+ openai