Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,5 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
-
"""
|
4 |
-
Минималистичный чат (одна кнопка отправки):
|
5 |
-
- Вложения через иконку в поле ввода (как в мессенджерах).
|
6 |
-
- Пайплайн: Florence-2 (NIM API) → GPT-OSS (NVIDIA Integrate).
|
7 |
-
- Внизу: необработанный сырой вывод Florence для отладки.
|
8 |
-
- Без WebGPU/wasm.
|
9 |
-
|
10 |
-
Требуется в Secrets: NV_API_KEY
|
11 |
-
"""
|
12 |
|
13 |
import os
|
14 |
import io
|
@@ -94,12 +85,6 @@ def _deep_text_candidates(obj: Any) -> List[str]:
|
|
94 |
return out
|
95 |
|
96 |
def _debug_dump_from_response(resp: requests.Response) -> str:
|
97 |
-
"""
|
98 |
-
Возвращает максимально сырой тех. вывод:
|
99 |
-
- статус, content-type, длина
|
100 |
-
- если JSON: raw text
|
101 |
-
- если ZIP: список файлов и сырые JSON/TXT содержимое
|
102 |
-
"""
|
103 |
lines = []
|
104 |
data = resp.content
|
105 |
ct = (resp.headers.get("content-type") or "").lower()
|
@@ -109,7 +94,6 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
|
|
109 |
lines.append(f"content-type: {ct}")
|
110 |
lines.append(f"bytes: {len(data)}")
|
111 |
|
112 |
-
# JSON
|
113 |
if "application/json" in ct and not data.startswith(b"PK"):
|
114 |
try:
|
115 |
raw = resp.text
|
@@ -119,14 +103,12 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
|
|
119 |
lines.append(raw)
|
120 |
return "\n".join(lines)
|
121 |
|
122 |
-
# ZIP
|
123 |
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
124 |
lines.append("--- ZIP CONTENTS ---")
|
125 |
try:
|
126 |
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
127 |
for name in z.namelist():
|
128 |
lines.append(f"* {name}")
|
129 |
-
# Сырые JSON/TXT
|
130 |
for name in z.namelist():
|
131 |
low = name.lower()
|
132 |
if low.endswith(".json") or low.endswith(".txt"):
|
@@ -140,7 +122,6 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
|
|
140 |
lines.append(f"[zip parse error: {e}]")
|
141 |
return "\n".join(lines)
|
142 |
|
143 |
-
# Фолбэк: просто выбросить текстовое содержимое
|
144 |
try:
|
145 |
txt = data.decode("utf-8", errors="ignore")
|
146 |
except Exception:
|
@@ -150,13 +131,9 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
|
|
150 |
return "\n".join(lines)
|
151 |
|
152 |
def _parse_vlm_text(resp: requests.Response) -> str:
|
153 |
-
"""
|
154 |
-
Достаём лучший текст (если он есть).
|
155 |
-
"""
|
156 |
data = resp.content
|
157 |
ct = (resp.headers.get("content-type") or "").lower()
|
158 |
|
159 |
-
# JSON
|
160 |
if "application/json" in ct and not data.startswith(b"PK"):
|
161 |
try:
|
162 |
obj = resp.json()
|
@@ -165,11 +142,9 @@ def _parse_vlm_text(resp: requests.Response) -> str:
|
|
165 |
except Exception:
|
166 |
return ""
|
167 |
|
168 |
-
# ZIP → ищем JSON/TXT
|
169 |
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
170 |
try:
|
171 |
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
172 |
-
# JSON приоритет
|
173 |
for name in z.namelist():
|
174 |
if not name.lower().endswith(".json"):
|
175 |
continue
|
@@ -181,7 +156,6 @@ def _parse_vlm_text(resp: requests.Response) -> str:
|
|
181 |
return cands[0]
|
182 |
except Exception:
|
183 |
pass
|
184 |
-
# затем TXT
|
185 |
for name in z.namelist():
|
186 |
if name.lower().endswith(".txt"):
|
187 |
try:
|
@@ -194,16 +168,12 @@ def _parse_vlm_text(resp: requests.Response) -> str:
|
|
194 |
except Exception:
|
195 |
return ""
|
196 |
|
197 |
-
# Фоллбек
|
198 |
try:
|
199 |
return data.decode("utf-8", errors="ignore").strip()
|
200 |
except Exception:
|
201 |
return ""
|
202 |
|
203 |
def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
|
204 |
-
"""
|
205 |
-
Возвращает (best_text, raw_debug_dump)
|
206 |
-
"""
|
207 |
content = _vlm_content(task_token, asset_id, text_prompt)
|
208 |
payload = {"messages": [{"role": "user", "content": content}]}
|
209 |
headers = {
|
@@ -214,7 +184,6 @@ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] =
|
|
214 |
"NVCF-FUNCTION-ASSET-IDS": asset_id,
|
215 |
}
|
216 |
resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
|
217 |
-
# Сырой дамп для отладки — даже если статус не 200
|
218 |
raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
|
219 |
if not resp.ok:
|
220 |
return f"[VLM HTTP {resp.status_code}]", raw_dump
|
@@ -225,10 +194,6 @@ def _is_good(text: str) -> bool:
|
|
225 |
return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
|
226 |
|
227 |
def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
|
228 |
-
"""
|
229 |
-
Пробуем <MORE_DETAILED_CAPTION> → <DETAILED_CAPTION> → <CAPTION> → <OCR>.
|
230 |
-
Возвращает (caption, asset_id, debug_raw_all_attempts)
|
231 |
-
"""
|
232 |
asset_id = nvcf_upload_asset(image_path)
|
233 |
attempts = [
|
234 |
("<MORE_DETAILED_CAPTION>", None),
|
@@ -271,15 +236,11 @@ def _extract_text_from_stream_chunk(chunk: Any) -> str:
|
|
271 |
# --------------------- Чат-логика ---------------------
|
272 |
def respond(
|
273 |
message: Dict[str, Any],
|
274 |
-
chat_history: List[
|
275 |
last_caption: str,
|
276 |
last_asset_id: str,
|
277 |
last_debug: str
|
278 |
):
|
279 |
-
"""
|
280 |
-
message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
|
281 |
-
Пайплайн: если есть изображение → Florence (капшен + raw) → GPT-OSS.
|
282 |
-
"""
|
283 |
text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
|
284 |
files = (message or {}).get("files", []) if isinstance(message, dict) else []
|
285 |
|
@@ -296,7 +257,6 @@ def respond(
|
|
296 |
|
297 |
img_path = first_image_path(files)
|
298 |
|
299 |
-
# Видимый месседж пользователя
|
300 |
parts = []
|
301 |
if text and text.strip():
|
302 |
parts.append(text.strip())
|
@@ -305,24 +265,22 @@ def respond(
|
|
305 |
user_visible = "\n".join(parts) if parts else "🖐️"
|
306 |
|
307 |
chat_history = chat_history or []
|
308 |
-
chat_history.append(
|
309 |
-
|
310 |
yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
|
311 |
|
312 |
caption = last_caption or ""
|
313 |
asset_id = last_asset_id or ""
|
314 |
debug_raw = last_debug or ""
|
315 |
|
316 |
-
# Всегда: если есть изображение в запросе — сначала Florence
|
317 |
if img_path:
|
318 |
-
chat_history[-1][
|
319 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
320 |
try:
|
321 |
caption, asset_id, debug_raw = get_caption_with_debug(img_path)
|
322 |
except Exception as e:
|
323 |
caption, debug_raw = "", f"[Florence error] {e}"
|
324 |
|
325 |
-
# Системный промпт
|
326 |
if caption:
|
327 |
system_prompt = (
|
328 |
"You are a helpful multimodal assistant. "
|
@@ -338,10 +296,8 @@ def respond(
|
|
338 |
"If the user refers to an image but no caption is available, ask them to reattach the image."
|
339 |
)
|
340 |
|
341 |
-
# User text для модели
|
342 |
user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
|
343 |
|
344 |
-
# Стрим LLM
|
345 |
assistant_accum = ""
|
346 |
try:
|
347 |
stream = llm.chat.completions.create(
|
@@ -360,11 +316,9 @@ def respond(
|
|
360 |
if not piece:
|
361 |
continue
|
362 |
assistant_accum += piece
|
363 |
-
chat_history[-1][
|
364 |
-
# Показываем сырой вывод Florence внизу (фиксированный на запрос)
|
365 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
366 |
-
except Exception
|
367 |
-
# Фоллбэк без стрима
|
368 |
try:
|
369 |
resp = llm.chat.completions.create(
|
370 |
model="openai/gpt-oss-120b",
|
@@ -392,10 +346,10 @@ def respond(
|
|
392 |
final_text = str(resp)
|
393 |
else:
|
394 |
final_text = str(resp)
|
395 |
-
chat_history[-1][
|
396 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
397 |
except Exception as e2:
|
398 |
-
chat_history[-1][
|
399 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
400 |
|
401 |
# --------------------- Интерфейс ---------------------
|
@@ -410,7 +364,7 @@ messenger_css = """
|
|
410 |
#bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
|
411 |
#send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
|
412 |
#msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
|
413 |
-
#raw-
|
414 |
"""
|
415 |
|
416 |
theme = gr.themes.Soft(
|
@@ -430,7 +384,7 @@ with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
|
|
430 |
debug_state = gr.State(value="")
|
431 |
|
432 |
with gr.Group(elem_id="chat-wrap"):
|
433 |
-
chatbot = gr.Chatbot(label="", height=520, elem_id="chat")
|
434 |
|
435 |
with gr.Row(elem_id="bottom-bar"):
|
436 |
msg = gr.MultimodalTextbox(
|
@@ -440,16 +394,14 @@ with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
|
|
440 |
)
|
441 |
send = gr.Button("➤", variant="primary", elem_id="send")
|
442 |
|
443 |
-
|
444 |
-
with gr.Box(elem_id="raw-box"):
|
445 |
raw_out = gr.Textbox(
|
446 |
-
label="
|
447 |
value="",
|
448 |
lines=14,
|
449 |
show_copy_button=True
|
450 |
)
|
451 |
|
452 |
-
# Единственная подача: по Enter и по кнопке (одна видимая кнопка отправки)
|
453 |
msg.submit(
|
454 |
respond,
|
455 |
inputs=[msg, chatbot, caption_state, asset_state, debug_state],
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
import os
|
5 |
import io
|
|
|
85 |
return out
|
86 |
|
87 |
def _debug_dump_from_response(resp: requests.Response) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
lines = []
|
89 |
data = resp.content
|
90 |
ct = (resp.headers.get("content-type") or "").lower()
|
|
|
94 |
lines.append(f"content-type: {ct}")
|
95 |
lines.append(f"bytes: {len(data)}")
|
96 |
|
|
|
97 |
if "application/json" in ct and not data.startswith(b"PK"):
|
98 |
try:
|
99 |
raw = resp.text
|
|
|
103 |
lines.append(raw)
|
104 |
return "\n".join(lines)
|
105 |
|
|
|
106 |
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
107 |
lines.append("--- ZIP CONTENTS ---")
|
108 |
try:
|
109 |
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
110 |
for name in z.namelist():
|
111 |
lines.append(f"* {name}")
|
|
|
112 |
for name in z.namelist():
|
113 |
low = name.lower()
|
114 |
if low.endswith(".json") or low.endswith(".txt"):
|
|
|
122 |
lines.append(f"[zip parse error: {e}]")
|
123 |
return "\n".join(lines)
|
124 |
|
|
|
125 |
try:
|
126 |
txt = data.decode("utf-8", errors="ignore")
|
127 |
except Exception:
|
|
|
131 |
return "\n".join(lines)
|
132 |
|
133 |
def _parse_vlm_text(resp: requests.Response) -> str:
|
|
|
|
|
|
|
134 |
data = resp.content
|
135 |
ct = (resp.headers.get("content-type") or "").lower()
|
136 |
|
|
|
137 |
if "application/json" in ct and not data.startswith(b"PK"):
|
138 |
try:
|
139 |
obj = resp.json()
|
|
|
142 |
except Exception:
|
143 |
return ""
|
144 |
|
|
|
145 |
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
146 |
try:
|
147 |
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
|
|
148 |
for name in z.namelist():
|
149 |
if not name.lower().endswith(".json"):
|
150 |
continue
|
|
|
156 |
return cands[0]
|
157 |
except Exception:
|
158 |
pass
|
|
|
159 |
for name in z.namelist():
|
160 |
if name.lower().endswith(".txt"):
|
161 |
try:
|
|
|
168 |
except Exception:
|
169 |
return ""
|
170 |
|
|
|
171 |
try:
|
172 |
return data.decode("utf-8", errors="ignore").strip()
|
173 |
except Exception:
|
174 |
return ""
|
175 |
|
176 |
def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
|
|
|
|
|
|
|
177 |
content = _vlm_content(task_token, asset_id, text_prompt)
|
178 |
payload = {"messages": [{"role": "user", "content": content}]}
|
179 |
headers = {
|
|
|
184 |
"NVCF-FUNCTION-ASSET-IDS": asset_id,
|
185 |
}
|
186 |
resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
|
|
|
187 |
raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
|
188 |
if not resp.ok:
|
189 |
return f"[VLM HTTP {resp.status_code}]", raw_dump
|
|
|
194 |
return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
|
195 |
|
196 |
def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
|
|
|
|
|
|
|
|
|
197 |
asset_id = nvcf_upload_asset(image_path)
|
198 |
attempts = [
|
199 |
("<MORE_DETAILED_CAPTION>", None),
|
|
|
236 |
# --------------------- Чат-логика ---------------------
|
237 |
def respond(
|
238 |
message: Dict[str, Any],
|
239 |
+
chat_history: List[Dict[str, str]],
|
240 |
last_caption: str,
|
241 |
last_asset_id: str,
|
242 |
last_debug: str
|
243 |
):
|
|
|
|
|
|
|
|
|
244 |
text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
|
245 |
files = (message or {}).get("files", []) if isinstance(message, dict) else []
|
246 |
|
|
|
257 |
|
258 |
img_path = first_image_path(files)
|
259 |
|
|
|
260 |
parts = []
|
261 |
if text and text.strip():
|
262 |
parts.append(text.strip())
|
|
|
265 |
user_visible = "\n".join(parts) if parts else "🖐️"
|
266 |
|
267 |
chat_history = chat_history or []
|
268 |
+
chat_history.append({"role": "user", "content": user_visible})
|
269 |
+
chat_history.append({"role": "assistant", "content": ""})
|
270 |
yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
|
271 |
|
272 |
caption = last_caption or ""
|
273 |
asset_id = last_asset_id or ""
|
274 |
debug_raw = last_debug or ""
|
275 |
|
|
|
276 |
if img_path:
|
277 |
+
chat_history[-1]["content"] = "🔎 Обрабатываю изображение во Florence…"
|
278 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
279 |
try:
|
280 |
caption, asset_id, debug_raw = get_caption_with_debug(img_path)
|
281 |
except Exception as e:
|
282 |
caption, debug_raw = "", f"[Florence error] {e}"
|
283 |
|
|
|
284 |
if caption:
|
285 |
system_prompt = (
|
286 |
"You are a helpful multimodal assistant. "
|
|
|
296 |
"If the user refers to an image but no caption is available, ask them to reattach the image."
|
297 |
)
|
298 |
|
|
|
299 |
user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
|
300 |
|
|
|
301 |
assistant_accum = ""
|
302 |
try:
|
303 |
stream = llm.chat.completions.create(
|
|
|
316 |
if not piece:
|
317 |
continue
|
318 |
assistant_accum += piece
|
319 |
+
chat_history[-1]["content"] = assistant_accum
|
|
|
320 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
321 |
+
except Exception:
|
|
|
322 |
try:
|
323 |
resp = llm.chat.completions.create(
|
324 |
model="openai/gpt-oss-120b",
|
|
|
346 |
final_text = str(resp)
|
347 |
else:
|
348 |
final_text = str(resp)
|
349 |
+
chat_history[-1]["content"] = final_text
|
350 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
351 |
except Exception as e2:
|
352 |
+
chat_history[-1]["content"] = f"[Ошибка LLM: {e2}]"
|
353 |
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
354 |
|
355 |
# --------------------- Интерфейс ---------------------
|
|
|
364 |
#bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
|
365 |
#send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
|
366 |
#msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
|
367 |
+
#raw-wrap .wrap>label { font-weight: 600; }
|
368 |
"""
|
369 |
|
370 |
theme = gr.themes.Soft(
|
|
|
384 |
debug_state = gr.State(value="")
|
385 |
|
386 |
with gr.Group(elem_id="chat-wrap"):
|
387 |
+
chatbot = gr.Chatbot(label="", height=520, elem_id="chat", type="messages")
|
388 |
|
389 |
with gr.Row(elem_id="bottom-bar"):
|
390 |
msg = gr.MultimodalTextbox(
|
|
|
394 |
)
|
395 |
send = gr.Button("➤", variant="primary", elem_id="send")
|
396 |
|
397 |
+
with gr.Accordion("Raw Florence output", open=True, elem_id="raw-wrap"):
|
|
|
398 |
raw_out = gr.Textbox(
|
399 |
+
label="",
|
400 |
value="",
|
401 |
lines=14,
|
402 |
show_copy_button=True
|
403 |
)
|
404 |
|
|
|
405 |
msg.submit(
|
406 |
respond,
|
407 |
inputs=[msg, chatbot, caption_state, asset_state, debug_state],
|