Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,13 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
"""
|
4 |
-
|
5 |
-
-
|
6 |
-
- Florence-2 (NIM API)
|
7 |
-
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
Требуется: NV_API_KEY в Secrets HF Space.
|
13 |
"""
|
14 |
|
15 |
import os
|
@@ -17,7 +15,6 @@ import io
|
|
17 |
import json
|
18 |
import zipfile
|
19 |
import mimetypes
|
20 |
-
import traceback
|
21 |
from typing import Any, Dict, List, Optional, Tuple
|
22 |
|
23 |
import requests
|
@@ -35,7 +32,7 @@ if not NV_API_KEY:
|
|
35 |
|
36 |
llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
|
37 |
|
38 |
-
# --------------------- Florence
|
39 |
def _guess_mime(path: str) -> str:
|
40 |
return mimetypes.guess_type(path)[0] or "image/jpeg"
|
41 |
|
@@ -67,7 +64,6 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
|
|
67 |
return asset_id
|
68 |
|
69 |
def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
|
70 |
-
# "<TASK_PROMPT><text_prompt (когда нужен)><img>"
|
71 |
parts = [task_token]
|
72 |
if text_prompt and text_prompt.strip():
|
73 |
parts.append(text_prompt.strip())
|
@@ -77,19 +73,15 @@ def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = No
|
|
77 |
PRIORITY_TEXT_KEYS = [
|
78 |
"more_detailed_caption", "detailed_caption", "caption",
|
79 |
"generated_text", "text", "ocr", "description",
|
80 |
-
"output_text", "result_text",
|
81 |
]
|
82 |
-
LABEL_KEYS = ["label", "name", "category", "class", "text"]
|
83 |
|
84 |
def _deep_text_candidates(obj: Any) -> List[str]:
|
85 |
out = []
|
86 |
def walk(o):
|
87 |
if isinstance(o, dict):
|
88 |
-
# Сначала — приоритетные ключи
|
89 |
for k in PRIORITY_TEXT_KEYS:
|
90 |
if k in o and isinstance(o[k], str) and o[k].strip():
|
91 |
out.append(o[k].strip())
|
92 |
-
# Затем любые строковые поля
|
93 |
for v in o.values():
|
94 |
walk(v)
|
95 |
elif isinstance(o, list):
|
@@ -101,118 +93,117 @@ def _deep_text_candidates(obj: Any) -> List[str]:
|
|
101 |
walk(obj)
|
102 |
return out
|
103 |
|
104 |
-
def
|
105 |
"""
|
106 |
-
|
107 |
-
|
|
|
|
|
108 |
"""
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
-
def
|
139 |
"""
|
140 |
-
|
141 |
-
Если нечего извлечь — best_text = "" (важно для фолбэков).
|
142 |
"""
|
143 |
-
listing = []
|
144 |
-
ct = (resp.headers.get("content-type") or "").lower()
|
145 |
data = resp.content
|
|
|
146 |
|
147 |
-
# JSON
|
148 |
if "application/json" in ct and not data.startswith(b"PK"):
|
149 |
try:
|
150 |
obj = resp.json()
|
151 |
cands = _deep_text_candidates(obj)
|
152 |
-
if cands
|
153 |
-
return cands[0], listing
|
154 |
-
synth = _synthesize_from_detections(obj)
|
155 |
-
return (synth or ""), listing
|
156 |
except Exception:
|
157 |
-
|
158 |
|
159 |
-
# ZIP
|
160 |
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
161 |
try:
|
162 |
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
163 |
-
|
164 |
-
|
165 |
-
synth_cand = None
|
166 |
-
# Сначала попробуем JSON
|
167 |
-
for name in listing:
|
168 |
if not name.lower().endswith(".json"):
|
169 |
continue
|
170 |
try:
|
171 |
with z.open(name) as f:
|
172 |
obj = json.loads(f.read().decode("utf-8", errors="ignore"))
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
except Exception:
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
# Затем TXT
|
181 |
-
for name in listing:
|
182 |
if name.lower().endswith(".txt"):
|
183 |
try:
|
184 |
with z.open(name) as f:
|
185 |
txt = f.read().decode("utf-8", errors="ignore").strip()
|
186 |
if txt:
|
187 |
-
return txt
|
188 |
except Exception:
|
189 |
-
|
190 |
-
# Если ничего — попробуем синтез из детекций
|
191 |
-
if synth_cand:
|
192 |
-
return synth_cand, listing
|
193 |
except Exception:
|
194 |
-
|
195 |
|
196 |
-
#
|
197 |
try:
|
198 |
-
|
199 |
-
return (txt if txt else ""), listing
|
200 |
except Exception:
|
201 |
-
return ""
|
202 |
-
|
203 |
-
def _is_good_caption(text: str) -> bool:
|
204 |
-
if not text:
|
205 |
-
return False
|
206 |
-
t = text.strip()
|
207 |
-
if not t or len(t) < 3:
|
208 |
-
return False
|
209 |
-
# Отбросим наши старые плейсхолдеры
|
210 |
-
bad_markers = [
|
211 |
-
"Получено", "изображений-результатов", "[Result empty]", "[Результат пуст]"
|
212 |
-
]
|
213 |
-
return not any(m.lower() in t.lower() for m in bad_markers)
|
214 |
|
215 |
-
def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str,
|
|
|
|
|
|
|
216 |
content = _vlm_content(task_token, asset_id, text_prompt)
|
217 |
payload = {"messages": [{"role": "user", "content": content}]}
|
218 |
headers = {
|
@@ -223,15 +214,20 @@ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] =
|
|
223 |
"NVCF-FUNCTION-ASSET-IDS": asset_id,
|
224 |
}
|
225 |
resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
|
|
|
|
|
226 |
if not resp.ok:
|
227 |
-
|
228 |
-
text
|
229 |
-
return text,
|
230 |
|
231 |
-
def
|
|
|
|
|
|
|
232 |
"""
|
233 |
-
|
234 |
-
Возвращает (caption, asset_id,
|
235 |
"""
|
236 |
asset_id = nvcf_upload_asset(image_path)
|
237 |
attempts = [
|
@@ -240,17 +236,13 @@ def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
|
|
240 |
("<CAPTION>", None),
|
241 |
("<OCR>", None),
|
242 |
]
|
243 |
-
|
244 |
-
for
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
except Exception:
|
251 |
-
continue
|
252 |
-
# Если совсем ничего — пустая строка (важно для чата)
|
253 |
-
return "", asset_id, last_listing
|
254 |
|
255 |
# --------------------- LLM streaming utils ---------------------
|
256 |
def _extract_text_from_stream_chunk(chunk: Any) -> str:
|
@@ -281,10 +273,12 @@ def respond(
|
|
281 |
message: Dict[str, Any],
|
282 |
chat_history: List[List[str]],
|
283 |
last_caption: str,
|
284 |
-
last_asset_id: str
|
|
|
285 |
):
|
286 |
"""
|
287 |
message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
|
|
|
288 |
"""
|
289 |
text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
|
290 |
files = (message or {}).get("files", []) if isinstance(message, dict) else []
|
@@ -302,7 +296,7 @@ def respond(
|
|
302 |
|
303 |
img_path = first_image_path(files)
|
304 |
|
305 |
-
#
|
306 |
parts = []
|
307 |
if text and text.strip():
|
308 |
parts.append(text.strip())
|
@@ -312,32 +306,27 @@ def respond(
|
|
312 |
|
313 |
chat_history = chat_history or []
|
314 |
chat_history.append([user_visible, ""])
|
315 |
-
|
|
|
316 |
|
317 |
-
# Подпись к изображению
|
318 |
caption = last_caption or ""
|
319 |
asset_id = last_asset_id or ""
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
caption = ""
|
331 |
-
# Лаконично сигналим об ошибке в подкапоте
|
332 |
-
chat_history[-1][1] = f"⚠️ Не удалось получить подпись: {e}"
|
333 |
-
yield {"text": "", "files": []}, chat_history, caption, asset_id
|
334 |
|
335 |
-
# Системный промпт
|
336 |
if caption:
|
337 |
system_prompt = (
|
338 |
"You are a helpful multimodal assistant. "
|
339 |
"Use the provided 'More Detailed Caption' as visual context. "
|
340 |
-
"Do not reveal your chain-of-thought. "
|
341 |
"If something is not visible or uncertain, say so.\n\n"
|
342 |
"Image Caption START >>>\n"
|
343 |
f"{caption}\n"
|
@@ -346,11 +335,10 @@ def respond(
|
|
346 |
else:
|
347 |
system_prompt = (
|
348 |
"You are a helpful assistant. "
|
349 |
-
"If the user refers to an image but no caption is available, ask them to reattach the image.
|
350 |
-
"Do not reveal your chain-of-thought."
|
351 |
)
|
352 |
|
353 |
-
#
|
354 |
user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
|
355 |
|
356 |
# Стрим LLM
|
@@ -373,10 +361,10 @@ def respond(
|
|
373 |
continue
|
374 |
assistant_accum += piece
|
375 |
chat_history[-1][1] = assistant_accum
|
376 |
-
|
377 |
-
|
378 |
-
except Exception:
|
379 |
-
#
|
380 |
try:
|
381 |
resp = llm.chat.completions.create(
|
382 |
model="openai/gpt-oss-120b",
|
@@ -405,73 +393,72 @@ def respond(
|
|
405 |
else:
|
406 |
final_text = str(resp)
|
407 |
chat_history[-1][1] = final_text
|
408 |
-
yield {"text": "", "files": []}, chat_history, caption, asset_id
|
409 |
except Exception as e2:
|
410 |
chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
|
411 |
-
yield {"text": "", "files": []}, chat_history, caption, asset_id
|
412 |
|
413 |
# --------------------- Интерфейс ---------------------
|
414 |
messenger_css = """
|
415 |
:root {
|
416 |
--radius-xl: 16px;
|
417 |
-
--radius-lg: 14px;
|
418 |
}
|
419 |
-
.gradio-container { max-width:
|
420 |
-
#title { text-align: center; padding: 8px 0 10px; font-size:
|
421 |
-
#chat-wrap { border: 1px solid rgba(0,0,0,0.
|
422 |
-
#chat { height:
|
423 |
#bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
|
424 |
-
#send { min-width:
|
425 |
#msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
|
426 |
-
|
427 |
"""
|
428 |
|
429 |
theme = gr.themes.Soft(
|
430 |
primary_hue="cyan",
|
431 |
neutral_hue="slate",
|
432 |
).set(
|
433 |
-
body_text_color_subdued="#6b7280",
|
434 |
button_large_radius="999px",
|
435 |
button_small_radius="999px",
|
436 |
block_radius="16px",
|
437 |
)
|
438 |
|
439 |
with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
|
440 |
-
gr.Markdown("✨ <div id='title'
|
441 |
|
442 |
caption_state = gr.State(value="")
|
443 |
asset_state = gr.State(value="")
|
|
|
444 |
|
445 |
with gr.Group(elem_id="chat-wrap"):
|
446 |
-
chatbot = gr.Chatbot(label="", height=
|
447 |
|
448 |
with gr.Row(elem_id="bottom-bar"):
|
449 |
msg = gr.MultimodalTextbox(
|
450 |
show_label=False,
|
451 |
-
placeholder="Напишите
|
452 |
elem_id="msg",
|
453 |
)
|
454 |
send = gr.Button("➤", variant="primary", elem_id="send")
|
455 |
|
456 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
msg.submit(
|
458 |
respond,
|
459 |
-
inputs=[msg, chatbot, caption_state, asset_state],
|
460 |
-
outputs=[msg, chatbot, caption_state, asset_state]
|
461 |
)
|
462 |
send.click(
|
463 |
respond,
|
464 |
-
inputs=[msg, chatbot, caption_state, asset_state],
|
465 |
-
outputs=[msg, chatbot, caption_state, asset_state]
|
466 |
-
)
|
467 |
-
|
468 |
-
# Очистка
|
469 |
-
def clear_all():
|
470 |
-
return {"text": "", "files": []}, [], "", ""
|
471 |
-
gr.Button("Очистить", variant="secondary").click(
|
472 |
-
clear_all,
|
473 |
-
inputs=None,
|
474 |
-
outputs=[msg, chatbot, caption_state, asset_state]
|
475 |
)
|
476 |
|
477 |
if __name__ == "__main__":
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
"""
|
4 |
+
Минималистичный чат (одна кнопка отправки):
|
5 |
+
- Вложения через иконку в поле ввода (как в мессенджерах).
|
6 |
+
- Пайплайн: Florence-2 (NIM API) → GPT-OSS (NVIDIA Integrate).
|
7 |
+
- Внизу: необработанный сырой вывод Florence для отладки.
|
8 |
+
- Без WebGPU/wasm.
|
9 |
+
|
10 |
+
Требуется в Secrets: NV_API_KEY
|
|
|
|
|
11 |
"""
|
12 |
|
13 |
import os
|
|
|
15 |
import json
|
16 |
import zipfile
|
17 |
import mimetypes
|
|
|
18 |
from typing import Any, Dict, List, Optional, Tuple
|
19 |
|
20 |
import requests
|
|
|
32 |
|
33 |
llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
|
34 |
|
35 |
+
# --------------------- Florence utils ---------------------
|
36 |
def _guess_mime(path: str) -> str:
|
37 |
return mimetypes.guess_type(path)[0] or "image/jpeg"
|
38 |
|
|
|
64 |
return asset_id
|
65 |
|
66 |
def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
|
|
|
67 |
parts = [task_token]
|
68 |
if text_prompt and text_prompt.strip():
|
69 |
parts.append(text_prompt.strip())
|
|
|
73 |
PRIORITY_TEXT_KEYS = [
|
74 |
"more_detailed_caption", "detailed_caption", "caption",
|
75 |
"generated_text", "text", "ocr", "description",
|
|
|
76 |
]
|
|
|
77 |
|
78 |
def _deep_text_candidates(obj: Any) -> List[str]:
|
79 |
out = []
|
80 |
def walk(o):
|
81 |
if isinstance(o, dict):
|
|
|
82 |
for k in PRIORITY_TEXT_KEYS:
|
83 |
if k in o and isinstance(o[k], str) and o[k].strip():
|
84 |
out.append(o[k].strip())
|
|
|
85 |
for v in o.values():
|
86 |
walk(v)
|
87 |
elif isinstance(o, list):
|
|
|
93 |
walk(obj)
|
94 |
return out
|
95 |
|
96 |
+
def _debug_dump_from_response(resp: requests.Response) -> str:
|
97 |
"""
|
98 |
+
Возвращает максимально сырой тех. вывод:
|
99 |
+
- статус, content-type, длина
|
100 |
+
- если JSON: raw text
|
101 |
+
- если ZIP: список файлов и сырые JSON/TXT содержимое
|
102 |
"""
|
103 |
+
lines = []
|
104 |
+
data = resp.content
|
105 |
+
ct = (resp.headers.get("content-type") or "").lower()
|
106 |
+
|
107 |
+
lines.append("=== Florence HTTP Response ===")
|
108 |
+
lines.append(f"status: {resp.status_code}")
|
109 |
+
lines.append(f"content-type: {ct}")
|
110 |
+
lines.append(f"bytes: {len(data)}")
|
111 |
+
|
112 |
+
# JSON
|
113 |
+
if "application/json" in ct and not data.startswith(b"PK"):
|
114 |
+
try:
|
115 |
+
raw = resp.text
|
116 |
+
except Exception:
|
117 |
+
raw = data.decode("utf-8", errors="ignore")
|
118 |
+
lines.append("--- RAW JSON ---")
|
119 |
+
lines.append(raw)
|
120 |
+
return "\n".join(lines)
|
121 |
+
|
122 |
+
# ZIP
|
123 |
+
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
124 |
+
lines.append("--- ZIP CONTENTS ---")
|
125 |
+
try:
|
126 |
+
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
127 |
+
for name in z.namelist():
|
128 |
+
lines.append(f"* {name}")
|
129 |
+
# Сырые JSON/TXT
|
130 |
+
for name in z.namelist():
|
131 |
+
low = name.lower()
|
132 |
+
if low.endswith(".json") or low.endswith(".txt"):
|
133 |
+
try:
|
134 |
+
with z.open(name) as f:
|
135 |
+
raw = f.read().decode("utf-8", errors="ignore")
|
136 |
+
lines.append(f"\n--- FILE: {name} ---\n{raw}")
|
137 |
+
except Exception as e:
|
138 |
+
lines.append(f"\n--- FILE: {name} --- [read error: {e}]")
|
139 |
+
except Exception as e:
|
140 |
+
lines.append(f"[zip parse error: {e}]")
|
141 |
+
return "\n".join(lines)
|
142 |
+
|
143 |
+
# Фолбэк: просто выбросить текстовое содержимое
|
144 |
+
try:
|
145 |
+
txt = data.decode("utf-8", errors="ignore")
|
146 |
+
except Exception:
|
147 |
+
txt = "[binary body]"
|
148 |
+
lines.append("--- RAW BODY ---")
|
149 |
+
lines.append(txt)
|
150 |
+
return "\n".join(lines)
|
151 |
|
152 |
+
def _parse_vlm_text(resp: requests.Response) -> str:
|
153 |
"""
|
154 |
+
Достаём лучший текст (если он есть).
|
|
|
155 |
"""
|
|
|
|
|
156 |
data = resp.content
|
157 |
+
ct = (resp.headers.get("content-type") or "").lower()
|
158 |
|
159 |
+
# JSON
|
160 |
if "application/json" in ct and not data.startswith(b"PK"):
|
161 |
try:
|
162 |
obj = resp.json()
|
163 |
cands = _deep_text_candidates(obj)
|
164 |
+
return cands[0] if cands else ""
|
|
|
|
|
|
|
165 |
except Exception:
|
166 |
+
return ""
|
167 |
|
168 |
+
# ZIP → ищем JSON/TXT
|
169 |
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
170 |
try:
|
171 |
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
172 |
+
# JSON приоритет
|
173 |
+
for name in z.namelist():
|
|
|
|
|
|
|
174 |
if not name.lower().endswith(".json"):
|
175 |
continue
|
176 |
try:
|
177 |
with z.open(name) as f:
|
178 |
obj = json.loads(f.read().decode("utf-8", errors="ignore"))
|
179 |
+
cands = _deep_text_candidates(obj)
|
180 |
+
if cands:
|
181 |
+
return cands[0]
|
182 |
except Exception:
|
183 |
+
pass
|
184 |
+
# затем TXT
|
185 |
+
for name in z.namelist():
|
|
|
|
|
186 |
if name.lower().endswith(".txt"):
|
187 |
try:
|
188 |
with z.open(name) as f:
|
189 |
txt = f.read().decode("utf-8", errors="ignore").strip()
|
190 |
if txt:
|
191 |
+
return txt
|
192 |
except Exception:
|
193 |
+
pass
|
|
|
|
|
|
|
194 |
except Exception:
|
195 |
+
return ""
|
196 |
|
197 |
+
# Фоллбек
|
198 |
try:
|
199 |
+
return data.decode("utf-8", errors="ignore").strip()
|
|
|
200 |
except Exception:
|
201 |
+
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
+
def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
|
204 |
+
"""
|
205 |
+
Возвращает (best_text, raw_debug_dump)
|
206 |
+
"""
|
207 |
content = _vlm_content(task_token, asset_id, text_prompt)
|
208 |
payload = {"messages": [{"role": "user", "content": content}]}
|
209 |
headers = {
|
|
|
214 |
"NVCF-FUNCTION-ASSET-IDS": asset_id,
|
215 |
}
|
216 |
resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
|
217 |
+
# Сырой дамп для отладки — даже если статус не 200
|
218 |
+
raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
|
219 |
if not resp.ok:
|
220 |
+
return f"[VLM HTTP {resp.status_code}]", raw_dump
|
221 |
+
text = _parse_vlm_text(resp)
|
222 |
+
return text, raw_dump
|
223 |
|
224 |
+
def _is_good(text: str) -> bool:
|
225 |
+
return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
|
226 |
+
|
227 |
+
def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
|
228 |
"""
|
229 |
+
Пробуем <MORE_DETAILED_CAPTION> → <DETAILED_CAPTION> → <CAPTION> → <OCR>.
|
230 |
+
Возвращает (caption, asset_id, debug_raw_all_attempts)
|
231 |
"""
|
232 |
asset_id = nvcf_upload_asset(image_path)
|
233 |
attempts = [
|
|
|
236 |
("<CAPTION>", None),
|
237 |
("<OCR>", None),
|
238 |
]
|
239 |
+
debug_parts = []
|
240 |
+
for token, prompt in attempts:
|
241 |
+
text, raw_dump = _call_florence(token, asset_id, prompt)
|
242 |
+
debug_parts.append(f"=== Attempt {token} ===\n{raw_dump}\n")
|
243 |
+
if _is_good(text):
|
244 |
+
return text, asset_id, "\n".join(debug_parts)
|
245 |
+
return "", asset_id, "\n".join(debug_parts)
|
|
|
|
|
|
|
|
|
246 |
|
247 |
# --------------------- LLM streaming utils ---------------------
|
248 |
def _extract_text_from_stream_chunk(chunk: Any) -> str:
|
|
|
273 |
message: Dict[str, Any],
|
274 |
chat_history: List[List[str]],
|
275 |
last_caption: str,
|
276 |
+
last_asset_id: str,
|
277 |
+
last_debug: str
|
278 |
):
|
279 |
"""
|
280 |
message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
|
281 |
+
Пайплайн: если есть изображение → Florence (капшен + raw) → GPT-OSS.
|
282 |
"""
|
283 |
text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
|
284 |
files = (message or {}).get("files", []) if isinstance(message, dict) else []
|
|
|
296 |
|
297 |
img_path = first_image_path(files)
|
298 |
|
299 |
+
# Видимый месседж пользователя
|
300 |
parts = []
|
301 |
if text and text.strip():
|
302 |
parts.append(text.strip())
|
|
|
306 |
|
307 |
chat_history = chat_history or []
|
308 |
chat_history.append([user_visible, ""])
|
309 |
+
# Первое обновление UI (очищаем поле ввода)
|
310 |
+
yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
|
311 |
|
|
|
312 |
caption = last_caption or ""
|
313 |
asset_id = last_asset_id or ""
|
314 |
+
debug_raw = last_debug or ""
|
315 |
+
|
316 |
+
# Всегда: если есть изображение в запросе — сначала Florence
|
317 |
+
if img_path:
|
318 |
+
chat_history[-1][1] = "🔎 Обрабатываю изображение во Florence…"
|
319 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
320 |
+
try:
|
321 |
+
caption, asset_id, debug_raw = get_caption_with_debug(img_path)
|
322 |
+
except Exception as e:
|
323 |
+
caption, debug_raw = "", f"[Florence error] {e}"
|
|
|
|
|
|
|
|
|
324 |
|
325 |
+
# Системный промпт
|
326 |
if caption:
|
327 |
system_prompt = (
|
328 |
"You are a helpful multimodal assistant. "
|
329 |
"Use the provided 'More Detailed Caption' as visual context. "
|
|
|
330 |
"If something is not visible or uncertain, say so.\n\n"
|
331 |
"Image Caption START >>>\n"
|
332 |
f"{caption}\n"
|
|
|
335 |
else:
|
336 |
system_prompt = (
|
337 |
"You are a helpful assistant. "
|
338 |
+
"If the user refers to an image but no caption is available, ask them to reattach the image."
|
|
|
339 |
)
|
340 |
|
341 |
+
# User text для модели
|
342 |
user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
|
343 |
|
344 |
# Стрим LLM
|
|
|
361 |
continue
|
362 |
assistant_accum += piece
|
363 |
chat_history[-1][1] = assistant_accum
|
364 |
+
# Показываем сырой вывод Florence внизу (фиксированный на запрос)
|
365 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
366 |
+
except Exception as e:
|
367 |
+
# Фоллбэк без стрима
|
368 |
try:
|
369 |
resp = llm.chat.completions.create(
|
370 |
model="openai/gpt-oss-120b",
|
|
|
393 |
else:
|
394 |
final_text = str(resp)
|
395 |
chat_history[-1][1] = final_text
|
396 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
397 |
except Exception as e2:
|
398 |
chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
|
399 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
|
400 |
|
401 |
# --------------------- Интерфейс ---------------------
|
402 |
messenger_css = """
|
403 |
:root {
|
404 |
--radius-xl: 16px;
|
|
|
405 |
}
|
406 |
+
.gradio-container { max-width: 780px !important; margin: auto; }
|
407 |
+
#title { text-align: center; padding: 8px 0 10px; font-size: 18px; }
|
408 |
+
#chat-wrap { border: 1px solid rgba(0,0,0,0.07); border-radius: var(--radius-xl); overflow: hidden; }
|
409 |
+
#chat { height: 520px; }
|
410 |
#bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
|
411 |
+
#send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
|
412 |
#msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
|
413 |
+
#raw-box .wrap>label { font-weight: 600; }
|
414 |
"""
|
415 |
|
416 |
theme = gr.themes.Soft(
|
417 |
primary_hue="cyan",
|
418 |
neutral_hue="slate",
|
419 |
).set(
|
|
|
420 |
button_large_radius="999px",
|
421 |
button_small_radius="999px",
|
422 |
block_radius="16px",
|
423 |
)
|
424 |
|
425 |
with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
|
426 |
+
gr.Markdown("✨ <div id='title'>Визуальный чат: Florence → GPT‑OSS</div>")
|
427 |
|
428 |
caption_state = gr.State(value="")
|
429 |
asset_state = gr.State(value="")
|
430 |
+
debug_state = gr.State(value="")
|
431 |
|
432 |
with gr.Group(elem_id="chat-wrap"):
|
433 |
+
chatbot = gr.Chatbot(label="", height=520, elem_id="chat")
|
434 |
|
435 |
with gr.Row(elem_id="bottom-bar"):
|
436 |
msg = gr.MultimodalTextbox(
|
437 |
show_label=False,
|
438 |
+
placeholder="Напишите сообщение… (иконка слева — добавить изображение)",
|
439 |
elem_id="msg",
|
440 |
)
|
441 |
send = gr.Button("➤", variant="primary", elem_id="send")
|
442 |
|
443 |
+
# Нижний блок — сырой вывод Florence
|
444 |
+
with gr.Box(elem_id="raw-box"):
|
445 |
+
raw_out = gr.Textbox(
|
446 |
+
label="Raw Florence output",
|
447 |
+
value="",
|
448 |
+
lines=14,
|
449 |
+
show_copy_button=True
|
450 |
+
)
|
451 |
+
|
452 |
+
# Единственная подача: по Enter и по кнопке (одна видимая кнопка отправки)
|
453 |
msg.submit(
|
454 |
respond,
|
455 |
+
inputs=[msg, chatbot, caption_state, asset_state, debug_state],
|
456 |
+
outputs=[msg, chatbot, caption_state, asset_state, raw_out]
|
457 |
)
|
458 |
send.click(
|
459 |
respond,
|
460 |
+
inputs=[msg, chatbot, caption_state, asset_state, debug_state],
|
461 |
+
outputs=[msg, chatbot, caption_state, asset_state, raw_out]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
)
|
463 |
|
464 |
if __name__ == "__main__":
|