Serg4451D commited on
Commit
152bf38
·
verified ·
1 Parent(s): ebe16a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -166
app.py CHANGED
@@ -1,15 +1,13 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- Элегантный чат как в мессенджерах:
5
- - Кнопка добавления изображения прямо в строке ввода.
6
- - Florence-2 (NIM API) создаёт подпись (<MORE_DETAILED_CAPTION>) серверно.
7
- - Надёжный парсер: вытягивает текст из ZIP/JSON, синтезирует summary из детекций,
8
- и имеет фолбэки <DETAILED_CAPTION> → <CAPTION> → <OCR>.
9
- - LLM-стриминг через NVIDIA Integrate (OpenAI-совместимый API).
10
- - Без WebGPU.
11
-
12
- Требуется: NV_API_KEY в Secrets HF Space.
13
  """
14
 
15
  import os
@@ -17,7 +15,6 @@ import io
17
  import json
18
  import zipfile
19
  import mimetypes
20
- import traceback
21
  from typing import Any, Dict, List, Optional, Tuple
22
 
23
  import requests
@@ -35,7 +32,7 @@ if not NV_API_KEY:
35
 
36
  llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
37
 
38
- # --------------------- Florence-2 utils ---------------------
39
  def _guess_mime(path: str) -> str:
40
  return mimetypes.guess_type(path)[0] or "image/jpeg"
41
 
@@ -67,7 +64,6 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
67
  return asset_id
68
 
69
  def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
70
- # "<TASK_PROMPT><text_prompt (когда нужен)><img>"
71
  parts = [task_token]
72
  if text_prompt and text_prompt.strip():
73
  parts.append(text_prompt.strip())
@@ -77,19 +73,15 @@ def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = No
77
  PRIORITY_TEXT_KEYS = [
78
  "more_detailed_caption", "detailed_caption", "caption",
79
  "generated_text", "text", "ocr", "description",
80
- "output_text", "result_text",
81
  ]
82
- LABEL_KEYS = ["label", "name", "category", "class", "text"]
83
 
84
  def _deep_text_candidates(obj: Any) -> List[str]:
85
  out = []
86
  def walk(o):
87
  if isinstance(o, dict):
88
- # Сначала — приоритетные ключи
89
  for k in PRIORITY_TEXT_KEYS:
90
  if k in o and isinstance(o[k], str) and o[k].strip():
91
  out.append(o[k].strip())
92
- # Затем любые строковые поля
93
  for v in o.values():
94
  walk(v)
95
  elif isinstance(o, list):
@@ -101,118 +93,117 @@ def _deep_text_candidates(obj: Any) -> List[str]:
101
  walk(obj)
102
  return out
103
 
104
- def _synthesize_from_detections(obj: Any) -> Optional[str]:
105
  """
106
- Если пришли детекции/объекты, собрать краткое резюме вида:
107
- 'Обнаружено: person×2, dog×1'
 
 
108
  """
109
- labels = []
110
- def walk(o):
111
- if isinstance(o, dict):
112
- # списки детекций под известными ключами
113
- for key in ["detections", "predictions", "objects", "results"]:
114
- if key in o and isinstance(o[key], list):
115
- for it in o[key]:
116
- if isinstance(it, dict):
117
- label = None
118
- for lk in LABEL_KEYS:
119
- if lk in it and isinstance(it[lk], str):
120
- label = it[lk]
121
- break
122
- if label:
123
- labels.append(label)
124
- for v in o.values():
125
- walk(v)
126
- elif isinstance(o, list):
127
- for it in o:
128
- walk(it)
129
- walk(obj)
130
- if not labels:
131
- return None
132
- # Подсчитать
133
- from collections import Counter
134
- c = Counter(labels)
135
- parts = [f"{k}×{v}" for k, v in c.most_common()]
136
- return "Обнаружено: " + ", ".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- def _parse_vlm_response_to_text(resp: requests.Response) -> Tuple[str, List[str]]:
139
  """
140
- Возвращает (best_text, zip_listing).
141
- Если нечего извлечь — best_text = "" (важно для фолбэков).
142
  """
143
- listing = []
144
- ct = (resp.headers.get("content-type") or "").lower()
145
  data = resp.content
 
146
 
147
- # JSON inline
148
  if "application/json" in ct and not data.startswith(b"PK"):
149
  try:
150
  obj = resp.json()
151
  cands = _deep_text_candidates(obj)
152
- if cands:
153
- return cands[0], listing
154
- synth = _synthesize_from_detections(obj)
155
- return (synth or ""), listing
156
  except Exception:
157
- pass
158
 
159
- # ZIP
160
  if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
161
  try:
162
  with zipfile.ZipFile(io.BytesIO(data), "r") as z:
163
- listing = z.namelist()
164
- text_cands = []
165
- synth_cand = None
166
- # Сначала попробуем JSON
167
- for name in listing:
168
  if not name.lower().endswith(".json"):
169
  continue
170
  try:
171
  with z.open(name) as f:
172
  obj = json.loads(f.read().decode("utf-8", errors="ignore"))
173
- text_cands += _deep_text_candidates(obj)
174
- synth = _synthesize_from_detections(obj)
175
- synth_cand = synth_cand or synth
176
  except Exception:
177
- continue
178
- if text_cands:
179
- return text_cands[0], listing
180
- # Затем TXT
181
- for name in listing:
182
  if name.lower().endswith(".txt"):
183
  try:
184
  with z.open(name) as f:
185
  txt = f.read().decode("utf-8", errors="ignore").strip()
186
  if txt:
187
- return txt, listing
188
  except Exception:
189
- continue
190
- # Если ничего — попробуем синтез из детекций
191
- if synth_cand:
192
- return synth_cand, listing
193
  except Exception:
194
- pass
195
 
196
- # Фолбэк: попытка как текст
197
  try:
198
- txt = data.decode("utf-8", errors="ignore").strip()
199
- return (txt if txt else ""), listing
200
  except Exception:
201
- return "", listing
202
-
203
- def _is_good_caption(text: str) -> bool:
204
- if not text:
205
- return False
206
- t = text.strip()
207
- if not t or len(t) < 3:
208
- return False
209
- # Отбросим наши старые плейсхолдеры
210
- bad_markers = [
211
- "Получено", "изображений-результатов", "[Result empty]", "[Результат пуст]"
212
- ]
213
- return not any(m.lower() in t.lower() for m in bad_markers)
214
 
215
- def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, List[str]]:
 
 
 
216
  content = _vlm_content(task_token, asset_id, text_prompt)
217
  payload = {"messages": [{"role": "user", "content": content}]}
218
  headers = {
@@ -223,15 +214,20 @@ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] =
223
  "NVCF-FUNCTION-ASSET-IDS": asset_id,
224
  }
225
  resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
 
 
226
  if not resp.ok:
227
- raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
228
- text, listing = _parse_vlm_response_to_text(resp)
229
- return text, listing
230
 
231
- def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
 
 
 
232
  """
233
- Пытаемся получить осмысленную подпись.
234
- Возвращает (caption, asset_id, zip_listing)
235
  """
236
  asset_id = nvcf_upload_asset(image_path)
237
  attempts = [
@@ -240,17 +236,13 @@ def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
240
  ("<CAPTION>", None),
241
  ("<OCR>", None),
242
  ]
243
- last_listing: List[str] = []
244
- for task, txt in attempts:
245
- try:
246
- caption, listing = _call_florence(task, asset_id, txt)
247
- last_listing = listing or last_listing
248
- if _is_good_caption(caption):
249
- return caption, asset_id, listing
250
- except Exception:
251
- continue
252
- # Если совсем ничего — пустая строка (важно для чата)
253
- return "", asset_id, last_listing
254
 
255
  # --------------------- LLM streaming utils ---------------------
256
  def _extract_text_from_stream_chunk(chunk: Any) -> str:
@@ -281,10 +273,12 @@ def respond(
281
  message: Dict[str, Any],
282
  chat_history: List[List[str]],
283
  last_caption: str,
284
- last_asset_id: str
 
285
  ):
286
  """
287
  message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
 
288
  """
289
  text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
290
  files = (message or {}).get("files", []) if isinstance(message, dict) else []
@@ -302,7 +296,7 @@ def respond(
302
 
303
  img_path = first_image_path(files)
304
 
305
- # Сообщение пользователя (лаконично)
306
  parts = []
307
  if text and text.strip():
308
  parts.append(text.strip())
@@ -312,32 +306,27 @@ def respond(
312
 
313
  chat_history = chat_history or []
314
  chat_history.append([user_visible, ""])
315
- yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
 
316
 
317
- # Подпись к изображению
318
  caption = last_caption or ""
319
  asset_id = last_asset_id or ""
320
- try:
321
- if img_path:
322
- # Показать пользователю, что генерируем подпись
323
- chat_history[-1][1] = "🔎 Генерирую подпись к изображению…"
324
- yield {"text": "", "files": []}, chat_history, caption, asset_id
325
-
326
- caption, asset_id, _ = get_robust_caption(img_path)
327
- if not _is_good_caption(caption):
328
- caption = "" # не подсовываем пустышку в LLM
329
- except Exception as e:
330
- caption = ""
331
- # Лаконично сигналим об ошибке в подкапоте
332
- chat_history[-1][1] = f"⚠️ Не удалось получить подпись: {e}"
333
- yield {"text": "", "files": []}, chat_history, caption, asset_id
334
 
335
- # Системный промпт (без «рассуждений»)
336
  if caption:
337
  system_prompt = (
338
  "You are a helpful multimodal assistant. "
339
  "Use the provided 'More Detailed Caption' as visual context. "
340
- "Do not reveal your chain-of-thought. "
341
  "If something is not visible or uncertain, say so.\n\n"
342
  "Image Caption START >>>\n"
343
  f"{caption}\n"
@@ -346,11 +335,10 @@ def respond(
346
  else:
347
  system_prompt = (
348
  "You are a helpful assistant. "
349
- "If the user refers to an image but no caption is available, ask them to reattach the image. "
350
- "Do not reveal your chain-of-thought."
351
  )
352
 
353
- # Текст для модели (если совсем ничего не написали, но есть изображение)
354
  user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
355
 
356
  # Стрим LLM
@@ -373,10 +361,10 @@ def respond(
373
  continue
374
  assistant_accum += piece
375
  chat_history[-1][1] = assistant_accum
376
- yield {"text": "", "files": []}, chat_history, caption, asset_id
377
-
378
- except Exception:
379
- # Фолбэк без стрима
380
  try:
381
  resp = llm.chat.completions.create(
382
  model="openai/gpt-oss-120b",
@@ -405,73 +393,72 @@ def respond(
405
  else:
406
  final_text = str(resp)
407
  chat_history[-1][1] = final_text
408
- yield {"text": "", "files": []}, chat_history, caption, asset_id
409
  except Exception as e2:
410
  chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
411
- yield {"text": "", "files": []}, chat_history, caption, asset_id
412
 
413
  # --------------------- Интерфейс ---------------------
414
  messenger_css = """
415
  :root {
416
  --radius-xl: 16px;
417
- --radius-lg: 14px;
418
  }
419
- .gradio-container { max-width: 800px !important; margin: auto; }
420
- #title { text-align: center; padding: 8px 0 10px; font-size: 20px; }
421
- #chat-wrap { border: 1px solid rgba(0,0,0,0.06); border-radius: var(--radius-xl); overflow: hidden; }
422
- #chat { height: 560px; }
423
  #bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
424
- #send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
425
  #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
426
- .gr-chatbot { border-radius: 0 !important; }
427
  """
428
 
429
  theme = gr.themes.Soft(
430
  primary_hue="cyan",
431
  neutral_hue="slate",
432
  ).set(
433
- body_text_color_subdued="#6b7280",
434
  button_large_radius="999px",
435
  button_small_radius="999px",
436
  block_radius="16px",
437
  )
438
 
439
  with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
440
- gr.Markdown("✨ <div id='title'>Элегантный визуальный чат</div>")
441
 
442
  caption_state = gr.State(value="")
443
  asset_state = gr.State(value="")
 
444
 
445
  with gr.Group(elem_id="chat-wrap"):
446
- chatbot = gr.Chatbot(label="", height=560, elem_id="chat")
447
 
448
  with gr.Row(elem_id="bottom-bar"):
449
  msg = gr.MultimodalTextbox(
450
  show_label=False,
451
- placeholder="Напишите сообщение... (иконка слева — добавить изображение)",
452
  elem_id="msg",
453
  )
454
  send = gr.Button("➤", variant="primary", elem_id="send")
455
 
456
- # Отправка по Enter и по кнопке
 
 
 
 
 
 
 
 
 
457
  msg.submit(
458
  respond,
459
- inputs=[msg, chatbot, caption_state, asset_state],
460
- outputs=[msg, chatbot, caption_state, asset_state]
461
  )
462
  send.click(
463
  respond,
464
- inputs=[msg, chatbot, caption_state, asset_state],
465
- outputs=[msg, chatbot, caption_state, asset_state]
466
- )
467
-
468
- # Очистка
469
- def clear_all():
470
- return {"text": "", "files": []}, [], "", ""
471
- gr.Button("Очистить", variant="secondary").click(
472
- clear_all,
473
- inputs=None,
474
- outputs=[msg, chatbot, caption_state, asset_state]
475
  )
476
 
477
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ Минималистичный чат (одна кнопка отправки):
5
+ - Вложения через иконку в поле ввода (как в мессенджерах).
6
+ - Пайплайн: Florence-2 (NIM API) GPT-OSS (NVIDIA Integrate).
7
+ - Внизу: необработанный сырой вывод Florence для отладки.
8
+ - Без WebGPU/wasm.
9
+
10
+ Требуется в Secrets: NV_API_KEY
 
 
11
  """
12
 
13
  import os
 
15
  import json
16
  import zipfile
17
  import mimetypes
 
18
  from typing import Any, Dict, List, Optional, Tuple
19
 
20
  import requests
 
32
 
33
  llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
34
 
35
+ # --------------------- Florence utils ---------------------
36
  def _guess_mime(path: str) -> str:
37
  return mimetypes.guess_type(path)[0] or "image/jpeg"
38
 
 
64
  return asset_id
65
 
66
  def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
 
67
  parts = [task_token]
68
  if text_prompt and text_prompt.strip():
69
  parts.append(text_prompt.strip())
 
73
  PRIORITY_TEXT_KEYS = [
74
  "more_detailed_caption", "detailed_caption", "caption",
75
  "generated_text", "text", "ocr", "description",
 
76
  ]
 
77
 
78
  def _deep_text_candidates(obj: Any) -> List[str]:
79
  out = []
80
  def walk(o):
81
  if isinstance(o, dict):
 
82
  for k in PRIORITY_TEXT_KEYS:
83
  if k in o and isinstance(o[k], str) and o[k].strip():
84
  out.append(o[k].strip())
 
85
  for v in o.values():
86
  walk(v)
87
  elif isinstance(o, list):
 
93
  walk(obj)
94
  return out
95
 
96
+ def _debug_dump_from_response(resp: requests.Response) -> str:
97
  """
98
+ Возвращает максимально сырой тех. вывод:
99
+ - статус, content-type, длина
100
+ - если JSON: raw text
101
+ - если ZIP: список файлов и сырые JSON/TXT содержимое
102
  """
103
+ lines = []
104
+ data = resp.content
105
+ ct = (resp.headers.get("content-type") or "").lower()
106
+
107
+ lines.append("=== Florence HTTP Response ===")
108
+ lines.append(f"status: {resp.status_code}")
109
+ lines.append(f"content-type: {ct}")
110
+ lines.append(f"bytes: {len(data)}")
111
+
112
+ # JSON
113
+ if "application/json" in ct and not data.startswith(b"PK"):
114
+ try:
115
+ raw = resp.text
116
+ except Exception:
117
+ raw = data.decode("utf-8", errors="ignore")
118
+ lines.append("--- RAW JSON ---")
119
+ lines.append(raw)
120
+ return "\n".join(lines)
121
+
122
+ # ZIP
123
+ if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
124
+ lines.append("--- ZIP CONTENTS ---")
125
+ try:
126
+ with zipfile.ZipFile(io.BytesIO(data), "r") as z:
127
+ for name in z.namelist():
128
+ lines.append(f"* {name}")
129
+ # Сырые JSON/TXT
130
+ for name in z.namelist():
131
+ low = name.lower()
132
+ if low.endswith(".json") or low.endswith(".txt"):
133
+ try:
134
+ with z.open(name) as f:
135
+ raw = f.read().decode("utf-8", errors="ignore")
136
+ lines.append(f"\n--- FILE: {name} ---\n{raw}")
137
+ except Exception as e:
138
+ lines.append(f"\n--- FILE: {name} --- [read error: {e}]")
139
+ except Exception as e:
140
+ lines.append(f"[zip parse error: {e}]")
141
+ return "\n".join(lines)
142
+
143
+ # Фолбэк: просто выбросить текстовое содержимое
144
+ try:
145
+ txt = data.decode("utf-8", errors="ignore")
146
+ except Exception:
147
+ txt = "[binary body]"
148
+ lines.append("--- RAW BODY ---")
149
+ lines.append(txt)
150
+ return "\n".join(lines)
151
 
152
+ def _parse_vlm_text(resp: requests.Response) -> str:
153
  """
154
+ Достаём лучший текст (если он есть).
 
155
  """
 
 
156
  data = resp.content
157
+ ct = (resp.headers.get("content-type") or "").lower()
158
 
159
+ # JSON
160
  if "application/json" in ct and not data.startswith(b"PK"):
161
  try:
162
  obj = resp.json()
163
  cands = _deep_text_candidates(obj)
164
+ return cands[0] if cands else ""
 
 
 
165
  except Exception:
166
+ return ""
167
 
168
+ # ZIP → ищем JSON/TXT
169
  if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
170
  try:
171
  with zipfile.ZipFile(io.BytesIO(data), "r") as z:
172
+ # JSON приоритет
173
+ for name in z.namelist():
 
 
 
174
  if not name.lower().endswith(".json"):
175
  continue
176
  try:
177
  with z.open(name) as f:
178
  obj = json.loads(f.read().decode("utf-8", errors="ignore"))
179
+ cands = _deep_text_candidates(obj)
180
+ if cands:
181
+ return cands[0]
182
  except Exception:
183
+ pass
184
+ # затем TXT
185
+ for name in z.namelist():
 
 
186
  if name.lower().endswith(".txt"):
187
  try:
188
  with z.open(name) as f:
189
  txt = f.read().decode("utf-8", errors="ignore").strip()
190
  if txt:
191
+ return txt
192
  except Exception:
193
+ pass
 
 
 
194
  except Exception:
195
+ return ""
196
 
197
+ # Фоллбек
198
  try:
199
+ return data.decode("utf-8", errors="ignore").strip()
 
200
  except Exception:
201
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
+ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
204
+ """
205
+ Возвращает (best_text, raw_debug_dump)
206
+ """
207
  content = _vlm_content(task_token, asset_id, text_prompt)
208
  payload = {"messages": [{"role": "user", "content": content}]}
209
  headers = {
 
214
  "NVCF-FUNCTION-ASSET-IDS": asset_id,
215
  }
216
  resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
217
+ # Сырой дамп для отладки — даже если статус не 200
218
+ raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
219
  if not resp.ok:
220
+ return f"[VLM HTTP {resp.status_code}]", raw_dump
221
+ text = _parse_vlm_text(resp)
222
+ return text, raw_dump
223
 
224
+ def _is_good(text: str) -> bool:
225
+ return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
226
+
227
+ def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
228
  """
229
+ Пробуем <MORE_DETAILED_CAPTION> <DETAILED_CAPTION> → <CAPTION> → <OCR>.
230
+ Возвращает (caption, asset_id, debug_raw_all_attempts)
231
  """
232
  asset_id = nvcf_upload_asset(image_path)
233
  attempts = [
 
236
  ("<CAPTION>", None),
237
  ("<OCR>", None),
238
  ]
239
+ debug_parts = []
240
+ for token, prompt in attempts:
241
+ text, raw_dump = _call_florence(token, asset_id, prompt)
242
+ debug_parts.append(f"=== Attempt {token} ===\n{raw_dump}\n")
243
+ if _is_good(text):
244
+ return text, asset_id, "\n".join(debug_parts)
245
+ return "", asset_id, "\n".join(debug_parts)
 
 
 
 
246
 
247
  # --------------------- LLM streaming utils ---------------------
248
  def _extract_text_from_stream_chunk(chunk: Any) -> str:
 
273
  message: Dict[str, Any],
274
  chat_history: List[List[str]],
275
  last_caption: str,
276
+ last_asset_id: str,
277
+ last_debug: str
278
  ):
279
  """
280
  message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
281
+ Пайплайн: если есть изображение → Florence (капшен + raw) → GPT-OSS.
282
  """
283
  text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
284
  files = (message or {}).get("files", []) if isinstance(message, dict) else []
 
296
 
297
  img_path = first_image_path(files)
298
 
299
+ # Видимый месседж пользователя
300
  parts = []
301
  if text and text.strip():
302
  parts.append(text.strip())
 
306
 
307
  chat_history = chat_history or []
308
  chat_history.append([user_visible, ""])
309
+ # Первое обновление UI (очищаем поле ввода)
310
+ yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
311
 
 
312
  caption = last_caption or ""
313
  asset_id = last_asset_id or ""
314
+ debug_raw = last_debug or ""
315
+
316
+ # Всегда: если есть изображение в запросе — сначала Florence
317
+ if img_path:
318
+ chat_history[-1][1] = "🔎 Обрабатываю изображение во Florence…"
319
+ yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
320
+ try:
321
+ caption, asset_id, debug_raw = get_caption_with_debug(img_path)
322
+ except Exception as e:
323
+ caption, debug_raw = "", f"[Florence error] {e}"
 
 
 
 
324
 
325
+ # Системный промпт
326
  if caption:
327
  system_prompt = (
328
  "You are a helpful multimodal assistant. "
329
  "Use the provided 'More Detailed Caption' as visual context. "
 
330
  "If something is not visible or uncertain, say so.\n\n"
331
  "Image Caption START >>>\n"
332
  f"{caption}\n"
 
335
  else:
336
  system_prompt = (
337
  "You are a helpful assistant. "
338
+ "If the user refers to an image but no caption is available, ask them to reattach the image."
 
339
  )
340
 
341
+ # User text для модели
342
  user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
343
 
344
  # Стрим LLM
 
361
  continue
362
  assistant_accum += piece
363
  chat_history[-1][1] = assistant_accum
364
+ # Показываем сырой вывод Florence внизу (фиксированный на запрос)
365
+ yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
366
+ except Exception as e:
367
+ # Фоллбэк без стрима
368
  try:
369
  resp = llm.chat.completions.create(
370
  model="openai/gpt-oss-120b",
 
393
  else:
394
  final_text = str(resp)
395
  chat_history[-1][1] = final_text
396
+ yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
397
  except Exception as e2:
398
  chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
399
+ yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
400
 
401
  # --------------------- Интерфейс ---------------------
402
  messenger_css = """
403
  :root {
404
  --radius-xl: 16px;
 
405
  }
406
+ .gradio-container { max-width: 780px !important; margin: auto; }
407
+ #title { text-align: center; padding: 8px 0 10px; font-size: 18px; }
408
+ #chat-wrap { border: 1px solid rgba(0,0,0,0.07); border-radius: var(--radius-xl); overflow: hidden; }
409
+ #chat { height: 520px; }
410
  #bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
411
+ #send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
412
  #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
413
+ #raw-box .wrap>label { font-weight: 600; }
414
  """
415
 
416
  theme = gr.themes.Soft(
417
  primary_hue="cyan",
418
  neutral_hue="slate",
419
  ).set(
 
420
  button_large_radius="999px",
421
  button_small_radius="999px",
422
  block_radius="16px",
423
  )
424
 
425
  with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
426
+ gr.Markdown("✨ <div id='title'>Визуальный чат: Florence → GPT‑OSS</div>")
427
 
428
  caption_state = gr.State(value="")
429
  asset_state = gr.State(value="")
430
+ debug_state = gr.State(value="")
431
 
432
  with gr.Group(elem_id="chat-wrap"):
433
+ chatbot = gr.Chatbot(label="", height=520, elem_id="chat")
434
 
435
  with gr.Row(elem_id="bottom-bar"):
436
  msg = gr.MultimodalTextbox(
437
  show_label=False,
438
+ placeholder="Напишите сообщение… (иконка слева — добавить изображение)",
439
  elem_id="msg",
440
  )
441
  send = gr.Button("➤", variant="primary", elem_id="send")
442
 
443
+ # Нижний блок сырой вывод Florence
444
+ with gr.Box(elem_id="raw-box"):
445
+ raw_out = gr.Textbox(
446
+ label="Raw Florence output",
447
+ value="",
448
+ lines=14,
449
+ show_copy_button=True
450
+ )
451
+
452
+ # Единственная подача: по Enter и по кнопке (одна видимая кнопка отправки)
453
  msg.submit(
454
  respond,
455
+ inputs=[msg, chatbot, caption_state, asset_state, debug_state],
456
+ outputs=[msg, chatbot, caption_state, asset_state, raw_out]
457
  )
458
  send.click(
459
  respond,
460
+ inputs=[msg, chatbot, caption_state, asset_state, debug_state],
461
+ outputs=[msg, chatbot, caption_state, asset_state, raw_out]
 
 
 
 
 
 
 
 
 
462
  )
463
 
464
  if __name__ == "__main__":