Spaces:

Serg4451D
/

gpt-oss-multimodal

Sleeping

App Files Files Community

Serg4451D commited on 2 days ago

Commit

152bf38

verified ·

1 Parent(s): ebe16a8

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -166

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Элегантный чат как в мессенджерах:
-- Кнопка добавления изображения прямо в строке ввода.
-- Florence-2 (NIM API) создаёт подпись (<MORE_DETAILED_CAPTION>) серверно.
-- Надёжный парсер: вытягивает текст из ZIP/JSON, синтезирует summary из детекций,
-  и имеет фолбэки <DETAILED_CAPTION> → <CAPTION> → <OCR>.
-- LLM-стриминг через NVIDIA Integrate (OpenAI-совместимый API).
-- Без WebGPU.
-Требуется: NV_API_KEY в Secrets HF Space.
 """
 import os
@@ -17,7 +15,6 @@ import io
 import json
 import zipfile
 import mimetypes
-import traceback
 from typing import Any, Dict, List, Optional, Tuple
 import requests
@@ -35,7 +32,7 @@ if not NV_API_KEY:
 llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
-# --------------------- Florence-2 utils ---------------------
 def _guess_mime(path: str) -> str:
     return mimetypes.guess_type(path)[0] or "image/jpeg"
@@ -67,7 +64,6 @@ def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
     return asset_id
 def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
-    # "<TASK_PROMPT><text_prompt (когда нужен)><img>"
     parts = [task_token]
     if text_prompt and text_prompt.strip():
         parts.append(text_prompt.strip())
@@ -77,19 +73,15 @@ def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = No
 PRIORITY_TEXT_KEYS = [
     "more_detailed_caption", "detailed_caption", "caption",
     "generated_text", "text", "ocr", "description",
-    "output_text", "result_text",
 ]
-LABEL_KEYS = ["label", "name", "category", "class", "text"]
 def _deep_text_candidates(obj: Any) -> List[str]:
     out = []
     def walk(o):
         if isinstance(o, dict):
-            # Сначала — приоритетные ключи
             for k in PRIORITY_TEXT_KEYS:
                 if k in o and isinstance(o[k], str) and o[k].strip():
                     out.append(o[k].strip())
-            # Затем любые строковые поля
             for v in o.values():
                 walk(v)
         elif isinstance(o, list):
@@ -101,118 +93,117 @@ def _deep_text_candidates(obj: Any) -> List[str]:
     walk(obj)
     return out
-def _synthesize_from_detections(obj: Any) -> Optional[str]:
     """
-    Если пришли детекции/объекты, собрать краткое резюме вида:
-    'Обнаружено: person×2, dog×1'
     """
-    labels = []
-    def walk(o):
-        if isinstance(o, dict):
-            # списки детекций под известными ключами
-            for key in ["detections", "predictions", "objects", "results"]:
-                if key in o and isinstance(o[key], list):
-                    for it in o[key]:
-                        if isinstance(it, dict):
-                            label = None
-                            for lk in LABEL_KEYS:
-                                if lk in it and isinstance(it[lk], str):
-                                    label = it[lk]
-                                    break
-                            if label:
-                                labels.append(label)
-            for v in o.values():
-                walk(v)
-        elif isinstance(o, list):
-            for it in o:
-                walk(it)
-    walk(obj)
-    if not labels:
-        return None
-    # Подсчитать
-    from collections import Counter
-    c = Counter(labels)
-    parts = [f"{k}×{v}" for k, v in c.most_common()]
-    return "Обнаружено: " + ", ".join(parts)
-def _parse_vlm_response_to_text(resp: requests.Response) -> Tuple[str, List[str]]:
     """
-    Возвращает (best_text, zip_listing).
-    Если нечего извлечь — best_text = "" (важно для фолбэков).
     """
-    listing = []
-    ct = (resp.headers.get("content-type") or "").lower()
     data = resp.content
-    # JSON inline
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             obj = resp.json()
             cands = _deep_text_candidates(obj)
-            if cands:
-                return cands[0], listing
-            synth = _synthesize_from_detections(obj)
-            return (synth or ""), listing
         except Exception:
-            pass
-    # ZIP
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
-                listing = z.namelist()
-                text_cands = []
-                synth_cand = None
-                # Сначала попробуем JSON
-                for name in listing:
                     if not name.lower().endswith(".json"):
                         continue
                     try:
                         with z.open(name) as f:
                             obj = json.loads(f.read().decode("utf-8", errors="ignore"))
-                        text_cands += _deep_text_candidates(obj)
-                        synth = _synthesize_from_detections(obj)
-                        synth_cand = synth_cand or synth
                     except Exception:
-                        continue
-                if text_cands:
-                    return text_cands[0], listing
-                # Затем TXT
-                for name in listing:
                     if name.lower().endswith(".txt"):
                         try:
                             with z.open(name) as f:
                                 txt = f.read().decode("utf-8", errors="ignore").strip()
                             if txt:
-                                return txt, listing
                         except Exception:
-                            continue
-                # Если ничего — попробуем синтез из детекций
-                if synth_cand:
-                    return synth_cand, listing
         except Exception:
-            pass
-    # Фолбэк: попытка как текст
     try:
-        txt = data.decode("utf-8", errors="ignore").strip()
-        return (txt if txt else ""), listing
     except Exception:
-        return "", listing
-def _is_good_caption(text: str) -> bool:
-    if not text:
-        return False
-    t = text.strip()
-    if not t or len(t) < 3:
-        return False
-    # Отбросим наши старые плейсхолдеры
-    bad_markers = [
-        "Получено", "изображений-результатов", "[Result empty]", "[Результат пуст]"
-    ]
-    return not any(m.lower() in t.lower() for m in bad_markers)
-def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, List[str]]:
     content = _vlm_content(task_token, asset_id, text_prompt)
     payload = {"messages": [{"role": "user", "content": content}]}
     headers = {
@@ -223,15 +214,20 @@ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] =
         "NVCF-FUNCTION-ASSET-IDS": asset_id,
     }
     resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
     if not resp.ok:
-        raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
-    text, listing = _parse_vlm_response_to_text(resp)
-    return text, listing
-def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
     """
-    Пытаемся получить осмысленную подпись.
-    Возвращает (caption, asset_id, zip_listing)
     """
     asset_id = nvcf_upload_asset(image_path)
     attempts = [
@@ -240,17 +236,13 @@ def get_robust_caption(image_path: str) -> Tuple[str, str, List[str]]:
         ("<CAPTION>", None),
         ("<OCR>", None),
     ]
-    last_listing: List[str] = []
-    for task, txt in attempts:
-        try:
-            caption, listing = _call_florence(task, asset_id, txt)
-            last_listing = listing or last_listing
-            if _is_good_caption(caption):
-                return caption, asset_id, listing
-        except Exception:
-            continue
-    # Если совсем ничего — пустая строка (важно для чата)
-    return "", asset_id, last_listing
 # --------------------- LLM streaming utils ---------------------
 def _extract_text_from_stream_chunk(chunk: Any) -> str:
@@ -281,10 +273,12 @@ def respond(
     message: Dict[str, Any],
     chat_history: List[List[str]],
     last_caption: str,
-    last_asset_id: str
 ):
     """
     message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
     """
     text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
     files = (message or {}).get("files", []) if isinstance(message, dict) else []
@@ -302,7 +296,7 @@ def respond(
     img_path = first_image_path(files)
-    # Сообщение пользователя (лаконично)
     parts = []
     if text and text.strip():
         parts.append(text.strip())
@@ -312,32 +306,27 @@ def respond(
     chat_history = chat_history or []
     chat_history.append([user_visible, ""])
-    yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
-    # Подпись к изображению
     caption = last_caption or ""
     asset_id = last_asset_id or ""
-    try:
-        if img_path:
-            # Показать пользователю, что генерируем подпись
-            chat_history[-1][1] = "🔎 Генерирую подпись к изображению…"
-            yield {"text": "", "files": []}, chat_history, caption, asset_id
-            caption, asset_id, _ = get_robust_caption(img_path)
-            if not _is_good_caption(caption):
-                caption = ""  # не подсовываем пустышку в LLM
-    except Exception as e:
-        caption = ""
-        # Лаконично сигналим об ошибке в подкапоте
-        chat_history[-1][1] = f"⚠️ Не удалось получить подпись: {e}"
-        yield {"text": "", "files": []}, chat_history, caption, asset_id
-    # Системный промпт (без «рассуждений»)
     if caption:
         system_prompt = (
             "You are a helpful multimodal assistant. "
             "Use the provided 'More Detailed Caption' as visual context. "
-            "Do not reveal your chain-of-thought. "
             "If something is not visible or uncertain, say so.\n\n"
             "Image Caption START >>>\n"
             f"{caption}\n"
@@ -346,11 +335,10 @@ def respond(
     else:
         system_prompt = (
             "You are a helpful assistant. "
-            "If the user refers to an image but no caption is available, ask them to reattach the image. "
-            "Do not reveal your chain-of-thought."
         )
-    # Текст для модели (если совсем ничего не написали, но есть изображение)
     user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
     # Стрим LLM
@@ -373,10 +361,10 @@ def respond(
                 continue
             assistant_accum += piece
             chat_history[-1][1] = assistant_accum
-            yield {"text": "", "files": []}, chat_history, caption, asset_id
-    except Exception:
-        # Фолбэк без стрима
         try:
             resp = llm.chat.completions.create(
                 model="openai/gpt-oss-120b",
@@ -405,73 +393,72 @@ def respond(
             else:
                 final_text = str(resp)
             chat_history[-1][1] = final_text
-            yield {"text": "", "files": []}, chat_history, caption, asset_id
         except Exception as e2:
             chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
-            yield {"text": "", "files": []}, chat_history, caption, asset_id
 # --------------------- Интерфейс ---------------------
 messenger_css = """
 :root {
   --radius-xl: 16px;
-  --radius-lg: 14px;
 }
-.gradio-container { max-width: 800px !important; margin: auto; }
-#title { text-align: center; padding: 8px 0 10px; font-size: 20px; }
-#chat-wrap { border: 1px solid rgba(0,0,0,0.06); border-radius: var(--radius-xl); overflow: hidden; }
-#chat { height: 560px; }
 #bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
-#send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
 #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
-.gr-chatbot { border-radius: 0 !important; }
 """
 theme = gr.themes.Soft(
     primary_hue="cyan",
     neutral_hue="slate",
 ).set(
-    body_text_color_subdued="#6b7280",
     button_large_radius="999px",
     button_small_radius="999px",
     block_radius="16px",
 )
 with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
-    gr.Markdown("✨ <div id='title'>Элегантный визуальный чат</div>")
     caption_state = gr.State(value="")
     asset_state = gr.State(value="")
     with gr.Group(elem_id="chat-wrap"):
-        chatbot = gr.Chatbot(label="", height=560, elem_id="chat")
         with gr.Row(elem_id="bottom-bar"):
             msg = gr.MultimodalTextbox(
                 show_label=False,
-                placeholder="Напишите сообщение... (иконка слева — добавить изображение)",
                 elem_id="msg",
             )
             send = gr.Button("➤", variant="primary", elem_id="send")
-    # Отправка по Enter и по кнопке
     msg.submit(
         respond,
-        inputs=[msg, chatbot, caption_state, asset_state],
-        outputs=[msg, chatbot, caption_state, asset_state]
     )
     send.click(
         respond,
-        inputs=[msg, chatbot, caption_state, asset_state],
-        outputs=[msg, chatbot, caption_state, asset_state]
-    )
-    # Очистка
-    def clear_all():
-        return {"text": "", "files": []}, [], "", ""
-    gr.Button("Очистить", variant="secondary").click(
-        clear_all,
-        inputs=None,
-        outputs=[msg, chatbot, caption_state, asset_state]
     )
 if __name__ == "__main__":

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+Минималистичный чат (одна кнопка отправки):
+- Вложения через иконку в поле ввода (как в мессенджерах).
+- Пайплайн: Florence-2 (NIM API) → GPT-OSS (NVIDIA Integrate).
+- Внизу: необработанный сырой вывод Florence для отладки.
+- Без WebGPU/wasm.
+Требуется в Secrets: NV_API_KEY
 """
 import os
 import json
 import zipfile
 import mimetypes
 from typing import Any, Dict, List, Optional, Tuple
 import requests
 llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
+# --------------------- Florence utils ---------------------
 def _guess_mime(path: str) -> str:
     return mimetypes.guess_type(path)[0] or "image/jpeg"
     return asset_id
 def _vlm_content(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> str:
     parts = [task_token]
     if text_prompt and text_prompt.strip():
         parts.append(text_prompt.strip())
 PRIORITY_TEXT_KEYS = [
     "more_detailed_caption", "detailed_caption", "caption",
     "generated_text", "text", "ocr", "description",
 ]
 def _deep_text_candidates(obj: Any) -> List[str]:
     out = []
     def walk(o):
         if isinstance(o, dict):
             for k in PRIORITY_TEXT_KEYS:
                 if k in o and isinstance(o[k], str) and o[k].strip():
                     out.append(o[k].strip())
             for v in o.values():
                 walk(v)
         elif isinstance(o, list):
     walk(obj)
     return out
+def _debug_dump_from_response(resp: requests.Response) -> str:
     """
+    Возвращает максимально сырой тех. вывод:
+    - статус, content-type, длина
+    - если JSON: raw text
+    - если ZIP: список файлов и сырые JSON/TXT содержимое
     """
+    lines = []
+    data = resp.content
+    ct = (resp.headers.get("content-type") or "").lower()
+    lines.append("=== Florence HTTP Response ===")
+    lines.append(f"status: {resp.status_code}")
+    lines.append(f"content-type: {ct}")
+    lines.append(f"bytes: {len(data)}")
+    # JSON
+    if "application/json" in ct and not data.startswith(b"PK"):
+        try:
+            raw = resp.text
+        except Exception:
+            raw = data.decode("utf-8", errors="ignore")
+        lines.append("--- RAW JSON ---")
+        lines.append(raw)
+        return "\n".join(lines)
+    # ZIP
+    if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
+        lines.append("--- ZIP CONTENTS ---")
+        try:
+            with zipfile.ZipFile(io.BytesIO(data), "r") as z:
+                for name in z.namelist():
+                    lines.append(f"* {name}")
+                # Сырые JSON/TXT
+                for name in z.namelist():
+                    low = name.lower()
+                    if low.endswith(".json") or low.endswith(".txt"):
+                        try:
+                            with z.open(name) as f:
+                                raw = f.read().decode("utf-8", errors="ignore")
+                            lines.append(f"\n--- FILE: {name} ---\n{raw}")
+                        except Exception as e:
+                            lines.append(f"\n--- FILE: {name} --- [read error: {e}]")
+        except Exception as e:
+            lines.append(f"[zip parse error: {e}]")
+        return "\n".join(lines)
+    # Фолбэк: просто выбросить текстовое содержимое
+    try:
+        txt = data.decode("utf-8", errors="ignore")
+    except Exception:
+        txt = "[binary body]"
+    lines.append("--- RAW BODY ---")
+    lines.append(txt)
+    return "\n".join(lines)
+def _parse_vlm_text(resp: requests.Response) -> str:
     """
+    Достаём лучший текст (если он есть).
     """
     data = resp.content
+    ct = (resp.headers.get("content-type") or "").lower()
+    # JSON
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             obj = resp.json()
             cands = _deep_text_candidates(obj)
+            return cands[0] if cands else ""
         except Exception:
+            return ""
+    # ZIP → ищем JSON/TXT
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
+                # JSON приоритет
+                for name in z.namelist():
                     if not name.lower().endswith(".json"):
                         continue
                     try:
                         with z.open(name) as f:
                             obj = json.loads(f.read().decode("utf-8", errors="ignore"))
+                        cands = _deep_text_candidates(obj)
+                        if cands:
+                            return cands[0]
                     except Exception:
+                        pass
+                # затем TXT
+                for name in z.namelist():
                     if name.lower().endswith(".txt"):
                         try:
                             with z.open(name) as f:
                                 txt = f.read().decode("utf-8", errors="ignore").strip()
                             if txt:
+                                return txt
                         except Exception:
+                            pass
         except Exception:
+            return ""
+    # Фоллбек
     try:
+        return data.decode("utf-8", errors="ignore").strip()
     except Exception:
+        return ""
+def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
+    """
+    Возвращает (best_text, raw_debug_dump)
+    """
     content = _vlm_content(task_token, asset_id, text_prompt)
     payload = {"messages": [{"role": "user", "content": content}]}
     headers = {
         "NVCF-FUNCTION-ASSET-IDS": asset_id,
     }
     resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
+    # Сырой дамп для отладки — даже если статус не 200
+    raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
     if not resp.ok:
+        return f"[VLM HTTP {resp.status_code}]", raw_dump
+    text = _parse_vlm_text(resp)
+    return text, raw_dump
+def _is_good(text: str) -> bool:
+    return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
+def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
     """
+    Пробуем <MORE_DETAILED_CAPTION> → <DETAILED_CAPTION> → <CAPTION> → <OCR>.
+    Возвращает (caption, asset_id, debug_raw_all_attempts)
     """
     asset_id = nvcf_upload_asset(image_path)
     attempts = [
         ("<CAPTION>", None),
         ("<OCR>", None),
     ]
+    debug_parts = []
+    for token, prompt in attempts:
+        text, raw_dump = _call_florence(token, asset_id, prompt)
+        debug_parts.append(f"=== Attempt {token} ===\n{raw_dump}\n")
+        if _is_good(text):
+            return text, asset_id, "\n".join(debug_parts)
+    return "", asset_id, "\n".join(debug_parts)
 # --------------------- LLM streaming utils ---------------------
 def _extract_text_from_stream_chunk(chunk: Any) -> str:
     message: Dict[str, Any],
     chat_history: List[List[str]],
     last_caption: str,
+    last_asset_id: str,
+    last_debug: str
 ):
     """
     message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
+    Пайплайн: если есть изображение → Florence (капшен + raw) → GPT-OSS.
     """
     text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
     files = (message or {}).get("files", []) if isinstance(message, dict) else []
     img_path = first_image_path(files)
+    # Видимый месседж пользователя
     parts = []
     if text and text.strip():
         parts.append(text.strip())
     chat_history = chat_history or []
     chat_history.append([user_visible, ""])
+    # Первое обновление UI (очищаем поле ввода)
+    yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
     caption = last_caption or ""
     asset_id = last_asset_id or ""
+    debug_raw = last_debug or ""
+    # Всегда: если есть изображение в запросе — сначала Florence
+    if img_path:
+        chat_history[-1][1] = "🔎 Обрабатываю изображение во Florence…"
+        yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
+        try:
+            caption, asset_id, debug_raw = get_caption_with_debug(img_path)
+        except Exception as e:
+            caption, debug_raw = "", f"[Florence error] {e}"
+    # Системный промпт
     if caption:
         system_prompt = (
             "You are a helpful multimodal assistant. "
             "Use the provided 'More Detailed Caption' as visual context. "
             "If something is not visible or uncertain, say so.\n\n"
             "Image Caption START >>>\n"
             f"{caption}\n"
     else:
         system_prompt = (
             "You are a helpful assistant. "
+            "If the user refers to an image but no caption is available, ask them to reattach the image."
         )
+    # User text для модели
     user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
     # Стрим LLM
                 continue
             assistant_accum += piece
             chat_history[-1][1] = assistant_accum
+            # Показываем сырой вывод Florence внизу (фиксированный на запрос)
+            yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
+    except Exception as e:
+        # Фоллбэк без стрима
         try:
             resp = llm.chat.completions.create(
                 model="openai/gpt-oss-120b",
             else:
                 final_text = str(resp)
             chat_history[-1][1] = final_text
+            yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
         except Exception as e2:
             chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
+            yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
 # --------------------- Интерфейс ---------------------
 messenger_css = """
 :root {
   --radius-xl: 16px;
 }
+.gradio-container { max-width: 780px !important; margin: auto; }
+#title { text-align: center; padding: 8px 0 10px; font-size: 18px; }
+#chat-wrap { border: 1px solid rgba(0,0,0,0.07); border-radius: var(--radius-xl); overflow: hidden; }
+#chat { height: 520px; }
 #bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
+#send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
 #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
+#raw-box .wrap>label { font-weight: 600; }
 """
 theme = gr.themes.Soft(
     primary_hue="cyan",
     neutral_hue="slate",
 ).set(
     button_large_radius="999px",
     button_small_radius="999px",
     block_radius="16px",
 )
 with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
+    gr.Markdown("✨ <div id='title'>Визуальный чат: Florence → GPT‑OSS</div>")
     caption_state = gr.State(value="")
     asset_state = gr.State(value="")
+    debug_state = gr.State(value="")
     with gr.Group(elem_id="chat-wrap"):
+        chatbot = gr.Chatbot(label="", height=520, elem_id="chat")
         with gr.Row(elem_id="bottom-bar"):
             msg = gr.MultimodalTextbox(
                 show_label=False,
+                placeholder="Напишите сообщение… (иконка слева — добавить изображение)",
                 elem_id="msg",
             )
             send = gr.Button("➤", variant="primary", elem_id="send")
+    # Нижний блок — сырой вывод Florence
+    with gr.Box(elem_id="raw-box"):
+        raw_out = gr.Textbox(
+            label="Raw Florence output",
+            value="",
+            lines=14,
+            show_copy_button=True
+        )
+    # Единственная подача: по Enter и по кнопке (одна видимая кнопка отправки)
     msg.submit(
         respond,
+        inputs=[msg, chatbot, caption_state, asset_state, debug_state],
+        outputs=[msg, chatbot, caption_state, asset_state, raw_out]
     )
     send.click(
         respond,
+        inputs=[msg, chatbot, caption_state, asset_state, debug_state],
+        outputs=[msg, chatbot, caption_state, asset_state, raw_out]
     )
 if __name__ == "__main__":