Spaces:

Serg4451D
/

gpt-oss-multimodal

Sleeping

App Files Files Community

Serg4451D commited on 2 days ago

Commit

d8cecb5

verified ·

1 Parent(s): 152bf38

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -60

app.py CHANGED Viewed

@@ -1,14 +1,5 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-"""
-Минималистичный чат (одна кнопка отправки):
-- Вложения через иконку в поле ввода (как в мессенджерах).
-- Пайплайн: Florence-2 (NIM API) → GPT-OSS (NVIDIA Integrate).
-- Внизу: необработанный сырой вывод Florence для отладки.
-- Без WebGPU/wasm.
-Требуется в Secrets: NV_API_KEY
-"""
 import os
 import io
@@ -94,12 +85,6 @@ def _deep_text_candidates(obj: Any) -> List[str]:
     return out
 def _debug_dump_from_response(resp: requests.Response) -> str:
-    """
-    Возвращает максимально сырой тех. вывод:
-    - статус, content-type, длина
-    - если JSON: raw text
-    - если ZIP: список файлов и сырые JSON/TXT содержимое
-    """
     lines = []
     data = resp.content
     ct = (resp.headers.get("content-type") or "").lower()
@@ -109,7 +94,6 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
     lines.append(f"content-type: {ct}")
     lines.append(f"bytes: {len(data)}")
-    # JSON
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             raw = resp.text
@@ -119,14 +103,12 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
         lines.append(raw)
         return "\n".join(lines)
-    # ZIP
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         lines.append("--- ZIP CONTENTS ---")
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
                 for name in z.namelist():
                     lines.append(f"* {name}")
-                # Сырые JSON/TXT
                 for name in z.namelist():
                     low = name.lower()
                     if low.endswith(".json") or low.endswith(".txt"):
@@ -140,7 +122,6 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
             lines.append(f"[zip parse error: {e}]")
         return "\n".join(lines)
-    # Фолбэк: просто выбросить текстовое содержимое
     try:
         txt = data.decode("utf-8", errors="ignore")
     except Exception:
@@ -150,13 +131,9 @@ def _debug_dump_from_response(resp: requests.Response) -> str:
     return "\n".join(lines)
 def _parse_vlm_text(resp: requests.Response) -> str:
-    """
-    Достаём лучший текст (если он есть).
-    """
     data = resp.content
     ct = (resp.headers.get("content-type") or "").lower()
-    # JSON
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             obj = resp.json()
@@ -165,11 +142,9 @@ def _parse_vlm_text(resp: requests.Response) -> str:
         except Exception:
             return ""
-    # ZIP → ищем JSON/TXT
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
-                # JSON приоритет
                 for name in z.namelist():
                     if not name.lower().endswith(".json"):
                         continue
@@ -181,7 +156,6 @@ def _parse_vlm_text(resp: requests.Response) -> str:
                             return cands[0]
                     except Exception:
                         pass
-                # затем TXT
                 for name in z.namelist():
                     if name.lower().endswith(".txt"):
                         try:
@@ -194,16 +168,12 @@ def _parse_vlm_text(resp: requests.Response) -> str:
         except Exception:
             return ""
-    # Фоллбек
     try:
         return data.decode("utf-8", errors="ignore").strip()
     except Exception:
         return ""
 def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
-    """
-    Возвращает (best_text, raw_debug_dump)
-    """
     content = _vlm_content(task_token, asset_id, text_prompt)
     payload = {"messages": [{"role": "user", "content": content}]}
     headers = {
@@ -214,7 +184,6 @@ def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] =
         "NVCF-FUNCTION-ASSET-IDS": asset_id,
     }
     resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
-    # Сырой дамп для отладки — даже если статус не 200
     raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
     if not resp.ok:
         return f"[VLM HTTP {resp.status_code}]", raw_dump
@@ -225,10 +194,6 @@ def _is_good(text: str) -> bool:
     return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
 def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
-    """
-    Пробуем <MORE_DETAILED_CAPTION> → <DETAILED_CAPTION> → <CAPTION> → <OCR>.
-    Возвращает (caption, asset_id, debug_raw_all_attempts)
-    """
     asset_id = nvcf_upload_asset(image_path)
     attempts = [
         ("<MORE_DETAILED_CAPTION>", None),
@@ -271,15 +236,11 @@ def _extract_text_from_stream_chunk(chunk: Any) -> str:
 # --------------------- Чат-логика ---------------------
 def respond(
     message: Dict[str, Any],
-    chat_history: List[List[str]],
     last_caption: str,
     last_asset_id: str,
     last_debug: str
 ):
-    """
-    message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
-    Пайплайн: если есть изображение → Florence (капшен + raw) → GPT-OSS.
-    """
     text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
     files = (message or {}).get("files", []) if isinstance(message, dict) else []
@@ -296,7 +257,6 @@ def respond(
     img_path = first_image_path(files)
-    # Видимый месседж пользователя
     parts = []
     if text and text.strip():
         parts.append(text.strip())
@@ -305,24 +265,22 @@ def respond(
     user_visible = "\n".join(parts) if parts else "🖐️"
     chat_history = chat_history or []
-    chat_history.append([user_visible, ""])
-    # Первое обновление UI (очищаем поле ввода)
     yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
     caption = last_caption or ""
     asset_id = last_asset_id or ""
     debug_raw = last_debug or ""
-    # Всегда: если есть изображение в запросе — сначала Florence
     if img_path:
-        chat_history[-1][1] = "🔎 Обрабатываю изображение во Florence…"
         yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
         try:
             caption, asset_id, debug_raw = get_caption_with_debug(img_path)
         except Exception as e:
             caption, debug_raw = "", f"[Florence error] {e}"
-    # Системный промпт
     if caption:
         system_prompt = (
             "You are a helpful multimodal assistant. "
@@ -338,10 +296,8 @@ def respond(
             "If the user refers to an image but no caption is available, ask them to reattach the image."
         )
-    # User text для модели
     user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
-    # Стрим LLM
     assistant_accum = ""
     try:
         stream = llm.chat.completions.create(
@@ -360,11 +316,9 @@ def respond(
             if not piece:
                 continue
             assistant_accum += piece
-            chat_history[-1][1] = assistant_accum
-            # Показываем сырой вывод Florence внизу (фиксированный на запрос)
             yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
-    except Exception as e:
-        # Фоллбэк без стрима
         try:
             resp = llm.chat.completions.create(
                 model="openai/gpt-oss-120b",
@@ -392,10 +346,10 @@ def respond(
                     final_text = str(resp)
             else:
                 final_text = str(resp)
-            chat_history[-1][1] = final_text
             yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
         except Exception as e2:
-            chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
             yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
 # --------------------- Интерфейс ---------------------
@@ -410,7 +364,7 @@ messenger_css = """
 #bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
 #send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
 #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
-#raw-box .wrap>label { font-weight: 600; }
 """
 theme = gr.themes.Soft(
@@ -430,7 +384,7 @@ with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
     debug_state = gr.State(value="")
     with gr.Group(elem_id="chat-wrap"):
-        chatbot = gr.Chatbot(label="", height=520, elem_id="chat")
         with gr.Row(elem_id="bottom-bar"):
             msg = gr.MultimodalTextbox(
@@ -440,16 +394,14 @@ with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
             )
             send = gr.Button("➤", variant="primary", elem_id="send")
-    # Нижний блок — сырой вывод Florence
-    with gr.Box(elem_id="raw-box"):
         raw_out = gr.Textbox(
-            label="Raw Florence output",
             value="",
             lines=14,
             show_copy_button=True
         )
-    # Единственная подача: по Enter и по кнопке (одна видимая кнопка отправки)
     msg.submit(
         respond,
         inputs=[msg, chatbot, caption_state, asset_state, debug_state],

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import io
     return out
 def _debug_dump_from_response(resp: requests.Response) -> str:
     lines = []
     data = resp.content
     ct = (resp.headers.get("content-type") or "").lower()
     lines.append(f"content-type: {ct}")
     lines.append(f"bytes: {len(data)}")
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             raw = resp.text
         lines.append(raw)
         return "\n".join(lines)
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         lines.append("--- ZIP CONTENTS ---")
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
                 for name in z.namelist():
                     lines.append(f"* {name}")
                 for name in z.namelist():
                     low = name.lower()
                     if low.endswith(".json") or low.endswith(".txt"):
             lines.append(f"[zip parse error: {e}]")
         return "\n".join(lines)
     try:
         txt = data.decode("utf-8", errors="ignore")
     except Exception:
     return "\n".join(lines)
 def _parse_vlm_text(resp: requests.Response) -> str:
     data = resp.content
     ct = (resp.headers.get("content-type") or "").lower()
     if "application/json" in ct and not data.startswith(b"PK"):
         try:
             obj = resp.json()
         except Exception:
             return ""
     if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
         try:
             with zipfile.ZipFile(io.BytesIO(data), "r") as z:
                 for name in z.namelist():
                     if not name.lower().endswith(".json"):
                         continue
                             return cands[0]
                     except Exception:
                         pass
                 for name in z.namelist():
                     if name.lower().endswith(".txt"):
                         try:
         except Exception:
             return ""
     try:
         return data.decode("utf-8", errors="ignore").strip()
     except Exception:
         return ""
 def _call_florence(task_token: str, asset_id: str, text_prompt: Optional[str] = None) -> Tuple[str, str]:
     content = _vlm_content(task_token, asset_id, text_prompt)
     payload = {"messages": [{"role": "user", "content": content}]}
     headers = {
         "NVCF-FUNCTION-ASSET-IDS": asset_id,
     }
     resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
     raw_dump = _debug_dump_from_response(resp) if resp is not None else "[no response]"
     if not resp.ok:
         return f"[VLM HTTP {resp.status_code}]", raw_dump
     return isinstance(text, str) and len(text.strip()) >= 3 and "изображений-результатов" not in text.lower()
 def get_caption_with_debug(image_path: str) -> Tuple[str, str, str]:
     asset_id = nvcf_upload_asset(image_path)
     attempts = [
         ("<MORE_DETAILED_CAPTION>", None),
 # --------------------- Чат-логика ---------------------
 def respond(
     message: Dict[str, Any],
+    chat_history: List[Dict[str, str]],
     last_caption: str,
     last_asset_id: str,
     last_debug: str
 ):
     text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
     files = (message or {}).get("files", []) if isinstance(message, dict) else []
     img_path = first_image_path(files)
     parts = []
     if text and text.strip():
         parts.append(text.strip())
     user_visible = "\n".join(parts) if parts else "🖐️"
     chat_history = chat_history or []
+    chat_history.append({"role": "user", "content": user_visible})
+    chat_history.append({"role": "assistant", "content": ""})
     yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id, (last_debug or "")
     caption = last_caption or ""
     asset_id = last_asset_id or ""
     debug_raw = last_debug or ""
     if img_path:
+        chat_history[-1]["content"] = "🔎 Обрабатываю изображение во Florence…"
         yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
         try:
             caption, asset_id, debug_raw = get_caption_with_debug(img_path)
         except Exception as e:
             caption, debug_raw = "", f"[Florence error] {e}"
     if caption:
         system_prompt = (
             "You are a helpful multimodal assistant. "
             "If the user refers to an image but no caption is available, ask them to reattach the image."
         )
     user_text_for_llm = text or ("Describe the attached image." if caption else "Hi")
     assistant_accum = ""
     try:
         stream = llm.chat.completions.create(
             if not piece:
                 continue
             assistant_accum += piece
+            chat_history[-1]["content"] = assistant_accum
             yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
+    except Exception:
         try:
             resp = llm.chat.completions.create(
                 model="openai/gpt-oss-120b",
                     final_text = str(resp)
             else:
                 final_text = str(resp)
+            chat_history[-1]["content"] = final_text
             yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
         except Exception as e2:
+            chat_history[-1]["content"] = f"[Ошибка LLM: {e2}]"
             yield {"text": "", "files": []}, chat_history, caption, asset_id, (debug_raw or "")
 # --------------------- Интерфейс ---------------------
 #bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
 #send { min-width: 42px; max-width: 42px; height: 42px; border-radius: 999px; }
 #msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
+#raw-wrap .wrap>label { font-weight: 600; }
 """
 theme = gr.themes.Soft(
     debug_state = gr.State(value="")
     with gr.Group(elem_id="chat-wrap"):
+        chatbot = gr.Chatbot(label="", height=520, elem_id="chat", type="messages")
         with gr.Row(elem_id="bottom-bar"):
             msg = gr.MultimodalTextbox(
             )
             send = gr.Button("➤", variant="primary", elem_id="send")
+    with gr.Accordion("Raw Florence output", open=True, elem_id="raw-wrap"):
         raw_out = gr.Textbox(
+            label="",
             value="",
             lines=14,
             show_copy_button=True
         )
     msg.submit(
         respond,
         inputs=[msg, chatbot, caption_state, asset_state, debug_state],