Spaces:

Serg4451D
/

gpt-oss-multimodal

Sleeping

App Files Files Community

Serg4451D commited on 3 days ago

Commit

ce3af93

verified ·

1 Parent(s): f16156d

Update app.py

Browse files

Files changed (1) hide show

app.py +220 -77

app.py CHANGED Viewed

@@ -1,97 +1,213 @@
 import os
-from typing import Generator, List, Tuple
 import gradio as gr
 from gradio_client import Client, handle_file
 from openai import OpenAI
-# --- Конфигурация (в HF Spaces добавь NV_API_KEY в Secrets) ---
-NV_API_KEY = os.environ.get("NV_API_KEY")
 if not NV_API_KEY:
-    raise RuntimeError("Добавьте NV_API_KEY в Secrets Hugging Face Space")
-# Florence-2 (публичный wrapper)
-florence = Client("gokaygokay/Florence-2")
-# OpenAI-compatible client (NVIDIA integrate)
-llm = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)
-def get_caption(image_path: str) -> str:
-    """Запрос 'More Detailed Caption' к Florence-2. image_path может быть URL или локальный путь."""
     try:
-        # handle_file поддерживает URL и локальные файлы
-        result = florence.predict(
-            image=handle_file(image_path),
             task_prompt="More Detailed Caption",
             text_input=None,
             model_id="microsoft/Florence-2-large",
-            api_name="/process_image",
         )
-        # result может быть строкой или структурой — нормализуем
-        return result if isinstance(result, str) else str(result)
     except Exception as e:
         return f"[Ошибка при генерац��и подписи: {e}]"
-def _extract_text_from_chunk(chunk) -> str:
-    """Универсальная попытка извлечь текстовый фрагмент из stream-chunk."""
     try:
-        # объект-атрибутный стиль
         if hasattr(chunk, "choices"):
-            choice = chunk.choices[0]
-            delta = getattr(choice, "delta", None)
-            if delta is not None:
-                txt = getattr(delta, "content", None) or getattr(delta, "reasoning_content", None)
-                return txt or ""
-        # dict-стиль
         if isinstance(chunk, dict):
-            choices = chunk.get("choices", [])
             if choices:
-                delta = choices[0].get("delta", {})
-                return delta.get("content") or delta.get("reasoning_content") or ""
     except Exception:
-        return ""
     return ""
-def chat_stream(image_path: str, user_message: str, history: List[Tuple[str, str]]):
     """
-    Generator для Gradio: сначала возвращает caption, затем по мере прихода токенов
-    обновляет последний ответ ассистента.
-    Возвращаемые значения — кортежи (history, caption) соответствующие outputs.
     """
     history = history or []
-    if not image_path:
-        history.append([user_message, "Пожалуйста, загрузите изображение."])
-        yield history, ""
         return
-    # Получаем подробную подпись
-    caption = get_caption(image_path)
-    # Сборка системного промпта
     system_prompt = (
-        "You are 'multimodal gpt-oss 120b'. Use the provided 'More Detailed Caption' as authoritative visual context.\n\n"
         "Image Caption START >>>\n"
         f"{caption}\n"
         "<<< Image Caption END.\n"
-        "When answering, mention visible details and be explicit when uncertain."
     )
-    # Добавляем сообщение пользователя
-    history.append([user_message, ""])
-    # Первый yield — чтобы UI сразу показал пользовательское сообщение и подпись
     yield history, caption
-    assistant_text = ""
     try:
         stream = llm.chat.completions.create(
             model="openai/gpt-oss-120b",
             messages=[
                 {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_message},
             ],
             temperature=0.8,
             top_p=1.0,
@@ -100,62 +216,89 @@ def chat_stream(image_path: str, user_message: str, history: List[Tuple[str, str
         )
         for chunk in stream:
-            piece = _extract_text_from_chunk(chunk)
             if not piece:
                 continue
-            assistant_text += piece
-            history[-1][1] = assistant_text
             yield history, caption
     except Exception as e:
-        # В случае ошибки — покажем её в чате
-        history[-1][1] = f"[Ошибка стриминга LLM: {e}]"
-        yield history, caption
-    # Финальный yield (гарантируем состояние завершения)
     yield history, caption
-# --- UI (для HF Spaces) ---
 EXAMPLE_IMAGES = [
-    # список простых строк (URL или локальные пути). НИКАКИХ вложенных списков!
     "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
 ]
 css = """
-#title {text-align:center; margin-bottom: -18px;}
 .gradio-container { max-width: 1100px; margin: auto; }
 """
-with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     gr.Markdown("<h2 id='title'>🖼️ multimodal gpt-oss 120b — визуальный чат</h2>")
     with gr.Row():
         with gr.Column(scale=4):
-            image_input = gr.Image(label="Загрузите картинку или выберите из галереи", type="filepath", tool="editor")
-            raw_caption = gr.Textbox(label="More Detailed Caption (Florence-2)", interactive=False)
-            user_input = gr.Textbox(label="Вопрос по изображению", placeholder="Например: 'Что происходит на фото?'")
             send_btn = gr.Button("Отправить")
             clear_btn = gr.Button("Очистить чат")
-            gr.Markdown("**Галерея примеров (клик — подставить в загрузчик)**")
-            gallery = gr.Gallery(value=EXAMPLE_IMAGES, columns=4, label="Примеры", show_label=False).style(grid=[4], height="auto")
         with gr.Column(scale=6):
-            chatbot = gr.Chatbot(label="Чат с моделью", height=600)
-    # Клик по картинке в галерее -> вставляем URL/путь в image_input
-    def pick_example(img_url: str):
-        return img_url
-    gallery.select(fn=pick_example, inputs=[gallery], outputs=[image_input])
-    # Кнопка отправки: привязываем генератор, который возвращает (chat_history, caption)
-    send_btn.click(fn=chat_stream, inputs=[image_input, user_input, chatbot], outputs=[chatbot, raw_caption])
-    clear_btn.click(lambda: [], None, chatbot)
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

+#!/usr/bin/env python3
+"""
+multimodal gpt-oss 120b — Gradio app for Hugging Face Spaces
+Функции:
+- Загрузка собственной картинки (type="filepath")
+- Галерея примеров (клик -> подставляет в загрузчик)
+- Автогенерация "More Detailed Caption" через gradio_client Florence-2
+- Streaming ответов от openai/gpt-oss-120b (через NVIDIA integrate / OpenAI-compatible)
+- Кеширование подписи для одной и той же картинки
+"""
 import os
+import traceback
+from typing import Any, Dict, List, Tuple, Optional
 import gradio as gr
 from gradio_client import Client, handle_file
 from openai import OpenAI
+# (опционально) локальный .env при локальном запуске
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except Exception:
+    pass
+# --------------------- Конфигурация ---------------------
+NV_API_KEY = os.environ.get("NV_API_KEY")  # ОБЯЗАТЕЛЬНО прописать в Secrets HF Spaces
+NV_BASE_URL = os.environ.get("NV_BASE_URL", "https://integrate.api.nvidia.com/v1")
 if not NV_API_KEY:
+    raise RuntimeError(
+        "NV_API_KEY не задан. В Hugging Face Space зайди в Settings → Secrets и добавь NV_API_KEY."
+    )
+# Florence-2 Gradio wrapper (публичный)
+FLORENCE_WRAPPER = "gokaygokay/Florence-2"
+# --------------------- Клиенты ---------------------
+florence = Client(FLORENCE_WRAPPER)
+llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
+# --------------------- Хелперы ---------------------
+def _normalize_florence_result(res: Any) -> str:
+    """
+    Нормализует результат predict от Florence-2: возвращает текстовую подпись.
+    Подстраховываемся на разные форматы (строка, dict, list и т.д.).
+    """
+    try:
+        if res is None:
+            return ""
+        if isinstance(res, str):
+            return res
+        # dict-like
+        if isinstance(res, dict):
+            # часто бывает ключ 'caption' или 'text' или 'generated_text'
+            for k in ("caption", "text", "generated_text", "output", "result"):
+                if k in res and isinstance(res[k], str):
+                    return res[k]
+            # если есть nested fields, попробуем взять первое строковое значение
+            for v in res.values():
+                if isinstance(v, str):
+                    return v
+            # fallback: str()
+            return str(res)
+        # list/tuple: join string elements
+        if isinstance(res, (list, tuple)):
+            pieces = [str(x) for x in res]
+            return "\n".join(pieces)
+        # other types: fallback to str
+        return str(res)
+    except Exception:
+        return f"[Ошибка нормализации подписи: {traceback.format_exc()}]"
+def get_caption_for_image(image_path_or_url: str, safety_note: bool = False) -> str:
+    """
+    Запрос к Florence-2: task_prompt="More Detailed Caption".
+    Принимает локальный путь или URL.
+    """
     try:
+        if not image_path_or_url:
+            return ""
+        # handle_file поддерживает URL и локальные пути
+        res = florence.predict(
+            image=handle_file(image_path_or_url),
             task_prompt="More Detailed Caption",
             text_input=None,
             model_id="microsoft/Florence-2-large",
+            api_name="/process_image"
         )
+        caption = _normalize_florence_result(res)
+        return caption
     except Exception as e:
+        # логируем в stdout (HF Spaces покажет лог)
+        print("Ошибка Florence-2 predict:", e)
+        traceback.print_exc()
         return f"[Ошибка при генерац��и подписи: {e}]"
+def _extract_text_from_stream_chunk(chunk: Any) -> str:
+    """
+    Универсально извлекает текстовые фрагменты из чанка стриминга LLM.
+    Работает с разными формами chunk (объект SDK, dict, ...)
+    """
     try:
+        # объектный стиль: chunk.choices[0].delta.content
         if hasattr(chunk, "choices"):
+            choices = getattr(chunk, "choices")
+            if choices:
+                c0 = choices[0]
+                delta = getattr(c0, "delta", None)
+                if delta is not None:
+                    # reasoning_content или content
+                    txt = getattr(delta, "reasoning_content", None) or getattr(delta, "content", None)
+                    if txt:
+                        return str(txt)
+                # some SDK might put content in c0.get("text") etc.
+                text_attr = getattr(c0, "text", None)
+                if text_attr:
+                    return str(text_attr)
+        # dict-style
         if isinstance(chunk, dict):
+            choices = chunk.get("choices") or []
             if choices:
+                delta = choices[0].get("delta") or {}
+                # try common keys
+                return str(delta.get("content") or delta.get("reasoning_content") or choices[0].get("text") or "")
     except Exception:
+        pass
     return ""
+# --------------------- UI-логика ---------------------
+# Кеш подписи (чтобы не вызывать Florence снова для той же картинки)
+# Храним словарь: {"image_path": "...", "caption": "..."}
+# Будем использовать gr.State для хранения этого словаря в сессии
+def generate_and_cache_caption(image, cache: Optional[Dict[str, str]]):
+    """
+    Вызывается при изменении image_input или при клике по галерее.
+    Возвращает (caption_text, new_cache_dict).
+    """
+    try:
+        if not image:
+            return "", {"image_path": None, "caption": None}
+        # Готовим path/URL
+        img_path = image if isinstance(image, str) else getattr(image, "name", None) or image
+        # Проверка кеша
+        if cache and cache.get("image_path") == img_path and cache.get("caption"):
+            return cache.get("caption"), cache
+        # Иначе генерируем подпись
+        caption = get_caption_for_image(img_path)
+        new_cache = {"image_path": img_path, "caption": caption}
+        return caption, new_cache
+    except Exception as e:
+        print("generate_and_cache_caption exception:", e)
+        traceback.print_exc()
+        return f"[Ошибка генерации подписи: {e}]", {"image_path": None, "caption": None}
+def chat_stream(image, user_message, history, cache: Dict[str, str]):
     """
+    Основной generator для кнопки Отправить / submit:
+    - Автоматически использует кеш подписи (если есть), иначе генерирует новую
+    - Возвращает по мере стриминга (history, caption) — соответствие outputs=[chatbot, raw_caption]
     """
     history = history or []
+    # Проверки входа
+    if not user_message:
+        # ничего не делаем, просто возвращаем текущее состояние
+        yield history, (cache.get("caption") if cache else "")
+        return
+    if not image:
+        # если нет картинки — говорим пользователю
+        history.append([user_message, "Пожалуйста, загрузите изображение или выберите из галереи."])
+        yield history, (cache.get("caption") if cache else "")
         return
+    # получить путь и подпись (используем кеш, если совпадает)
+    img_path = image if isinstance(image, str) else getattr(image, "name", None) or image
+    if cache and cache.get("image_path") == img_path and cache.get("caption"):
+        caption = cache.get("caption")
+    else:
+        caption = get_caption_for_image(img_path)
+        # обновляем кеш локально (не gr.State, а для текущего запроса)
+        cache = {"image_path": img_path, "caption": caption}
+    # система-промпт — даём контекст и просим указывать степень уверенности
     system_prompt = (
+        "You are 'multimodal gpt-oss 120b', a helpful multimodal assistant. "
+        "Use the provided 'More Detailed Caption' as authoritative visual context. "
+        "If something is not visible or certain, say so explicitly.\n\n"
         "Image Caption START >>>\n"
         f"{caption}\n"
         "<<< Image Caption END.\n"
+        "Answer the user's question based on the caption and general knowledge. "
+        "Be concise unless asked for details."
     )
+    # добавляем пользовательский запрос в историю (пустой ответ пока)
+    history.append([user_message, ""])  # assistant текст будет заполняться по мере стрима
+    # первый yield чтобы UI сразу отобразил user's message и подпись
     yield history, caption
+    assistant_accum = ""
     try:
+        # Запускаем стриминг вызов
         stream = llm.chat.completions.create(
             model="openai/gpt-oss-120b",
             messages=[
                 {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message}
             ],
             temperature=0.8,
             top_p=1.0,
         )
         for chunk in stream:
+            piece = _extract_text_from_stream_chunk(chunk)
             if not piece:
                 continue
+            assistant_accum += piece
+            # обновляем последний элемент истории (assistant part)
+            history[-1][1] = assistant_accum
             yield history, caption
     except Exception as e:
+        # Ошибка стриминга: попробуем получить финальный ответ без стрима, либо показать ошибку
+        print("Streaming error:", e)
+        traceback.print_exc()
+        # Пытаемся сделать не-стриминг вызов (fallback)
+        try:
+            resp = llm.chat.completions.create(
+                model="openai/gpt-oss-120b",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_message}
+                ],
+                temperature=0.8,
+                top_p=1.0,
+                max_tokens=1024,
+                stream=False,
+            )
+            # нормализуем возможный формат ответа
+            final_text = ""
+            # SDK может вернуть object-like resp.choices[0].message.content
+            if hasattr(resp, "choices"):
+                try:
+                    final_text = getattr(resp.choices[0].message, "content", "") or getattr(resp.choices[0], "text", "") or ""
+                except Exception:
+                    final_text = str(resp)
+            elif isinstance(resp, dict):
+                choices = resp.get("choices", [])
+                if choices:
+                    m = choices[0].get("message") or choices[0]
+                    final_text = m.get("content") or m.get("text") or str(m)
+                else:
+                    final_text = str(resp)
+            else:
+                final_text = str(resp)
+            history[-1][1] = final_text
+            yield history, caption
+        except Exception as e2:
+            history[-1][1] = f"[Ошибка LLM: {e2}]"
+            yield history, caption
+    # финальный yield (гарантируем окончательное состояние)
     yield history, caption
+# --------------------- Примеры для галереи (список строк) ---------------------
 EXAMPLE_IMAGES = [
     "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
 ]
+# --------------------- UI ---------------------
 css = """
 .gradio-container { max-width: 1100px; margin: auto; }
+#title { text-align: center; }
 """
+with gr.Blocks(css=css, analytics_enabled=False) as demo:
     gr.Markdown("<h2 id='title'>🖼️ multimodal gpt-oss 120b — визуальный чат</h2>")
     with gr.Row():
         with gr.Column(scale=4):
+            image_input = gr.Image(label="Загрузите картинку (файл / drag-n-drop / камера)", type="filepath")
+            raw_caption = gr.Textbox(label="More Detailed Caption (Florence-2)", interactive=False, lines=6)
+            user_input = gr.Textbox(label="Вопрос по изображению", placeholder="Например: Что происходит на фото?")
             send_btn = gr.Button("Отправить")
             clear_btn = gr.Button("Очистить чат")
+            gr.Markdown("**Галерея примеров (клик — подставить в загрузчик и получить подпись)**")
+            gallery = gr.Gallery(value=EXAMPLE_IMAGES, label="Примеры", columns=4, show_label=False).style(grid=[4])
         with gr.Column(scale=6):
+            chatbot = gr.Chatbot(label="Чат с моделью", height=640)
+    # gr.State для кеша подписи
+    caption_cache = gr.State(value={"image_path": None, "caption": None})
+    # обработчик клика по галерее: сразу подставляет картинку, генерирует подпись и обновляет кеш
+    def on_gallery_select(elem, cach