Spaces:

Serg4451D
/

gpt-oss-multimodal

Running

App Files Files Community

Serg4451D commited on 9 days ago

Commit

4dc8140

verified ·

1 Parent(s): e83b61c

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -72

app.py CHANGED Viewed

@@ -1,118 +1,161 @@
 import os
 import gradio as gr
 from gradio_client import Client, handle_file
 from openai import OpenAI
-# --- Конфиг ---
 NV_API_KEY = os.environ.get("NV_API_KEY")
 if not NV_API_KEY:
-    raise ValueError("В Secrets Hugging Face Spaces нужно задать NV_API_KEY")
-# Модель Florence-2
 florence = Client("gokaygokay/Florence-2")
-# Модель NVIDIA GPT-OSS-120B
-llm = OpenAI(
-    base_url="https://integrate.api.nvidia.com/v1",
-    api_key=NV_API_KEY
-)
-# --- Функции ---
-def get_caption(image_path):
-    """Делаем подробную подпись через Florence-2."""
     try:
         result = florence.predict(
             image=handle_file(image_path),
             task_prompt="More Detailed Caption",
             text_input=None,
             model_id="microsoft/Florence-2-large",
-            api_name="/process_image"
         )
         return result if isinstance(result, str) else str(result)
     except Exception as e:
         return f"[Ошибка при генерации подписи: {e}]"
-def chat_with_image(image_path, user_message, history):
-    """Отправляем в LLM запрос с учетом подписи от Florence-2."""
     if not image_path:
-        return history + [[user_message, "Пожалуйста, загрузите изображение."]]
     caption = get_caption(image_path)
     system_prompt = (
-        "Ты — 'multimodal gpt-oss 120b', умный ассистент, который видит изображение.\n"
-        f"Подробная подпись к картинке:\n{caption}\n"
-        "Используй её, чтобы отвечать на вопросы пользователя."
     )
-    history = history or []
     history.append([user_message, ""])
-    # Стриминг ответа
-    response_text = ""
-    for chunk in llm.chat.completions.create(
-        model="openai/gpt-oss-120b",
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_message}
-        ],
-        temperature=0.8,
-        top_p=1,
-        max_tokens=1024,
-        stream=True
-    ):
-        delta = chunk.choices[0].delta
-        if delta.content:
-            response_text += delta.content
-            history[-1][1] = response_text
-            yield history
-# --- UI ---
-example_images = [
-    ["https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png"],
-    ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"],
-    ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg"],
-    ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png"],
 ]
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        "<h1 style='text-align:center'>🖼️ multimodal gpt-oss 120b</h1>"
-        "<p style='text-align:center'>Загружайте изображение или выберите из галереи — модель увидит его и ответит на вопросы.</p>"
-    )
     with gr.Row():
         with gr.Column(scale=4):
-            image_input = gr.Image(type="filepath", label="Загрузите или выберите картинку")
-            gallery = gr.Gallery(
-                value=example_images,
-                label="Примеры",
-                columns=4,
-                height="auto",
-                preview=True
-            )
-            user_input = gr.Textbox(label="Ваш вопрос", placeholder="Например: Что изображено на фото?")
             send_btn = gr.Button("Отправить")
         with gr.Column(scale=6):
-            chatbot = gr.Chatbot(label="Чат", height=500)
-            clear_btn = gr.Button("Очистить чат")
-    # Логика выбора картинки из галереи
-    def select_example(example):
-        return example[0]
-    gallery.select(select_example, inputs=[gallery], outputs=[image_input])
-    send_btn.click(
-        chat_with_image,
-        inputs=[image_input, user_input, chatbot],
-        outputs=[chatbot]
-    )
-    clear_btn.click(lambda: None, None, chatbot)
-# Запуск
 if __name__ == "__main__":
-    demo.launch()

 import os
+from typing import Generator, List, Tuple
 import gradio as gr
 from gradio_client import Client, handle_file
 from openai import OpenAI
+# --- Конфигурация (в HF Spaces добавь NV_API_KEY в Secrets) ---
 NV_API_KEY = os.environ.get("NV_API_KEY")
 if not NV_API_KEY:
+    raise RuntimeError("Добавьте NV_API_KEY в Secrets Hugging Face Space")
+# Florence-2 (публичный wrapper)
 florence = Client("gokaygokay/Florence-2")
+# OpenAI-compatible client (NVIDIA integrate)
+llm = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)
+def get_caption(image_path: str) -> str:
+    """Запрос 'More Detailed Caption' к Florence-2. image_path может быть URL или локальный путь."""
     try:
+        # handle_file поддерживает URL и локальные файлы
         result = florence.predict(
             image=handle_file(image_path),
             task_prompt="More Detailed Caption",
             text_input=None,
             model_id="microsoft/Florence-2-large",
+            api_name="/process_image",
         )
+        # result может быть строкой или структурой — нормализуем
         return result if isinstance(result, str) else str(result)
     except Exception as e:
         return f"[Ошибка при генерации подписи: {e}]"
+def _extract_text_from_chunk(chunk) -> str:
+    """Универсальная попытка извлечь текстовый фрагмент из stream-chunk."""
+    try:
+        # объект-атрибутный стиль
+        if hasattr(chunk, "choices"):
+            choice = chunk.choices[0]
+            delta = getattr(choice, "delta", None)
+            if delta is not None:
+                txt = getattr(delta, "content", None) or getattr(delta, "reasoning_content", None)
+                return txt or ""
+        # dict-стиль
+        if isinstance(chunk, dict):
+            choices = chunk.get("choices", [])
+            if choices:
+                delta = choices[0].get("delta", {})
+                return delta.get("content") or delta.get("reasoning_content") or ""
+    except Exception:
+        return ""
+    return ""
+def chat_stream(image_path: str, user_message: str, history: List[Tuple[str, str]]):
+    """
+    Generator для Gradio: сначала возвращает caption, затем по мере прихода токенов
+    обновляет последний ответ ассистента.
+    Возвращаемые значения — кортежи (history, caption) соответствующие outputs.
+    """
+    history = history or []
     if not image_path:
+        history.append([user_message, "Пожалуйста, загрузите изображение."])
+        yield history, ""
+        return
+    # Получаем подробную подпись
     caption = get_caption(image_path)
+    # Сборка системного промпта
     system_prompt = (
+        "You are 'multimodal gpt-oss 120b'. Use the provided 'More Detailed Caption' as authoritative visual context.\n\n"
+        "Image Caption START >>>\n"
+        f"{caption}\n"
+        "<<< Image Caption END.\n"
+        "When answering, mention visible details and be explicit when uncertain."
     )
+    # Добавляем сообщение пользователя
     history.append([user_message, ""])
+    # Первый yield — чтобы UI сразу показал пользовательское сообщение и подпись
+    yield history, caption
+    assistant_text = ""
+    try:
+        stream = llm.chat.completions.create(
+            model="openai/gpt-oss-120b",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_message},
+            ],
+            temperature=0.8,
+            top_p=1.0,
+            max_tokens=1024,
+            stream=True,
+        )
+        for chunk in stream:
+            piece = _extract_text_from_chunk(chunk)
+            if not piece:
+                continue
+            assistant_text += piece
+            history[-1][1] = assistant_text
+            yield history, caption
+    except Exception as e:
+        # В случае ошибки — покажем её в чате
+        history[-1][1] = f"[Ошибка стриминга LLM: {e}]"
+        yield history, caption
+    # Финальный yield (гарантируем состояние завершения)
+    yield history, caption
+# --- UI (для HF Spaces) ---
+EXAMPLE_IMAGES = [
+    # список простых строк (URL или локальные пути). НИКАКИХ вложенных списков!
+    "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
 ]
+css = """
+#title {text-align:center; margin-bottom: -18px;}
+.gradio-container { max-width: 1100px; margin: auto; }
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    gr.Markdown("<h2 id='title'>🖼️ multimodal gpt-oss 120b — визуальный чат</h2>")
     with gr.Row():
         with gr.Column(scale=4):
+            image_input = gr.Image(label="Загрузите картинку или выберите из галереи", type="filepath", tool="editor")
+            raw_caption = gr.Textbox(label="More Detailed Caption (Florence-2)", interactive=False)
+            user_input = gr.Textbox(label="Вопрос по изображению", placeholder="Например: 'Что происходит на фото?'")
             send_btn = gr.Button("Отправить")
+            clear_btn = gr.Button("Очистить чат")
+            gr.Markdown("**Галерея примеров (клик — подставить в загрузчик)**")
+            gallery = gr.Gallery(value=EXAMPLE_IMAGES, columns=4, label="Примеры", show_label=False).style(grid=[4], height="auto")
         with gr.Column(scale=6):
+            chatbot = gr.Chatbot(label="Чат с моделью", height=600)
+    # Клик по картинке в галерее -> вставляем URL/путь в image_input
+    def pick_example(img_url: str):
+        return img_url
+    gallery.select(fn=pick_example, inputs=[gallery], outputs=[image_input])
+    # Кнопка отправки: п��ивязываем генератор, который возвращает (chat_history, caption)
+    send_btn.click(fn=chat_stream, inputs=[image_input, user_input, chatbot], outputs=[chatbot, raw_caption])
+    clear_btn.click(lambda: [], None, chatbot)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))