Spaces:

Serg4451D
/

gpt-oss-multimodal

Sleeping

File size: 7,070 Bytes

e83b61c
4dc8140
 
e83b61c
 
 
 
4dc8140
e83b61c
 
4dc8140
e83b61c
4dc8140
e83b61c
 
4dc8140
 
 
e83b61c
4dc8140
 
e83b61c
4dc8140
e83b61c
 
 
 
 
4dc8140
e83b61c
4dc8140
e83b61c
 
 
 
4dc8140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e83b61c
4dc8140
 
 
e83b61c
4dc8140
e83b61c
 
4dc8140
e83b61c
4dc8140
 
 
 
 
e83b61c
 
4dc8140
e83b61c
4dc8140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e83b61c
4dc8140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e83b61c
 
4dc8140
 
 
 
e83b61c
4dc8140
 
e83b61c
 
4dc8140
 
 
e83b61c
4dc8140
 
 
e83b61c
 
4dc8140
e83b61c
4dc8140
 
 
e83b61c
4dc8140
e83b61c
4dc8140
 
e83b61c
4dc8140
e83b61c
 
4dc8140
e83b61c

import os
from typing import Generator, List, Tuple

import gradio as gr
from gradio_client import Client, handle_file
from openai import OpenAI

# --- Конфигурация (в HF Spaces добавь NV_API_KEY в Secrets) ---
NV_API_KEY = os.environ.get("NV_API_KEY")
if not NV_API_KEY:
    raise RuntimeError("Добавьте NV_API_KEY в Secrets Hugging Face Space")

# Florence-2 (публичный wrapper)
florence = Client("gokaygokay/Florence-2")

# OpenAI-compatible client (NVIDIA integrate)
llm = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)


def get_caption(image_path: str) -> str:
    """Запрос 'More Detailed Caption' к Florence-2. image_path может быть URL или локальный путь."""
    try:
        # handle_file поддерживает URL и локальные файлы
        result = florence.predict(
            image=handle_file(image_path),
            task_prompt="More Detailed Caption",
            text_input=None,
            model_id="microsoft/Florence-2-large",
            api_name="/process_image",
        )
        # result может быть строкой или структурой — нормализуем
        return result if isinstance(result, str) else str(result)
    except Exception as e:
        return f"[Ошибка при генерации подписи: {e}]"


def _extract_text_from_chunk(chunk) -> str:
    """Универсальная попытка извлечь текстовый фрагмент из stream-chunk."""
    try:
        # объект-атрибутный стиль
        if hasattr(chunk, "choices"):
            choice = chunk.choices[0]
            delta = getattr(choice, "delta", None)
            if delta is not None:
                txt = getattr(delta, "content", None) or getattr(delta, "reasoning_content", None)
                return txt or ""
        # dict-стиль
        if isinstance(chunk, dict):
            choices = chunk.get("choices", [])
            if choices:
                delta = choices[0].get("delta", {})
                return delta.get("content") or delta.get("reasoning_content") or ""
    except Exception:
        return ""
    return ""


def chat_stream(image_path: str, user_message: str, history: List[Tuple[str, str]]):
    """
    Generator для Gradio: сначала возвращает caption, затем по мере прихода токенов
    обновляет последний ответ ассистента.
    Возвращаемые значения — кортежи (history, caption) соответствующие outputs.
    """
    history = history or []

    if not image_path:
        history.append([user_message, "Пожалуйста, загрузите изображение."])
        yield history, ""
        return

    # Получаем подробную подпись
    caption = get_caption(image_path)

    # Сборка системного промпта
    system_prompt = (
        "You are 'multimodal gpt-oss 120b'. Use the provided 'More Detailed Caption' as authoritative visual context.\n\n"
        "Image Caption START >>>\n"
        f"{caption}\n"
        "<<< Image Caption END.\n"
        "When answering, mention visible details and be explicit when uncertain."
    )

    # Добавляем сообщение пользователя
    history.append([user_message, ""])
    # Первый yield — чтобы UI сразу показал пользовательское сообщение и подпись
    yield history, caption

    assistant_text = ""
    try:
        stream = llm.chat.completions.create(
            model="openai/gpt-oss-120b",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message},
            ],
            temperature=0.8,
            top_p=1.0,
            max_tokens=1024,
            stream=True,
        )

        for chunk in stream:
            piece = _extract_text_from_chunk(chunk)
            if not piece:
                continue
            assistant_text += piece
            history[-1][1] = assistant_text
            yield history, caption

    except Exception as e:
        # В случае ошибки — покажем её в чате
        history[-1][1] = f"[Ошибка стриминга LLM: {e}]"
        yield history, caption

    # Финальный yield (гарантируем состояние завершения)
    yield history, caption


# --- UI (для HF Spaces) ---
EXAMPLE_IMAGES = [
    # список простых строк (URL или локальные пути). НИКАКИХ вложенных списков!
    "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
]

css = """
#title {text-align:center; margin-bottom: -18px;}
.gradio-container { max-width: 1100px; margin: auto; }
"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown("<h2 id='title'>🖼️ multimodal gpt-oss 120b — визуальный чат</h2>")
    with gr.Row():
        with gr.Column(scale=4):
            image_input = gr.Image(label="Загрузите картинку или выберите из галереи", type="filepath", tool="editor")
            raw_caption = gr.Textbox(label="More Detailed Caption (Florence-2)", interactive=False)
            user_input = gr.Textbox(label="Вопрос по изображению", placeholder="Например: 'Что происходит на фото?'")
            send_btn = gr.Button("Отправить")
            clear_btn = gr.Button("Очистить чат")
            gr.Markdown("**Галерея примеров (клик — подставить в загрузчик)**")
            gallery = gr.Gallery(value=EXAMPLE_IMAGES, columns=4, label="Примеры", show_label=False).style(grid=[4], height="auto")

        with gr.Column(scale=6):
            chatbot = gr.Chatbot(label="Чат с моделью", height=600)

    # Клик по картинке в галерее -> вставляем URL/путь в image_input
    def pick_example(img_url: str):
        return img_url

    gallery.select(fn=pick_example, inputs=[gallery], outputs=[image_input])

    # Кнопка отправки: привязываем генератор, который возвращает (chat_history, caption)
    send_btn.click(fn=chat_stream, inputs=[image_input, user_input, chatbot], outputs=[chatbot, raw_caption])

    clear_btn.click(lambda: [], None, chatbot)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))