#!/usr/bin/env python3
"""
multimodal gpt-oss 120b — Gradio app с Florence-2 в браузере (WebGPU)

Что изменилось:
- Подпись к изображению генерим на стороне пользователя (WebGPU) через Transformers.js.
- Сервер больше не грузит Florence/torch.
- LLM остаётся через NVIDIA Integrate (OpenAI-compatible), как и было.
"""

import os
import traceback
from typing import Any, Optional, List
import gradio as gr
from openai import OpenAI

# (опционально) локальный .env при локальном запуске
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

# --------------------- Конфигурация ---------------------
NV_API_KEY = os.environ.get("NV_API_KEY")  # ОБЯЗАТЕЛЬНО прописать в Secrets HF Spaces
NV_BASE_URL = os.environ.get("NV_BASE_URL", "https://integrate.api.nvidia.com/v1")

if not NV_API_KEY:
    raise RuntimeError(
        "NV_API_KEY не задан. В Hugging Face Space зайди в Settings → Secrets и добавь NV_API_KEY."
    )

# OpenAI клиент для LLM
llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)


def _extract_text_from_stream_chunk(chunk: Any) -> str:
    """
    Универсально извлекает текстовые фрагменты из чанка стриминга LLM.
    """
    try:
        if hasattr(chunk, "choices"):
            choices = getattr(chunk, "choices")
            if choices:
                c0 = choices[0]
                delta = getattr(c0, "delta", None)
                if delta is not None:
                    txt = getattr(delta, "reasoning_content", None) or getattr(delta, "content", None)
                    if txt:
                        return str(txt)
                text_attr = getattr(c0, "text", None)
                if text_attr:
                    return str(text_attr)
        if isinstance(chunk, dict):
            choices = chunk.get("choices") or []
            if choices:
                delta = choices[0].get("delta") or {}
                return str(delta.get("content") or delta.get("reasoning_content") or choices[0].get("text") or "")
    except Exception:
        pass
    return ""


def chat_stream(image, user_message: str, history: Optional[List[List[str]]], caption_text: str):
    """
    Основной generator для стриминга ответов LLM.
    Теперь принимает caption_text прямо из браузера (WebGPU).
    """
    history = history or []

    if not user_message:
        yield history, (caption_text or "")
        return

    if not image:
        history.append([user_message, "Пожалуйста, загрузите изображение или выберите из галереи."])
        yield history, (caption_text or "")
        return

    caption = caption_text or ""

    # Системный промпт с подписью
    system_prompt = (
        "You are 'multimodal gpt-oss 120b', a helpful multimodal assistant. "
        "Use the provided 'More Detailed Caption' as authoritative visual context. "
        "If something is not visible or certain, say so explicitly.\n\n"
        "Image Caption START >>>\n"
        f"{caption}\n"
        "<<< Image Caption END.\n"
        "Answer the user's question based on the caption and general knowledge. "
        "Be concise unless asked for details."
    )

    # Добавляем сообщение пользователя
    history.append([user_message, ""])
    # Показать подпись справа от чата (как и раньше)
    yield history, caption

    assistant_accum = ""
    try:
        # Стриминг от LLM
        stream = llm.chat.completions.create(
            model="openai/gpt-oss-120b",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ],
            temperature=0.8,
            top_p=1.0,
            max_tokens=1024,
            stream=True,
        )

        for chunk in stream:
            piece = _extract_text_from_stream_chunk(chunk)
            if not piece:
                continue
            assistant_accum += piece
            history[-1][1] = assistant_accum
            yield history, caption

    except Exception as e:
        print(f"Streaming error: {e}")
        traceback.print_exc()
        # Fallback на не-стриминг запрос
        try:
            resp = llm.chat.completions.create(
                model="openai/gpt-oss-120b",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_message}
                ],
                temperature=0.8,
                top_p=1.0,
                max_tokens=1024,
                stream=False,
            )
            final_text = ""
            if hasattr(resp, "choices"):
                try:
                    final_text = getattr(resp.choices[0].message, "content", "") or getattr(resp.choices[0], "text", "") or ""
                except Exception:
                    final_text = str(resp)
            elif isinstance(resp, dict):
                choices = resp.get("choices", [])
                if choices:
                    m = choices[0].get("message") or choices[0]
                    final_text = m.get("content") or m.get("text") or str(m)
                else:
                    final_text = str(resp)
            else:
                final_text = str(resp)
            history[-1][1] = final_text
            yield history, caption
        except Exception as e2:
            history[-1][1] = f"[Ошибка LLM: {e2}]"
            yield history, caption

    yield history, caption


# --------------------- Примеры для галереи ---------------------
EXAMPLE_IMAGES = [
    "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
]

# --------------------- UI ---------------------
css = """
.gradio-container { max-width: 1100px; margin: auto; }
#title { text-align: center; }
"""

# JS-функция: делает caption в браузере через WebGPU (Transformers.js)
WEBGPU_CAPTION_JS = r"""
async (image, use_client) => {
  try {
    if (!use_client) return null;

    if (!('gpu' in navigator)) {
      return "[WebGPU недоступен в браузере. Chrome/Edge 113+ (на Linux — chrome://flags/#enable-unsafe-webgpu), Safari TP.]";
    }

    // Извлекаем источник изображения из значения Gradio Image
    const toHTMLImage = async (imgVal) => {
      if (!imgVal) throw new Error("Нет изображения");
      let src = null;
      if (typeof imgVal === 'string') {
        src = imgVal;
      } else if (imgVal?.image) {
        src = imgVal.image;
      } else if (imgVal?.data) {
        src = imgVal.data;
      }
      if (!src) throw new Error("Не удалось прочитать изображение");
      const im = new Image();
      im.crossOrigin = 'anonymous';
      im.src = src;
      await im.decode();
      return im;
    };

    // Подтягиваем Transformers.js
    const { pipeline, env } = await import("https://cdn.jsdelivr.net/npm/@xenova/transformers@3.0.0");

    // Предпочесть WebGPU
    env.allowRemoteModels = true;
    env.useBrowserCache = true; // кэш в IndexedDB
    env.backends.onnx.backend = 'webgpu';

    // Инициализация один раз
    if (!window.__webgpu_captioner) {
      const candidates = [
        'Xenova/Florence-2-large-ft',
        'Xenova/Florence-2-base-ft'
      ];
      let lastErr = null;
      for (const model of candidates) {
        try {
          window.__webgpu_captioner = await pipeline(
            'image-to-text',
            model,
            { device: 'webgpu', dtype: 'fp16', quantized: true }
          );
          break;
        } catch (e) {
          lastErr = e;
          console.warn('Failed to load', model, e);
        }
      }
      if (!window.__webgpu_captioner) throw lastErr || new Error("Не удалось инициализировать captioner");
    }

    const imgEl = await toHTMLImage(image);

    // Для Florence-2 более детальная подпись через специальный токен задачи
    const out = await window.__webgpu_captioner(imgEl, { text: '<MORE_DETAILED_CAPTION>' });

    const text = Array.isArray(out)
      ? (out[0]?.generated_text ?? out[0]?.text ?? JSON.stringify(out[0]))
      : (out?.generated_text ?? out?.text ?? String(out));

    return text;
  } catch (e) {
    return `[WebGPU caption error: ${'message' in e ? e.message : e}]`;
  }
}
"""

with gr.Blocks(css=css, analytics_enabled=False) as demo:
    gr.Markdown("<h2 id='title'>🖼️ multimodal gpt-oss 120b — визуальный чат (Florence в браузере / WebGPU)</h2>")

    with gr.Row():
        with gr.Column(scale=4):
            image_input = gr.Image(label="Загрузите картинку", type="filepath")
            use_webgpu = gr.Checkbox(value=True, label="Генерировать подпись к изображению в браузере (WebGPU)")
            raw_caption = gr.Textbox(
                label="More Detailed Caption (WebGPU)", 
                interactive=True, 
                lines=6,
                placeholder="Подпись появится тут (если включён WebGPU-капшенер)"
            )
            user_input = gr.Textbox(
                label="Вопрос по изображению", 
                placeholder="Например: Что происходит на фото?"
            )
            with gr.Row():
                send_btn = gr.Button("Отправить", variant="primary")
                clear_btn = gr.Button("Очистить чат")

            gr.Markdown("**Галерея примеров (клик — подставить в загрузчик, подпись посчитается в браузере)**")
            gallery = gr.Gallery(
                value=EXAMPLE_IMAGES,
                label="Примеры",
                columns=4,
                rows=1,
                show_label=False,
                height="auto",
                object_fit="contain"
            )

        with gr.Column(scale=6):
            chatbot = gr.Chatbot(label="Чат с моделью", height=640)

    # Клик по галерее: просто подставить изображение и очистить подпись (капшенер сработает на change)
    def on_gallery_select(evt: gr.SelectData):
        img = EXAMPLE_IMAGES[evt.index]
        return img, ""

    gallery.select(
        on_gallery_select,
        inputs=None,
        outputs=[image_input, raw_caption]
    )

    # Изменение картинки: считаем подпись на клиенте (WebGPU)
    image_input.change(
        None,
        inputs=[image_input, use_webgpu],
        outputs=[raw_caption],
        js=WEBGPU_CAPTION_JS
    )

    # Отправка сообщения: берём caption прямо из текстбокса (не генерим на сервере)
    send_btn.click(
        chat_stream,
        inputs=[image_input, user_input, chatbot, raw_caption],
        outputs=[chatbot, raw_caption]
    )

    user_input.submit(
        chat_stream,
        inputs=[image_input, user_input, chatbot, raw_caption],
        outputs=[chatbot, raw_caption]
    )

    # Очистка чата + подписи
    def clear_all():
        return [], ""

    clear_btn.click(
        clear_all,
        inputs=None,
        outputs=[chatbot, raw_caption]
    )

# Запуск
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        share=False
    )