Spaces:

Serg4451D
/

gpt-oss-multimodal

Sleeping

App Files Files Community

gpt-oss-multimodal / app.py

Serg4451D

Update app.py

4dc8140 verified 24 days ago

raw

history blame

7.07 kB

	import os
	from typing import Generator, List, Tuple

	import gradio as gr
	from gradio_client import Client, handle_file
	from openai import OpenAI

	# --- Конфигурация (в HF Spaces добавь NV_API_KEY в Secrets) ---
	NV_API_KEY = os.environ.get("NV_API_KEY")
	if not NV_API_KEY:
	raise RuntimeError("Добавьте NV_API_KEY в Secrets Hugging Face Space")

	# Florence-2 (публичный wrapper)
	florence = Client("gokaygokay/Florence-2")

	# OpenAI-compatible client (NVIDIA integrate)
	llm = OpenAI(base_url="https://integrate.api.nvidia.com/v1", api_key=NV_API_KEY)


	def get_caption(image_path: str) -> str:
	"""Запрос 'More Detailed Caption' к Florence-2. image_path может быть URL или локальный путь."""
	try:
	# handle_file поддерживает URL и локальные файлы
	result = florence.predict(
	image=handle_file(image_path),
	task_prompt="More Detailed Caption",
	text_input=None,
	model_id="microsoft/Florence-2-large",
	api_name="/process_image",
	)
	# result может быть строкой или структурой — нормализуем
	return result if isinstance(result, str) else str(result)
	except Exception as e:
	return f"[Ошибка при генерации подписи: {e}]"


	def _extract_text_from_chunk(chunk) -> str:
	"""Универсальная попытка извлечь текстовый фрагмент из stream-chunk."""
	try:
	# объект-атрибутный стиль
	if hasattr(chunk, "choices"):
	choice = chunk.choices[0]
	delta = getattr(choice, "delta", None)
	if delta is not None:
	txt = getattr(delta, "content", None) or getattr(delta, "reasoning_content", None)
	return txt or ""
	# dict-стиль
	if isinstance(chunk, dict):
	choices = chunk.get("choices", [])
	if choices:
	delta = choices[0].get("delta", {})
	return delta.get("content") or delta.get("reasoning_content") or ""
	except Exception:
	return ""
	return ""


	def chat_stream(image_path: str, user_message: str, history: List[Tuple[str, str]]):
	"""
	Generator для Gradio: сначала возвращает caption, затем по мере прихода токенов
	обновляет последний ответ ассистента.
	Возвращаемые значения — кортежи (history, caption) соответствующие outputs.
	"""
	history = history or []

	if not image_path:
	history.append([user_message, "Пожалуйста, загрузите изображение."])
	yield history, ""
	return

	# Получаем подробную подпись
	caption = get_caption(image_path)

	# Сборка системного промпта
	system_prompt = (
	"You are 'multimodal gpt-oss 120b'. Use the provided 'More Detailed Caption' as authoritative visual context.\n\n"
	"Image Caption START >>>\n"
	f"{caption}\n"
	"<<< Image Caption END.\n"
	"When answering, mention visible details and be explicit when uncertain."
	)

	# Добавляем сообщение пользователя
	history.append([user_message, ""])
	# Первый yield — чтобы UI сразу показал пользовательское сообщение и подпись
	yield history, caption

	assistant_text = ""
	try:
	stream = llm.chat.completions.create(
	model="openai/gpt-oss-120b",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_message},
	],
	temperature=0.8,
	top_p=1.0,
	max_tokens=1024,
	stream=True,
	)

	for chunk in stream:
	piece = _extract_text_from_chunk(chunk)
	if not piece:
	continue
	assistant_text += piece
	history[-1][1] = assistant_text
	yield history, caption

	except Exception as e:
	# В случае ошибки — покажем её в чате
	history[-1][1] = f"[Ошибка стриминга LLM: {e}]"
	yield history, caption

	# Финальный yield (гарантируем состояние завершения)
	yield history, caption


	# --- UI (для HF Spaces) ---
	EXAMPLE_IMAGES = [
	# список простых строк (URL или локальные пути). НИКАКИХ вложенных списков!
	"https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
	]

	css = """
	#title {text-align:center; margin-bottom: -18px;}
	.gradio-container { max-width: 1100px; margin: auto; }
	"""

	with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
	gr.Markdown("<h2 id='title'>🖼️ multimodal gpt-oss 120b — визуальный чат</h2>")
	with gr.Row():
	with gr.Column(scale=4):
	image_input = gr.Image(label="Загрузите картинку или выберите из галереи", type="filepath", tool="editor")
	raw_caption = gr.Textbox(label="More Detailed Caption (Florence-2)", interactive=False)
	user_input = gr.Textbox(label="Вопрос по изображению", placeholder="Например: 'Что происходит на фото?'")
	send_btn = gr.Button("Отправить")
	clear_btn = gr.Button("Очистить чат")
	gr.Markdown("Галерея примеров (клик — подставить в загрузчик)")
	gallery = gr.Gallery(value=EXAMPLE_IMAGES, columns=4, label="Примеры", show_label=False).style(grid=[4], height="auto")

	with gr.Column(scale=6):
	chatbot = gr.Chatbot(label="Чат с моделью", height=600)

	# Клик по картинке в галерее -> вставляем URL/путь в image_input
	def pick_example(img_url: str):
	return img_url

	gallery.select(fn=pick_example, inputs=[gallery], outputs=[image_input])

	# Кнопка отправки: привязываем генератор, который возвращает (chat_history, caption)
	send_btn.click(fn=chat_stream, inputs=[image_input, user_input, chatbot], outputs=[chatbot, raw_caption])

	clear_btn.click(lambda: [], None, chatbot)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))