Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,287 +1,161 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
"""
|
4 |
-
|
5 |
-
-
|
6 |
-
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
- Раннер всех 14 задач Florence-2 с загрузкой изображения, текст-подсказкой и (при необходимости)
|
12 |
-
координатами региона в нормализованных 0..999 координатах.
|
13 |
-
- Вывод JSON/TXT + галерея изображений результатов (если модель вернёт предикты-изображения).
|
14 |
"""
|
15 |
|
16 |
import os
|
17 |
import io
|
18 |
import json
|
19 |
-
import time
|
20 |
-
import traceback
|
21 |
import zipfile
|
22 |
import mimetypes
|
|
|
23 |
from typing import Any, Dict, List, Optional, Tuple
|
24 |
|
25 |
import requests
|
26 |
import gradio as gr
|
27 |
-
from PIL import Image
|
28 |
from openai import OpenAI
|
29 |
|
30 |
# --------------------- Конфигурация ---------------------
|
31 |
-
NV_API_KEY = os.environ.get("NV_API_KEY")
|
32 |
NV_BASE_URL = os.environ.get("NV_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
33 |
-
# Официальный Florence-2 VLM endpoint (NIM API)
|
34 |
NV_VLM_URL = os.environ.get("NV_VLM_URL", "https://ai.api.nvidia.com/v1/vlm/microsoft/florence-2")
|
35 |
-
# Эндпоинт загрузки ассетов (NVCF assets)
|
36 |
NVCF_ASSETS_URL = "https://api.nvcf.nvidia.com/v2/nvcf/assets"
|
37 |
|
38 |
if not NV_API_KEY:
|
39 |
-
raise RuntimeError(
|
40 |
-
"NV_API_KEY не задан. В Hugging Face Space зайди в Settings → Secrets и добавь NV_API_KEY."
|
41 |
-
)
|
42 |
|
43 |
-
# OpenAI-совместимый клиент для LLM (NVIDIA Integrate)
|
44 |
llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
|
45 |
|
46 |
-
# --------------------- Florence-2
|
47 |
-
|
48 |
-
|
49 |
-
("Caption", "<CAPTION>"),
|
50 |
-
("Detailed Caption", "<DETAILED_CAPTION>"),
|
51 |
-
("More Detailed Caption", "<MORE_DETAILED_CAPTION>"),
|
52 |
-
("Object Detection (OD)", "<OD>"),
|
53 |
-
("Dense Region Caption", "<DENSE_REGION_CAPTION>"),
|
54 |
-
("Region Proposal", "<REGION_PROPOSAL>"),
|
55 |
-
("Caption to Phrase Grounding", "<CAPTION_TO_PHRASE_GROUNDING>"),
|
56 |
-
("Referring Expression Segmentation", "<REFERRING_EXPRESSION_SEGMENTATION>"),
|
57 |
-
("Region to Segmentation", "<REGION_TO_SEGMENTATION>"),
|
58 |
-
("Open Vocabulary Detection", "<OPEN_VOCABULARY_DETECTION>"),
|
59 |
-
("Region to Category", "<REGION_TO_CATEGORY>"),
|
60 |
-
("Region to Description", "<REGION_TO_DESCRIPTION>"),
|
61 |
-
("OCR", "<OCR>"),
|
62 |
-
("OCR with Region", "<OCR_WITH_REGION>"),
|
63 |
-
]
|
64 |
-
TASK_LABEL_TO_TOKEN = {label: token for (label, token) in FLORENCE_TASKS}
|
65 |
-
|
66 |
-
# Какие задачи требуют текстовую подсказку
|
67 |
-
TEXT_REQUIRED_TASKS = {
|
68 |
-
"<CAPTION_TO_PHRASE_GROUNDING>",
|
69 |
-
"<REFERRING_EXPRESSION_SEGMENTATION>",
|
70 |
-
"<OPEN_VOCABULARY_DETECTION>",
|
71 |
-
}
|
72 |
-
# Какие задачи требуют регион (нормализованные 0..999 координаты)
|
73 |
-
REGION_REQUIRED_TASKS = {
|
74 |
-
"<REGION_TO_SEGMENTATION>",
|
75 |
-
"<REGION_TO_CATEGORY>",
|
76 |
-
"<REGION_TO_DESCRIPTION>",
|
77 |
-
"<OCR_WITH_REGION>",
|
78 |
-
}
|
79 |
-
|
80 |
-
# --------------------- Вспомогательные функции ---------------------
|
81 |
-
def guess_mime_from_path(path: str) -> str:
|
82 |
-
mime, _ = mimetypes.guess_type(path)
|
83 |
-
if mime is None:
|
84 |
-
# По умолчанию JPEG
|
85 |
-
return "image/jpeg"
|
86 |
-
return mime
|
87 |
|
88 |
-
def nvcf_upload_asset(image_path: str, description: str = "
|
89 |
-
|
90 |
-
|
91 |
-
"""
|
92 |
-
content_type = guess_mime_from_path(image_path)
|
93 |
-
auth_resp = requests.post(
|
94 |
NVCF_ASSETS_URL,
|
95 |
headers={
|
96 |
"Authorization": f"Bearer {NV_API_KEY}",
|
97 |
"Content-Type": "application/json",
|
98 |
"accept": "application/json",
|
99 |
},
|
100 |
-
json={"contentType":
|
101 |
timeout=30,
|
102 |
)
|
103 |
-
|
104 |
-
up_url =
|
105 |
-
asset_id = str(
|
106 |
-
|
107 |
with open(image_path, "rb") as f:
|
108 |
-
|
109 |
up_url,
|
110 |
data=f,
|
111 |
headers={
|
112 |
"x-amz-meta-nvcf-asset-description": description,
|
113 |
-
"content-type":
|
114 |
},
|
115 |
timeout=300,
|
116 |
)
|
117 |
-
|
118 |
return asset_id
|
119 |
|
120 |
-
def
|
121 |
-
""
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
for v in [x1, y1, x2, y2]:
|
126 |
-
if not (0 <= int(v) <= 999):
|
127 |
-
raise ValueError("Координаты должны быть в диапазоне 0..999")
|
128 |
-
return f"<loc_{int(x1)}><loc_{int(y1)}><loc_{int(x2)}><loc_{int(y2)}>"
|
129 |
-
|
130 |
-
def build_vlm_content(
|
131 |
-
task_token: str,
|
132 |
-
asset_id: str,
|
133 |
-
text_prompt: Optional[str] = None,
|
134 |
-
region: Optional[Tuple[int, int, int, int]] = None,
|
135 |
-
) -> str:
|
136 |
-
"""
|
137 |
-
Собирает content-строку для Florence-2:
|
138 |
-
"<TASK_PROMPT><text_prompt (only when needed)><img>"
|
139 |
-
Для задач REGION_* вместо text_prompt подставляется формат координат.
|
140 |
-
"""
|
141 |
-
parts = [task_token]
|
142 |
-
if region is not None:
|
143 |
-
parts.append(build_region_prompt(*region))
|
144 |
-
if (text_prompt is not None) and (text_prompt.strip()):
|
145 |
-
parts.append(text_prompt.strip())
|
146 |
-
parts.append(f'<img src="data:image/jpeg;asset_id,{asset_id}" />')
|
147 |
-
return "".join(parts)
|
148 |
-
|
149 |
-
def call_florence_vlm(content: str, asset_id: str) -> Tuple[str, List[Image.Image], Dict[str, str]]:
|
150 |
"""
|
151 |
-
|
152 |
-
|
153 |
-
- primary_text: лучший извлечённый текстовый ответ/описание
|
154 |
-
- images_list: список PIL.Image (если вернуло изображения)
|
155 |
-
- text_files_dict: словарь {filename: text/json_str} из архива
|
156 |
"""
|
157 |
-
payload = {"messages": [{"role": "user", "content": content}]}
|
158 |
-
headers = {
|
159 |
-
"Authorization": f"Bearer {NV_API_KEY}",
|
160 |
-
"Accept": "application/json, application/zip, */*",
|
161 |
-
"Content-Type": "application/json",
|
162 |
-
# Пробрасываем asset_id в заголовки:
|
163 |
-
"NVCF-INPUT-ASSET-REFERENCES": asset_id,
|
164 |
-
"NVCF-FUNCTION-ASSET-IDS": asset_id,
|
165 |
-
}
|
166 |
-
|
167 |
-
resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
|
168 |
-
if not resp.ok:
|
169 |
-
# Попробуем дать более содержательное сообщение
|
170 |
-
try:
|
171 |
-
return f"[VLM HTTP {resp.status_code}] {resp.text}", [], {}
|
172 |
-
except Exception:
|
173 |
-
resp.raise_for_status()
|
174 |
-
|
175 |
ct = (resp.headers.get("content-type") or "").lower()
|
176 |
data = resp.content
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
# Рекурсивно ищем информативные текстовые значения
|
181 |
-
keys_priority = ["more_detailed_caption", "detailed_caption", "caption", "text", "ocr", "description"]
|
182 |
def walk(o):
|
183 |
-
|
184 |
if isinstance(o, dict):
|
185 |
-
|
186 |
-
for k in keys_priority:
|
187 |
if k in o and isinstance(o[k], str) and o[k].strip():
|
188 |
-
|
189 |
-
# иначе рекурсивно
|
190 |
for v in o.values():
|
191 |
-
|
192 |
elif isinstance(o, list):
|
193 |
for it in o:
|
194 |
-
|
195 |
elif isinstance(o, str):
|
196 |
if o.strip():
|
197 |
-
|
198 |
-
return
|
199 |
-
|
200 |
arr = walk(obj)
|
201 |
return arr[0] if arr else None
|
202 |
|
203 |
-
|
204 |
-
|
205 |
-
texts: Dict[str, str] = {}
|
206 |
-
primary_text: Optional[str] = None
|
207 |
-
|
208 |
-
with zipfile.ZipFile(io.BytesIO(zbytes), "r") as z:
|
209 |
-
for name in z.namelist():
|
210 |
-
try:
|
211 |
-
with z.open(name) as f:
|
212 |
-
raw = f.read()
|
213 |
-
except Exception:
|
214 |
-
continue
|
215 |
-
|
216 |
-
lower = name.lower()
|
217 |
-
if lower.endswith((".png", ".jpg", ".jpeg", ".bmp", ".webp")):
|
218 |
-
try:
|
219 |
-
img = Image.open(io.BytesIO(raw)).convert("RGBA")
|
220 |
-
images.append(img)
|
221 |
-
except Exception:
|
222 |
-
pass
|
223 |
-
elif lower.endswith(".json"):
|
224 |
-
try:
|
225 |
-
obj = json.loads(raw.decode("utf-8", errors="ignore"))
|
226 |
-
texts[name] = json.dumps(obj, ensure_ascii=False, indent=2)
|
227 |
-
if primary_text is None:
|
228 |
-
cand = _extract_primary_from_json(obj)
|
229 |
-
if cand:
|
230 |
-
primary_text = cand
|
231 |
-
except Exception:
|
232 |
-
texts[name] = raw.decode("utf-8", errors="ignore")
|
233 |
-
elif lower.endswith(".txt"):
|
234 |
-
txt = raw.decode("utf-8", errors="ignore").strip()
|
235 |
-
texts[name] = txt
|
236 |
-
if primary_text is None and txt:
|
237 |
-
primary_text = txt
|
238 |
-
|
239 |
-
if primary_text is None:
|
240 |
-
# Если ничего "осмысленного" не нашли — соберём обзор
|
241 |
-
if texts:
|
242 |
-
primary_text = next(iter(texts.values()))
|
243 |
-
elif images:
|
244 |
-
primary_text = f"[Получено {len(images)} изображений-результатов]"
|
245 |
-
else:
|
246 |
-
primary_text = "[Результат пуст]"
|
247 |
-
|
248 |
-
return primary_text, images, texts
|
249 |
-
|
250 |
-
# Если JSON:
|
251 |
-
if "application/json" in ct and not (data[:2] == b"PK"):
|
252 |
try:
|
253 |
obj = resp.json()
|
254 |
-
|
255 |
-
return primary_text, [], {"response.json": json.dumps(obj, ensure_ascii=False, indent=2)}
|
256 |
except Exception:
|
257 |
-
# fallback: попробовать как zip
|
258 |
pass
|
259 |
|
260 |
-
#
|
261 |
-
if data
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
-
#
|
265 |
try:
|
266 |
-
|
267 |
except Exception:
|
268 |
-
|
269 |
-
return text, [], {"raw.txt": text}
|
270 |
|
271 |
-
def
|
272 |
"""
|
273 |
-
|
274 |
-
Возвращает (caption_text, asset_id).
|
275 |
"""
|
276 |
-
asset_id = nvcf_upload_asset(image_path
|
277 |
-
content =
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
|
|
281 |
def _extract_text_from_stream_chunk(chunk: Any) -> str:
|
282 |
-
"""
|
283 |
-
Универсально извлекает текстовые фрагменты из чанка стриминга LLM.
|
284 |
-
"""
|
285 |
try:
|
286 |
if hasattr(chunk, "choices"):
|
287 |
choices = getattr(chunk, "choices")
|
@@ -304,74 +178,105 @@ def _extract_text_from_stream_chunk(chunk: Any) -> str:
|
|
304 |
pass
|
305 |
return ""
|
306 |
|
307 |
-
# ---------------------
|
308 |
-
def
|
|
|
|
|
|
|
|
|
|
|
309 |
"""
|
310 |
-
|
311 |
-
|
312 |
"""
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
|
|
|
340 |
assistant_accum = ""
|
341 |
try:
|
342 |
stream = llm.chat.completions.create(
|
343 |
model="openai/gpt-oss-120b",
|
344 |
messages=[
|
345 |
{"role": "system", "content": system_prompt},
|
346 |
-
{"role": "user", "content":
|
347 |
],
|
348 |
-
temperature=0.
|
349 |
top_p=1.0,
|
350 |
-
max_tokens=
|
351 |
stream=True,
|
352 |
)
|
353 |
-
|
354 |
for chunk in stream:
|
355 |
piece = _extract_text_from_stream_chunk(chunk)
|
356 |
if not piece:
|
357 |
continue
|
358 |
assistant_accum += piece
|
359 |
-
|
360 |
-
yield
|
361 |
|
362 |
except Exception as e:
|
363 |
-
|
364 |
-
traceback.print_exc()
|
365 |
try:
|
366 |
resp = llm.chat.completions.create(
|
367 |
model="openai/gpt-oss-120b",
|
368 |
messages=[
|
369 |
{"role": "system", "content": system_prompt},
|
370 |
-
{"role": "user", "content":
|
371 |
],
|
372 |
-
temperature=0.
|
373 |
top_p=1.0,
|
374 |
-
max_tokens=
|
375 |
stream=False,
|
376 |
)
|
377 |
final_text = ""
|
@@ -389,228 +294,81 @@ def chat_stream(image, user_message: str, history: Optional[List[List[str]]], ca
|
|
389 |
final_text = str(resp)
|
390 |
else:
|
391 |
final_text = str(resp)
|
392 |
-
|
393 |
-
yield
|
394 |
except Exception as e2:
|
395 |
-
|
396 |
-
yield
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
except Exception:
|
415 |
-
size = None
|
416 |
-
return caption, asset_id, size
|
417 |
-
except Exception as e:
|
418 |
-
return f"[Ошибка автокапшена: {e}]", "", None
|
419 |
-
|
420 |
-
def update_task_inputs(selected_label: str):
|
421 |
-
"""
|
422 |
-
Управляет видимостью полей text prompt / region по выбранной задаче.
|
423 |
-
"""
|
424 |
-
token = TASK_LABEL_TO_TOKEN.get(selected_label, "")
|
425 |
-
need_text = token in TEXT_REQUIRED_TASKS
|
426 |
-
need_region = token in REGION_REQUIRED_TASKS
|
427 |
-
|
428 |
-
return (
|
429 |
-
gr.update(visible=need_text), # text prompt
|
430 |
-
gr.update(visible=need_region), # x1
|
431 |
-
gr.update(visible=need_region), # y1
|
432 |
-
gr.update(visible=need_region), # x2
|
433 |
-
gr.update(visible=need_region), # y2
|
434 |
-
gr.update(visible=True), # run button
|
435 |
-
)
|
436 |
-
|
437 |
-
def run_florence_task(
|
438 |
-
image_path: Optional[str],
|
439 |
-
asset_id: str,
|
440 |
-
selected_label: str,
|
441 |
-
text_prompt: str,
|
442 |
-
x1: int, y1: int, x2: int, y2: int
|
443 |
-
):
|
444 |
-
"""
|
445 |
-
Запуск произвольной задачи Florence-2 на текущем изображении.
|
446 |
-
Возвращает: галерея изображений, текстовый результат.
|
447 |
-
"""
|
448 |
-
if not image_path:
|
449 |
-
return [], "[Ошибка] Загрузите изображение."
|
450 |
-
try:
|
451 |
-
token = TASK_LABEL_TO_TOKEN.get(selected_label, "<MORE_DETAILED_CAPTION>")
|
452 |
-
|
453 |
-
# Если asset_id пуст — загрузим прямо сейчас
|
454 |
-
if not asset_id:
|
455 |
-
asset_id = nvcf_upload_asset(image_path, f"Task: {selected_label}")
|
456 |
-
|
457 |
-
region = None
|
458 |
-
if token in REGION_REQUIRED_TASKS:
|
459 |
-
region = (int(x1), int(y1), int(x2), int(y2))
|
460 |
-
|
461 |
-
# Для задач, где текст обязателен, пустую строку лучше не подставлять
|
462 |
-
effective_text = text_prompt if (token in TEXT_REQUIRED_TASKS) else None
|
463 |
-
|
464 |
-
content = build_vlm_content(token, asset_id, text_prompt=effective_text, region=region)
|
465 |
-
primary_text, imgs, texts = call_florence_vlm(content, asset_id)
|
466 |
-
|
467 |
-
# Галерея изображений: список (numpy/PIL/urls) — PIL подходит
|
468 |
-
gallery_items = imgs
|
469 |
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
|
|
|
|
477 |
|
478 |
-
|
|
|
479 |
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
# --------------------- Примеры для галереи ---------------------
|
484 |
-
EXAMPLE_IMAGES = [
|
485 |
-
"https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
|
486 |
-
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
|
487 |
-
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cheetah.jpg",
|
488 |
-
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/flowers.png",
|
489 |
-
]
|
490 |
-
|
491 |
-
# --------------------- UI ---------------------
|
492 |
-
css = """
|
493 |
-
.gradio-container { max-width: 1100px; margin: auto; }
|
494 |
-
#title { text-align: center; }
|
495 |
-
"""
|
496 |
|
497 |
-
with gr.
|
498 |
-
|
|
|
|
|
|
|
|
|
499 |
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
with gr.Row():
|
505 |
-
with gr.Column(scale=4):
|
506 |
-
image_input = gr.Image(label="Загрузите картинку", type="filepath")
|
507 |
-
raw_caption = gr.Textbox(
|
508 |
-
label="More Detailed Caption (серверный Florence-2)",
|
509 |
-
interactive=True,
|
510 |
-
lines=6,
|
511 |
-
placeholder="Подпись появится тут (серверный Florence-2)"
|
512 |
-
)
|
513 |
-
user_input = gr.Textbox(
|
514 |
-
label="Вопрос по изображению",
|
515 |
-
placeholder="Например: Что происходит на фото?"
|
516 |
-
)
|
517 |
-
with gr.Row():
|
518 |
-
send_btn = gr.Button("Отправить", variant="primary")
|
519 |
-
clear_btn = gr.Button("Очистить чат")
|
520 |
-
|
521 |
-
gr.Markdown("Галерея примеров (клик — подставить в загрузчик, подпись посчитается на сервере)")
|
522 |
-
gallery = gr.Gallery(
|
523 |
-
value=EXAMPLE_IMAGES,
|
524 |
-
label="Примеры",
|
525 |
-
columns=4,
|
526 |
-
rows=1,
|
527 |
show_label=False,
|
528 |
-
|
529 |
-
|
530 |
)
|
|
|
531 |
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
)
|
538 |
-
task_text_prompt = gr.Textbox(
|
539 |
-
label="Text prompt (для некоторых задач)",
|
540 |
-
placeholder="Например: a black and brown dog",
|
541 |
-
visible=False
|
542 |
-
)
|
543 |
-
with gr.Row():
|
544 |
-
x1_in = gr.Slider(0, 999, step=1, value=100, label="x1 (0..999)", visible=False)
|
545 |
-
y1_in = gr.Slider(0, 999, step=1, value=100, label="y1 (0..999)", visible=False)
|
546 |
-
x2_in = gr.Slider(0, 999, step=1, value=800, label="x2 (0..999)", visible=False)
|
547 |
-
y2_in = gr.Slider(0, 999, step=1, value=800, label="y2 (0..999)", visible=False)
|
548 |
-
run_task_btn = gr.Button("Запустить задачу", visible=True)
|
549 |
-
task_gallery = gr.Gallery(label="Результирующие изображения", columns=3, height=320)
|
550 |
-
task_text_out = gr.Textbox(label="Результат (JSON/TXT)", lines=16)
|
551 |
-
|
552 |
-
with gr.Column(scale=6):
|
553 |
-
chatbot = gr.Chatbot(label="Чат с моделью", height=640)
|
554 |
-
|
555 |
-
# Галерея: выбор примера → подставляем URL в загрузчик
|
556 |
-
def on_gallery_select(evt: gr.SelectData):
|
557 |
-
img = EXAMPLE_IMAGES[evt.index]
|
558 |
-
# обнуляем caption и состояние
|
559 |
-
return img, "", "", None
|
560 |
-
|
561 |
-
gallery.select(
|
562 |
-
on_gallery_select,
|
563 |
-
inputs=None,
|
564 |
-
outputs=[image_input, raw_caption, asset_state, img_size_state]
|
565 |
)
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
inputs=[image_input],
|
571 |
-
outputs=[raw_caption, asset_state, img_size_state]
|
572 |
-
)
|
573 |
-
|
574 |
-
# Изменение выбора задачи → показать/скрыть поля
|
575 |
-
task_dropdown.change(
|
576 |
-
update_task_inputs,
|
577 |
-
inputs=[task_dropdown],
|
578 |
-
outputs=[task_text_prompt, x1_in, y1_in, x2_in, y2_in, run_task_btn]
|
579 |
)
|
580 |
|
581 |
-
#
|
582 |
-
run_task_btn.click(
|
583 |
-
run_florence_task,
|
584 |
-
inputs=[image_input, asset_state, task_dropdown, task_text_prompt, x1_in, y1_in, x2_in, y2_in],
|
585 |
-
outputs=[task_gallery, task_text_out]
|
586 |
-
)
|
587 |
-
|
588 |
-
# Отправка сообщения в чат
|
589 |
-
send_btn.click(
|
590 |
-
chat_stream,
|
591 |
-
inputs=[image_input, user_input, chatbot, raw_caption],
|
592 |
-
outputs=[chatbot, raw_caption]
|
593 |
-
)
|
594 |
-
user_input.submit(
|
595 |
-
chat_stream,
|
596 |
-
inputs=[image_input, user_input, chatbot, raw_caption],
|
597 |
-
outputs=[chatbot, raw_caption]
|
598 |
-
)
|
599 |
-
|
600 |
-
# Очистка чата + подписи
|
601 |
def clear_all():
|
602 |
-
return [], "", ""
|
603 |
-
|
604 |
-
clear_btn.click(
|
605 |
clear_all,
|
606 |
inputs=None,
|
607 |
-
outputs=[chatbot,
|
608 |
)
|
609 |
|
610 |
-
# Запуск
|
611 |
if __name__ == "__main__":
|
612 |
-
demo.launch(
|
613 |
-
server_name="0.0.0.0",
|
614 |
-
server_port=int(os.environ.get("PORT", 7860)),
|
615 |
-
share=False
|
616 |
-
)
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
"""
|
4 |
+
Минималистичный визуальный чат как в мессенджерах:
|
5 |
+
- Внизу — компактная строка ввода с маленькой кнопкой добавления изображений.
|
6 |
+
- Авто-подпись к изображению (<MORE_DETAILED_CAPTION>) через NVIDIA Florence-2 (NIM API).
|
7 |
+
- Ответ LLM (стриминг) через NVIDIA Integrate (OpenAI-совместимый API).
|
8 |
+
- Без WebGPU/wasm, без громоздких панелей.
|
9 |
+
|
10 |
+
Требуется в Secrets HF Space: NV_API_KEY
|
|
|
|
|
|
|
11 |
"""
|
12 |
|
13 |
import os
|
14 |
import io
|
15 |
import json
|
|
|
|
|
16 |
import zipfile
|
17 |
import mimetypes
|
18 |
+
import traceback
|
19 |
from typing import Any, Dict, List, Optional, Tuple
|
20 |
|
21 |
import requests
|
22 |
import gradio as gr
|
|
|
23 |
from openai import OpenAI
|
24 |
|
25 |
# --------------------- Конфигурация ---------------------
|
26 |
+
NV_API_KEY = os.environ.get("NV_API_KEY")
|
27 |
NV_BASE_URL = os.environ.get("NV_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
|
|
28 |
NV_VLM_URL = os.environ.get("NV_VLM_URL", "https://ai.api.nvidia.com/v1/vlm/microsoft/florence-2")
|
|
|
29 |
NVCF_ASSETS_URL = "https://api.nvcf.nvidia.com/v2/nvcf/assets"
|
30 |
|
31 |
if not NV_API_KEY:
|
32 |
+
raise RuntimeError("NV_API_KEY не задан. В HF Space: Settings → Secrets → NV_API_KEY")
|
|
|
|
|
33 |
|
|
|
34 |
llm = OpenAI(base_url=NV_BASE_URL, api_key=NV_API_KEY)
|
35 |
|
36 |
+
# --------------------- Florence-2 utils ---------------------
|
37 |
+
def _guess_mime(path: str) -> str:
|
38 |
+
return mimetypes.guess_type(path)[0] or "image/jpeg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
def nvcf_upload_asset(image_path: str, description: str = "Chat image") -> str:
|
41 |
+
# 1) авторизация на загрузку
|
42 |
+
auth = requests.post(
|
|
|
|
|
|
|
43 |
NVCF_ASSETS_URL,
|
44 |
headers={
|
45 |
"Authorization": f"Bearer {NV_API_KEY}",
|
46 |
"Content-Type": "application/json",
|
47 |
"accept": "application/json",
|
48 |
},
|
49 |
+
json={"contentType": _guess_mime(image_path), "description": description},
|
50 |
timeout=30,
|
51 |
)
|
52 |
+
auth.raise_for_status()
|
53 |
+
up_url = auth.json()["uploadUrl"]
|
54 |
+
asset_id = str(auth.json()["assetId"])
|
55 |
+
# 2) загрузка бинарника
|
56 |
with open(image_path, "rb") as f:
|
57 |
+
put = requests.put(
|
58 |
up_url,
|
59 |
data=f,
|
60 |
headers={
|
61 |
"x-amz-meta-nvcf-asset-description": description,
|
62 |
+
"content-type": _guess_mime(image_path),
|
63 |
},
|
64 |
timeout=300,
|
65 |
)
|
66 |
+
put.raise_for_status()
|
67 |
return asset_id
|
68 |
|
69 |
+
def _vlm_content_more_detailed_caption(asset_id: str) -> str:
|
70 |
+
# Формат: "<TASK_PROMPT><img>"
|
71 |
+
return f'<MORE_DETAILED_CAPTION><img src="data:image/jpeg;asset_id,{asset_id}" />'
|
72 |
+
|
73 |
+
def _parse_vlm_response(resp: requests.Response) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
"""
|
75 |
+
Возвращает извлечённый текст (caption/ocr/description), если доступен.
|
76 |
+
Florence-2 может отдавать JSON или ZIP с файлами.
|
|
|
|
|
|
|
77 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
ct = (resp.headers.get("content-type") or "").lower()
|
79 |
data = resp.content
|
80 |
|
81 |
+
def extract_text_from_json(obj: Any) -> Optional[str]:
|
82 |
+
keys = ["more_detailed_caption", "detailed_caption", "caption", "text", "ocr", "description"]
|
|
|
|
|
83 |
def walk(o):
|
84 |
+
res = []
|
85 |
if isinstance(o, dict):
|
86 |
+
for k in keys:
|
|
|
87 |
if k in o and isinstance(o[k], str) and o[k].strip():
|
88 |
+
res.append(o[k].strip())
|
|
|
89 |
for v in o.values():
|
90 |
+
res.extend(walk(v))
|
91 |
elif isinstance(o, list):
|
92 |
for it in o:
|
93 |
+
res.extend(walk(it))
|
94 |
elif isinstance(o, str):
|
95 |
if o.strip():
|
96 |
+
res.append(o.strip())
|
97 |
+
return res
|
|
|
98 |
arr = walk(obj)
|
99 |
return arr[0] if arr else None
|
100 |
|
101 |
+
# JSON
|
102 |
+
if "application/json" in ct and not data.startswith(b"PK"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
try:
|
104 |
obj = resp.json()
|
105 |
+
return extract_text_from_json(obj) or json.dumps(obj, ensure_ascii=False)
|
|
|
106 |
except Exception:
|
|
|
107 |
pass
|
108 |
|
109 |
+
# ZIP
|
110 |
+
if data.startswith(b"PK") or "zip" in ct or "octet-stream" in ct:
|
111 |
+
try:
|
112 |
+
with zipfile.ZipFile(io.BytesIO(data), "r") as z:
|
113 |
+
primary = None
|
114 |
+
for name in z.namelist():
|
115 |
+
with z.open(name) as f:
|
116 |
+
raw = f.read()
|
117 |
+
if name.lower().endswith(".json"):
|
118 |
+
try:
|
119 |
+
obj = json.loads(raw.decode("utf-8", errors="ignore"))
|
120 |
+
primary = extract_text_from_json(obj) or primary
|
121 |
+
except Exception:
|
122 |
+
pass
|
123 |
+
elif name.lower().endswith(".txt") and primary is None:
|
124 |
+
txt = raw.decode("utf-8", errors="ignore").strip()
|
125 |
+
if txt:
|
126 |
+
primary = txt
|
127 |
+
return primary or "[Нет текстового результата]"
|
128 |
+
except Exception:
|
129 |
+
pass
|
130 |
|
131 |
+
# Фоллбэк: текст
|
132 |
try:
|
133 |
+
return data.decode("utf-8", errors="ignore")
|
134 |
except Exception:
|
135 |
+
return "[Не удалось разобра��ь ответ Florence-2]"
|
|
|
136 |
|
137 |
+
def get_more_detailed_caption(image_path: str) -> Tuple[str, str]:
|
138 |
"""
|
139 |
+
Возвращает (caption, asset_id) для заданного изображения.
|
|
|
140 |
"""
|
141 |
+
asset_id = nvcf_upload_asset(image_path)
|
142 |
+
content = _vlm_content_more_detailed_caption(asset_id)
|
143 |
+
payload = {"messages": [{"role": "user", "content": content}]}
|
144 |
+
headers = {
|
145 |
+
"Authorization": f"Bearer {NV_API_KEY}",
|
146 |
+
"Accept": "application/json, application/zip, */*",
|
147 |
+
"Content-Type": "application/json",
|
148 |
+
"NVCF-INPUT-ASSET-REFERENCES": asset_id,
|
149 |
+
"NVCF-FUNCTION-ASSET-IDS": asset_id,
|
150 |
+
}
|
151 |
+
resp = requests.post(NV_VLM_URL, headers=headers, json=payload, timeout=300)
|
152 |
+
if not resp.ok:
|
153 |
+
raise RuntimeError(f"VLM HTTP {resp.status_code}: {resp.text}")
|
154 |
+
caption = _parse_vlm_response(resp)
|
155 |
+
return caption, asset_id
|
156 |
|
157 |
+
# --------------------- LLM streaming utils ---------------------
|
158 |
def _extract_text_from_stream_chunk(chunk: Any) -> str:
|
|
|
|
|
|
|
159 |
try:
|
160 |
if hasattr(chunk, "choices"):
|
161 |
choices = getattr(chunk, "choices")
|
|
|
178 |
pass
|
179 |
return ""
|
180 |
|
181 |
+
# --------------------- Чат-логика ---------------------
|
182 |
+
def respond(
|
183 |
+
message: Dict[str, Any],
|
184 |
+
chat_history: List[List[str]],
|
185 |
+
last_caption: str,
|
186 |
+
last_asset_id: str
|
187 |
+
):
|
188 |
"""
|
189 |
+
message: MultimodalTextbox -> {"text": str, "files": [<paths or dicts>]}
|
190 |
+
Возвращает generator с потоковым ответом LLM.
|
191 |
"""
|
192 |
+
text = (message or {}).get("text", "") if isinstance(message, dict) else str(message or "")
|
193 |
+
files = (message or {}).get("files", []) if isinstance(message, dict) else []
|
194 |
+
|
195 |
+
def first_image_path(files) -> Optional[str]:
|
196 |
+
for f in files:
|
197 |
+
if isinstance(f, dict) and f.get("path"):
|
198 |
+
# gradio dict
|
199 |
+
mt = f.get("mime_type") or _guess_mime(f["path"])
|
200 |
+
if mt.startswith("image/"):
|
201 |
+
return f["path"]
|
202 |
+
elif isinstance(f, str):
|
203 |
+
if _guess_mime(f).startswith("image/"):
|
204 |
+
return f
|
205 |
+
return None
|
206 |
+
|
207 |
+
img_path = first_image_path(files)
|
208 |
+
|
209 |
+
# Сформируем видимое сообщение пользователя (эстетично и лаконично)
|
210 |
+
parts = []
|
211 |
+
if text and text.strip():
|
212 |
+
parts.append(text.strip())
|
213 |
+
if img_path:
|
214 |
+
parts.append("🖼️ [изображение]")
|
215 |
+
user_visible = "\n".join(parts) if parts else "🖐️"
|
216 |
+
|
217 |
+
chat_history = chat_history or []
|
218 |
+
chat_history.append([user_visible, ""])
|
219 |
+
yield {"text": "", "files": []}, chat_history, last_caption, last_asset_id
|
220 |
+
|
221 |
+
# Капшен изображения (если есть новое)
|
222 |
+
caption = last_caption or ""
|
223 |
+
asset_id = last_asset_id or ""
|
224 |
+
try:
|
225 |
+
if img_path:
|
226 |
+
caption, asset_id = get_more_detailed_caption(img_path)
|
227 |
+
except Exception as e:
|
228 |
+
caption = f"[Ошибка автокапшена: {e}]"
|
229 |
+
|
230 |
+
# Системный промпт
|
231 |
+
if caption:
|
232 |
+
system_prompt = (
|
233 |
+
"You are a helpful multimodal assistant.\n"
|
234 |
+
"Use the provided 'More Detailed Caption' as authoritative visual context.\n"
|
235 |
+
"If something is not visible or uncertain, say so.\n\n"
|
236 |
+
"Image Caption START >>>\n"
|
237 |
+
f"{caption}\n"
|
238 |
+
"<<< Image Caption END."
|
239 |
+
)
|
240 |
+
else:
|
241 |
+
system_prompt = (
|
242 |
+
"You are a helpful assistant. The user might have sent text-only message. "
|
243 |
+
"If they refer to an image but no caption is available, ask to attach an image."
|
244 |
+
)
|
245 |
|
246 |
+
# Стрим LLM
|
247 |
assistant_accum = ""
|
248 |
try:
|
249 |
stream = llm.chat.completions.create(
|
250 |
model="openai/gpt-oss-120b",
|
251 |
messages=[
|
252 |
{"role": "system", "content": system_prompt},
|
253 |
+
{"role": "user", "content": text or "Describe the attached image."}
|
254 |
],
|
255 |
+
temperature=0.7,
|
256 |
top_p=1.0,
|
257 |
+
max_tokens=768,
|
258 |
stream=True,
|
259 |
)
|
|
|
260 |
for chunk in stream:
|
261 |
piece = _extract_text_from_stream_chunk(chunk)
|
262 |
if not piece:
|
263 |
continue
|
264 |
assistant_accum += piece
|
265 |
+
chat_history[-1][1] = assistant_accum
|
266 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id
|
267 |
|
268 |
except Exception as e:
|
269 |
+
# Фоллбэк без стрима
|
|
|
270 |
try:
|
271 |
resp = llm.chat.completions.create(
|
272 |
model="openai/gpt-oss-120b",
|
273 |
messages=[
|
274 |
{"role": "system", "content": system_prompt},
|
275 |
+
{"role": "user", "content": text or "Describe the attached image."}
|
276 |
],
|
277 |
+
temperature=0.7,
|
278 |
top_p=1.0,
|
279 |
+
max_tokens=768,
|
280 |
stream=False,
|
281 |
)
|
282 |
final_text = ""
|
|
|
294 |
final_text = str(resp)
|
295 |
else:
|
296 |
final_text = str(resp)
|
297 |
+
chat_history[-1][1] = final_text
|
298 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id
|
299 |
except Exception as e2:
|
300 |
+
chat_history[-1][1] = f"[Ошибка LLM: {e2}]"
|
301 |
+
yield {"text": "", "files": []}, chat_history, caption, asset_id
|
302 |
+
|
303 |
+
# --------------------- Интерфейс ---------------------
|
304 |
+
messenger_css = """
|
305 |
+
:root {
|
306 |
+
--radius-xl: 16px;
|
307 |
+
--radius-lg: 14px;
|
308 |
+
}
|
309 |
+
.gradio-container { max-width: 800px !important; margin: auto; }
|
310 |
+
#title { text-align: center; padding: 8px 0 10px; font-size: 20px; }
|
311 |
+
#chat-wrap { border: 1px solid rgba(0,0,0,0.06); border-radius: var(--radius-xl); overflow: hidden; }
|
312 |
+
#chat { height: 560px; }
|
313 |
+
#bottom-bar { position: sticky; bottom: 0; background: var(--body-background-fill); border-top: 1px solid rgba(0,0,0,0.06); padding: 8px; display: flex; gap: 8px; align-items: center; }
|
314 |
+
#send { min-width: 44px; max-width: 44px; height: 44px; border-radius: 999px; }
|
315 |
+
#msg .mm-wrap { border: 1px solid rgba(0,0,0,0.08); border-radius: 999px; }
|
316 |
+
.gr-chatbot { border-radius: 0 !important; }
|
317 |
+
.gr-chatbot .wrap.svelte-1cl0v3x { padding: 12px !important; } /* мягкие отступы (селектор может отличаться по версии) */
|
318 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
+
theme = gr.themes.Soft(
|
321 |
+
primary_hue="cyan",
|
322 |
+
neutral_hue="slate",
|
323 |
+
).set(
|
324 |
+
body_text_color_subdued="#6b7280",
|
325 |
+
button_large_radius="999px",
|
326 |
+
button_small_radius="999px",
|
327 |
+
block_radius="16px",
|
328 |
+
)
|
329 |
|
330 |
+
with gr.Blocks(theme=theme, css=messenger_css, analytics_enabled=False) as demo:
|
331 |
+
gr.Markdown("✨ <div id='title'>Элегантный визуальный чат</div>")
|
332 |
|
333 |
+
caption_state = gr.State(value="")
|
334 |
+
asset_state = gr.State(value="")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
|
336 |
+
with gr.Group(elem_id="chat-wrap"):
|
337 |
+
chatbot = gr.Chatbot(
|
338 |
+
label="",
|
339 |
+
height=560,
|
340 |
+
elem_id="chat"
|
341 |
+
)
|
342 |
|
343 |
+
# Нижняя компактная строка ввода с маленькой кнопкой вложений внутри
|
344 |
+
with gr.Row(elem_id="bottom-bar"):
|
345 |
+
msg = gr.MultimodalTextbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
show_label=False,
|
347 |
+
placeholder="Напишите сообщение... (иконка слева — добавить изображение)",
|
348 |
+
elem_id="msg",
|
349 |
)
|
350 |
+
send = gr.Button("➤", variant="primary", elem_id="send")
|
351 |
|
352 |
+
# Отправка по Enter и по кнопке
|
353 |
+
msg.submit(
|
354 |
+
respond,
|
355 |
+
inputs=[msg, chatbot, caption_state, asset_state],
|
356 |
+
outputs=[msg, chatbot, caption_state, asset_state]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
)
|
358 |
+
send.click(
|
359 |
+
respond,
|
360 |
+
inputs=[msg, chatbot, caption_state, asset_state],
|
361 |
+
outputs=[msg, chatbot, caption_state, asset_state]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
)
|
363 |
|
364 |
+
# Очистка
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
def clear_all():
|
366 |
+
return {"text": "", "files": []}, [], "", ""
|
367 |
+
gr.Button("Очистить", variant="secondary").click(
|
|
|
368 |
clear_all,
|
369 |
inputs=None,
|
370 |
+
outputs=[msg, chatbot, caption_state, asset_state]
|
371 |
)
|
372 |
|
|
|
373 |
if __name__ == "__main__":
|
374 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), share=False)
|
|
|
|
|
|
|
|