Spaces:
Running
Running
update
Browse files- app.py +10 -10
- eagle_vl/serve/chat_utils.py +79 -7
- requirements.txt +4 -1
app.py
CHANGED
|
@@ -98,7 +98,7 @@ def predict(
|
|
| 98 |
history,
|
| 99 |
top_p,
|
| 100 |
temperature,
|
| 101 |
-
|
| 102 |
max_context_length_tokens,
|
| 103 |
video_nframes,
|
| 104 |
chunk_size: int = 512,
|
|
@@ -113,7 +113,7 @@ def predict(
|
|
| 113 |
top_p (float): The top-p value.
|
| 114 |
temperature (float): The temperature value.
|
| 115 |
repetition_penalty (float): The repetition penalty value.
|
| 116 |
-
|
| 117 |
max_context_length_tokens (int): The max context length tokens.
|
| 118 |
chunk_size (int): The chunk size.
|
| 119 |
"""
|
|
@@ -171,7 +171,7 @@ def predict(
|
|
| 171 |
model=model,
|
| 172 |
processor=processor,
|
| 173 |
stop_words=stop_words,
|
| 174 |
-
max_length=
|
| 175 |
temperature=temperature,
|
| 176 |
top_p=top_p,
|
| 177 |
video_nframes=video_nframes,
|
|
@@ -196,7 +196,7 @@ def predict(
|
|
| 196 |
print(
|
| 197 |
f"temperature: {temperature}, "
|
| 198 |
f"top_p: {top_p}, "
|
| 199 |
-
f"
|
| 200 |
)
|
| 201 |
|
| 202 |
yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
|
|
@@ -209,7 +209,7 @@ def retry(
|
|
| 209 |
history,
|
| 210 |
top_p,
|
| 211 |
temperature,
|
| 212 |
-
|
| 213 |
max_context_length_tokens,
|
| 214 |
video_nframes,
|
| 215 |
chunk_size: int = 512,
|
|
@@ -234,7 +234,7 @@ def retry(
|
|
| 234 |
history,
|
| 235 |
top_p,
|
| 236 |
temperature,
|
| 237 |
-
|
| 238 |
max_context_length_tokens,
|
| 239 |
video_nframes,
|
| 240 |
chunk_size,
|
|
@@ -286,11 +286,11 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
|
|
| 286 |
temperature = gr.Slider(
|
| 287 |
minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
|
| 288 |
)
|
| 289 |
-
|
| 290 |
-
minimum=512, maximum=
|
| 291 |
)
|
| 292 |
max_context_length_tokens = gr.Slider(
|
| 293 |
-
minimum=512, maximum=
|
| 294 |
)
|
| 295 |
video_nframes = gr.Slider(
|
| 296 |
minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
|
|
@@ -310,7 +310,7 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
|
|
| 310 |
history,
|
| 311 |
top_p,
|
| 312 |
temperature,
|
| 313 |
-
|
| 314 |
max_context_length_tokens,
|
| 315 |
video_nframes
|
| 316 |
]
|
|
|
|
| 98 |
history,
|
| 99 |
top_p,
|
| 100 |
temperature,
|
| 101 |
+
max_generate_length,
|
| 102 |
max_context_length_tokens,
|
| 103 |
video_nframes,
|
| 104 |
chunk_size: int = 512,
|
|
|
|
| 113 |
top_p (float): The top-p value.
|
| 114 |
temperature (float): The temperature value.
|
| 115 |
repetition_penalty (float): The repetition penalty value.
|
| 116 |
+
max_generate_length (int): The max length tokens.
|
| 117 |
max_context_length_tokens (int): The max context length tokens.
|
| 118 |
chunk_size (int): The chunk size.
|
| 119 |
"""
|
|
|
|
| 171 |
model=model,
|
| 172 |
processor=processor,
|
| 173 |
stop_words=stop_words,
|
| 174 |
+
max_length=max_generate_length,
|
| 175 |
temperature=temperature,
|
| 176 |
top_p=top_p,
|
| 177 |
video_nframes=video_nframes,
|
|
|
|
| 196 |
print(
|
| 197 |
f"temperature: {temperature}, "
|
| 198 |
f"top_p: {top_p}, "
|
| 199 |
+
f"max_generate_length: {max_generate_length}"
|
| 200 |
)
|
| 201 |
|
| 202 |
yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
|
|
|
|
| 209 |
history,
|
| 210 |
top_p,
|
| 211 |
temperature,
|
| 212 |
+
max_generate_length,
|
| 213 |
max_context_length_tokens,
|
| 214 |
video_nframes,
|
| 215 |
chunk_size: int = 512,
|
|
|
|
| 234 |
history,
|
| 235 |
top_p,
|
| 236 |
temperature,
|
| 237 |
+
max_generate_length,
|
| 238 |
max_context_length_tokens,
|
| 239 |
video_nframes,
|
| 240 |
chunk_size,
|
|
|
|
| 286 |
temperature = gr.Slider(
|
| 287 |
minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
|
| 288 |
)
|
| 289 |
+
max_generate_length = gr.Slider(
|
| 290 |
+
minimum=512, maximum=8192, value=4096, step=64, interactive=True, label="Max Generate Length"
|
| 291 |
)
|
| 292 |
max_context_length_tokens = gr.Slider(
|
| 293 |
+
minimum=512, maximum=65536, value=16384, step=64, interactive=True, label="Max Context Length Tokens"
|
| 294 |
)
|
| 295 |
video_nframes = gr.Slider(
|
| 296 |
minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
|
|
|
|
| 310 |
history,
|
| 311 |
top_p,
|
| 312 |
temperature,
|
| 313 |
+
max_generate_length,
|
| 314 |
max_context_length_tokens,
|
| 315 |
video_nframes
|
| 316 |
]
|
eagle_vl/serve/chat_utils.py
CHANGED
|
@@ -17,6 +17,44 @@ import mimetypes
|
|
| 17 |
IMAGE_TOKEN = "<image>"
|
| 18 |
logger = logging.getLogger("gradio_logger")
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
class SeparatorStyle(IntEnum):
|
| 22 |
"""Separator styles."""
|
|
@@ -342,6 +380,40 @@ def convert_conversation_to_prompts(conversation: Conversation):
|
|
| 342 |
return conv_prompts, last_image
|
| 343 |
|
| 344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
def to_gradio_chatbot(conversation: Conversation) -> list:
|
| 346 |
"""Convert the conversation to gradio chatbot format, supporting images and video."""
|
| 347 |
ret = []
|
|
@@ -360,7 +432,7 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
|
|
| 360 |
|
| 361 |
for j, item in enumerate(items):
|
| 362 |
# If string path, determine type
|
| 363 |
-
if isinstance(item, str):
|
| 364 |
mime, _ = mimetypes.guess_type(item)
|
| 365 |
with open(item, "rb") as f:
|
| 366 |
data = f.read()
|
|
@@ -372,15 +444,15 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
|
|
| 372 |
f'alt="user upload image_{j}" '
|
| 373 |
f'style="max-width:300px;height:auto;" />'
|
| 374 |
)
|
| 375 |
-
elif mime and mime.startswith("video/"):
|
| 376 |
-
media_str += (
|
| 377 |
-
f'<video controls '
|
| 378 |
-
f'style="max-width:300px;height:auto;" '
|
| 379 |
-
f'src="data:{mime};base64,{b64}"></video>'
|
| 380 |
-
)
|
| 381 |
else:
|
| 382 |
# Fallback to link
|
| 383 |
media_str += f'<a href="{item}" target="_blank">{item}</a>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
# If PIL image
|
| 386 |
else:
|
|
|
|
| 17 |
IMAGE_TOKEN = "<image>"
|
| 18 |
logger = logging.getLogger("gradio_logger")
|
| 19 |
|
| 20 |
+
import cv2
|
| 21 |
+
import base64
|
| 22 |
+
import tempfile
|
| 23 |
+
import os
|
| 24 |
+
import imageio
|
| 25 |
+
|
| 26 |
+
def compress_video_to_base64(video_path: str, max_frames=128, resolution=(960, 540)) -> str:
|
| 27 |
+
cap = cv2.VideoCapture(video_path)
|
| 28 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 29 |
+
step = max(1, total_frames // max_frames)
|
| 30 |
+
|
| 31 |
+
frames = []
|
| 32 |
+
count = 0
|
| 33 |
+
while cap.isOpened():
|
| 34 |
+
ret, frame = cap.read()
|
| 35 |
+
if not ret:
|
| 36 |
+
break
|
| 37 |
+
if count % step == 0:
|
| 38 |
+
frame_resized = cv2.resize(frame, resolution)
|
| 39 |
+
frames.append(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
|
| 40 |
+
count += 1
|
| 41 |
+
cap.release()
|
| 42 |
+
|
| 43 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
|
| 44 |
+
tmp_path = tmp.name
|
| 45 |
+
|
| 46 |
+
writer = imageio.get_writer(tmp_path, fps=10, codec='libx264', quality=8) # quality: 0(worst) - 10(best)
|
| 47 |
+
for f in frames:
|
| 48 |
+
writer.append_data(f)
|
| 49 |
+
writer.close()
|
| 50 |
+
|
| 51 |
+
with open(tmp_path, "rb") as f:
|
| 52 |
+
video_data = f.read()
|
| 53 |
+
os.remove(tmp_path)
|
| 54 |
+
|
| 55 |
+
return base64.b64encode(video_data).decode("utf-8")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
|
| 59 |
class SeparatorStyle(IntEnum):
|
| 60 |
"""Separator styles."""
|
|
|
|
| 380 |
return conv_prompts, last_image
|
| 381 |
|
| 382 |
|
| 383 |
+
def to_gradio_chatbot2(conversation: Conversation) -> list:
|
| 384 |
+
"""Convert the conversation to gradio chatbot format."""
|
| 385 |
+
ret = []
|
| 386 |
+
for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
|
| 387 |
+
if i % 2 == 0:
|
| 388 |
+
if type(msg) is tuple:
|
| 389 |
+
msg, images = copy.deepcopy(msg)
|
| 390 |
+
|
| 391 |
+
if isinstance(images, list):
|
| 392 |
+
img_str = ""
|
| 393 |
+
for j, image in enumerate(images):
|
| 394 |
+
if isinstance(image, str):
|
| 395 |
+
with open(image, "rb") as f:
|
| 396 |
+
data = f.read()
|
| 397 |
+
img_b64_str = base64.b64encode(data).decode()
|
| 398 |
+
image_str = (
|
| 399 |
+
f'<img src="data:image/png;base64,{img_b64_str}" '
|
| 400 |
+
f'alt="user upload image" style="max-width: 300px; height: auto;" />'
|
| 401 |
+
)
|
| 402 |
+
else:
|
| 403 |
+
image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
|
| 404 |
+
|
| 405 |
+
img_str += image_str
|
| 406 |
+
msg = img_str + msg
|
| 407 |
+
else:
|
| 408 |
+
pass
|
| 409 |
+
|
| 410 |
+
ret.append([msg, None])
|
| 411 |
+
else:
|
| 412 |
+
ret[-1][-1] = msg
|
| 413 |
+
return ret
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
|
| 417 |
def to_gradio_chatbot(conversation: Conversation) -> list:
|
| 418 |
"""Convert the conversation to gradio chatbot format, supporting images and video."""
|
| 419 |
ret = []
|
|
|
|
| 432 |
|
| 433 |
for j, item in enumerate(items):
|
| 434 |
# If string path, determine type
|
| 435 |
+
if isinstance(item, str) and (not item.endswith((".mp4", ".mov", ".avi", ".webm"))):
|
| 436 |
mime, _ = mimetypes.guess_type(item)
|
| 437 |
with open(item, "rb") as f:
|
| 438 |
data = f.read()
|
|
|
|
| 444 |
f'alt="user upload image_{j}" '
|
| 445 |
f'style="max-width:300px;height:auto;" />'
|
| 446 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
else:
|
| 448 |
# Fallback to link
|
| 449 |
media_str += f'<a href="{item}" target="_blank">{item}</a>'
|
| 450 |
+
elif isinstance(item, str) and (item.endswith((".mp4", ".mov", ".avi", ".webm"))):
|
| 451 |
+
b64 = compress_video_to_base64(item)
|
| 452 |
+
media_str += (
|
| 453 |
+
f'<video controls style="max-width:300px;height:auto;" '
|
| 454 |
+
f'src="data:video/mp4;base64,{b64}"></video>'
|
| 455 |
+
)
|
| 456 |
|
| 457 |
# If PIL image
|
| 458 |
else:
|
requirements.txt
CHANGED
|
@@ -22,4 +22,7 @@ SentencePiece
|
|
| 22 |
|
| 23 |
# eagle
|
| 24 |
peft
|
| 25 |
-
decord
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# eagle
|
| 24 |
peft
|
| 25 |
+
decord
|
| 26 |
+
opencv-python
|
| 27 |
+
imageio
|
| 28 |
+
imageio-ffmpeg
|