ankandrew commited on
Commit
ac35ded
·
verified ·
1 Parent(s): f389324

Upload 2 files

Browse files
qwen_vl_utils/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .vision_process import (
2
+ extract_vision_info,
3
+ fetch_image,
4
+ fetch_video,
5
+ process_vision_info,
6
+ smart_resize,
7
+ )
qwen_vl_utils/vision_process.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modified from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
2
+ from __future__ import annotations
3
+
4
+ import base64
5
+ import copy
6
+ import logging
7
+ import math
8
+ import os
9
+ import sys
10
+ import time
11
+ import warnings
12
+ from functools import lru_cache
13
+ from io import BytesIO
14
+ from typing import Optional
15
+
16
+ import requests
17
+ import torch
18
+ import torchvision
19
+ from packaging import version
20
+ from PIL import Image
21
+ from torchvision import io, transforms
22
+ from torchvision.transforms import InterpolationMode
23
+
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ IMAGE_FACTOR = 28
28
+ MIN_PIXELS = 4 * 28 * 28
29
+ MAX_PIXELS = 16384 * 28 * 28
30
+ MAX_RATIO = 200
31
+
32
+ VIDEO_MIN_PIXELS = 128 * 28 * 28
33
+ VIDEO_MAX_PIXELS = 768 * 28 * 28
34
+ FRAME_FACTOR = 2
35
+ FPS = 2.0
36
+ FPS_MIN_FRAMES = 4
37
+ FPS_MAX_FRAMES = 768
38
+
39
+ # Set the maximum number of video token inputs.
40
+ # Here, 128K represents the maximum number of input tokens for the VLLM model.
41
+ # Remember to adjust it according to your own configuration.
42
+ VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
43
+ logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
44
+
45
+
46
+ def round_by_factor(number: int, factor: int) -> int:
47
+ """Returns the closest integer to 'number' that is divisible by 'factor'."""
48
+ return round(number / factor) * factor
49
+
50
+
51
+ def ceil_by_factor(number: int, factor: int) -> int:
52
+ """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
53
+ return math.ceil(number / factor) * factor
54
+
55
+
56
+ def floor_by_factor(number: int, factor: int) -> int:
57
+ """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
58
+ return math.floor(number / factor) * factor
59
+
60
+
61
+ def smart_resize(
62
+ height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
63
+ ) -> tuple[int, int]:
64
+ """
65
+ Rescales the image so that the following conditions are met:
66
+
67
+ 1. Both dimensions (height and width) are divisible by 'factor'.
68
+
69
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
70
+
71
+ 3. The aspect ratio of the image is maintained as closely as possible.
72
+ """
73
+ if max(height, width) / min(height, width) > MAX_RATIO:
74
+ raise ValueError(
75
+ f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
76
+ )
77
+ h_bar = max(factor, round_by_factor(height, factor))
78
+ w_bar = max(factor, round_by_factor(width, factor))
79
+ if h_bar * w_bar > max_pixels:
80
+ beta = math.sqrt((height * width) / max_pixels)
81
+ h_bar = max(factor, floor_by_factor(height / beta, factor))
82
+ w_bar = max(factor, floor_by_factor(width / beta, factor))
83
+ elif h_bar * w_bar < min_pixels:
84
+ beta = math.sqrt(min_pixels / (height * width))
85
+ h_bar = ceil_by_factor(height * beta, factor)
86
+ w_bar = ceil_by_factor(width * beta, factor)
87
+ return h_bar, w_bar
88
+
89
+
90
+ def to_rgb(pil_image: Image.Image) -> Image.Image:
91
+ if pil_image.mode == 'RGBA':
92
+ white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
93
+ white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
94
+ return white_background
95
+ else:
96
+ return pil_image.convert("RGB")
97
+
98
+
99
+ def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
100
+ if "image" in ele:
101
+ image = ele["image"]
102
+ else:
103
+ image = ele["image_url"]
104
+ image_obj = None
105
+ if isinstance(image, Image.Image):
106
+ image_obj = image
107
+ elif image.startswith("http://") or image.startswith("https://"):
108
+ # fix memory leak issue while using BytesIO
109
+ with requests.get(image, stream=True) as response:
110
+ response.raise_for_status()
111
+ with BytesIO(response.content) as bio:
112
+ image_obj = copy.deepcopy(Image.open(bio))
113
+ elif image.startswith("file://"):
114
+ image_obj = Image.open(image[7:])
115
+ elif image.startswith("data:image"):
116
+ if "base64," in image:
117
+ _, base64_data = image.split("base64,", 1)
118
+ data = base64.b64decode(base64_data)
119
+ # fix memory leak issue while using BytesIO
120
+ with BytesIO(data) as bio:
121
+ image_obj = copy.deepcopy(Image.open(bio))
122
+ else:
123
+ image_obj = Image.open(image)
124
+ if image_obj is None:
125
+ raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
126
+ image = to_rgb(image_obj)
127
+ ## resize
128
+ if "resized_height" in ele and "resized_width" in ele:
129
+ resized_height, resized_width = smart_resize(
130
+ ele["resized_height"],
131
+ ele["resized_width"],
132
+ factor=size_factor,
133
+ )
134
+ else:
135
+ width, height = image.size
136
+ min_pixels = ele.get("min_pixels", MIN_PIXELS)
137
+ max_pixels = ele.get("max_pixels", MAX_PIXELS)
138
+ resized_height, resized_width = smart_resize(
139
+ height,
140
+ width,
141
+ factor=size_factor,
142
+ min_pixels=min_pixels,
143
+ max_pixels=max_pixels,
144
+ )
145
+ image = image.resize((resized_width, resized_height))
146
+
147
+ return image
148
+
149
+
150
+ def smart_nframes(
151
+ ele: dict,
152
+ total_frames: int,
153
+ video_fps: int | float,
154
+ ) -> int:
155
+ """calculate the number of frames for video used for model inputs.
156
+
157
+ Args:
158
+ ele (dict): a dict contains the configuration of video.
159
+ support either `fps` or `nframes`:
160
+ - nframes: the number of frames to extract for model inputs.
161
+ - fps: the fps to extract frames for model inputs.
162
+ - min_frames: the minimum number of frames of the video, only used when fps is provided.
163
+ - max_frames: the maximum number of frames of the video, only used when fps is provided.
164
+ total_frames (int): the original total number of frames of the video.
165
+ video_fps (int | float): the original fps of the video.
166
+
167
+ Raises:
168
+ ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
169
+
170
+ Returns:
171
+ int: the number of frames for video used for model inputs.
172
+ """
173
+ assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
174
+ if "nframes" in ele:
175
+ nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
176
+ else:
177
+ fps = ele.get("fps", FPS)
178
+ min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
179
+ max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
180
+ nframes = total_frames / video_fps * fps
181
+ if nframes > total_frames:
182
+ logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
183
+ nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
184
+ nframes = floor_by_factor(nframes, FRAME_FACTOR)
185
+ if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
186
+ raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
187
+ return nframes
188
+
189
+
190
+ def _read_video_torchvision(
191
+ ele: dict,
192
+ ) -> (torch.Tensor, float):
193
+ """read video using torchvision.io.read_video
194
+
195
+ Args:
196
+ ele (dict): a dict contains the configuration of video.
197
+ support keys:
198
+ - video: the path of video. support "file://", "http://", "https://" and local path.
199
+ - video_start: the start time of video.
200
+ - video_end: the end time of video.
201
+ Returns:
202
+ torch.Tensor: the video tensor with shape (T, C, H, W).
203
+ """
204
+ video_path = ele["video"]
205
+ if version.parse(torchvision.__version__) < version.parse("0.19.0"):
206
+ if "http://" in video_path or "https://" in video_path:
207
+ warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
208
+ if "file://" in video_path:
209
+ video_path = video_path[7:]
210
+ st = time.time()
211
+ video, audio, info = io.read_video(
212
+ video_path,
213
+ start_pts=ele.get("video_start", 0.0),
214
+ end_pts=ele.get("video_end", None),
215
+ pts_unit="sec",
216
+ output_format="TCHW",
217
+ )
218
+ total_frames, video_fps = video.size(0), info["video_fps"]
219
+ logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
220
+ nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
221
+ idx = torch.linspace(0, total_frames - 1, nframes).round().long()
222
+ sample_fps = nframes / max(total_frames, 1e-6) * video_fps
223
+ video = video[idx]
224
+ return video, sample_fps
225
+
226
+
227
+ def is_decord_available() -> bool:
228
+ import importlib.util
229
+
230
+ return importlib.util.find_spec("decord") is not None
231
+
232
+
233
+ def calculate_video_frame_range(
234
+ ele: dict,
235
+ total_frames: int,
236
+ video_fps: float,
237
+ ) -> tuple[int, int, int]:
238
+ """
239
+ Calculate the start and end frame indices based on the given time range.
240
+
241
+ Args:
242
+ ele (dict): A dictionary containing optional 'video_start' and 'video_end' keys (in seconds).
243
+ total_frames (int): Total number of frames in the video.
244
+ video_fps (float): Frames per second of the video.
245
+
246
+ Returns:
247
+ tuple: A tuple containing (start_frame, end_frame, frame_count).
248
+
249
+ Raises:
250
+ ValueError: If input parameters are invalid or the time range is inconsistent.
251
+ """
252
+ # Validate essential parameters
253
+ if video_fps <= 0:
254
+ raise ValueError("video_fps must be a positive number")
255
+ if total_frames <= 0:
256
+ raise ValueError("total_frames must be a positive integer")
257
+
258
+ # Get start and end time in seconds
259
+ video_start = ele.get("video_start", None)
260
+ video_end = ele.get("video_end", None)
261
+ if video_start is None and video_end is None:
262
+ return 0, total_frames - 1, total_frames
263
+
264
+ max_duration = total_frames / video_fps
265
+ # Process start frame
266
+ if video_start is not None:
267
+ video_start_clamped = max(0.0, min(video_start, max_duration))
268
+ start_frame = math.ceil(video_start_clamped * video_fps)
269
+ else:
270
+ start_frame = 0
271
+ # Process end frame
272
+ if video_end is not None:
273
+ video_end_clamped = max(0.0, min(video_end, max_duration))
274
+ end_frame = math.floor(video_end_clamped * video_fps)
275
+ end_frame = min(end_frame, total_frames - 1)
276
+ else:
277
+ end_frame = total_frames - 1
278
+
279
+ # Validate frame order
280
+ if start_frame >= end_frame:
281
+ raise ValueError(
282
+ f"Invalid time range: Start frame {start_frame} (at {video_start_clamped if video_start is not None else 0}s) "
283
+ f"exceeds end frame {end_frame} (at {video_end_clamped if video_end is not None else max_duration}s). "
284
+ f"Video duration: {max_duration:.2f}s ({total_frames} frames @ {video_fps}fps)"
285
+ )
286
+
287
+ logger.info(f"calculate video frame range: {start_frame=}, {end_frame=}, {total_frames=} from {video_start=}, {video_end=}, {video_fps=:.3f}")
288
+ return start_frame, end_frame, end_frame - start_frame + 1
289
+
290
+
291
+ def _read_video_decord(
292
+ ele: dict,
293
+ ) -> (torch.Tensor, float):
294
+ """read video using decord.VideoReader
295
+
296
+ Args:
297
+ ele (dict): a dict contains the configuration of video.
298
+ support keys:
299
+ - video: the path of video. support "file://", "http://", "https://" and local path.
300
+ - video_start: the start time of video.
301
+ - video_end: the end time of video.
302
+ Returns:
303
+ torch.Tensor: the video tensor with shape (T, C, H, W).
304
+ """
305
+ import decord
306
+ video_path = ele["video"]
307
+ st = time.time()
308
+ vr = decord.VideoReader(video_path)
309
+ total_frames, video_fps = len(vr), vr.get_avg_fps()
310
+ start_frame, end_frame, total_frames = calculate_video_frame_range(
311
+ ele,
312
+ total_frames,
313
+ video_fps,
314
+ )
315
+ nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
316
+ idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()
317
+ video = vr.get_batch(idx).asnumpy()
318
+ video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
319
+ logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
320
+ sample_fps = nframes / max(total_frames, 1e-6) * video_fps
321
+ return video, sample_fps
322
+
323
+
324
+ def is_torchcodec_available() -> bool:
325
+ """Check if torchcodec is available and properly installed."""
326
+ try:
327
+ import importlib.util
328
+ if importlib.util.find_spec("torchcodec") is None:
329
+ return False
330
+ from torchcodec.decoders import VideoDecoder
331
+ return True
332
+ except (ImportError, AttributeError, Exception):
333
+ return False
334
+
335
+
336
+ def _read_video_torchcodec(
337
+ ele: dict,
338
+ ) -> (torch.Tensor, float):
339
+ """read video using torchcodec.decoders.VideoDecoder
340
+
341
+ Args:
342
+ ele (dict): a dict contains the configuration of video.
343
+ support keys:
344
+ - video: the path of video. support "file://", "http://", "https://" and local path.
345
+ - video_start: the start time of video.
346
+ - video_end: the end time of video.
347
+ Returns:
348
+ torch.Tensor: the video tensor with shape (T, C, H, W).
349
+ """
350
+ from torchcodec.decoders import VideoDecoder
351
+ TORCHCODEC_NUM_THREADS = int(os.environ.get('TORCHCODEC_NUM_THREADS', 8))
352
+ logger.info(f"set TORCHCODEC_NUM_THREADS: {TORCHCODEC_NUM_THREADS}")
353
+ video_path = ele["video"]
354
+ st = time.time()
355
+ decoder = VideoDecoder(video_path, num_ffmpeg_threads=TORCHCODEC_NUM_THREADS)
356
+ video_fps = decoder.metadata.average_fps
357
+ total_frames = decoder.metadata.num_frames
358
+ start_frame, end_frame, total_frames = calculate_video_frame_range(
359
+ ele,
360
+ total_frames,
361
+ video_fps,
362
+ )
363
+ nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
364
+ idx = torch.linspace(start_frame, end_frame, nframes).round().long().tolist()
365
+ sample_fps = nframes / max(total_frames, 1e-6) * video_fps
366
+ video = decoder.get_frames_at(indices=idx).data
367
+ logger.info(f"torchcodec: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
368
+ return video, sample_fps
369
+
370
+
371
+ VIDEO_READER_BACKENDS = {
372
+ "decord": _read_video_decord,
373
+ "torchvision": _read_video_torchvision,
374
+ "torchcodec": _read_video_torchcodec,
375
+ }
376
+
377
+ FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
378
+
379
+
380
+ @lru_cache(maxsize=1)
381
+ def get_video_reader_backend() -> str:
382
+ if FORCE_QWENVL_VIDEO_READER is not None:
383
+ video_reader_backend = FORCE_QWENVL_VIDEO_READER
384
+ elif is_torchcodec_available():
385
+ video_reader_backend = "torchcodec"
386
+ elif is_decord_available():
387
+ video_reader_backend = "decord"
388
+ else:
389
+ video_reader_backend = "torchvision"
390
+ print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr)
391
+ return video_reader_backend
392
+
393
+
394
+ def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
395
+ if isinstance(ele["video"], str):
396
+ video_reader_backend = get_video_reader_backend()
397
+ try:
398
+ video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele)
399
+ except Exception as e:
400
+ logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
401
+ video, sample_fps = VIDEO_READER_BACKENDS["torchvision"](ele)
402
+
403
+ nframes, _, height, width = video.shape
404
+ min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
405
+ total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
406
+ max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
407
+ max_pixels_supposed = ele.get("max_pixels", max_pixels)
408
+ if max_pixels_supposed > max_pixels:
409
+ logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
410
+ max_pixels = min(max_pixels_supposed, max_pixels)
411
+ if "resized_height" in ele and "resized_width" in ele:
412
+ resized_height, resized_width = smart_resize(
413
+ ele["resized_height"],
414
+ ele["resized_width"],
415
+ factor=image_factor,
416
+ )
417
+ else:
418
+ resized_height, resized_width = smart_resize(
419
+ height,
420
+ width,
421
+ factor=image_factor,
422
+ min_pixels=min_pixels,
423
+ max_pixels=max_pixels,
424
+ )
425
+ video = transforms.functional.resize(
426
+ video,
427
+ [resized_height, resized_width],
428
+ interpolation=InterpolationMode.BICUBIC,
429
+ antialias=True,
430
+ ).float()
431
+ if return_video_sample_fps:
432
+ return video, sample_fps
433
+ return video
434
+ else:
435
+ assert isinstance(ele["video"], (list, tuple))
436
+ process_info = ele.copy()
437
+ process_info.pop("type", None)
438
+ process_info.pop("video", None)
439
+ images = [
440
+ fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
441
+ for video_element in ele["video"]
442
+ ]
443
+ nframes = ceil_by_factor(len(images), FRAME_FACTOR)
444
+ if len(images) < nframes:
445
+ images.extend([images[-1]] * (nframes - len(images)))
446
+ if return_video_sample_fps:
447
+ return images, process_info.pop("fps", 2.0)
448
+ return images
449
+
450
+
451
+ def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
452
+ vision_infos = []
453
+ if isinstance(conversations[0], dict):
454
+ conversations = [conversations]
455
+ for conversation in conversations:
456
+ for message in conversation:
457
+ if isinstance(message["content"], list):
458
+ for ele in message["content"]:
459
+ if (
460
+ "image" in ele
461
+ or "image_url" in ele
462
+ or "video" in ele
463
+ or ele.get("type","") in ("image", "image_url", "video")
464
+ ):
465
+ vision_infos.append(ele)
466
+ return vision_infos
467
+
468
+
469
+ def process_vision_info(
470
+ conversations: list[dict] | list[list[dict]],
471
+ return_video_kwargs: bool = False,
472
+ ) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
473
+
474
+ vision_infos = extract_vision_info(conversations)
475
+ ## Read images or videos
476
+ image_inputs = []
477
+ video_inputs = []
478
+ video_sample_fps_list = []
479
+ for vision_info in vision_infos:
480
+ if "image" in vision_info or "image_url" in vision_info:
481
+ image_inputs.append(fetch_image(vision_info))
482
+ elif "video" in vision_info:
483
+ video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True)
484
+ video_sample_fps_list.append(video_sample_fps)
485
+ video_inputs.append(video_input)
486
+ else:
487
+ raise ValueError("image, image_url or video should in content.")
488
+ if len(image_inputs) == 0:
489
+ image_inputs = None
490
+ if len(video_inputs) == 0:
491
+ video_inputs = None
492
+ if return_video_kwargs:
493
+ return image_inputs, video_inputs, {'fps': video_sample_fps_list}
494
+ return image_inputs, video_inputs