RynnEC / rynnec /model /processor.py
lixin4ever's picture
Upload (#2)
372785b verified
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for VideoLLaMA3.
"""
from abc import ABCMeta, abstractmethod
import copy
import warnings
from collections import defaultdict
from typing import List, Union, Dict, Optional, Any
import json
import torch
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput, VideoInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from rynnec.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX
from rynnec.mm_utils import load_video, load_images
from rynnec.model.videollama3_encoder.image_processing_videollama3 import is_valid_image, is_valid_video
class Videollama3ProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"padding": False,
},
}
class Videollama3BaseProcessor(ProcessorMixin, metaclass=ABCMeta):
r"""
Modified from Qwen2VLProcessor
Args:
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
"""
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template", "image_merge_size", "video_merge_size", "fps", "max_frames"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = None
chat_template = None
def __init__(
self,
image_processor=None,
tokenizer=None,
chat_template=None,
image_merge_size: int = 1,
video_merge_size: int = 2,
fps=1,
max_frames=180,
**kwargs
):
if chat_template is not None:
self.chat_template = chat_template
self.image_processor = image_processor
self.tokenizer = tokenizer
self.image_merge_size = image_merge_size
self.video_merge_size = video_merge_size
self.fps = fps
self.max_frames = max_frames
if self.chat_template is not None:
self.tokenizer.chat_template = self.chat_template
self.image_token = DEFAULT_IMAGE_TOKEN
self.think_start_token = "<think>"
self.think_end_token = "</think>"
self.tokenizer.add_tokens([self.image_token], special_tokens=True)
self.tokenizer.add_tokens([self.think_start_token, self.think_end_token], special_tokens=False)
self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
self.think_start_token_id = self.tokenizer.convert_tokens_to_ids(self.think_start_token)
self.think_end_token_id = self.tokenizer.convert_tokens_to_ids(self.think_end_token)
self.newline_token_id = self.tokenizer.encode("\n")[0]
def load_video(self, *args, **kwargs):
return load_video(*args, **kwargs)
def load_images(self, *args, **kwargs):
return load_images(*args, **kwargs)
def _get_downsampled_grid_sizes(self, image_inputs: Dict[str, Any]):
grid_sizes = []
for grid_size, merge_size in zip(image_inputs.get("grid_sizes", []), image_inputs.get("merge_sizes", [])):
if not torch.all(grid_size[1:] % merge_size == 0):
warnings.warn(f"Grid size {grid_size} is not divisible by merge size. Some undesired errors may occur.")
if grid_size[0] == 1:
grid_sizes.append(grid_size[1:] / merge_size)
elif grid_size[0] > 1:
grid_sizes.extend([grid_size[1:] / merge_size] * grid_size[0])
return grid_sizes
def _get_visual_seq_len(self, grid_size: torch.Tensor):
num_tokens = int(grid_size.prod().item())
return num_tokens
@abstractmethod
def _process_text_with_label(
self,
text: List[Dict],
grid_sizes: torch.Tensor = None,
**kwargs,
):
return {}
def _process_text_without_label(
self,
text: Union[List[str], List[Dict]],
grid_sizes: torch.Tensor = None,
**kwargs,
):
if isinstance(text, (list, tuple)) and isinstance(text[0], dict):
warnings.warn("Input text is a list of messages. Automatically convert it to a string with 'apply_chat_template' with generation prompt.")
text = self.apply_chat_template(text, tokenize=False, add_generation_prompt=True)
if len(grid_sizes) > 0:
image_idx = 0
while self.image_token in text:
thw = grid_sizes[image_idx]
text = text.replace(self.image_token, "<placeholder>" * thw.prod().long(), 1)
image_idx += 1
text = text.replace("<placeholder>", self.image_token)
assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
text_inputs = self.tokenizer(text, **kwargs)
return text_inputs
def process_text(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]],
image_inputs: Dict[str, torch.Tensor] = {},
return_labels: bool = False,
**kwargs,
):
kwargs.pop("padding", None)
kwargs.pop("padding_side", None)
grid_sizes = []
for grid_size, merge_size in zip(image_inputs.get("grid_sizes", []), image_inputs.get("merge_sizes", [])):
if not torch.all(grid_size[1:] % merge_size == 0):
warnings.warn(f"Grid size {grid_size} is not divisible by merge size. Some undesired errors may occur.")
if grid_size[0] == 1:
grid_sizes.append(grid_size[1:] / merge_size)
elif grid_size[0] > 1:
grid_sizes.extend([grid_size[1:] / merge_size] * grid_size[0])
if return_labels:
return self._process_text_with_label(text, grid_sizes, **kwargs)
return self._process_text_without_label(text, grid_sizes, **kwargs)
def process_images(
self,
images: ImageInput = None,
merge_size: Optional[int] = 1,
**kwargs,
):
if images is None:
return {}
image_inputs = self.image_processor(images=images, merge_size=merge_size, **kwargs)
return image_inputs
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]] = None,
images: ImageInput = None,
merge_size: Optional[int] = 1,
return_labels: bool = False,
**kwargs: Unpack[Videollama3ProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **grid_sizes** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
"""
output_kwargs = self._merge_kwargs(
Videollama3ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
output_kwargs["text_kwargs"].pop("padding", None)
output_kwargs["text_kwargs"].pop("padding_side", None)
image_inputs = self.process_images(images, merge_size, **output_kwargs["images_kwargs"])
text_inputs = self.process_text(text, image_inputs, return_labels, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs})
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def _load_multimodal_data(self, conversation: List[Dict[str, Any]]):
multimodal_info = defaultdict(list)
new_conversation = []
for message in conversation:
new_message = {"role": message["role"]}
if not isinstance(message["content"], (list, tuple)):
new_message["content"] = message["content"]
new_conversation.append(new_message)
continue
new_contents = []
for content in message["content"]:
if not isinstance(content, dict):
new_contents.append(content)
continue
assert "type" in content, "Content must have 'type' field."
if content["type"] in ["image", "video"] and content["type"] in content and isinstance(content[content["type"]], dict):
# TODO: support other types which are not compatible with json
load_args = content[content["type"]]
data_id = json.dumps({k: v for k, v in load_args.items() if not k in ["start_time", "end_time"]})
new_content = copy.deepcopy(content)
multimodal_info[data_id].append(new_content)
new_contents.append(new_content)
else:
new_contents.append(content)
new_message["content"] = new_contents
new_conversation.append(new_message)
for data_id, contents in multimodal_info.items():
data_type = contents[0]["type"]
if data_type == "image":
image = self.load_images(contents[0][data_type]["image_path"])[0]
for content in contents:
content["image"] = image.copy()
elif data_type == "video":
# TODO: start_time is None?
start_times = [content["video"].get("start_time", 0.) for content in contents]
end_times = [content["video"].get("end_time", float("inf")) for content in contents]
load_args = contents[0][data_type]
start_time, end_time = min(start_times), max(end_times)
if start_time > 0:
load_args["start_time"] = start_time
if end_time < float("inf"):
load_args["end_time"] = end_time
images, timestamps = self.load_video(**load_args)
for content, start_time, end_time in zip(contents, start_times, end_times):
cur_images, cur_timestamps = [], []
for image, timestamp in zip(images, timestamps):
if start_time <= timestamp <= end_time:
cur_images.append(image.copy())
cur_timestamps.append(timestamp)
content[data_type] = cur_images
content["num_frames"] = len(cur_images)
content["timestamps"] = cur_timestamps
return new_conversation
def _gather_multimodal_data(self, conversation: List[Dict[str, Any]]):
images = []
for message in conversation:
if not isinstance(message["content"], (list, tuple)):
continue
for content in message["content"]:
if not isinstance(content, dict):
continue
if content["type"] == "video":
video = content["video"]
assert is_valid_video(video), f"Invalid video data: {video}."
images.append(video)
if content["type"] == "image":
image = content["image"]
assert is_valid_image(image), f"Invalid image data: {image}."
images.append(image)
images = images if len(images) > 0 else None
return images
def apply_chat_template(
self,
conversation: List[Dict[str, Any]],
chat_template: Optional[str] = None,
tokenize: bool = False,
add_system_prompt: bool = False,
add_generation_prompt: bool = False,
add_think_prompt: bool = False,
return_dict: bool = False,
**kwargs,
) -> str:
"""
Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
conversations to turn them into a single tokenizable string.
Args:
conversation (`List[Dict, str, str]`):
The conversation to format.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
chat template is used.
tokenize (`bool`, *optional*, defaults to `False`):
Whether to tokenize the output or not.
add_system_prompt (`bool`, *optional*, defaults to `False`):
Whether to add the system prompt to the output or not.
add_generation_prompt (`bool`, *optional*, defaults to `False`):
Whether to add the generation prompt to the output or not.
image_token (`Optional[str]`, *optional*, defaults to `<image>`):
The token to use for indicating images in the conversation.
**kwargs:
Additional keyword arguments
"""
if chat_template is None:
if self.chat_template is not None:
chat_template = self.chat_template
else:
raise ValueError(
"No chat template is set for this processor. Please either set the `chat_template` attribute, "
"or provide a chat template as an argument. See "
"https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
)
images = None
if return_dict:
conversation = self._load_multimodal_data(conversation)
images = self._gather_multimodal_data(conversation)
prompt = self.tokenizer.apply_chat_template(
conversation,
chat_template=chat_template,
tokenize=tokenize,
add_system_prompt=add_system_prompt,
add_generation_prompt=add_generation_prompt,
add_think_prompt=add_think_prompt,
image_token=self.image_token,
**kwargs
)
out = {"text": prompt, "images": images}
if return_dict:
return out
return out["text"]
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))