Spaces:

lixin4ever
/

RynnEC

Runtime error

App Files Files Community

RynnEC / rynnec /model /processor.py

lixin4ever

Upload (#2)

372785b verified 6 days ago

raw

history blame contribute delete

18.1 kB

	# coding=utf-8
	# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
	#
	# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
	# and OPT implementations in this library. It has been modified from its
	# original forms to accommodate minor architectural differences compared
	# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Processor class for VideoLLaMA3.
	"""
	from abc import ABCMeta, abstractmethod
	import copy
	import warnings
	from collections import defaultdict
	from typing import List, Union, Dict, Optional, Any

	import json
	import torch
	from transformers.feature_extraction_utils import BatchFeature
	from transformers.image_utils import ImageInput, VideoInput
	from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

	from rynnec.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX
	from rynnec.mm_utils import load_video, load_images
	from rynnec.model.videollama3_encoder.image_processing_videollama3 import is_valid_image, is_valid_video


	class Videollama3ProcessorKwargs(ProcessingKwargs, total=False):
	_defaults = {
	"text_kwargs": {
	"padding": False,
	},
	}


	class Videollama3BaseProcessor(ProcessorMixin, metaclass=ABCMeta):
	r"""
	Modified from Qwen2VLProcessor
	Args:
	image_processor ([`Qwen2VLImageProcessor`], optional):
	The image processor is a required input.
	tokenizer ([`Qwen2TokenizerFast`], optional):
	The tokenizer is a required input.
	chat_template (`str`, optional): A Jinja template which will be used to convert lists of messages
	in a chat into a tokenizable string.
	"""

	attributes = ["image_processor", "tokenizer"]
	valid_kwargs = ["chat_template", "image_merge_size", "video_merge_size", "fps", "max_frames"]
	image_processor_class = "AutoImageProcessor"
	tokenizer_class = None
	chat_template = None

	def __init__(
	self,
	image_processor=None,
	tokenizer=None,
	chat_template=None,
	image_merge_size: int = 1,
	video_merge_size: int = 2,
	fps=1,
	max_frames=180,
	**kwargs
	):
	if chat_template is not None:
	self.chat_template = chat_template

	self.image_processor = image_processor
	self.tokenizer = tokenizer
	self.image_merge_size = image_merge_size
	self.video_merge_size = video_merge_size
	self.fps = fps
	self.max_frames = max_frames

	if self.chat_template is not None:
	self.tokenizer.chat_template = self.chat_template

	self.image_token = DEFAULT_IMAGE_TOKEN
	self.think_start_token = "<think>"
	self.think_end_token = "</think>"
	self.tokenizer.add_tokens([self.image_token], special_tokens=True)
	self.tokenizer.add_tokens([self.think_start_token, self.think_end_token], special_tokens=False)
	self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
	self.think_start_token_id = self.tokenizer.convert_tokens_to_ids(self.think_start_token)
	self.think_end_token_id = self.tokenizer.convert_tokens_to_ids(self.think_end_token)
	self.newline_token_id = self.tokenizer.encode("\n")[0]

	def load_video(self, args, *kwargs):
	return load_video(args, *kwargs)

	def load_images(self, args, *kwargs):
	return load_images(args, *kwargs)

	def _get_downsampled_grid_sizes(self, image_inputs: Dict[str, Any]):
	grid_sizes = []
	for grid_size, merge_size in zip(image_inputs.get("grid_sizes", []), image_inputs.get("merge_sizes", [])):
	if not torch.all(grid_size[1:] % merge_size == 0):
	warnings.warn(f"Grid size {grid_size} is not divisible by merge size. Some undesired errors may occur.")
	if grid_size[0] == 1:
	grid_sizes.append(grid_size[1:] / merge_size)
	elif grid_size[0] > 1:
	grid_sizes.extend([grid_size[1:] / merge_size] * grid_size[0])
	return grid_sizes

	def _get_visual_seq_len(self, grid_size: torch.Tensor):
	num_tokens = int(grid_size.prod().item())
	return num_tokens

	@abstractmethod
	def _process_text_with_label(
	self,
	text: List[Dict],
	grid_sizes: torch.Tensor = None,
	**kwargs,
	):
	return {}

	def _process_text_without_label(
	self,
	text: Union[List[str], List[Dict]],
	grid_sizes: torch.Tensor = None,
	**kwargs,
	):
	if isinstance(text, (list, tuple)) and isinstance(text[0], dict):
	warnings.warn("Input text is a list of messages. Automatically convert it to a string with 'apply_chat_template' with generation prompt.")
	text = self.apply_chat_template(text, tokenize=False, add_generation_prompt=True)

	if len(grid_sizes) > 0:
	image_idx = 0
	while self.image_token in text:
	thw = grid_sizes[image_idx]
	text = text.replace(self.image_token, "<placeholder>" * thw.prod().long(), 1)
	image_idx += 1
	text = text.replace("<placeholder>", self.image_token)
	assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."

	text_inputs = self.tokenizer(text, **kwargs)
	return text_inputs

	def process_text(
	self,
	text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]],
	image_inputs: Dict[str, torch.Tensor] = {},
	return_labels: bool = False,
	**kwargs,
	):
	kwargs.pop("padding", None)
	kwargs.pop("padding_side", None)

	grid_sizes = []
	for grid_size, merge_size in zip(image_inputs.get("grid_sizes", []), image_inputs.get("merge_sizes", [])):
	if not torch.all(grid_size[1:] % merge_size == 0):
	warnings.warn(f"Grid size {grid_size} is not divisible by merge size. Some undesired errors may occur.")
	if grid_size[0] == 1:
	grid_sizes.append(grid_size[1:] / merge_size)
	elif grid_size[0] > 1:
	grid_sizes.extend([grid_size[1:] / merge_size] * grid_size[0])

	if return_labels:
	return self._process_text_with_label(text, grid_sizes, **kwargs)
	return self._process_text_without_label(text, grid_sizes, **kwargs)

	def process_images(
	self,
	images: ImageInput = None,
	merge_size: Optional[int] = 1,
	**kwargs,
	):
	if images is None:
	return {}
	image_inputs = self.image_processor(images=images, merge_size=merge_size, **kwargs)
	return image_inputs

	def __call__(
	self,
	text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], List[Dict]] = None,
	images: ImageInput = None,
	merge_size: Optional[int] = 1,
	return_labels: bool = False,
	**kwargs: Unpack[Videollama3ProcessorKwargs],
	) -> BatchFeature:
	"""
	Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
	and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
	the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
	Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

	Args:
	images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
	The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
	tensor. Both channels-first and channels-last formats are supported.
	text (`str`, `List[str]`, `List[List[str]]`):
	The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
	(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
	`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
	return_tensors (`str` or [`~utils.TensorType`], optional):
	If set, will return tensors of a particular framework. Acceptable values are:
	- `'tf'`: Return TensorFlow `tf.constant` objects.
	- `'pt'`: Return PyTorch `torch.Tensor` objects.
	- `'np'`: Return NumPy `np.ndarray` objects.
	- `'jax'`: Return JAX `jnp.ndarray` objects.

	Returns:
	[`BatchFeature`]: A [`BatchFeature`] with the following fields:

	- input_ids -- List of token ids to be fed to a model. Returned when `text` is not `None`.
	- attention_mask -- List of indices specifying which tokens should be attended to by the model (when
	`return_attention_mask=True` or if "attention_mask" is in `self.model_input_names` and if `text` is not
	`None`).
	- pixel_values -- Pixel values to be fed to a model. Returned when `images` is not `None`.
	- grid_sizes -- List of image 3D grid in LLM. Returned when `images` is not `None`.
	"""
	output_kwargs = self._merge_kwargs(
	Videollama3ProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)
	output_kwargs["text_kwargs"].pop("padding", None)
	output_kwargs["text_kwargs"].pop("padding_side", None)

	image_inputs = self.process_images(images, merge_size, **output_kwargs["images_kwargs"])
	text_inputs = self.process_text(text, image_inputs, return_labels, **output_kwargs["text_kwargs"])

	return BatchFeature(data={text_inputs, image_inputs})

	def batch_decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
	refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
	the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	def _load_multimodal_data(self, conversation: List[Dict[str, Any]]):
	multimodal_info = defaultdict(list)
	new_conversation = []
	for message in conversation:
	new_message = {"role": message["role"]}
	if not isinstance(message["content"], (list, tuple)):
	new_message["content"] = message["content"]
	new_conversation.append(new_message)
	continue

	new_contents = []
	for content in message["content"]:
	if not isinstance(content, dict):
	new_contents.append(content)
	continue
	assert "type" in content, "Content must have 'type' field."
	if content["type"] in ["image", "video"] and content["type"] in content and isinstance(content[content["type"]], dict):
	# TODO: support other types which are not compatible with json
	load_args = content[content["type"]]
	data_id = json.dumps({k: v for k, v in load_args.items() if not k in ["start_time", "end_time"]})
	new_content = copy.deepcopy(content)
	multimodal_info[data_id].append(new_content)
	new_contents.append(new_content)
	else:
	new_contents.append(content)

	new_message["content"] = new_contents
	new_conversation.append(new_message)

	for data_id, contents in multimodal_info.items():
	data_type = contents[0]["type"]
	if data_type == "image":
	image = self.load_images(contents[0][data_type]["image_path"])[0]
	for content in contents:
	content["image"] = image.copy()

	elif data_type == "video":
	# TODO: start_time is None?
	start_times = [content["video"].get("start_time", 0.) for content in contents]
	end_times = [content["video"].get("end_time", float("inf")) for content in contents]

	load_args = contents[0][data_type]
	start_time, end_time = min(start_times), max(end_times)
	if start_time > 0:
	load_args["start_time"] = start_time
	if end_time < float("inf"):
	load_args["end_time"] = end_time
	images, timestamps = self.load_video(**load_args)

	for content, start_time, end_time in zip(contents, start_times, end_times):
	cur_images, cur_timestamps = [], []
	for image, timestamp in zip(images, timestamps):
	if start_time <= timestamp <= end_time:
	cur_images.append(image.copy())
	cur_timestamps.append(timestamp)

	content[data_type] = cur_images
	content["num_frames"] = len(cur_images)
	content["timestamps"] = cur_timestamps

	return new_conversation

	def _gather_multimodal_data(self, conversation: List[Dict[str, Any]]):
	images = []
	for message in conversation:
	if not isinstance(message["content"], (list, tuple)):
	continue
	for content in message["content"]:
	if not isinstance(content, dict):
	continue
	if content["type"] == "video":
	video = content["video"]
	assert is_valid_video(video), f"Invalid video data: {video}."
	images.append(video)
	if content["type"] == "image":
	image = content["image"]
	assert is_valid_image(image), f"Invalid image data: {image}."
	images.append(image)
	images = images if len(images) > 0 else None
	return images

	def apply_chat_template(
	self,
	conversation: List[Dict[str, Any]],
	chat_template: Optional[str] = None,
	tokenize: bool = False,
	add_system_prompt: bool = False,
	add_generation_prompt: bool = False,
	add_think_prompt: bool = False,
	return_dict: bool = False,
	**kwargs,
	) -> str:
	"""
	Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
	conversations to turn them into a single tokenizable string.
	Args:
	conversation (`List[Dict, str, str]`):
	The conversation to format.
	chat_template (`Optional[str]`, optional):
	The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
	chat template is used.
	tokenize (`bool`, optional, defaults to `False`):
	Whether to tokenize the output or not.
	add_system_prompt (`bool`, optional, defaults to `False`):
	Whether to add the system prompt to the output or not.
	add_generation_prompt (`bool`, optional, defaults to `False`):
	Whether to add the generation prompt to the output or not.
	image_token (`Optional[str]`, optional, defaults to `<image>`):
	The token to use for indicating images in the conversation.
	**kwargs:
	Additional keyword arguments
	"""

	if chat_template is None:
	if self.chat_template is not None:
	chat_template = self.chat_template
	else:
	raise ValueError(
	"No chat template is set for this processor. Please either set the `chat_template` attribute, "
	"or provide a chat template as an argument. See "
	"https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
	)

	images = None
	if return_dict:
	conversation = self._load_multimodal_data(conversation)
	images = self._gather_multimodal_data(conversation)

	prompt = self.tokenizer.apply_chat_template(
	conversation,
	chat_template=chat_template,
	tokenize=tokenize,
	add_system_prompt=add_system_prompt,
	add_generation_prompt=add_generation_prompt,
	add_think_prompt=add_think_prompt,
	image_token=self.image_token,
	**kwargs
	)

	out = {"text": prompt, "images": images}
	if return_dict:
	return out
	return out["text"]

	@property
	def model_input_names(self):
	tokenizer_input_names = self.tokenizer.model_input_names
	image_processor_input_names = self.image_processor.model_input_names
	return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))