Spaces:

Gatsby767
/

AbrahamicSolver

Running

App Files Files Community

Gatsby767 commited on Aug 14

Commit

c040236

verified ·

1 Parent(s): da64666

Upload 10 files

Browse files

Files changed (10) hide show

__init__.cpython-310.pyc +0 -0
__init__.cpython-311.pyc +0 -0
__init__.py +13 -0
flash_attention_utils.cpython-310.pyc +0 -0
flash_attention_utils.py +191 -0
monkey_patch.cpython-310.pyc +0 -0
monkey_patch.py +32 -0
protocol.py +689 -0
qwen2_vl.cpython-310.pyc +0 -0
qwen2_vl.py +189 -0

__init__.cpython-310.pyc ADDED Viewed

Binary file (171 Bytes). View file

__init__.cpython-311.pyc ADDED Viewed

Binary file (187 Bytes). View file

__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

flash_attention_utils.cpython-310.pyc ADDED Viewed

Binary file (4.29 kB). View file

flash_attention_utils.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright 2024 The Fairseq Authors and the HuggingFace Inc. team
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Based on https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/modeling_flash_attention_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from typing import Optional, Tuple
+import torch
+import torch.distributed as dist
+from transformers.modeling_flash_attention_utils import _flash_attention_forward, fa_peft_integration_check
+from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
+from ...utils.ulysses import (
+    gather_heads_scatter_seq,
+    gather_seq_scatter_heads,
+    get_ulysses_sequence_parallel_group,
+    get_ulysses_sequence_parallel_world_size,
+)
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    _flash_supports_window_size = "window_size" in inspect.signature(flash_attn_func).parameters
+    _flash_supports_deterministic = "deterministic" in inspect.signature(flash_attn_func).parameters
+    _flash_deterministic_enabled = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+    _flash_use_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+def prepare_fa2_from_position_ids(
+    query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, position_ids: torch.Tensor
+):
+    query = query.view(-1, query.size(-2), query.size(-1))
+    key = key.contiguous().view(-1, key.size(-2), key.size(-1))
+    value = value.contiguous().view(-1, value.size(-2), value.size(-1))
+    position_ids = position_ids.flatten()
+    indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
+    cu_seqlens = torch.cat(
+        (
+            indices_q[position_ids == 0],
+            torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
+        )
+    )
+    max_length = cu_seqlens.diff().max()  # use cu_seqlens to infer max_length for qwen2vl mrope
+    return (query, key, value, indices_q, (cu_seqlens, cu_seqlens), (max_length, max_length))
+def _custom_flash_attention_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    query_length: int,
+    is_causal: bool = True,
+    position_ids: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
+    use_top_left_mask: bool = False,
+    deterministic: Optional[bool] = None,
+    **kwargs,
+):
+    """
+    Patches flash attention forward to handle 3D position ids in mrope. (3, batch_size, seq_length)
+    """
+    if not use_top_left_mask:
+        causal = is_causal
+    else:
+        causal = is_causal and query_length != 1
+    # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
+    use_sliding_windows = (
+        _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window
+    )
+    flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
+    if _flash_supports_deterministic:
+        flash_kwargs["deterministic"] = deterministic if deterministic is not None else _flash_deterministic_enabled
+    if kwargs.get("softcap") is not None:
+        flash_kwargs["softcap"] = kwargs.pop("softcap")
+    query_states, key_states, value_states = fa_peft_integration_check(
+        query_states, key_states, value_states, target_dtype=torch.bfloat16
+    )
+    sp_size = get_ulysses_sequence_parallel_world_size()
+    if sp_size > 1:
+        # (batch_size, seq_length, num_head, head_size)
+        query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2)
+        key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2)
+        value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2)
+        position_ids_lst = [torch.empty_like(position_ids) for _ in range(sp_size)]
+        position_ids = dist.all_gather(position_ids_lst, position_ids, group=get_ulysses_sequence_parallel_group())
+        position_ids = torch.cat(position_ids_lst, dim=-1)  # (..., batch_size, seq_length)
+    if position_ids is not None and position_ids.dim() == 3:  # qwen2vl mrope
+        position_ids = position_ids[0]
+    if position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
+        batch_size = query_states.size(0)
+        query_states, key_states, value_states, _, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
+            query_states, key_states, value_states, position_ids
+        )
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_in_batch_q,
+            max_seqlen_k=max_seqlen_in_batch_k,
+            dropout_p=kwargs.pop("dropout", 0.0),
+            softmax_scale=kwargs.pop("softmax_scale", None),
+            causal=causal,
+            **flash_kwargs,
+        )
+        attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1))
+    else:
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_length,
+            is_causal=is_causal,
+            sliding_window=sliding_window,
+            use_top_left_mask=use_top_left_mask,
+            deterministic=deterministic,
+            **kwargs,
+        )  # do not pass position_ids to old flash_attention_forward
+    if sp_size > 1:
+        # (batch_size, seq_length, num_head, head_size)
+        attn_output = gather_heads_scatter_seq(attn_output, head_dim=2, seq_dim=1)
+    return attn_output
+def flash_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # This is before the transpose
+    q_len = query.shape[2]
+    # FA2 uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    # FA2 always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
+    kwargs.pop("is_causal", None)
+    attn_output = _custom_flash_attention_forward(
+        query,
+        key,
+        value,
+        attention_mask,
+        query_length=q_len,
+        is_causal=True,
+        dropout=dropout,
+        softmax_scale=scaling,
+        sliding_window=sliding_window,
+        softcap=softcap,
+        use_top_left_mask=_flash_use_top_left_mask,
+        **kwargs,
+    )
+    return attn_output, None

monkey_patch.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

monkey_patch.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from .transformers.flash_attention_utils import flash_attention_forward
+from .transformers.qwen2_vl import qwen2_vl_attn_forward
+def apply_ulysses_patch(model_type: str) -> None:
+    if model_type in ("llama", "gemma", "gemma2", "mistral", "qwen2", "qwen3", "qwen3_moe"):
+        ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
+    elif model_type in ("qwen2_vl", "qwen2_5_vl"):
+        from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLFlashAttention2
+        from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLFlashAttention2
+        Qwen2VLFlashAttention2.forward = qwen2_vl_attn_forward
+        Qwen2_5_VLFlashAttention2.forward = qwen2_vl_attn_forward
+    else:
+        raise NotImplementedError(f"Model architecture {model_type} is not supported yet.")

protocol.py ADDED Viewed

	@@ -0,0 +1,689 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implement base data transfer protocol between any two functions, modules.
+We can subclass Protocol to define more detailed batch info with specific keys
+"""
+import copy
+import io
+import pickle
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import ray
+import torch
+from numpy.typing import NDArray
+from tensordict import TensorDict
+from torch.distributed import ProcessGroup
+from torch.utils.data import DataLoader
+from .utils.py_functional import union_two_dict
+try:
+    import tensordict
+    tensordict.set_lazy_legacy(False).set()
+except Exception:
+    pass
+__all__ = ["DataProto", "union_tensor_dict"]
+def pad_dataproto_to_divisor(data: "DataProto", size_divisor: int) -> Tuple["DataProto", int]:
+    """Pad a DataProto to size divisible by size_divisor
+    Args:
+        data (DataProto): the unpadded DataProto
+        size_divisor (int): size divisor
+    Returns:
+        data (DataProto): the padded DataProto
+        pad_size (int)
+    """
+    assert isinstance(data, DataProto), "data must be a DataProto"
+    if len(data) % size_divisor != 0:
+        pad_size = size_divisor - len(data) % size_divisor
+        padding_protos = []
+        remaining_pad = pad_size
+        while remaining_pad > 0:
+            take_size = min(remaining_pad, len(data))
+            padding_protos.append(data[:take_size])
+            remaining_pad -= take_size
+        data_padded = DataProto.concat([data] + padding_protos)
+    else:
+        pad_size = 0
+        data_padded = data
+    return data_padded, pad_size
+def unpad_dataproto(data: "DataProto", pad_size: int) -> "DataProto":
+    if pad_size != 0:
+        data = data[:-pad_size]
+    return data
+def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> TensorDict:
+    """Union two tensordicts."""
+    if tensor_dict1.batch_size != tensor_dict2.batch_size:
+        raise ValueError(
+            f"Two tensor dict must have identical batch size. Got {tensor_dict1.batch_size} and {tensor_dict2.batch_size}"
+        )
+    for key in tensor_dict2.keys():
+        if key in tensor_dict1 and not torch.equal(tensor_dict1[key], tensor_dict2[key]):
+            raise ValueError(f"Key already exists: {key}.")
+        tensor_dict1[key] = tensor_dict2[key]
+    return tensor_dict1
+def union_numpy_dict(tensor_dict1: Dict[str, NDArray], tensor_dict2: Dict[str, NDArray]) -> Dict[str, NDArray]:
+    for key in tensor_dict2.keys():
+        if key in tensor_dict1:
+            assert isinstance(tensor_dict2[key], np.ndarray)
+            assert isinstance(tensor_dict1[key], np.ndarray)
+            if not np.all(tensor_dict1[key] == tensor_dict2[key]):
+                raise ValueError(f"Key already exists: {key}.")
+        tensor_dict1[key] = tensor_dict2[key]
+    return tensor_dict1
+def batch_collate(features: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
+    if len(features) == 0:
+        return {}
+    batch_features = defaultdict(list)
+    for feature in features:
+        for key, value in feature.items():
+            batch_features[key].append(value)
+    return batch_features
+def fold_batch_dim(data: "DataProto", new_batch_size: int):
+    """
+    Fold a batch dim from [bsz, xxx] into [new_bsz, bsz // new_bsz, xxx]
+    """
+    batch_size = data.batch.batch_size[0]
+    assert batch_size % new_batch_size == 0
+    tensor: TensorDict = data.batch
+    non_tensor = data.non_tensor_batch
+    tensor = tensor.view(new_batch_size, -1)
+    tensor.auto_batch_size_(batch_dims=1)
+    for key, value in non_tensor.items():
+        non_tensor[key] = np.reshape(value, newshape=(new_batch_size, -1, *value.shape[1:]))
+    return DataProto(batch=tensor, non_tensor_batch=non_tensor, meta_info=data.meta_info)
+def collate_fn(data_items: list["DataProtoItem"]):
+    batch = []
+    non_tensor_batch = []
+    for data in data_items:
+        batch.append(data.batch)
+        non_tensor_batch.append(data.non_tensor_batch)
+    batch = torch.stack(batch).contiguous()
+    non_tensor_batch = batch_collate(non_tensor_batch)
+    non_tensor_batch = {key: np.array(value, dtype=object) for key, value in non_tensor_batch.items()}
+    return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)
+@dataclass
+class DataProtoItem:
+    batch: Optional[TensorDict] = None
+    non_tensor_batch: Dict[str, NDArray] = field(default_factory=dict)
+    meta_info: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class DataProto:
+    """
+    A DataProto is a data structure that aims to provide a standard protocol for data exchange between functions.
+    It contains a batch (TensorDict) and a meta_info (Dict). The batch is a TensorDict https://pytorch.org/tensordict/.
+    TensorDict allows you to manipulate a dictionary of Tensors like a single Tensor. Ideally, the tensors with the
+    same batch size should be put inside batch.
+    """
+    batch: Optional[TensorDict] = None
+    non_tensor_batch: Dict[str, NDArray] = field(default_factory=dict)
+    meta_info: Dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self):
+        self.check_consistency()  # perform necessary checking
+    def __len__(self) -> int:
+        if self.batch is not None:
+            return self.batch.batch_size[0]
+        elif self.non_tensor_batch is not None and len(self.non_tensor_batch) > 0:
+            pivot_key = list(self.non_tensor_batch.keys())[0]
+            return self.non_tensor_batch[pivot_key].shape[0]
+        else:
+            return 0
+    def __getitem__(self, item: Union[int, slice]) -> Union["DataProto", "DataProtoItem"]:
+        tensor_data = self.batch[item]
+        non_tensor_data = {key: value[item] for key, value in self.non_tensor_batch.items()}
+        return_type = DataProto if isinstance(item, slice) else DataProtoItem
+        return return_type(batch=tensor_data, non_tensor_batch=non_tensor_data, meta_info=self.meta_info)
+    def __getstate__(self) -> Tuple[bytes, Dict[str, NDArray], Dict[str, Any]]:
+        buffer = io.BytesIO()
+        if self.batch is not None:
+            self.batch: TensorDict = self.batch.contiguous()
+            self.batch: TensorDict = self.batch.consolidate()
+        torch.save(self.batch, buffer)
+        buffer_bytes = buffer.getvalue()
+        return buffer_bytes, self.non_tensor_batch, self.meta_info
+    def __setstate__(self, data: Tuple[bytes, Dict[str, NDArray], Dict[str, Any]]) -> None:
+        batch_deserialized_bytes, non_tensor_batch, meta_info = data
+        batch_deserialized = io.BytesIO(batch_deserialized_bytes)
+        batch = torch.load(batch_deserialized, weights_only=False, map_location="cpu")
+        self.batch = batch
+        self.non_tensor_batch = non_tensor_batch
+        self.meta_info = meta_info
+    def save_to_disk(self, filepath: str) -> None:
+        with open(filepath, "wb") as f:
+            pickle.dump(self, f)
+    @staticmethod
+    def load_from_disk(filepath: str) -> "DataProto":
+        with open(filepath, "rb") as f:
+            data = pickle.load(f)
+            return data
+    def print_size(self, prefix: str = "") -> None:
+        size_of_tensordict = 0
+        if self.batch is not None:
+            for tensor in self.batch.values():
+                if isinstance(tensor, torch.Tensor):
+                    size_of_tensordict += tensor.element_size() * tensor.numel()
+        size_of_numpy_array = 0
+        for value in self.non_tensor_batch.values():
+            size_of_numpy_array += value.nbytes
+        size_of_numpy_array /= 1024**3
+        size_of_tensordict /= 1024**3
+        message = f"Size of tensordict: {size_of_tensordict} GB, size of non_tensor_batch: {size_of_numpy_array} GB."
+        print({prefix}, {message})
+    def check_consistency(self):
+        """Check the consistency of the DataProto. Mainly for batch and non_tensor_batch
+        We expose this function as a public one so that user can call themselves directly
+        """
+        if self.batch is not None:
+            assert len(self.batch.batch_size) == 1, "only support num_batch_dims=1"
+        if self.batch is not None and len(self.non_tensor_batch) != 0:
+            # TODO: we can actually lift this restriction if needed
+            assert len(self.batch.batch_size) == 1, "only support num_batch_dims=1 when non_tensor_batch is not empty."
+            batch_size = self.batch.batch_size[0]
+            for key, value in self.non_tensor_batch.items():
+                assert len(value) == batch_size, f"key {key} length {len(value)} is not equal to bsz {batch_size}."
+    @classmethod
+    def from_single_dict(
+        cls,
+        data: Dict[str, Union[torch.Tensor, NDArray]],
+        meta_info: Optional[Dict[str, Any]] = None,
+    ) -> "DataProto":
+        tensors, non_tensors = {}, {}
+        for key, value in data.items():
+            if isinstance(value, torch.Tensor):
+                tensors[key] = value
+            elif isinstance(value, np.ndarray):
+                non_tensors[key] = value
+            else:
+                raise ValueError(f"Unsupported type in data {type(value)}")
+        return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
+    @classmethod
+    def from_dict(
+        cls,
+        tensors: Dict[str, torch.Tensor],
+        non_tensors: Dict[str, NDArray] = None,
+        meta_info: Optional[Dict[str, Any]] = None,
+        num_batch_dims: int = 1,
+    ) -> "DataProto":
+        """Create a DataProto from a dict of tensors. This assumes that
+        1. All the tensor in tensors have the same dim0
+        2. Only dim0 is the batch dim
+        """
+        assert len(tensors) > 0, "tensors must not be empty"
+        assert num_batch_dims > 0, "num_batch_dims must be greater than zero"
+        if non_tensors is not None:
+            assert num_batch_dims == 1, "only support num_batch_dims=1 when non_tensors is not None."
+        meta_info = meta_info or {}
+        non_tensors = non_tensors or {}
+        assert isinstance(non_tensors, dict), "non_tensors should be a dictionary."
+        # get and check batch size
+        batch_size = None
+        pivot_key = None
+        for key, tensor in tensors.items():
+            if batch_size is None:
+                batch_size = tensor.shape[:num_batch_dims]
+                pivot_key = key
+            else:
+                current_batch = tensor.shape[:num_batch_dims]
+                assert batch_size == current_batch, (
+                    f"Not all the tensor in tensors have the same batch size with batch_dims={num_batch_dims}. "
+                    f"Got {pivot_key} has {batch_size}, {key} has {current_batch}"
+                )
+        tensor_dict = TensorDict(source=tensors, batch_size=batch_size)
+        return cls(batch=tensor_dict, non_tensor_batch=non_tensors, meta_info=meta_info)
+    def to(self, device: torch.device) -> "DataProto":
+        """move the batch to device
+        Args:
+            device (torch.device, str): torch device
+        Returns:
+            DataProto: the current DataProto
+        """
+        if self.batch is not None:
+            self.batch = self.batch.to(device)
+        return self
+    def select(
+        self,
+        batch_keys: Optional[List[str]] = None,
+        non_tensor_batch_keys: Optional[List[str]] = None,
+        meta_info_keys: Optional[List[str]] = None,
+        deepcopy: bool = False,
+    ) -> "DataProto":
+        """Select a subset of the DataProto via batch_keys and meta_info_keys
+        Args:
+            batch_keys (list, optional): a list of strings indicating the keys in batch to select
+            meta_info_keys (list, optional): a list of keys indicating the meta info to select
+        Returns:
+            DataProto: the DataProto with the selected batch_keys and meta_info_keys
+        """
+        # TODO (zhangchi.usc1992) whether to copy
+        if batch_keys is not None:
+            batch_keys = tuple(batch_keys)
+            sub_batch = self.batch.select(*batch_keys)
+        else:
+            sub_batch = self.batch
+        if non_tensor_batch_keys is not None:
+            non_tensor_batch = {k: v for k, v in self.non_tensor_batch.items() if k in non_tensor_batch_keys}
+        else:
+            non_tensor_batch = self.non_tensor_batch
+        if deepcopy:
+            non_tensor_batch = copy.deepcopy(non_tensor_batch)
+        if meta_info_keys is not None:
+            sub_meta_info = {k: v for k, v in self.meta_info.items() if k in meta_info_keys}
+        else:
+            sub_meta_info = self.meta_info
+        if deepcopy:
+            sub_meta_info = copy.deepcopy(sub_meta_info)
+        return DataProto(batch=sub_batch, non_tensor_batch=non_tensor_batch, meta_info=sub_meta_info)
+    def pop(
+        self,
+        batch_keys: Optional[List[str]] = None,
+        non_tensor_batch_keys: Optional[List[str]] = None,
+        meta_info_keys: Optional[List[str]] = None,
+    ) -> "DataProto":
+        """Pop a subset of the DataProto via `batch_keys` and `meta_info_keys`
+        Args:
+            batch_keys (list, optional): a list of strings indicating the keys in batch to pop
+            meta_info_keys (list, optional): a list of keys indicating the meta info to pop
+        Returns:
+            DataProto: the DataProto with the poped batch_keys and meta_info_keys
+        """
+        assert batch_keys is not None
+        non_tensor_batch_keys = non_tensor_batch_keys or []
+        meta_info_keys = meta_info_keys or []
+        tensors = {}
+        for key in batch_keys:
+            tensors[key] = self.batch.pop(key)
+        non_tensors = {}
+        for key in non_tensor_batch_keys:
+            non_tensors[key] = self.non_tensor_batch.pop(key)
+        meta_info = {}
+        for key in meta_info_keys:
+            meta_info[key] = self.meta_info.pop(key)
+        return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
+    def rename(
+        self, old_keys: Optional[Union[str, List[str]]] = None, new_keys: Optional[Union[str, List[str]]] = None
+    ) -> "DataProto":
+        """
+        Note that this function only rename the key in the batch
+        """
+        def validate_input(keys):
+            if keys is not None:
+                if isinstance(keys, str):
+                    keys = [keys]
+                elif isinstance(keys, list):
+                    pass
+                else:
+                    raise TypeError(f"keys must be a list or a string, but got {type(keys)}")
+            return keys
+        old_keys = validate_input(old_keys)
+        new_keys = validate_input(new_keys)
+        if len(new_keys) != len(old_keys):
+            raise ValueError(
+                f"new_keys and old_keys must have the same length, but got {len(new_keys)} and {len(old_keys)}"
+            )
+        self.batch.rename_key_(tuple(old_keys), tuple(new_keys))
+        return self
+    def union(self, other: "DataProto") -> "DataProto":
+        """Union with another DataProto. Union batch and meta_info separately.
+        Throw an error if
+        - there are conflict keys in batch and they are not equal
+        - the batch size of two data batch is not the same
+        - there are conflict keys in meta_info and they are not the same.
+        Args:
+            other (DataProto): another DataProto to union
+        Returns:
+            DataProto: the DataProto after union
+        """
+        self.batch = union_tensor_dict(self.batch, other.batch)
+        self.non_tensor_batch = union_numpy_dict(self.non_tensor_batch, other.non_tensor_batch)
+        self.meta_info = union_two_dict(self.meta_info, other.meta_info)
+        return self
+    def make_iterator(
+        self, mini_batch_size: int, epochs: int, seed: int = None, dataloader_kwargs: Dict[str, Any] = None
+    ):
+        """Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
+        dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
+        Args:
+            mini_batch_size (int): mini-batch size when iterating the dataset. We require that
+                ``batch.batch_size[0] % mini_batch_size == 0``
+            epochs (int): number of epochs when iterating the dataset.
+            dataloader_kwargs: internally, it returns a DataLoader over the batch.
+                The dataloader_kwargs is the kwargs passed to the DataLoader
+        Returns:
+            Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is
+            ``self.batch.batch_size * epochs // mini_batch_size``
+        """
+        assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0"
+        # we can directly create a dataloader from TensorDict
+        if dataloader_kwargs is None:
+            dataloader_kwargs = {}
+        if seed is not None:
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = None
+        assert isinstance(dataloader_kwargs, Dict)
+        train_dataloader = DataLoader(
+            dataset=self, batch_size=mini_batch_size, collate_fn=collate_fn, generator=generator, **dataloader_kwargs
+        )
+        def get_data():
+            for _ in range(epochs):
+                for d in train_dataloader:
+                    d.meta_info = self.meta_info
+                    yield d
+        return iter(get_data())
+    def chunk(self, chunks: int) -> List["DataProto"]:
+        """Split the batch among dim=0 into chunks. The meta_info is passed to each DataProto after split.
+        Args:
+            chunks (int): the number of chunks to split on dim=0
+        Returns:
+            List[DataProto]: a list of DataProto after splitting
+        """
+        assert len(self) % chunks == 0, (
+            f"only support equal chunk. Got size of DataProto {len(self)} and chunk {chunks}."
+        )
+        if self.batch is not None:
+            batch_lst = self.batch.chunk(chunks=chunks, dim=0)
+        else:
+            batch_lst = [None for _ in range(chunks)]
+        non_tensor_batch_lst = [{} for _ in range(chunks)]
+        for key, value in self.non_tensor_batch.items():
+            assert isinstance(value, np.ndarray)
+            non_tensor_lst = np.array_split(value, chunks)
+            assert len(non_tensor_lst) == chunks
+            for i in range(chunks):
+                non_tensor_batch_lst[i][key] = non_tensor_lst[i]
+        output = []
+        for i in range(chunks):
+            output.append(
+                DataProto(batch=batch_lst[i], non_tensor_batch=non_tensor_batch_lst[i], meta_info=self.meta_info)
+            )
+        return output
+    def split(self, split_size: int) -> List["DataProto"]:
+        chunks = len(self) // split_size
+        return self.chunk(chunks)
+    @staticmethod
+    def concat(data: List["DataProto"]) -> "DataProto":
+        """Concat a list of DataProto. The batch is concatenated among dim=0.
+        The meta_info is assumed to be identical and will use the first one.
+        Args:
+            data (List[DataProto]): list of DataProto
+        Returns:
+            DataProto: concatenated DataProto
+        """
+        batch_lst = [batch.batch for batch in data]
+        if batch_lst[0] is not None:
+            new_batch = torch.cat(batch_lst, dim=0)
+        else:
+            new_batch = None
+        non_tensor_batch = batch_collate([d.non_tensor_batch for d in data])
+        for key, value in non_tensor_batch.items():
+            non_tensor_batch[key] = np.concatenate(value, axis=0)
+        return DataProto(batch=new_batch, non_tensor_batch=non_tensor_batch, meta_info=data[0].meta_info)
+    def reorder(self, indices: torch.Tensor) -> None:
+        """
+        Note that this operation is in-place
+        """
+        indices_np = indices.detach().numpy()
+        self.batch = self.batch[indices]
+        self.non_tensor_batch = {key: value[indices_np] for key, value in self.non_tensor_batch.items()}
+    def repeat(self, repeat_times: int = 2, interleave: bool = True) -> "DataProto":
+        """
+        Repeat the batch data a specified number of times.
+        Args:
+            repeat_times (int): Number of times to repeat the data.
+            interleave (bool): Whether to interleave the repeated data.
+        Returns:
+            DataProto: A new DataProto with repeated data.
+        """
+        if self.batch is not None:
+            if interleave:
+                # Interleave the data
+                repeated_tensors = {
+                    key: tensor.repeat_interleave(repeat_times, dim=0) for key, tensor in self.batch.items()
+                }
+            else:
+                # Stack the data
+                repeated_tensors = {
+                    key: tensor.unsqueeze(0).expand(repeat_times, *tensor.shape).reshape(-1, *tensor.shape[1:])
+                    for key, tensor in self.batch.items()
+                }
+            repeated_batch = TensorDict(
+                source=repeated_tensors,
+                batch_size=(self.batch.batch_size[0] * repeat_times,),
+            )
+        else:
+            repeated_batch = None
+        repeated_non_tensor_batch = {}
+        for key, value in self.non_tensor_batch.items():
+            if interleave:
+                repeated_non_tensor_batch[key] = np.repeat(value, repeat_times, axis=0)
+            else:
+                repeated_non_tensor_batch[key] = np.tile(value, (repeat_times,) + (1,) * (value.ndim - 1))
+        return DataProto(
+            batch=repeated_batch,
+            non_tensor_batch=repeated_non_tensor_batch,
+            meta_info=self.meta_info,
+        )
+@dataclass
+class DataProtoFuture:
+    """
+    DataProtoFuture aims to eliminate actual data fetching on driver. By doing so, the driver doesn't have to wait
+    for data so that asynchronous execution becomes possible.
+    DataProtoFuture contains a list of futures from another WorkerGroup of size world_size.
+    - collect_fn is a Callable that reduces the list of futures to a DataProto
+    - dispatch_fn is a Callable that partitions the DataProto into a list of DataProto of size world_size and then select
+    Potential issue: we can optimize dispatch_fn(collect_fn) such that only needed data is fetched on destination
+    - DataProtoFuture only supports directly passing from the output of a method to another input. You can't perform any
+    operation on the DataProtoFuture in driver.
+    """
+    collect_fn: Callable
+    futures: List[ray.ObjectRef]
+    dispatch_fn: Callable = None
+    @staticmethod
+    def concat(data: List[ray.ObjectRef]) -> "DataProtoFuture":
+        output = DataProtoFuture(collect_fn=DataProto.concat, futures=data)
+        return output
+    def chunk(self, chunks: int) -> List["DataProtoFuture"]:
+        from functools import partial
+        arg_future_lst = []
+        for i in range(chunks):
+            # note that we can't directly pass i and chunks
+            def dispatch_fn(x, i, chunks):
+                return x.chunk(chunks=chunks)[i]
+            arg_future = DataProtoFuture(
+                collect_fn=self.collect_fn, dispatch_fn=partial(dispatch_fn, i=i, chunks=chunks), futures=self.futures
+            )
+            arg_future_lst.append(arg_future)
+        return arg_future_lst
+    def get(self):
+        outputs = ray.get(self.futures)  # dp_size.
+        for output in outputs:
+            assert isinstance(output, DataProto)
+        outputs = self.collect_fn(outputs)  # select dp, concat
+        if self.dispatch_fn is not None:
+            outputs = self.dispatch_fn(outputs)  # split in batch dim, select using dp
+        return outputs
+def allgather_dict_tensors(
+    tensors: Union[Dict[str, torch.Tensor], TensorDict], size: int, group: ProcessGroup, dim: int = 0
+) -> Union[Dict[str, torch.Tensor], TensorDict]:
+    """
+    TODO: optimize this.
+    - We can use async ops
+    - We can use only one allgather
+    """
+    if isinstance(tensors, TensorDict):
+        is_tensor_dict = True
+        tensors_as_dict = tensors.to_dict()
+    else:
+        tensors_as_dict = tensors
+        is_tensor_dict = False
+    output = {}
+    sorted_keys = sorted(tensors_as_dict.keys())
+    for key in sorted_keys:
+        value = tensors_as_dict[key]
+        output[key] = [torch.empty_like(value) for _ in range(size)]
+        torch.distributed.all_gather(output[key], value, group=group, async_op=False)
+        output[key] = torch.cat(output[key], dim=dim)
+    if is_tensor_dict:
+        output = TensorDict(source=output, batch_size=tensors.batch_size[0] * size)
+    return output
+def all_gather_data_proto(data: DataProto, size: int, group: ProcessGroup) -> None:
+    # Note that this is an inplace operator just like torch.distributed.all_gather
+    prev_device = data.batch.device
+    data.batch = data.batch.cuda(device=torch.cuda.current_device())
+    data.batch = allgather_dict_tensors(data.batch.contiguous(), size=size, group=group, dim=0)
+    data.batch = data.batch.to(prev_device)
+    # all gather non_tensor_batch
+    all_non_tensor_batch = [None for _ in range(size)]
+    torch.distributed.all_gather_object(all_non_tensor_batch, data.non_tensor_batch, group=group)
+    data.non_tensor_batch = {k: np.concatenate([d[k] for d in all_non_tensor_batch]) for k in data.non_tensor_batch}

qwen2_vl.cpython-310.pyc ADDED Viewed

Binary file (4.45 kB). View file

qwen2_vl.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Based on:
+# https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import torch
+from .flash_attention_utils import flash_attention_forward
+try:
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+        Qwen2VLAttention,
+        apply_multimodal_rotary_pos_emb,
+        repeat_kv,
+    )
+    from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
+except ImportError:
+    pass
+def get_rope_index(
+    processor: "Qwen2VLProcessor",
+    input_ids: torch.Tensor,
+    image_grid_thw: Optional[torch.Tensor] = None,
+    video_grid_thw: Optional[torch.Tensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Gets the position ids for Qwen2-VL, it should be generated before sharding the sequence.
+    The batch dim has been removed and the input_ids should be a 1D tensor representing a single example.
+    https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1546
+    """
+    spatial_merge_size = processor.image_processor.merge_size
+    tokens_per_second = 2
+    image_token_id = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
+    video_token_id = processor.tokenizer.convert_tokens_to_ids("<|video_pad|>")
+    vision_start_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_start|>")
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        position_ids = torch.ones(3, input_ids.size(0), dtype=input_ids.dtype, device=input_ids.device)  # (3, seqlen)
+        image_index, video_index = 0, 0
+        input_ids = input_ids[attention_mask == 1]
+        image_nums, video_nums = 0, 0
+        vision_start_indices = torch.argwhere(input_ids == vision_start_token_id)
+        vision_tokens = input_ids[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        input_tokens = input_ids.tolist()
+        llm_pos_ids_list: list = []
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+        for _ in range(image_nums + video_nums):
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                second_per_grid_t = 0
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                if second_per_grid_ts is not None:
+                    second_per_grid_t = second_per_grid_ts[video_index]
+                else:
+                    second_per_grid_t = 1.0
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t.item(),
+                h.item() // spatial_merge_size,
+                w.item() // spatial_merge_size,
+            )
+            text_len = ed - st
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
+            t_index = (t_index * second_per_grid_t * tokens_per_second).long().flatten()
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        position_ids[..., attention_mask == 1] = llm_positions.to(position_ids.device)
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1).to(input_ids.device)
+        else:
+            position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).view(1, -1).expand(3, -1)
+    return position_ids
+def qwen2_vl_attn_forward(
+    self: "Qwen2VLAttention",
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    **kwargs,
+) -> Tuple[torch.Tensor, None, None]:
+    bsz, q_len, _ = hidden_states.size()  # q_len = seq_length / sp_size
+    query_states = self.q_proj(hidden_states)  # (batch_size, seq_length / sp_size, num_heads * head_size)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    # Because the input can be padded, the absolute sequence length depends on the max position id.
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+    query_states, key_states = apply_multimodal_rotary_pos_emb(
+        query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+    )
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    dropout_rate = 0.0 if not self.training else self.attention_dropout
+    sliding_window = None
+    if (
+        self.config.use_sliding_window
+        and getattr(self.config, "sliding_window", None) is not None
+        and self.layer_idx >= self.config.max_window_layers
+    ):
+        sliding_window = self.config.sliding_window
+    attn_output, _ = flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        dropout=dropout_rate,
+        sliding_window=sliding_window,
+        position_ids=position_ids,  # important: pass position ids
+    )  # (batch_size, seq_length, num_head / sp_size, head_size)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+    attn_output = self.o_proj(attn_output)
+    return attn_output, None, None