JetLM
/

SDAR-4B-Chat

@@ -21,10 +21,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, Optional, Tuple, Union
 import torch
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
@@ -43,8 +44,9 @@ from transformers.modeling_outputs import (
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
-from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_sdar import SDARConfig
 from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
@@ -69,6 +71,10 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
 @use_kernel_forward_from_hub("RMSNorm")
 class SDARRMSNorm(nn.Module):
@@ -272,34 +278,21 @@ class SDARAttention(nn.Module):
             value_states = torch.cat(
                 [past_value_states, value_states], dim=-2)
-        attention_mask = attention_mask.bool() if attention_mask is not None else None
-        if torch.all(attention_mask):  # decoding
-            query_states = query_states.transpose(1, 2)
-            key_states = key_states.transpose(1, 2)
-            value_states = value_states.transpose(1, 2)
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                causal=False,
-                softmax_scale=self.scaling
-            )
-        else:  # prefilling
-            attn_output = F.scaled_dot_product_attention(
-                query=query_states,
-                key=key_states,
-                value=value_states,
-                attn_mask=attention_mask,
-                is_causal=False,
-                scale=self.scaling,
-                enable_gqa=True
-            )
-            attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
-        return attn_output, None  # , attn_weights
 class SDARDecoderLayer(GradientCheckpointingLayer):
@@ -733,10 +726,6 @@ class SDARModel(SDARPreTrainedModel):
         return causal_mask
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
-    ...
 @auto_docstring
 class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
@@ -771,6 +760,49 @@ class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
     def get_decoder(self):
         return self.model
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -785,8 +817,8 @@ class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        logits_to_keep: Union[int, torch.Tensor] = 0,
-        **kwargs: Unpack[KwargsForCausalLM],
     ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -814,40 +846,33 @@ class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             cache_position=cache_position,
-            **kwargs,
-        )
         hidden_states = outputs.last_hidden_state
-        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = slice(-logits_to_keep,
-                              None) if isinstance(logits_to_keep, int) else logits_to_keep
-        hidden_states = hidden_states[:, slice_indices, :].contiguous()
-        fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
-        if fuse_linear_and_cross_entropy:
-            # When using fused_linear_ce_loss, we do not compute the whole logits on HBM
-            logits = None
-        else:
-            logits = self.lm_head(hidden_states)
         loss = None
-        if labels is not None:
-            # FusedLinearCrossEntropyLoss will be implemented by monkey patch when training
-            # We don't use it when inferencing
-            loss_fct = nn.CrossEntropyLoss()  # nn.CE
-            loss = loss_fct(
-                logits.view(-1, self.config.vocab_size), labels.view(-1))
         return CausalLMOutputWithPast(
             loss=loss,
@@ -862,4 +887,4 @@ __all__ = [
     "SDARForCausalLM",
     "SDARModel",
     "SDARPreTrainedModel",
-]

 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Callable, Optional, Tuple, Union, List
 import torch
 from torch import nn
+from einops import rearrange
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
 from .configuration_sdar import SDARConfig
+from .fused_linear_diffusion_cross_entropy import FusedLinearDiffusionCrossEntropyLoss
 from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
 logger = logging.get_logger(__name__)
+@torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
+def fused_flex_attention(query, key, value, attention_mask, **kwargs):
+    return flex_attention(query, key, value, block_mask=attention_mask, **kwargs)
 @use_kernel_forward_from_hub("RMSNorm")
 class SDARRMSNorm(nn.Module):
             value_states = torch.cat(
                 [past_value_states, value_states], dim=-2)
+        attn_output, attn_weights = fused_flex_attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            attention_mask=attention_mask,
+            enable_gqa=True,
+            scale=self.scaling,
+            return_lse=True
+        )
+        attn_weights = attn_weights.to(
+            value_states.dtype) if attn_weights is not None else None
+        attn_output = rearrange(attn_output, 'b h l d -> b l (h d)')
         attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights  # , attn_weights
 class SDARDecoderLayer(GradientCheckpointingLayer):
         return causal_mask
 @auto_docstring
 class SDARForCausalLM(SDARPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def get_decoder(self):
         return self.model
+    def prepare_for_bd_training(self, inputs_ids, position_ids, prompt_mask):
+        bsz, seq_len = inputs_ids.shape
+        num_tokens = calculate_token_nums(position_ids) # List[torch.Tensor]
+        noisy_inputs_ids, logits_to_keep_half, p_mask = forward_add_noise_packed(
+            inputs_ids=inputs_ids,
+            num_tokens_list=num_tokens,
+            prompt_mask=prompt_mask,
+            mask_id=self.config.mask_token_id,
+        )
+        router_noisy_part_list = []
+        for i in range(bsz):
+            cur_router_noisy_part = (torch.arange(num_tokens[i].shape[0] *2) % 2 == 0).to(inputs_ids.device)
+            cur_router_noisy_part = cur_router_noisy_part.repeat_interleave(num_tokens[i].repeat_interleave(2))
+            router_noisy_part_list.append(cur_router_noisy_part)
+        router_noisy_part = torch.stack(router_noisy_part_list, dim=0)
+        # concated inputs_ids: (bzs, seq_len x 2)
+        concat_inputs_ids = inputs_ids.repeat(1, 2)
+        # concated logits_to_keep: (bsz, seq_len x 2)
+        logits_to_keep = torch.zeros(
+                    bsz, 2 * seq_len, dtype=torch.bool, device=inputs_ids.device)
+        # concated position_ids: (bsz, seq_len x 2)
+        concat_position_ids = torch.zeros(
+                    bsz, 2 * seq_len, dtype=position_ids.dtype, device=position_ids.device)
+        for i in range(bsz):
+            concat_inputs_ids[i][router_noisy_part[i]] = noisy_inputs_ids[i]
+            concat_inputs_ids[i][~router_noisy_part[i]] = inputs_ids[i]
+            logits_to_keep[i][router_noisy_part[i]] = logits_to_keep_half[i]
+            concat_position_ids[i][router_noisy_part[i]] = position_ids[i]
+            concat_position_ids[i][~router_noisy_part[i]] = position_ids[i]
+        # create flex_attention mask
+        attention_mask = block_attn_mask(num_tokens, self.config.block_size, inputs_ids.device)
+        flex_attention_mask_3d = create_block_mask(
+                            lambda b, h, q_idx, kv_idx: attention_mask[b, q_idx, kv_idx],
+                            B=attention_mask.size(0), H=None,
+                            Q_LEN=attention_mask.size(1), KV_LEN=attention_mask.size(2),
+        )
+        return concat_inputs_ids, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask
     @can_return_tuple
     @auto_docstring
     def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            return_dict=True,
             cache_position=cache_position,
+            **kwargs
+            )
         hidden_states = outputs.last_hidden_state
+        if logits_to_keep is not None:
+            B, _, H = hidden_states.shape
+            num_keep = logits_to_keep.sum(dim=1)
+            assert torch.all(num_keep == num_keep[0])
+            N = int(num_keep[0].item())
+            hidden_states = hidden_states[logits_to_keep].view(B, N, H).contiguous()   # [B, N, H]
+        logits = self.lm_head(hidden_states)
         loss = None
+        # if labels is not None:
+        #     loss_fct = nn.CrossEntropyLoss(reduction="none", ignore_index=-100)
+        #     loss = loss_fct(
+        #         logits.view(-1, self.config.vocab_size),
+        #         labels.view(-1)
+        #         ).view(labels.size())
         return CausalLMOutputWithPast(
             loss=loss,
     "SDARForCausalLM",
     "SDARModel",
     "SDARPreTrainedModel",
+]