Spaces:

lucalp
/

blt-entropy-patcher

Running on Zero

Srinivasan Iyer

sviyer commited on Feb 14

Commit

f3e8125

unverified ·

1 Parent(s): c49e251

using apex rmsnorm (#57)

* using apex rmsnorm

* added message for missing apex

* black

* missed a print

---------

Co-authored-by: Srini Iyer <[email protected]>

Files changed (4) hide show

bytelatent/base_transformer.py CHANGED Viewed

@@ -17,6 +17,14 @@ from xformers.ops import AttentionBias, fmha
 from bytelatent import probe
 from bytelatent.tokenizers.constants import EOS_ID
 if int(os.environ.get("BLT_ALLOW_MISSING_FLEX_ATTENTION", False)) == 0:
     flex_attention_comp = torch.compile(flex_attention)
 else:
@@ -294,37 +302,6 @@ class RotaryEmbedding(torch.nn.Module):
             return self.freqs_cis[0:seqlen]
-class RMSNorm(nn.Module):
-    """
-    Initialize the RMSNorm normalization layer.
-    Args:
-        dim (int): The dimension of the input tensor.
-        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-    Attributes:
-        eps (float): A small value added to the denominator for numerical stability.
-        weight (nn.Parameter): Learnable scaling parameter.
-    """
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def _norm(self, x: torch.Tensor):
-        return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x: torch.Tensor):
-        x = probe.log_stats(x, "resid")
-        output = self._norm(x.float())
-        return (output * self.weight.float()).type_as(x)
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)  # type: ignore
 def _reshape_for_attn_bias(
     attn_bias: AttentionBias | None,
     *tensors: torch.Tensor,

 from bytelatent import probe
 from bytelatent.tokenizers.constants import EOS_ID
+try:
+    from apex.normalization.fused_layer_norm import FusedRMSNorm
+    RMSNorm = FusedRMSNorm
+except (ImportError, ModuleNotFoundError):
+    print("Apex not found. Using nn.RMSNorm")
+    RMSNorm = nn.RMSNorm
 if int(os.environ.get("BLT_ALLOW_MISSING_FLEX_ATTENTION", False)) == 0:
     flex_attention_comp = torch.compile(flex_attention)
 else:
             return self.freqs_cis[0:seqlen]
 def _reshape_for_attn_bias(
     attn_bias: AttentionBias | None,
     *tensors: torch.Tensor,

bytelatent/model/latent_transformer.py CHANGED Viewed

@@ -12,12 +12,19 @@ from xformers.ops import AttentionBias
 from bytelatent.base_transformer import (
     BaseTransformer,
     BaseTransformerArgs,
-    RMSNorm,
     flex_attention_comp,
     repeat_kv,
 )
 from bytelatent.model.utils import create_causal_mask
 logger = logging.getLogger()
@@ -44,7 +51,7 @@ class CrossAttention(nn.Module):
         self.n_kv_heads = n_kv_heads
         self.heads_per_group = self.n_heads // self.n_kv_heads
-        self.cross_attn_norm_q = RMSNorm(dim, eps=norm_eps)
         self.cross_attn_norm_kv = RMSNorm(dim, eps=norm_eps)
         self.wq = nn.Linear(

 from bytelatent.base_transformer import (
     BaseTransformer,
     BaseTransformerArgs,
     flex_attention_comp,
     repeat_kv,
 )
 from bytelatent.model.utils import create_causal_mask
+try:
+    from apex.normalization.fused_layer_norm import FusedRMSNorm
+    RMSNorm = FusedRMSNorm
+except (ImportError, ModuleNotFoundError):
+    print("Apex not found. Using nn.RMSNorm")
+    RMSNorm = nn.RMSNorm
 logger = logging.getLogger()
         self.n_kv_heads = n_kv_heads
         self.heads_per_group = self.n_heads // self.n_kv_heads
+        self.cross_attn_norm_q = nn.RMSNorm(dim, eps=norm_eps)
         self.cross_attn_norm_kv = RMSNorm(dim, eps=norm_eps)
         self.wq = nn.Linear(

bytelatent/model/local_models.py CHANGED Viewed

@@ -14,7 +14,6 @@ from xformers.ops import AttentionBias
 from bytelatent.base_transformer import (
     BaseTransformerArgs,
     InitStdFactor,
-    RMSNorm,
     RotaryEmbedding,
     TransformerBlock,
 )
@@ -22,6 +21,14 @@ from bytelatent.model.latent_transformer import CrossAttention
 from bytelatent.model.utils import create_causal_mask, downsample
 from bytelatent.tokenizers.blt_tokenizer import BOE_ID
 logger = logging.getLogger()

 from bytelatent.base_transformer import (
     BaseTransformerArgs,
     InitStdFactor,
     RotaryEmbedding,
     TransformerBlock,
 )
 from bytelatent.model.utils import create_causal_mask, downsample
 from bytelatent.tokenizers.blt_tokenizer import BOE_ID
+try:
+    from apex.normalization.fused_layer_norm import FusedRMSNorm
+    RMSNorm = FusedRMSNorm
+except (ImportError, ModuleNotFoundError):
+    print("Apex not found. Using nn.RMSNorm")
+    RMSNorm = nn.RMSNorm
 logger = logging.getLogger()

bytelatent/transformer.py CHANGED Viewed

@@ -19,11 +19,18 @@ from xformers.ops import AttentionBias, fmha
 from bytelatent.base_transformer import (
     BaseTransformer,
     BaseTransformerArgs,
-    RMSNorm,
     cross_entropy,
 )
 from bytelatent.model.utils import create_causal_mask
 def attention_flops_per_token(n_layers, seq_len, dim, causal):
     # Formula from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py#L27-L30

 from bytelatent.base_transformer import (
     BaseTransformer,
     BaseTransformerArgs,
     cross_entropy,
 )
 from bytelatent.model.utils import create_causal_mask
+try:
+    from apex.normalization.fused_layer_norm import FusedRMSNorm
+    RMSNorm = FusedRMSNorm
+except (ImportError, ModuleNotFoundError):
+    print("Apex not found. Using nn.RMSNorm")
+    RMSNorm = nn.RMSNorm
 def attention_flops_per_token(n_layers, seq_len, dim, causal):
     # Formula from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py#L27-L30