Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

winglian commited on Sep 27, 2023

Commit

895f0a0

unverified ·

1 Parent(s): e7d3e2d

skip some flash attn patches unless explicitly enabled (#643)

Browse files

* skip some flash attn patches if explicitly disabled

* make the other patches optional

Files changed (3) hide show

README.md +2 -0
src/axolotl/monkeypatch/llama_attn_hijack_flash.py +31 -23
src/axolotl/utils/models.py +5 -1

README.md CHANGED Viewed

@@ -636,6 +636,8 @@ flash_optimum:
 xformers_attention:
 # whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 flash_attention:
 # whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:

 xformers_attention:
 # whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 flash_attention:
+flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
 # whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:

src/axolotl/monkeypatch/llama_attn_hijack_flash.py CHANGED Viewed

@@ -38,7 +38,11 @@ except ImportError:
 LOG = logging.getLogger("axolotl")
-def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
         _prepare_decoder_attention_mask
     )
@@ -49,33 +53,37 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
             llama_model_forward
         )
-    try:
-        from flash_attn.losses.cross_entropy import CrossEntropyLoss
-        LOG.info("patching with flash_attn.losses.cross_entropy")
-        transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
-            CrossEntropyLoss, inplace_backward=True
-        )
-    except ImportError:
-        LOG.info(
-            "optimized flash-attention CrossEntropyLoss not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy'`)"
-        )
-    try:
-        from flash_attn.ops.rms_norm import RMSNorm
-        class LlamaRMSNorm(RMSNorm):
-            """Patched LLamaRMSNorm"""
-            def __init__(self, hidden_size, eps=1e-6):
-                super().__init__(hidden_size, eps=eps)
-        LOG.info("patching with flash_attn.ops.rms_norm")
-        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
-    except ImportError:
-        LOG.info(
-            "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
-        )
 # Disable the transformation of the attention mask in LlamaModel as the flash attention

 LOG = logging.getLogger("axolotl")
+def replace_llama_attn_with_flash_attn(
+    packed: Optional[bool] = False,
+    cross_entropy: Optional[bool] = False,
+    rms_norm: Optional[bool] = False,
+):
     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
         _prepare_decoder_attention_mask
     )
             llama_model_forward
         )
+    # skip only if explicitly disabled
+    if cross_entropy:
+        try:
+            from flash_attn.losses.cross_entropy import CrossEntropyLoss
+            LOG.info("patching with flash_attn.losses.cross_entropy")
+            transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
+                CrossEntropyLoss, inplace_backward=True
+            )
+        except ImportError:
+            LOG.info(
+                "optimized flash-attention CrossEntropyLoss not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy'`)"
+            )
+    # skip only if explicitly disabled
+    if rms_norm:
+        try:
+            from flash_attn.ops.rms_norm import RMSNorm
+            class LlamaRMSNorm(RMSNorm):
+                """Patched LLamaRMSNorm"""
+                def __init__(self, hidden_size, eps=1e-6):
+                    super().__init__(hidden_size, eps=eps)
+            LOG.info("patching with flash_attn.ops.rms_norm")
+            transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+        except ImportError:
+            LOG.info(
+                "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
+            )
 # Disable the transformation of the attention mask in LlamaModel as the flash attention

src/axolotl/utils/models.py CHANGED Viewed

@@ -121,7 +121,11 @@ def load_model(
             )
             LOG.info("patching with flash attention for sample packing")
-            replace_llama_attn_with_flash_attn(packed=cfg.sample_packing)
     elif cfg.is_llama_derived_model and cfg.xformers_attention:
         from axolotl.monkeypatch.llama_attn_hijack_xformers import (
             hijack_llama_attention,

             )
             LOG.info("patching with flash attention for sample packing")
+            replace_llama_attn_with_flash_attn(
+                packed=cfg.sample_packing,
+                cross_entropy=cfg.flash_attn_cross_entropy,
+                rms_norm=cfg.flash_attn_rms_norm,
+            )
     elif cfg.is_llama_derived_model and cfg.xformers_attention:
         from axolotl.monkeypatch.llama_attn_hijack_xformers import (
             hijack_llama_attention,