drbh commited on 4 days ago

Commit

c743a32

1 Parent(s): 4762963

fix: align kernel source with latest reference source

Files changed (19) hide show

.gitignore +8 -1
build.toml +6 -20
flash_attn/flash_api.cpp +5 -5
flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu +0 -14
flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu +0 -14
flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu +0 -14
flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu +0 -14
flash_attn/src/flash_bwd_launch_template.h +1 -21
flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu +0 -14
flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu +0 -14
flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu +0 -14
flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu +0 -14
flash_attn/src/flash_fwd_launch_template.h +2 -31
flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu +0 -11
flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu +0 -11
flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu +0 -11
flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu +0 -11
flash_attn/src/generate_kernels.py +1 -1
flash_attn/src/static_switch.h +0 -3

.gitignore CHANGED Viewed

	@@ -1 +1,8 @@
1	- .bak

+.bak
+__pycache__
+build-ext
+cmake
+result
+CMakeLists.txt
+setup.py
+pyproject.toml

build.toml CHANGED Viewed

@@ -1,10 +1,12 @@
 [general]
 name = "flash_attn"
 [torch]
 src = ["torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h"]
 [kernel.flash_attn]
 cuda-capabilities = [
   "8.0",
   "9.0",
@@ -13,6 +15,7 @@ cuda-capabilities = [
 ]
 src = [
   "flash_attn/flash_api.cpp",
   "flash_attn/src/philox_unpack.cuh",
   "flash_attn/src/namespace_config.h",
   "flash_attn/src/hardware_info.h",
@@ -21,29 +24,18 @@ src = [
   "flash_attn/src/alibi.h",
   "flash_attn/src/block_info.h",
   "flash_attn/src/dropout.h",
-  "flash_attn/src/flash.h",
-  "flash_attn/src/generate_kernels.py",
-  "flash_attn/src/hardware_info.h",
   "flash_attn/src/kernel_traits.h",
   "flash_attn/src/mask.h",
-  "flash_attn/src/namespace_config.h",
   "flash_attn/src/philox.cuh",
-  "flash_attn/src/philox_unpack.cuh",
   "flash_attn/src/rotary.h",
   "flash_attn/src/softmax.h",
-  "flash_attn/src/static_switch.h",
   "flash_attn/src/utils.h",
-  ## bwd kernels
   "flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu",
   "flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu",
   "flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu",
   "flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu",
-  "flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu",
-  "flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu",
-  "flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu",
-  "flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu",
   "flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu",
   "flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu",
   "flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu",
@@ -73,10 +65,6 @@ src = [
   "flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu",
-  "flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu",
-  "flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu",
-  "flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu",
-  "flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu",
   "flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu",
@@ -99,14 +87,12 @@ src = [
   "flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu",
   "flash_attn/src/flash_fwd_kernel.h",
   "flash_attn/src/flash_fwd_launch_template.h",
   "flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu",
-  "flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu",
-  "flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu",
-  "flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu",
-  "flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu",

 [general]
 name = "flash_attn"
+universal=false
 [torch]
 src = ["torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h"]
 [kernel.flash_attn]
+backend = "cuda"
 cuda-capabilities = [
   "8.0",
   "9.0",
 ]
 src = [
   "flash_attn/flash_api.cpp",
   "flash_attn/src/philox_unpack.cuh",
   "flash_attn/src/namespace_config.h",
   "flash_attn/src/hardware_info.h",
   "flash_attn/src/alibi.h",
   "flash_attn/src/block_info.h",
   "flash_attn/src/dropout.h",
   "flash_attn/src/kernel_traits.h",
   "flash_attn/src/mask.h",
   "flash_attn/src/philox.cuh",
   "flash_attn/src/rotary.h",
   "flash_attn/src/softmax.h",
   "flash_attn/src/utils.h",
+  # bwd kernels
   "flash_attn/src/flash_bwd_hdim128_bf16_causal_sm80.cu",
   "flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu",
   "flash_attn/src/flash_bwd_hdim128_fp16_causal_sm80.cu",
   "flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu",
   "flash_attn/src/flash_bwd_hdim192_bf16_causal_sm80.cu",
   "flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu",
   "flash_attn/src/flash_bwd_hdim192_fp16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_hdim128_fp16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu",
   "flash_attn/src/flash_fwd_hdim192_bf16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_hdim192_fp16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu",
   "flash_attn/src/flash_fwd_kernel.h",
   "flash_attn/src/flash_fwd_launch_template.h",
+  # split kernels
   "flash_attn/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu",
   "flash_attn/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu",

flash_attn/flash_api.cpp CHANGED Viewed

@@ -432,7 +432,7 @@ mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x round_mult
     }
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
     const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
@@ -644,7 +644,7 @@ mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \s
     }
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
     const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
@@ -831,7 +831,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x multipl
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
     const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
@@ -1048,7 +1048,7 @@ mha_varlen_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
     const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
@@ -1321,7 +1321,7 @@ mha_fwd_kvcache(at::Tensor &q,                 // batch_size x seqlen_q x num_he
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
     const int head_size = round_multiple(head_size_og, 8);
-    const int head_size_rounded = head_size <= 192 ? round_multiple(head_size, 32) : 256;
     const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);

     }
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, head_size <= 128 ? 32 : 64);
     const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
     }
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, head_size <= 128 ? 32 : 64);
     const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
     TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, head_size <= 128 ? 32 : 64);
     const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
     if (softcap > 0.f) { TORCH_CHECK(p_dropout == 0.f, "Softcapping does not support dropout for now"); }
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = round_multiple(head_size, head_size <= 128 ? 32 : 64);
     const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
     auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
     const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, head_size <= 128 ? 32 : 64);
     const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
     const int seqlen_k_rounded = round_multiple(seqlen_k, 128);

flash_attn/src/flash_bwd_hdim160_bf16_causal_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_bwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_bwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_bwd_<cutlass::bfloat16_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_bwd_hdim160_fp16_causal_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_bwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_bwd_<cutlass::half_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::half_t, true>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_bwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_bwd_<cutlass::half_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
-    run_mha_bwd_hdim160<cutlass::half_t, false>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_bwd_launch_template.h CHANGED Viewed

@@ -102,7 +102,7 @@ void run_flash_bwd_seqk_parallel(Flash_bwd_params &params, cudaStream_t stream)
                         // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
                         // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
                         // If Is_local, set Is_causal to false
-                        auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap>;
                         // auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, false, Is_causal, false, false, true, true>;
                         if (smem_size_dq_dk_dv >= 48 * 1024)  {
                             C10_CUDA_CHECK(cudaFuncSetAttribute(
@@ -261,26 +261,6 @@ void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream) {
     });
 }
-template<typename T, bool Is_causal>
-void run_mha_bwd_hdim160(Flash_bwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 160;
-    int device;
-    cudaGetDevice(&device);
-    int max_smem_per_block;
-    cudaError status_ = cudaDeviceGetAttribute(
-        &max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device);
-    if (status_ != cudaSuccess) {
-      C10_CUDA_CHECK(status_);
-    }
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        if (max_smem_per_block >= 116 * 1024) {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-        } else {
-            run_flash_bwd<Flash_bwd_kernel_traits<Headdim, 64, 64, 8, 4, 4, 4, false, true, T>, Is_dropout, Is_causal>(params, stream);
-        }
-    });
-}
 template<typename T, bool Is_causal>
 void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 192;

                         // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
                         // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
                         // If Is_local, set Is_causal to false
+                        auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !Has_alibi && Kernel_traits::kHeadDim <= 128, IsEvenKConst && !Has_alibi, Is_softcap>;
                         // auto kernel = &flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel<Kernel_traits, false, Is_causal, false, false, true, true>;
                         if (smem_size_dq_dk_dv >= 48 * 1024)  {
                             C10_CUDA_CHECK(cudaFuncSetAttribute(
     });
 }
 template<typename T, bool Is_causal>
 void run_mha_bwd_hdim192(Flash_bwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 192;

flash_attn/src/flash_fwd_hdim160_bf16_causal_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_fwd_<cutlass::bfloat16_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream) {
-    run_mha_fwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_fwd_<cutlass::bfloat16_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream) {
-    run_mha_fwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_hdim160_fp16_causal_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_fwd_<cutlass::half_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream) {
-    run_mha_fwd_hdim160<cutlass::half_t, true>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu DELETED Viewed

@@ -1,14 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template<>
-void run_mha_fwd_<cutlass::half_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream) {
-    run_mha_fwd_hdim160<cutlass::half_t, false>(params, stream);
-}
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_launch_template.h CHANGED Viewed

@@ -76,7 +76,7 @@ void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
                             // If return_softmax, set IsEvenMNConst to false to reduce number of templates
                             // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
                             // If Is_local, set Is_causal to false
-                            auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap, ReturnSoftmaxConst && Is_dropout && !Is_softcap>;
                             // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, false, true, true, false>;
                             // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
                             // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
@@ -117,7 +117,7 @@ void run_flash_splitkv_fwd(Flash_fwd_params &params, cudaStream_t stream) {
                                 // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr.
                                 // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
                                 // If Is_local, set Is_causal to false
-                                auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && !Append_KV && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap, Split, Append_KV>;
                                 // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, true, Split, Append_KV>;
                                 // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, IsEvenKConst>;
                                 if (smem_size >= 48 * 1024) {
@@ -165,7 +165,6 @@ void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream)
     constexpr static int kBlockM = 64;  // Fixed for all head dimensions
     // TD [2023-08-28]: nvcc segfaults for headdim 96 with block size 64 x 256,
     // and for headdim 192 with block size 64 x 128.
-    // Also for headdim 160 with block size 64 x 128 after the rotary addition.
     constexpr static int kBlockN = Headdim <= 64 ? 256 : (Headdim <= 128 ? 128 : 64);
     run_flash_splitkv_fwd<Flash_fwd_kernel_traits<Headdim, kBlockM, kBlockN, 4, false, false, T>, Is_causal>(params, stream);
 }
@@ -257,34 +256,6 @@ void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
     });
 }
-template<typename T, bool Is_causal>
-void run_mha_fwd_hdim160(Flash_fwd_params &params, cudaStream_t stream) {
-    constexpr static int Headdim = 160;
-    auto [cc_major, cc_minor] = get_compute_capability(get_current_device());
-    bool is_sm8x = cc_major == 8 && cc_minor > 0;
-    DROPOUT_SWITCH(params.p_dropout < 1.f, Is_dropout, [&] {
-        // For A100, H100, 128 x 32 is the fastest.
-        // For sm86 or sm89, 64 x 64 is the fastest for causal (because it's square),
-        // and 128 x 64 with 8 warps is the fastest for non-causal.
-        if (is_sm8x) {
-            if constexpr(!Is_causal) {
-                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, false, T>, Is_dropout, Is_causal>(params, stream);
-            } else {
-                run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-            }
-        } else {
-            run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-        }
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 32, 4, false, true, T>, Is_dropout, Is_causal>(params, stream);
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, false, T>, Is_dropout, Is_causal>(params, stream);
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 4, false, T>>(params, stream);
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 128, 4, false, T>>(params, stream);
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 64, 64, 4, false, T>>(params, stream);
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 64, 8, false, T>>(params, stream);
-        // run_flash_fwd<Flash_fwd_kernel_traits<Headdim, 128, 128, 8, false, T>>(params, stream);
-    });
-}
 template<typename T, bool Is_causal>
 void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 192;

                             // If return_softmax, set IsEvenMNConst to false to reduce number of templates
                             // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
                             // If Is_local, set Is_causal to false
+                            auto kernel = &flash_fwd_kernel<Kernel_traits, Is_dropout && !Is_softcap, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && !Has_alibi && !ReturnSoftmaxConst && Kernel_traits::kHeadDim <= 128, IsEvenKConst && !ReturnSoftmaxConst && !Has_alibi, Is_softcap, ReturnSoftmaxConst && Is_dropout && !Is_softcap>;
                             // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, false, true, true, false>;
                             // printf("IsEvenMNConst = %d, IsEvenKConst = %d, Is_local = %d, Is_causal = %d, ReturnSoftmaxConst = %d, Is_dropout = %d\n", int(IsEvenMNConst), int(IsEvenKConst), int(Is_local), int(Is_causal), int(ReturnSoftmaxConst), int(Is_dropout));
                             // auto kernel = &flash_fwd_kernel<Kernel_traits, false, Is_causal, false, true, true, false>;
                                 // If Append_KV, then we must have seqlen_offsets, which means cu_seqlens_k != nullptr.
                                 // If not IsEvenKConst, we also set IsEvenMNConst to false to reduce number of templates.
                                 // If Is_local, set Is_causal to false
+                                auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && !Append_KV && IsEvenKConst && !Is_local && !Has_alibi && Kernel_traits::kHeadDim <= 128, IsEvenKConst && !Has_alibi, Is_softcap, Split, Append_KV>;
                                 // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, true, Split, Append_KV>;
                                 // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, IsEvenKConst>;
                                 if (smem_size >= 48 * 1024) {
     constexpr static int kBlockM = 64;  // Fixed for all head dimensions
     // TD [2023-08-28]: nvcc segfaults for headdim 96 with block size 64 x 256,
     // and for headdim 192 with block size 64 x 128.
     constexpr static int kBlockN = Headdim <= 64 ? 256 : (Headdim <= 128 ? 128 : 64);
     run_flash_splitkv_fwd<Flash_fwd_kernel_traits<Headdim, kBlockM, kBlockN, 4, false, false, T>, Is_causal>(params, stream);
 }
     });
 }
 template<typename T, bool Is_causal>
 void run_mha_fwd_hdim192(Flash_fwd_params &params, cudaStream_t stream) {
     constexpr static int Headdim = 192;

flash_attn/src/flash_fwd_split_hdim160_bf16_causal_sm80.cu DELETED Viewed

@@ -1,11 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream);
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu DELETED Viewed

@@ -1,11 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream);
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_split_hdim160_fp16_causal_sm80.cu DELETED Viewed

@@ -1,11 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream);
-} // namespace FLASH_NAMESPACE

flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu DELETED Viewed

@@ -1,11 +0,0 @@
-// Copyright (c) 2024, Tri Dao.
-// Splitting the different head dimensions to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"
-#include "namespace_config.h"
-#include "flash_fwd_launch_template.h"
-namespace FLASH_NAMESPACE {
-template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream);
-} // namespace FLASH_NAMESPACE

flash_attn/src/generate_kernels.py CHANGED Viewed

@@ -10,7 +10,7 @@ DTYPE_MAP = {
 }
 SM = [80]  # Sm80 kernels support up to
-HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256]
 IS_CAUSAL = ["false", "true"]
 NAMESPACE_INCLUDE = '#include "namespace_config.h"\n'

 }
 SM = [80]  # Sm80 kernels support up to
+HEAD_DIMENSIONS = [32, 64, 96, 128, 192, 256]
 IS_CAUSAL = ["false", "true"]
 NAMESPACE_INCLUDE = '#include "namespace_config.h"\n'

flash_attn/src/static_switch.h CHANGED Viewed

@@ -101,9 +101,6 @@
     } else if (HEADDIM <= 128) {           \
       constexpr static int kHeadDim = 128; \
       return __VA_ARGS__();                \
-    } else if (HEADDIM <= 160) {           \
-      constexpr static int kHeadDim = 160; \
-      return __VA_ARGS__();                \
     } else if (HEADDIM <= 192) {           \
       constexpr static int kHeadDim = 192; \
       return __VA_ARGS__();                \

     } else if (HEADDIM <= 128) {           \
       constexpr static int kHeadDim = 128; \
       return __VA_ARGS__();                \
     } else if (HEADDIM <= 192) {           \
       constexpr static int kHeadDim = 192; \
       return __VA_ARGS__();                \