Various small fixes

Files changed (5) hide show

flake.lock CHANGED Viewed

@@ -98,11 +98,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1750275112,
-        "narHash": "sha256-gqAxmLLt0tYvuRYumOZHQgryMeEFdt6j3nEC8B5rT14=",
         "owner": "huggingface",
         "repo": "kernel-builder",
-        "rev": "1b63210b2a1fc3cda2e3a579e7aa8f8c8532626f",
         "type": "github"
       },
       "original": {

         ]
       },
       "locked": {
+        "lastModified": 1751014803,
+        "narHash": "sha256-9Xfq2k3uPfB602NwQF+zAY2GQZiKUN1G7Q6XiDCUR8Y=",
         "owner": "huggingface",
         "repo": "kernel-builder",
+        "rev": "bbc4e712ff2046e217818e97de2201e2b996756e",
         "type": "github"
       },
       "original": {

flake.nix CHANGED Viewed

@@ -13,5 +13,37 @@
     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
     };
 }

     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+      # Building with CDUA later than 12.4 fails with:
+      #
+      # error: 'ptxas' died due to signal 11 (Invalid memory reference)
+      #
+      # So, build for 12.4 only and copy to all the other build variants
+      # by hand (which works fine thanks to backward compat).
+      torchVersions = [
+        {
+          torchVersion = "2.6";
+          cudaVersion = "12.4";
+          cxx11Abi = false;
+          systems = [ "x86_64-linux" ];
+          upstreamVariant = true;
+        }
+        {
+          torchVersion = "2.6";
+          cudaVersion = "12.4";
+          cxx11Abi = true;
+          systems = [ "x86_64-linux" ];
+          upstreamVariant = true;
+        }
+        {
+          torchVersion = "2.7";
+          cudaVersion = "12.4";
+          cxx11Abi = true;
+          systems = [
+            "x86_64-linux"
+            "aarch64-linux"
+          ];
+          upstreamVariant = true;
+        }
+      ];
     };
 }

torch-ext/{flash_attn → flash_attn3}/__init__.py RENAMED Viewed

File without changes

torch-ext/{flash_attn → flash_attn3}/flash_attn_interface.py RENAMED Viewed

File without changes

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -5,7 +5,7 @@
 #include "torch_binding.h"
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-    m.def("fwd("
         "Tensor q,"
         "Tensor k,"
         "Tensor v,"
@@ -40,7 +40,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
         "int num_splits = 0,"
         "bool? pack_gqa = None,"
         "int sm_margin = 0) -> (Tensor(out!), Tensor, Tensor, Tensor)");
-    m.def("bwd("
         "Tensor dout,"
         "Tensor q,"
         "Tensor k,"
@@ -63,12 +63,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
         "float softcap = 0.0,"
         "bool deterministic = False,"
         "int sm_margin = 0) -> (Tensor(dq!), Tensor(dk!), Tensor(dv!), Tensor, Tensor, Tensor, Tensor, Tensor)");
-    m.def("fwd_combine("
         "Tensor out_partial,"
         "Tensor lse_partial,"
         "Tensor(out!)? out = None,"
         "ScalarType? out_dtype = None) -> (Tensor(out!), Tensor)");
-    m.def("get_scheduler_metadata("
         "int batch_size,"
         "int max_seqlen_q,"
         "int max_seqlen_k,"
@@ -94,10 +94,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
         "bool? pack_gqa = None,"
         "int sm_margin = 0) -> Tensor");
-    m.impl("fwd", &mha_fwd);
-    m.impl("bwd", &mha_bwd);
-    m.impl("fwd_combine", &mha_combine);
-    m.impl("get_scheduler_metadata", &mha_fwd_get_scheduler_metadata);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

 #include "torch_binding.h"
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+    ops.def("fwd("
         "Tensor q,"
         "Tensor k,"
         "Tensor v,"
         "int num_splits = 0,"
         "bool? pack_gqa = None,"
         "int sm_margin = 0) -> (Tensor(out!), Tensor, Tensor, Tensor)");
+    ops.def("bwd("
         "Tensor dout,"
         "Tensor q,"
         "Tensor k,"
         "float softcap = 0.0,"
         "bool deterministic = False,"
         "int sm_margin = 0) -> (Tensor(dq!), Tensor(dk!), Tensor(dv!), Tensor, Tensor, Tensor, Tensor, Tensor)");
+    ops.def("fwd_combine("
         "Tensor out_partial,"
         "Tensor lse_partial,"
         "Tensor(out!)? out = None,"
         "ScalarType? out_dtype = None) -> (Tensor(out!), Tensor)");
+    ops.def("get_scheduler_metadata("
         "int batch_size,"
         "int max_seqlen_q,"
         "int max_seqlen_k,"
         "bool? pack_gqa = None,"
         "int sm_margin = 0) -> Tensor");
+    ops.impl("fwd", &mha_fwd);
+    ops.impl("bwd", &mha_bwd);
+    ops.impl("fwd_combine", &mha_combine);
+    ops.impl("get_scheduler_metadata", &mha_fwd_get_scheduler_metadata);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)