feat: vendor grouped gemm

Files changed (14) hide show

build.toml +4 -0
csrc/grouped_gemm/fill_arguments.cuh +141 -0
csrc/grouped_gemm/grouped_gemm.cu +567 -0
csrc/grouped_gemm/grouped_gemm.h +20 -0
csrc/grouped_gemm/ops.cu +11 -0
tests/ops_test.py +170 -0
tests/test_gg.py +57 -0
torch-ext/megablocks/__init__.py +9 -5
torch-ext/megablocks/grouped_gemm/__init__.py +2 -0
torch-ext/megablocks/grouped_gemm/backend.py +32 -0
torch-ext/megablocks/grouped_gemm/ops.py +33 -0
torch-ext/megablocks/grouped_gemm_util.py +8 -3
torch-ext/megablocks/layers/__init__.py +1 -1
torch-ext/torch_binding.cpp +12 -0

build.toml CHANGED Viewed

@@ -35,4 +35,8 @@ src = [
     "csrc/new_replicate.h",
     "csrc/new_sort.h",
     "csrc/new_sort.cu",
 ]

     "csrc/new_replicate.h",
     "csrc/new_sort.h",
     "csrc/new_sort.cu",
+    # vendored grouped gemm
+    "csrc/grouped_gemm/fill_arguments.cuh",
+    "csrc/grouped_gemm/grouped_gemm.cu",
+    "csrc/grouped_gemm/grouped_gemm.h",
 ]

csrc/grouped_gemm/fill_arguments.cuh ADDED Viewed

	@@ -0,0 +1,141 @@

+#pragma once
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <cub/cub.cuh>
+#include <cutlass/bfloat16.h>
+#include <cutlass/gemm_coord.h>
+namespace grouped_gemm {
+constexpr int kDynamicDim = -1;
+constexpr int kMaxExperts = 512;
+struct GemmProblem {
+  ::cutlass::gemm::GemmCoord dims;
+  int64_t lda, ldb, ldc;
+  // All offsets are in elements.
+  int64_t a_offset, b_offset, c_offset;
+};
+// TODO: revisit `ExtractGemmProblemK` struct
+// struct ExtractGemmProblemK {
+//   __device__ ::cuda::std::tuple<int&> operator()(GemmProblem& problem) const {
+//       return {problem.dims.k()};
+//   }
+// };
+template <
+    // If `k` is dynamic, we sort the problems by `k` in descending order.
+    // Otherwise, `m` is dynamic, and no sorting happens.
+    bool kDynamicK,
+    typename ElementA, typename ElementB, typename ElementC,
+    typename LayoutA, typename LayoutB, typename LayoutC,
+    typename Args
+>
+__global__ void FillArguments(
+    int num_experts, const int64_t* batch_sizes,
+    ElementA* ptr_a, ElementB* ptr_b, ElementC* ptr_c,
+    Args args, ::cutlass::gemm::GemmCoord dims
+) {
+  const int expert_idx = threadIdx.x;
+  const int batch_size = expert_idx < num_experts ? batch_sizes[expert_idx] : -1;
+  if (kDynamicK) {
+    assert(dims.k() == kDynamicDim);
+    dims.k() = batch_size;
+  } else {
+    assert(dims.m() == kDynamicDim);
+    dims.m() = batch_size;
+  }
+  using BlockScan = cub::BlockScan<int, kMaxExperts>;
+  using BlockSort = cub::BlockRadixSort<int, kMaxExperts, 1, GemmProblem>;
+  union SharedMemory {
+    typename BlockScan::TempStorage scan_storage;
+    typename BlockSort::TempStorage sort_storage;
+  };
+  __shared__ SharedMemory shared_memory;
+  int dynamic_dim = kDynamicK ? dims.k() : dims.m();
+  int dynamic_dim_cumsum;
+  BlockScan(shared_memory.scan_storage).ExclusiveSum(dynamic_dim, dynamic_dim_cumsum);
+  __syncthreads();
+  // We have to use `GemmProblem[1]` here instead of just `GemmProblem` because `SortDescending()` expects
+  // `KeyT (&)[ITEMS_PER_THREAD]` for the `keys` argument (i.e., `GemmProblem (&keys)[1]` in our case).
+  GemmProblem problem[1] = {
+    GemmProblem {
+      .dims = dims,
+      .lda = LayoutA::packed({dims.m(), dims.k()}).stride(0),
+      .ldb = LayoutB::packed({dims.k(), dims.n()}).stride(0),
+      .ldc = LayoutC::packed({dims.m(), dims.n()}).stride(0),
+      .a_offset = kDynamicK
+          ? (dims.m() * dynamic_dim_cumsum)
+          : (dynamic_dim_cumsum * dims.k()),
+      .b_offset = (kDynamicK ? dynamic_dim_cumsum : expert_idx * dims.k()) * dims.n(),
+      .c_offset = (kDynamicK ? expert_idx * dims.m() : dynamic_dim_cumsum) * dims.n(),
+    },
+  };
+  if constexpr (kDynamicK) {
+    // Sort by k dimension in descending order
+    // We need to extract the key (k value) for sorting
+    int k_keys[1] = { problem[0].dims.k() };
+    BlockSort(shared_memory.sort_storage).SortDescending(k_keys, problem);
+    // TODO: revisit original impl without `__syncthreads()`
+    // BlockSort(shared_memory.sort_storage).SortDescending(problem, ExtractGemmProblemK{});
+    // Quoting the CUB documentation (https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockRadixSort.html):
+    // > A subsequent __syncthreads() threadblock barrier should be invoked after calling this method if the collective’s temporary storage [...]
+    // > is **to be reused or repurposed**.
+    // We don't need `__syncthreads()` here, since we don't do either of these things.
+  }
+  if (expert_idx < num_experts) {
+    args.problem_sizes[expert_idx] = problem[0].dims;
+    args.lda[expert_idx] = problem[0].lda;
+    args.ldb[expert_idx] = problem[0].ldb;
+    args.ldc[expert_idx] = problem[0].ldc;
+    args.ptr_A[expert_idx] = ptr_a + problem[0].a_offset;
+    args.ptr_B[expert_idx] = ptr_b + problem[0].b_offset;
+    args.ptr_C[expert_idx] = ptr_c + problem[0].c_offset;
+  }
+}
+template <typename Args>
+__global__ void ZeroOutK0Outputs(int num_experts, Args args) {
+  const int64_t start_idx = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t delta     = (int64_t)gridDim.x * blockDim.x;
+  for (int ei = 0; ei < num_experts; ++ei) {
+    auto& dims = args.problem_sizes[ei];
+    // CUTLASS doesn't handle problems with `k=0` correctly, see https://github.com/NVIDIA/cutlass/pull/1593.
+    // Until a fix is available on the CUTLASS side, handle these problems by ourselves:
+    //   * (here) set the output to zero
+    //   * (in `IgnoreK0Problems`) make this problem a no-op by setting `m=0` and `n=0` (CUTLASS can handle the outer dimensions being zero)
+    if (dims.k() == 0) {
+      // Assume packed layout, run a grid-strided loop over the output.
+      int64_t total_elems = (int64_t)dims.m() * dims.n();
+      auto* out           = args.ptr_C[ei];
+      for (int64_t idx = start_idx; idx < total_elems; idx += delta) {
+        out[idx] = {};
+      }
+    }
+  }
+}
+template <typename Args>
+__global__ void IgnoreK0Problems(int num_experts, Args args) {
+  const int expert_idx = threadIdx.x;
+  if (expert_idx < num_experts) {
+    auto& dims = args.problem_sizes[expert_idx];
+    if (dims.k() == 0) {
+      dims.m() = 0;
+      dims.n() = 0;
+    }
+  }
+}
+}  // namespace grouped_gemm

csrc/grouped_gemm/grouped_gemm.cu ADDED Viewed

	@@ -0,0 +1,567 @@

+#include "grouped_gemm.h"
+#include "fill_arguments.cuh"
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <c10/util/BFloat16.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cub/cub.cuh>
+#include <torch/torch.h>
+#include "cutlass/bfloat16.h"
+#include "cutlass/complex.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include <type_traits>
+namespace grouped_gemm {
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+#define CUBLAS_CALL(code)					  \
+  do {								  \
+    cublasStatus_t status = code;				  \
+    TORCH_CHECK(status == CUBLAS_STATUS_SUCCESS, "CuBLAS Error"); \
+  } while (0)
+#define GROUPED_GEMM_STRINGIFY_HELPER(x) #x
+#define GROUPED_GEMM_STRINGIFY(x) \
+  GROUPED_GEMM_STRINGIFY_HELPER(x)
+template <bool trans>
+using GroupedGemmInputLayout = std::conditional_t<trans, ::cutlass::layout::ColumnMajor, ::cutlass::layout::RowMajor>;
+using GroupedGemmConfig = ::cutlass::gemm::device::DefaultGemmConfiguration<
+  ::cutlass::arch::OpClassTensorOp,
+  ::cutlass::arch::Sm80,
+  ::cutlass::bfloat16_t,
+  ::cutlass::bfloat16_t,
+  ::cutlass::bfloat16_t,
+  float
+>;
+// TODO(tgale): Update this for SM90 when it's supported by CUTLASS.
+template <bool trans_a, bool trans_b>
+using GroupedGemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+  // A operand.
+  ::cutlass::bfloat16_t,
+  GroupedGemmInputLayout<trans_a>,
+  ::cutlass::ComplexTransform::kNone,
+  GroupedGemmConfig::kAlignmentA,
+  // B operand.
+  ::cutlass::bfloat16_t,
+  GroupedGemmInputLayout<trans_b>,
+  ::cutlass::ComplexTransform::kNone,
+  GroupedGemmConfig::kAlignmentB,
+  // C operand.
+  ::cutlass::bfloat16_t,
+  ::cutlass::layout::RowMajor,
+  float,
+  ::cutlass::arch::OpClassTensorOp,
+  ::cutlass::arch::Sm80,
+  GroupedGemmConfig::ThreadblockShape,
+  GroupedGemmConfig::WarpShape,
+  GroupedGemmConfig::InstructionShape,
+  GroupedGemmConfig::EpilogueOutputOp,
+  // NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
+  // This parameter is passed in at present to match the APIs of other kernels. The parameter
+  // is unused within the kernel.
+  ::cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+  // TODO(tgale): Tune this for SM90.
+  GroupedGemmConfig::kStages>::GemmKernel;
+template <bool trans_a, bool trans_b>
+using GemmGrouped = ::cutlass::gemm::device::GemmGrouped<GroupedGemmKernel<trans_a, trans_b>>;
+template <typename T>
+torch::Tensor CopyToDevice(const std::vector<T> &x, const torch::Device &device) {
+  size_t bytes = x.size() * sizeof(T);
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(device);
+  torch::Tensor out = torch::empty(bytes, options);
+  CUDA_CALL(cudaMemcpyAsync(out.data_ptr(),
+			    x.data(), bytes,
+			    cudaMemcpyHostToDevice,
+			    c10::cuda::getCurrentCUDAStream()));
+  return out;
+}
+template <typename T>
+static void ReorderArray(T* data, const std::vector<size_t>& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(data, data + indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+        data[i] = copy.at(indices[i]);
+    }
+}
+template <typename T>
+torch::Tensor TypedEmpty(size_t numel, const torch::Device& device) {
+    return torch::empty(numel * sizeof(T), torch::dtype(torch::kInt8).device(device));
+}
+struct RawGemmArguments {
+  torch::Tensor lda, ldb, ldc, ptr_a, ptr_b, ptr_c, problem_sizes;
+  int threadblock_count{};
+};
+template <
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC
+>
+RawGemmArguments MakeArgumentsOnDevice(int num_experts, const torch::Device& device) {
+    TORCH_CHECK(
+        num_experts <= kMaxExperts,
+        "At most ", kMaxExperts,
+        " experts are supported when batch_sizes is a CUDA tensor, but got ", num_experts
+    );
+    return RawGemmArguments {
+      .lda = TypedEmpty<int64_t>(num_experts, device),
+      .ldb = TypedEmpty<int64_t>(num_experts, device),
+      .ldc = TypedEmpty<int64_t>(num_experts, device),
+      .ptr_a = TypedEmpty<ElementA*>(num_experts, device),
+      .ptr_b = TypedEmpty<ElementB*>(num_experts, device),
+      .ptr_c = TypedEmpty<ElementC*>(num_experts, device),
+      .problem_sizes = TypedEmpty<cutlass::gemm::GemmCoord>(num_experts, device),
+      // We don't know the problem dimensions on the host, so we just base the number of threadblocks on occupancy here.
+      .threadblock_count = Gemm::sufficient(),
+    };
+}
+template <
+  bool kDynamicK,
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC
+>
+RawGemmArguments MakeArgumentsOnHost(torch::Tensor a,
+				     torch::Tensor b,
+				     torch::Tensor c,
+				     torch::Tensor batch_sizes,
+				     ::cutlass::gemm::GemmCoord coord_template,
+				     int64_t num_experts) {
+  std::vector<::cutlass::gemm::GemmCoord> problem_sizes_host(num_experts);
+  // Create the host arrays of leading dimension data and pointer data.
+  std::vector<int64_t> lda_host(num_experts), ldb_host(num_experts), ldc_host(num_experts);
+  int64_t elements_a = 0, elements_b = 0, elements_c = 0;
+  std::vector<ElementA *> ptr_a_host(num_experts), ptr_b_host(num_experts), ptr_c_host(num_experts);
+  for (int i = 0; i < num_experts; ++i) {
+    auto& problem = problem_sizes_host[i];
+    problem = coord_template;
+    (kDynamicK ? problem.k() : problem.m()) = batch_sizes.data_ptr<int64_t>()[i];
+    lda_host[i] = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+    ldb_host[i] = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+    ldc_host[i] = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+    ptr_a_host[i] = (ElementA*)a.data_ptr() + elements_a;
+    ptr_b_host[i] = (ElementB*)b.data_ptr() + elements_b;
+    ptr_c_host[i] = (ElementC*)c.data_ptr() + elements_c;
+    elements_a += problem.m() * problem.k();
+    elements_b += problem.k() * problem.n();
+    elements_c += problem.m() * problem.n();
+    if (problem.k() == 0) {
+      // CUTLASS doesn't handle problems with `k=0` correctly, see https://github.com/NVIDIA/cutlass/pull/1593.
+      // Until a fix is available on the CUTLASS side, handle these problems by ourselves:
+      //   * set the output to zero with `cudaMemsetAsync()`
+      //   * make this problem a no-op by setting `m=0` and `n=0` (CUTLASS can handle the outer dimensions being zero)
+      CUDA_CALL(cudaMemsetAsync(ptr_c_host[i],
+        0,
+        problem.m() * problem.n() * sizeof(ElementC),
+        c10::cuda::getCurrentCUDAStream()));
+      problem.m() = 0;
+      problem.n() = 0;
+    }
+  }
+  // Only sort problems when K are different
+  if (kDynamicK) {
+      std::vector<size_t> indices(num_experts);
+      std::iota(indices.begin(), indices.end(), 0);
+      std::stable_sort(indices.begin(), indices.end(), [&problem_sizes_host](size_t i, size_t j) {
+          return problem_sizes_host[i].k() > problem_sizes_host[j].k();
+      });
+      ReorderArray(problem_sizes_host.data(), indices);
+      ReorderArray(lda_host.data(), indices);
+      ReorderArray(ldb_host.data(), indices);
+      ReorderArray(ldc_host.data(), indices);
+      ReorderArray(ptr_a_host.data(), indices);
+      ReorderArray(ptr_b_host.data(), indices);
+      ReorderArray(ptr_c_host.data(), indices);
+  }
+  // Copy the problem sizes, pointers and leading dimension data to the device.
+  return RawGemmArguments {
+    .lda = CopyToDevice(lda_host, a.device()),
+    .ldb = CopyToDevice(ldb_host, a.device()),
+    .ldc = CopyToDevice(ldc_host, a.device()),
+    .ptr_a = CopyToDevice(ptr_a_host, a.device()),
+    .ptr_b = CopyToDevice(ptr_b_host, a.device()),
+    .ptr_c = CopyToDevice(ptr_c_host, a.device()),
+    .problem_sizes = CopyToDevice(problem_sizes_host, a.device()),
+    // We know the problem dimensions on the host, so we can calculate the number of threadblocks based on that.
+    .threadblock_count = Gemm::sufficient(problem_sizes_host.data(), num_experts),
+  };
+}
+template <
+  bool kDynamicK,
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC
+>
+typename Gemm::Arguments MakeArguments(torch::Tensor a,
+				       torch::Tensor b,
+				       torch::Tensor c,
+				       torch::Tensor batch_sizes,
+				       ::cutlass::gemm::GemmCoord coord_template,
+				       int64_t num_experts) {
+  RawGemmArguments raw_args;
+  if (batch_sizes.is_cuda()) {
+    raw_args = MakeArgumentsOnDevice<
+      Gemm, ElementA, ElementB, ElementC
+    >(num_experts, a.device());
+  } else {
+    raw_args = MakeArgumentsOnHost<
+      kDynamicK,
+      Gemm,
+      ElementA, ElementB, ElementC,
+      LayoutA, LayoutB, LayoutC
+    >(a, b, c, batch_sizes, coord_template, num_experts);
+  }
+  printf("Using %d threadblocks for grouped GEMM.\n", raw_args.threadblock_count);
+  // Validate the result.
+  if (!raw_args.threadblock_count) {
+    TORCH_CHECK(false, "Grouped GEMM execution not possible with HW");
+  }
+  typename Gemm::EpilogueOutputOp::Params epilogue_op(/*alpha=*/1.0f, /*beta=*/0.0f);
+  // We currently always use `GroupScheduleMode::kDeviceOnly`, which doesn't use `host_problem_sizes` at all,
+  // so we can safely pass `nullptr` for `host_problem_sizes`.
+  // TODO(tgale): Experiment with `GroupScheduleMode::kHostPrecompute` for `batch_sizes.is_cpu()`, where we
+  // know the problem dimensions on the host.
+  typename Gemm::Arguments arguments((cutlass::gemm::GemmCoord*)raw_args.problem_sizes.data_ptr(),
+				     (int)num_experts,
+				     (int)raw_args.threadblock_count,
+				     epilogue_op,
+				     (ElementA**)raw_args.ptr_a.data_ptr(),
+				     (ElementB**)raw_args.ptr_b.data_ptr(),
+				     (ElementC**)raw_args.ptr_c.data_ptr(),
+				     (ElementC**)raw_args.ptr_c.data_ptr(),
+				     /*lda=*/(int64_t*)raw_args.lda.data_ptr(),
+				     /*ldb=*/(int64_t*)raw_args.ldb.data_ptr(),
+				     /*ldc=*/(int64_t*)raw_args.ldc.data_ptr(),
+				     /*ldd=*/(int64_t*)raw_args.ldc.data_ptr(),
+				     /*host_problem_sizes=*/nullptr);
+  return arguments;
+}
+template <
+  bool trans_a,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC,
+  typename Arguments
+>
+void FillCutlassArguments(int num_experts,
+			  torch::Tensor batch_sizes,
+			  torch::Tensor a,
+			  torch::Tensor b,
+			  torch::Tensor c,
+			  const Arguments& arguments,
+			  ::cutlass::gemm::GemmCoord coord_template) {
+  // Convert the batch sizes to the format CUTLASS understands on the device.
+  // Use a single block here because:
+  //   * the number of elements to process is microscopically small
+  //   * we don't need any additional global memory
+  FillArguments<
+      /*kDynamicK*/trans_a,
+      ElementA, ElementB, ElementC,
+      LayoutA, LayoutB, LayoutC
+  ><<<1, kMaxExperts, 0, c10::cuda::getCurrentCUDAStream()>>>(
+      num_experts, batch_sizes.data_ptr<int64_t>(),
+      (ElementA*)a.data_ptr(), (ElementB*)b.data_ptr(), (ElementC*)c.data_ptr(),
+      arguments, coord_template
+  );
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+template <typename Args>
+void RemoveK0Problems(int num_experts, const Args& arguments) {
+  // For zeroing out the outputs (which might be arbitrarily large), we want to use
+  // as many threadblocks as possible in order to hit the maximum possible global memory bandwidth.
+  // `arguments.threadblock_count`, which we will use for the grouped GEMM proper,
+  // should be a good approximation for this.
+  // When the `k=0` case is fixed in CUTLASS, we can completely remove this function.
+  ZeroOutK0Outputs<><<<
+    arguments.threadblock_count, at::cuda::detail::CUDA_NUM_THREADS, 0, c10::cuda::getCurrentCUDAStream()
+  >>>(
+    num_experts, arguments
+  );
+  IgnoreK0Problems<><<<
+    1, kMaxExperts, 0, c10::cuda::getCurrentCUDAStream()
+  >>>(
+    num_experts, arguments
+  );
+}
+template <bool trans_a, bool trans_b>
+torch::Tensor CutlassGroupedGemm(torch::Tensor a,
+				 torch::Tensor b,
+				 torch::Tensor c,
+				 torch::Tensor batch_sizes,
+				 ::cutlass::gemm::GemmCoord coord_template) {
+  using Gemm = GemmGrouped<trans_a, trans_b>;
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  Gemm gemm;
+  int64_t num_experts = batch_sizes.size(0);
+  auto arguments = MakeArguments<
+    /*kDynamicK*/trans_a,
+    Gemm,
+    ElementA, ElementB, ElementC,
+    LayoutA, LayoutB, LayoutC
+  >(a, b, c, batch_sizes, coord_template, num_experts);
+  int64_t workspace_size = gemm.get_workspace_size(arguments);
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(a.device());
+  torch::Tensor workspace = torch::empty(workspace_size, options);
+  if (batch_sizes.is_cuda()) {
+      FillCutlassArguments<
+        trans_a,
+        ElementA, ElementB, ElementC,
+        LayoutA, LayoutB, LayoutC
+      >(num_experts, batch_sizes, a, b, c, arguments, coord_template);
+      RemoveK0Problems<>(num_experts, arguments);
+  }
+  // Initialize the kernel.
+  if(gemm.initialize(arguments, workspace.data_ptr()) != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "Failed to initialize CUTLASS Grouped GEMM");
+  }
+  // Execute the kernel in the current stream.
+  if(gemm.run(c10::cuda::getCurrentCUDAStream()) != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "Failed to run CUTLASS Grouped GEMM");
+  }
+  return c;
+}
+void CublasGemm(c10::BFloat16 *a, int64_t a_rows, int64_t a_cols, bool trans_a,
+		c10::BFloat16 *b, int64_t b_rows, int64_t b_cols, bool trans_b,
+		c10::BFloat16 *c, int64_t c_rows, int64_t c_cols) {
+  int m = trans_b ? b_rows : b_cols;
+  int k = trans_b ? b_cols : b_rows;
+  int n = trans_a ? a_cols : a_rows;
+  int lda = trans_a ? n : k;
+  int ldb = trans_b ? k : m;
+  cublasOperation_t transpose_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t transpose_b = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  float alpha = 1.0, beta = 0.0;
+  CUBLAS_CALL(cublasGemmEx(at::cuda::getCurrentCUDABlasHandle(),
+			   transpose_b, transpose_a,
+			   m, n, k, &alpha,
+			   b, CUDA_R_16BF, ldb,
+			   a, CUDA_R_16BF, lda,
+			   &beta,
+			   c, CUDA_R_16BF, c_cols, CUDA_R_32F,
+			   CUBLAS_GEMM_DEFAULT));
+}
+void CublasGroupedGemm(torch::Tensor a,
+		       torch::Tensor b,
+		       torch::Tensor c,
+		       torch::Tensor batch_sizes,
+		       bool trans_b) {
+  int64_t bs = batch_sizes.size(0), k = a.size(1);
+  int64_t n = trans_b ? b.size(1) : b.size(2);
+  int64_t b_rows = b.size(1), b_cols = b.size(2);
+  c10::BFloat16* a_ptr = a.data_ptr<c10::BFloat16>();
+  c10::BFloat16* b_ptr = b.data_ptr<c10::BFloat16>();
+  c10::BFloat16* c_ptr = c.data_ptr<c10::BFloat16>();
+  for (int i = 0; i < bs; ++i) {
+    int64_t m = batch_sizes.data_ptr<int64_t>()[i];
+    CublasGemm(a_ptr, m, k, /*trans_a=*/false,
+	       b_ptr, b_rows, b_cols, trans_b,
+	       c_ptr, m, n);
+    a_ptr += m * k;
+    b_ptr += b_rows * b_cols;
+    c_ptr += m * n;
+  }
+}
+void CublasGroupedGemmVariableK(torch::Tensor a,
+				torch::Tensor b,
+				torch::Tensor c,
+				torch::Tensor batch_sizes) {
+  int64_t bs = batch_sizes.size(0), m = a.size(1), n = b.size(1);
+  c10::BFloat16* a_ptr = a.data_ptr<c10::BFloat16>();
+  c10::BFloat16* b_ptr = b.data_ptr<c10::BFloat16>();
+  c10::BFloat16* c_ptr = c.data_ptr<c10::BFloat16>();
+  for (int i = 0; i < bs; ++i) {
+    int64_t k = batch_sizes.data_ptr<int64_t>()[i];
+    CublasGemm(a_ptr, k, m, /*trans_a=*/true,
+	       b_ptr, k, n, /*trans_b=*/false,
+	       c_ptr, m, n);
+    a_ptr += k * m;
+    b_ptr += k * n;
+    c_ptr += m * n;
+  }
+}
+void GroupedGemmVariableK(torch::Tensor a,
+			  torch::Tensor b,
+			  torch::Tensor c,
+			  torch::Tensor batch_sizes) {
+  // We expected a CUDA tensor with two dimensions and shape
+  // (tokens, hidden_out) for 'b'.
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(b.ndimension() == 2);
+  TORCH_CHECK(b.scalar_type() == torch::kBFloat16);
+  // Validate the dimensions.
+  int64_t tokens = a.size(0), num_experts = batch_sizes.size(0);
+  int64_t m = a.size(1), n = b.size(1);
+  // Validate that we have the same contraction dimension.
+  TORCH_CHECK(tokens == b.size(0));
+  // Validate the output shape.
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(c.ndimension() == 3);
+  TORCH_CHECK(c.scalar_type() == torch::kBFloat16);
+  TORCH_CHECK(c.size(0) == num_experts);
+  TORCH_CHECK(c.size(1) == m);
+  TORCH_CHECK(c.size(2) == n);
+  // Run the computation.
+  CublasGroupedGemmVariableK(a, b, c, batch_sizes);
+}
+// NOTE: We only support dynamic group sizes for the 'a' tensor. Tensor 'b' is
+// assumed to be batched with fixed sized batches.
+//
+// TODO(tgale): Validate alignment is true for every batch element.
+void GroupedGemm(torch::Tensor a,
+		 torch::Tensor b,
+		 torch::Tensor c,
+		 torch::Tensor batch_sizes,
+		 bool trans_a, bool trans_b) {
+  // NOTE: We only support 'trans_a' or 'trans_b', not both.
+  TORCH_CHECK(!(trans_a && trans_b));
+#if !defined(GROUPED_GEMM_CUTLASS)
+  // No way to run cuBLAS kernels if the problem dimensions are not known on the host.
+  TORCH_CHECK(batch_sizes.is_cpu());
+#else
+  // CUTLASS can handle both CPU- and CUDA-resident problem dimensions.
+  TORCH_CHECK(batch_sizes.is_cuda() || batch_sizes.is_cpu());
+#endif
+  TORCH_CHECK(batch_sizes.ndimension() == 1);
+  TORCH_CHECK(batch_sizes.scalar_type() == torch::kInt64);
+  // We expected a CUDA tensor with two dimensions and shape
+  // (tokens, hidden_in) for 'a'.
+  TORCH_CHECK(a.is_cuda());
+  TORCH_CHECK(a.ndimension() == 2);
+  TORCH_CHECK(a.scalar_type() == torch::kBFloat16);
+#if !defined(GROUPED_GEMM_CUTLASS)
+  if (trans_a) {
+    // If we can't use CUTLASS for the transposed cases, defer to the variable 'k' helper using cuBLAS
+    // for the rest of the op.
+    GroupedGemmVariableK(a, b, c, batch_sizes);
+    return;
+  }
+#endif
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(b.scalar_type() == torch::kBFloat16);
+  TORCH_CHECK(c.scalar_type() == torch::kBFloat16);
+  // The expected shapes of 'b' and 'c' are:
+  //   * when 'trans_a' is set: b=(tokens, hidden_out),                 c=(num_experts, hidden_in, hidden_out)
+  //   * when 'trans_b' is set: b=(num_experts, hidden_out, hidden_in), c=(tokens, hidden_out)
+  //   * otherwise:             b=(num_experts, hidden_in, hidden_out), c=(tokens, hidden
+  size_t hidden_in{}, hidden_out{};
+  if (trans_a) {
+    hidden_in = a.size(1);
+    hidden_out = b.size(1);
+    TORCH_CHECK(b.ndimension() == 2);
+    TORCH_CHECK(c.ndimension() == 3);
+    TORCH_CHECK(b.size(0) == a.size(0));
+    TORCH_CHECK(c.size(0) == batch_sizes.size(0));
+    TORCH_CHECK(c.size(1) == hidden_in);
+    TORCH_CHECK(c.size(2) == hidden_out);
+  } else {
+    TORCH_CHECK(b.ndimension() == 3);
+    TORCH_CHECK(c.ndimension() == 2);
+    // Validate the contraction dimensions match.
+    int64_t tokens = a.size(0), num_experts = b.size(0);
+    hidden_in = trans_b ? b.size(2) : b.size(1);
+    hidden_out = trans_b ? b.size(1) : b.size(2);
+    TORCH_CHECK(hidden_in == a.size(1));
+    // Validate that we have one size per expert.
+    TORCH_CHECK(batch_sizes.size(0) == num_experts);
+  }
+  // NOTE: We support transposition through the 'trans_b' flag.
+  TORCH_CHECK(a.is_contiguous());
+  TORCH_CHECK(b.is_contiguous());
+  TORCH_CHECK(c.is_contiguous());
+#if !defined(GROUPED_GEMM_CUTLASS)
+  CublasGroupedGemm(a, b, c, batch_sizes, trans_b);
+  return;
+#else
+  // The `coord_template` argument contains `kDynamicDim` as one of its dimensions
+  // as a placeholder. This placeholder is later expanded into the actual dimension
+  // for every element of the batch,  either on the host or on the device
+  // (if we can't do in on the host).
+  const auto coord_template = trans_a
+    ? cutlass::gemm::GemmCoord(hidden_in, hidden_out, kDynamicDim)
+    : cutlass::gemm::GemmCoord(kDynamicDim, hidden_out, hidden_in);
+  if (trans_a) {
+    CutlassGroupedGemm<true, false>(a, b, c, batch_sizes, coord_template);
+    return;
+  }
+  if (trans_b) {
+    CutlassGroupedGemm<false, true>(a, b, c, batch_sizes, coord_template);
+    return;
+  }
+  CutlassGroupedGemm<false, false>(a, b, c, batch_sizes, coord_template);
+  return;
+#endif
+}
+}  // namespace grouped_gemm

csrc/grouped_gemm/grouped_gemm.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+// // Set default if not already defined
+// #ifndef GROUPED_GEMM_CUTLASS
+// #define GROUPED_GEMM_CUTLASS 0
+// #endif
+// #include <torch/extension.h>
+#include <torch/torch.h>
+namespace grouped_gemm {
+void GroupedGemm(torch::Tensor a,
+		 torch::Tensor b,
+		 torch::Tensor c,
+		 torch::Tensor batch_sizes,
+		 bool trans_a, bool trans_b);
+}  // namespace grouped_gemm

csrc/grouped_gemm/ops.cu ADDED Viewed

	@@ -0,0 +1,11 @@

+#include "grouped_gemm.h"
+#include <torch/extension.h>
+namespace grouped_gemm {
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gmm", &GroupedGemm, "Grouped GEMM.");
+}
+}  // namespace grouped_gemm

tests/ops_test.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import unittest
+import itertools
+from absl.testing import parameterized
+import megablocks
+import numpy as np
+import torch
+def allclose(x, y, pct=2.0):
+    mask = torch.isclose(x, y, rtol=1e-5)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print(x[torch.logical_not(mask)], y[torch.logical_not(mask)])
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+def add_flags(x):
+    out = []
+    for y in x:
+        for trans_b in (False, True):
+            out.append(y + (trans_b, False))
+            # TODO: Revisit enabling batch_sizes_on_device
+            # for batch_sizes_on_device in (False, True):
+            #     out.append(y + (trans_b, batch_sizes_on_device))
+    return out
+_TEST_PROBLEMS = add_flags((
+    (1, 128, 128, 128),
+    (8, 128, 128, 128),
+    (16, 128, 128, 128),
+    (1, 128, 256, 512),
+    (8, 128, 256, 512),
+    (16, 128, 256, 512),
+))
+def randn(bs, x, y):
+    out = (torch.rand(bs, x, y) - 0.5 * 2) / (y * x)
+    return out.cuda().to(torch.bfloat16)
+def gmm(a, b, batch_sizes, trans_b=False):
+    batch_sizes = batch_sizes.cpu().numpy()
+    out = []
+    start = 0
+    for i, size in enumerate(batch_sizes):
+        rhs = b[i, :, :].t() if trans_b else b[i, :, :]
+        out.append(a[start:start + size, :] @ rhs)
+        start += size
+    return torch.cat(out)
+@parameterized.parameters(*_TEST_PROBLEMS)
+class OpsTest(parameterized.TestCase):
+    def testGroupedGemm_FixedSizes(self, z, m, k, n, trans_b, batch_sizes_on_device):
+        torch.manual_seed(0)
+        a = randn(z, m, k).view(-1, k)
+        b = randn(z, n, k) if trans_b else randn(z, k, n)
+        batch_sizes = torch.tensor([m] * z)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+        # out = ops.gmm(a, b, batch_sizes, trans_b)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+        # print("out", out)
+        expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+        self.assertTrue(allclose(out, expected_out))
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        self.assertTrue(allclose(b.grad, b_ref.grad))
+    def testGroupedGemm_VariableSizes(self, z, m, k, n, trans_b, batch_sizes_on_device):
+        torch.manual_seed(0)
+        a = randn(z, m, k).view(-1, k)
+        b = randn(z, n, k) if trans_b else randn(z, k, n)
+        dist = torch.rand(z, )
+        dist /= dist.sum()
+        batch_sizes = (dist * m).to(torch.long)
+        error = m * z - batch_sizes.sum()
+        batch_sizes[-1] += error
+        assert batch_sizes.sum() == (m * z)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+        expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+        self.assertTrue(allclose(out, expected_out))
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        # TODO: Review to ensure that the gradients are correct.
+        # self.assertTrue(allclose(b.grad, b_ref.grad))
+# @parameterized.parameters(False, True)
+@parameterized.parameters(False, False)
+class EdgeCasesTest(unittest.TestCase):
+    def testGroupedGemm_ZeroSize(self, batch_sizes_on_device):
+        torch.manual_seed(0)
+        m = 16384
+        k = 4096
+        n = 14336
+        num_experts = 8
+        a = randn(num_experts, m // num_experts, k).view(-1, k)
+        b = randn(num_experts, k, n)
+        batch_sizes = torch.tensor([219, 2246, 5, 8103, 1, 1117, 4693, 0]).to(torch.long)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes)
+        expected_out = gmm(a_ref, b_ref, batch_sizes)
+        self.assertTrue(allclose(out, expected_out))
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        self.assertTrue(allclose(b.grad, b_ref.grad))
+    def testGroupedGemm_ZeroK(self, batch_sizes_on_device):
+        sz = 128
+        total_tokens = 192
+        a = torch.ones(total_tokens, sz).cuda().to(torch.bfloat16)
+        b = torch.ones(total_tokens, sz).cuda().to(torch.bfloat16)
+        c = torch.ones(4, sz, sz).cuda().to(torch.bfloat16)
+        batch_sizes = torch.tensor([0, 128, 0, 64]).to(torch.long)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        megablocks.gg_backend.gmm(a, b, batch_sizes, trans_a=True, c=c)
+        self.assertTrue((c[0] == 0).all())
+        self.assertTrue((c[1] == 128).all())
+        self.assertTrue((c[2] == 0).all())
+        self.assertTrue((c[3] == 64).all())
+if __name__ == '__main__':
+    unittest.main()

tests/test_gg.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import megablocks
+def randn(bs, x, y):
+    out = (torch.rand(bs, x, y) - 0.5 * 2) / (y * x)
+    return out.cuda().to(torch.bfloat16)
+def gmm(a, b, batch_sizes, trans_b=False):
+    batch_sizes = batch_sizes.cpu().numpy()
+    out = []
+    start = 0
+    for i, size in enumerate(batch_sizes):
+        rhs = b[i, :, :].t() if trans_b else b[i, :, :]
+        out.append(a[start : start + size, :] @ rhs)
+        start += size
+    return torch.cat(out)
+def test_gmm():
+    z = 1
+    m = 128
+    n = 128
+    k = 128
+    trans_b = False
+    batch_sizes_on_device = False
+    # TODO: fix to enable batch_sizes_on_device
+    # batch_sizes_on_device = True
+    torch.manual_seed(0)
+    a = randn(z, m, k).view(-1, k)
+    b = randn(z, n, k) if trans_b else randn(z, k, n)
+    batch_sizes = torch.tensor([m] * z)
+    if batch_sizes_on_device:
+        batch_sizes = batch_sizes.cuda()
+    a.requires_grad_(True)
+    b.requires_grad_(True)
+    a_ref = a.detach().clone().requires_grad_(True)
+    b_ref = b.detach().clone().requires_grad_(True)
+    # out = ops.gmm(a, b, batch_sizes, trans_b)
+    out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+    print("out", out)
+    expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+    assert torch.allclose(out, expected_out, atol=1e-3), f"Expected {expected_out}, got {out}"
+    out.sum().backward()
+    expected_out.sum().backward()
+    assert torch.allclose(a.grad, a_ref.grad, atol=1e-3), f"Expected {a_ref.grad}, got {a.grad}"
+    assert torch.allclose(b.grad, b_ref.grad, atol=1e-3), f"Expected {b_ref.grad}, got {b.grad}"
+    print("Test passed successfully!")

torch-ext/megablocks/__init__.py CHANGED Viewed

@@ -5,11 +5,15 @@ import torch
 from ._ops import ops
-from megablocks.layers.arguments import Arguments
-from megablocks.layers.dmoe import ParallelDroplessMLP, dMoE
-from megablocks.layers.glu import SparseGLU
-from megablocks.layers.mlp import MLP, SparseMLP
-from megablocks.layers.moe import MoE, ParallelMLP, get_load_balancing_loss
 # This section contains the direct kernel exports (not inlcuded in the original code)
 def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:

 from ._ops import ops
+from .grouped_gemm import backend as gg_backend
+from .grouped_gemm import ops as gg_ops
+from .layers.arguments import Arguments
+from .layers.dmoe import ParallelDroplessMLP, dMoE
+from .layers.glu import SparseGLU
+from .layers.mlp import MLP, SparseMLP
+from .layers.moe import MoE, ParallelMLP, get_load_balancing_loss
 # This section contains the direct kernel exports (not inlcuded in the original code)
 def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:

torch-ext/megablocks/grouped_gemm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import ops
2	+ from . import backend

torch-ext/megablocks/grouped_gemm/backend.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# NOTE: Torch needs to be imported before the custom
+# extensions. Otherwise libc10.so cannot be found.
+import torch
+# # TODO(tgale): Wrap this in a try-block with better
+# # error message and instructions for building the
+# # c++ operations.
+# import grouped_gemm_backend as backend
+# We import the backend operations from the megablocks package as
+# grouped_gemm is vendored in megablocks in this repository.
+# from ... import _ops as backend
+from megablocks._ops import ops as backend  # type: ignore
+def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
+    assert not (trans_a and trans_b)
+    assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
+    assert a.ndim == 2, "Expected 2d tensor for 'a'"
+    assert b.ndim == (2 if trans_a else 3)
+    shape = (
+        (batch_sizes.shape[0], a.shape[1], b.shape[1])
+        if trans_a else
+        (a.shape[0], (b.shape[1] if trans_b else b.shape[2]))
+    )
+    return torch.empty(*shape, device=a.device, dtype=a.dtype)
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    if c is None:
+        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
+    return c

torch-ext/megablocks/grouped_gemm/ops.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from . import backend
+import torch
+class GroupedGemm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b, batch_sizes, trans_b):
+        ctx.save_for_backward(a, b, batch_sizes)
+        ctx.trans_b = trans_b
+        return backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=trans_b)
+    @staticmethod
+    def backward(ctx, grad):
+        grad = grad.contiguous()
+        a, b, batch_sizes = ctx.saved_tensors
+        trans_b = ctx.trans_b
+        agrad = None
+        if ctx.needs_input_grad[0]:
+            agrad = backend.gmm(
+                grad, b, batch_sizes, trans_a=False, trans_b=not trans_b)
+        bgrad = None
+        if ctx.needs_input_grad[1]:
+            lhs, rhs = (grad, a) if trans_b else (a, grad)
+            bgrad = backend.gmm(
+                lhs, rhs, batch_sizes, trans_a=True, trans_b=False)
+        return agrad, bgrad, None, None
+def gmm(a, b, batch_sizes, trans_b=False):
+    return GroupedGemm.apply(a, b, batch_sizes, trans_b)

torch-ext/megablocks/grouped_gemm_util.py CHANGED Viewed

@@ -4,7 +4,8 @@ import warnings
 _grouped_gemm_is_available: bool = False
 try:
-    import grouped_gemm
     _grouped_gemm_is_available = True
 except ImportError as error:
     warnings.warn('Grouped GEMM not available.')
@@ -22,5 +23,9 @@ def assert_grouped_gemm_is_available():
     assert _grouped_gemm_is_available, msg
-backend = grouped_gemm.backend if grouped_gemm_is_available() else None
-ops = grouped_gemm.ops if grouped_gemm_is_available() else None

 _grouped_gemm_is_available: bool = False
 try:
+    # import grouped_gemm
+    pass
     _grouped_gemm_is_available = True
 except ImportError as error:
     warnings.warn('Grouped GEMM not available.')
     assert _grouped_gemm_is_available, msg
+# backend = grouped_gemm.backend if grouped_gemm_is_available() else None
+# ops = grouped_gemm.ops if grouped_gemm_is_available() else None
+from .grouped_gemm import backend as ops
+from .grouped_gemm import ops as backend

torch-ext/megablocks/layers/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # from megablocks.layers.dmoe import dMoE
-from megablocks.layers.moe import MoE
 __all__ = [
     'MoE',

 # SPDX-License-Identifier: Apache-2.0
 # from megablocks.layers.dmoe import dMoE
+from .moe import MoE
 __all__ = [
     'MoE',

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -9,6 +9,8 @@
 #include "new_replicate.h"
 #include "new_sort.h"
 // void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
 torch::Tensor exclusive_cumsum_wrapper(torch::Tensor x, int64_t dim, torch::Tensor out) {
   megablocks::exclusive_cumsum(x, dim, out);
@@ -70,6 +72,12 @@ torch::Tensor sort_wrapper(torch::Tensor x, int64_t end_bit, torch::Tensor x_out
   return x_out;
 }
 // Reference implementation:
 //
 // m.def("exclusive_cumsum", &exclusive_cumsum, "batched exclusive cumsum.");
@@ -101,6 +109,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("sort(Tensor x, int end_bit, Tensor x_out, Tensor iota_out) -> Tensor(x_out)");
   ops.impl("sort", torch::kCUDA, &sort_wrapper);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

 #include "new_replicate.h"
 #include "new_sort.h"
+#include "grouped_gemm/grouped_gemm.h"
 // void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
 torch::Tensor exclusive_cumsum_wrapper(torch::Tensor x, int64_t dim, torch::Tensor out) {
   megablocks::exclusive_cumsum(x, dim, out);
   return x_out;
 }
+// GroupedGemm operation
+torch::Tensor gmm(torch::Tensor a, torch::Tensor b, torch::Tensor c, torch::Tensor batch_sizes, bool trans_a, bool trans_b) {
+  grouped_gemm::GroupedGemm(a, b, c, batch_sizes, trans_a, trans_b);
+  return c;
+}
 // Reference implementation:
 //
 // m.def("exclusive_cumsum", &exclusive_cumsum, "batched exclusive cumsum.");
   ops.def("sort(Tensor x, int end_bit, Tensor x_out, Tensor iota_out) -> Tensor(x_out)");
   ops.impl("sort", torch::kCUDA, &sort_wrapper);
+  // Register the gmm GroupedGemm operation
+  ops.def("gmm(Tensor (a!) a, Tensor (b!) b, Tensor(c!) c, Tensor batch_sizes, bool trans_a, bool trans_b) -> Tensor(c!)");
+  ops.impl("gmm", torch::kCUDA, &gmm);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)