drbh commited on 11 days ago

Commit

b2bfc37

1 Parent(s): 484fde0

fix: bump build and imports

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch26-cxx11-cu118-x86_64-linux/megablocks/{_megablocks_a585153_dirty.abi3.so → _megablocks_6756875_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/backend.py +2 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/arguments.py +2 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/common.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/dmlp_registry.py +2 -2
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/dmoe.py +8 -5
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/glu.py +16 -5
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/memory_test.py +2 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/mlp.py +9 -5
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/moe.py +50 -18
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/mpu.py +2 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/router.py +4 -2
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/sharedexpert_registry.py +4 -2
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/__init__.py +14 -14
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +5 -2
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/binned_gather.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/binned_scatter.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/cumsum.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/gather.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/histogram.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/histogram_benchmark.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/matmul_benchmark.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/padded_gather.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/permute_benchmark.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/replicate.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/scatter.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/sort.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/sort_benchmark.py +1 -1
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/topology.py +1 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/{_megablocks_a585153_dirty.abi3.so → _megablocks_6756875_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/megablocks/grouped_gemm/backend.py +2 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/arguments.py +2 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/common.py +1 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/dmlp_registry.py +2 -2
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/dmoe.py +8 -5
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/glu.py +16 -5
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/memory_test.py +2 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/mlp.py +9 -5
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/moe.py +50 -18
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/mpu.py +2 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/router.py +4 -2
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/sharedexpert_registry.py +4 -2
build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/__init__.py +14 -14
build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +5 -2
build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/binned_gather.py +1 -1
build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/binned_scatter.py +1 -1

build/torch26-cxx11-cu118-x86_64-linux/megablocks/{_megablocks_a585153_dirty.abi3.so → _megablocks_6756875_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44462d45f75616c369c2421fe41d53cd1d1dc365f1d2545d870e2db999e67e38
 size 10517608

 version https://git-lfs.github.com/spec/v1
+oid sha256:ad46e9f244afa886c8a104d75e37f93afd2a0ecf83bfc7a414680fa16d8b78f9
 size 10517608

build/torch26-cxx11-cu118-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_a585153_dirty
-ops = torch.ops._megablocks_a585153_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_a585153_dirty::{op_name}"

 import torch
+from . import _megablocks_6756875_dirty
+ops = torch.ops._megablocks_6756875_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_6756875_dirty::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/megablocks/grouped_gemm/backend.py CHANGED Viewed

@@ -10,7 +10,8 @@ import torch
 # We import the backend operations from the megablocks package as
 # grouped_gemm is vendored in megablocks in this repository.
 # from ... import _ops as backend
-from megablocks._ops import ops as backend  # type: ignore
 def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     assert not (trans_a and trans_b)

 # We import the backend operations from the megablocks package as
 # grouped_gemm is vendored in megablocks in this repository.
 # from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
 def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     assert not (trans_a and trans_b)

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/arguments.py CHANGED Viewed

@@ -9,7 +9,8 @@ import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-import megablocks.grouped_gemm_util as grouped_gemm
 # Type annotation for in-place Tensor initialization function.
 InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]

 import torch.distributed as dist
 import torch.nn.functional as F
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
 # Type annotation for in-place Tensor initialization function.
 InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/common.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import torch
-from megablocks.layers.arguments import Arguments
 def dtype(args: Arguments):

 import torch
+from .arguments import Arguments
 def dtype(args: Arguments):

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/dmlp_registry.py CHANGED Viewed

@@ -3,8 +3,8 @@
 from typing import Union
-from megablocks.layers import glu, mlp
-from megablocks.layers.arguments import Arguments
 MlpType = Union[mlp.SparseMLP, glu.SparseGLU]

 from typing import Union
+from . import glu, mlp
+from .arguments import Arguments
 MlpType = Union[mlp.SparseMLP, glu.SparseGLU]

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/dmoe.py CHANGED Viewed

@@ -6,11 +6,14 @@ import stk.ops
 import torch
 from stk import Matrix
-import megablocks.ops as ops
-# from megablocks.ops import ops
-from megablocks.layers import common, dmlp_registry, moe, mpu
-from megablocks.layers.arguments import Arguments
 def promote_scalar(x):
     return x.view(1) if not len(x.size()) else x

 import torch
 from stk import Matrix
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
 def promote_scalar(x):
     return x.view(1) if not len(x.size()) else x

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/glu.py CHANGED Viewed

@@ -4,11 +4,22 @@
 import stk.ops
 import torch
-from megablocks import grouped_gemm_util as gg
-from megablocks.layers import common, mpu
-from megablocks.layers.activation_fn import act_fn
-from megablocks.layers.arguments import Arguments
-from megablocks.layers.mlp import (
     SharedMLP,
     SparseMLP,
     create_dmoe_expert_weights,

 import stk.ops
 import torch
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
     SharedMLP,
     SparseMLP,
     create_dmoe_expert_weights,

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/memory_test.py CHANGED Viewed

@@ -6,7 +6,8 @@ import gc
 import torch
 import torch.distributed as dist
-from megablocks.layers import arguments, dmoe
 _TESTS = ((8, 2048, 4096, 4096, 32, 4),)

 import torch
 import torch.distributed as dist
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
 _TESTS = ((8, 2048, 4096, 4096, 32, 4),)

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/mlp.py CHANGED Viewed

@@ -9,11 +9,15 @@ import stk.ops
 import torch
 from packaging import version
-from megablocks import grouped_gemm_util as gg
-from megablocks.layers import common, gelu, mpu
-from megablocks.layers.activation_fn import act_fn
-from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
 class ScaleGradient(torch.autograd.Function):

 import torch
 from packaging import version
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
 class ScaleGradient(torch.autograd.Function):

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/moe.py CHANGED Viewed

@@ -6,10 +6,27 @@ import numpy as np
 import torch
 import torch.distributed as dist
-import megablocks.ops as ops
-from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
-from megablocks.layers.all_to_all import all_to_all
-from megablocks.layers.arguments import Arguments
 _LOAD_BALANCING_LOSS = []
@@ -158,7 +175,8 @@ class ParallelMLP(torch.nn.Module):
         # prior? Could we place the `torch.max` operation to return
         # 32-bit expert indices?
         top_expert = top_expert.int()
-        output = ops.sort(top_expert, self.sort_end_bit)
         assert output is not None
         bin_ids, indices = output
@@ -168,10 +186,12 @@ class ParallelMLP(torch.nn.Module):
         # TODO(tgale): Does the sorted data produce a more favorable
         # data distribution for histogram? Or is the op parallelism
         # worth more?
-        tokens_per_expert = ops.histogram(top_expert, self.num_experts)
         # Calculate the bin bounds for the sorted tokens.
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
         assert bins is not None
         bins = bins.view(1) if not len(bins.size()) else bins
@@ -195,7 +215,8 @@ class ParallelMLP(torch.nn.Module):
     ):
         # Route the tokens for MoE computation.
         x = x.view(-1, x.shape[-1])
-        output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
         assert output is not None
         x = output
@@ -204,7 +225,9 @@ class ParallelMLP(torch.nn.Module):
         x = self.mlp(x)
         # Un-route the data for the MoE output.
-        return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
     def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
         # x: [sl, bs, hs]
@@ -264,7 +287,8 @@ class ParallelMLP(torch.nn.Module):
             # If we're sharding the experts along the hidden dimension
             # multiple devices own parts of the same sets of experts.
             # Replicate the token counts so every device gets the counts.
-            repeated_tokens_per_expert = ops.repeat(
                 tokens_per_expert,
                 (mpu.hidden_sharding_degree(self.args),),
             )
@@ -285,7 +309,8 @@ class ParallelMLP(torch.nn.Module):
         # This view updates the shape of the tensor from [sl, bs, hs] to
         # [sl * bs, hs] prior to the permutation.
         x = x.view(-1, x.shape[-1])
-        output = ops.gather(x, indices, bin_ids, bins, self.top_k)
         assert output is not None
         x = output
@@ -317,7 +342,8 @@ class ParallelMLP(torch.nn.Module):
         # get all of the tokens assigned to them.
         #
         # TODO(tgale): Fuse this into the prior, local permutation.
-        x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
         # Start the cross-device permutation asynchronously so we can
         # overlap communication with computation.
@@ -336,7 +362,8 @@ class ParallelMLP(torch.nn.Module):
             # for expert computation we'll do one more local permutation. The
             # rest of this torch.no_grad() scope sets up the indices and bins
             # for this permutation.
-            replicate_bins = ops.inclusive_cumsum(
                 parallel_tokens_per_expert.flatten(),
                 0,
             )
@@ -351,14 +378,16 @@ class ParallelMLP(torch.nn.Module):
                 ),
                 mpu.experts_per_rank(self.args),
             )
-            parallel_top_expert = ops.replicate(
                 parallel_top_expert.unsqueeze(dim=0),
                 replicate_bins,
                 tokens_received,
             ).flatten()
             # TODO(tgale): The sort_end_bit here can be reduced.
-            parallel_bin_ids, parallel_indices = ops.sort(
                 parallel_top_expert,
                 self.sort_end_bit,
             )
@@ -368,7 +397,8 @@ class ParallelMLP(torch.nn.Module):
                 dim=0,
                 dtype=torch.int,
             )
-            parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
             parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
             # If expert_capacity is set to zero, set the number of tokens
@@ -416,10 +446,12 @@ class ParallelMLP(torch.nn.Module):
             -1,
             self.args.hidden_size,
         )
-        x = ops.sum(x.view(shape), dim=0)
         # Un-permute locally to setup for the next series of operations.
-        x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
         return x, tokens_per_expert.flatten()
     def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):

 import torch
 import torch.distributed as dist
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
 _LOAD_BALANCING_LOSS = []
         # prior? Could we place the `torch.max` operation to return
         # 32-bit expert indices?
         top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
         assert output is not None
         bin_ids, indices = output
         # TODO(tgale): Does the sorted data produce a more favorable
         # data distribution for histogram? Or is the op parallelism
         # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
         # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
         assert bins is not None
         bins = bins.view(1) if not len(bins.size()) else bins
     ):
         # Route the tokens for MoE computation.
         x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
         assert output is not None
         x = output
         x = self.mlp(x)
         # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
     def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
         # x: [sl, bs, hs]
             # If we're sharding the experts along the hidden dimension
             # multiple devices own parts of the same sets of experts.
             # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
                 tokens_per_expert,
                 (mpu.hidden_sharding_degree(self.args),),
             )
         # This view updates the shape of the tensor from [sl, bs, hs] to
         # [sl * bs, hs] prior to the permutation.
         x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
         assert output is not None
         x = output
         # get all of the tokens assigned to them.
         #
         # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
         # Start the cross-device permutation asynchronously so we can
         # overlap communication with computation.
             # for expert computation we'll do one more local permutation. The
             # rest of this torch.no_grad() scope sets up the indices and bins
             # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
                 parallel_tokens_per_expert.flatten(),
                 0,
             )
                 ),
                 mpu.experts_per_rank(self.args),
             )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
                 parallel_top_expert.unsqueeze(dim=0),
                 replicate_bins,
                 tokens_received,
             ).flatten()
             # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
                 parallel_top_expert,
                 self.sort_end_bit,
             )
                 dim=0,
                 dtype=torch.int,
             )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
             parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
             # If expert_capacity is set to zero, set the number of tokens
             -1,
             self.args.hidden_size,
         )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
         # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
         return x, tokens_per_expert.flatten()
     def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/mpu.py CHANGED Viewed

@@ -6,7 +6,8 @@ from typing import Optional
 import torch
 import torch.distributed as dist
-from megablocks.layers.arguments import Arguments
 class MoeParam(torch.Tensor):

 import torch
 import torch.distributed as dist
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
 class MoeParam(torch.Tensor):

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/router.py CHANGED Viewed

@@ -4,8 +4,10 @@ from typing import Any
 import torch
-from megablocks.layers import common
-from megablocks.layers.arguments import Arguments
 _ROUTER_LOGITS = []

 import torch
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
 _ROUTER_LOGITS = []

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers/sharedexpert_registry.py CHANGED Viewed

@@ -3,8 +3,10 @@
 from typing import Union
-from megablocks.layers import glu, mlp
-from megablocks.layers.arguments import Arguments
 _REGISTRY = {
     'mlp': mlp.SharedMLP,

 from typing import Union
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
 _REGISTRY = {
     'mlp': mlp.SharedMLP,

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/__init__.py CHANGED Viewed

@@ -1,20 +1,20 @@
 # Copyright 2024 Databricks
 # SPDX-License-Identifier: Apache-2.0
-from megablocks.ops.binned_gather import binned_gather
-from megablocks.ops.binned_scatter import binned_scatter
-from megablocks.ops.cumsum import exclusive_cumsum, inclusive_cumsum
-from megablocks.ops.gather import gather
-from megablocks.ops.histogram import histogram
-from megablocks.ops.padded_gather import padded_gather
-from megablocks.ops.padded_scatter import padded_scatter
-from megablocks.ops.repeat import repeat
-from megablocks.ops.replicate import replicate
-from megablocks.ops.round_up import round_up
-from megablocks.ops.scatter import scatter
-from megablocks.ops.sort import sort
-from megablocks.ops.sum import sum
-from megablocks.ops.topology import topology
 __all__ = [
     'binned_gather',

 # Copyright 2024 Databricks
 # SPDX-License-Identifier: Apache-2.0
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
 __all__ = [
     'binned_gather',

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -4,8 +4,11 @@
 import torch
 import torch.distributed as dist
-from megablocks import benchmark_util
-from megablocks.layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),

 import torch
 import torch.distributed as dist
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from ..layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/binned_gather.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for binned_gather kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for binned_gather kernel.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/binned_scatter.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for binned_scatter kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for binned_scatter kernel.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/cumsum.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 # instructions for building the c++ operations.
 try:
     # import megablocks_ops as ops  # type: ignore
-    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # instructions for building the c++ operations.
 try:
     # import megablocks_ops as ops  # type: ignore
+    from .._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/gather.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for gather kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for gather kernel.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/histogram.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from .._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/histogram_benchmark.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 import torch
 from absl.testing import parameterized
-from megablocks import ops
 _HISTOGRAM_TESTS = (
     (16384, torch.int32, 2),

 import torch
 from absl.testing import parameterized
+from .. import ops
 _HISTOGRAM_TESTS = (
     (16384, torch.int32, 2),

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/matmul_benchmark.py CHANGED Viewed

@@ -7,7 +7,7 @@ import stk
 import torch
 from absl.testing import parameterized
-from megablocks import benchmark_util, ops
 # Calling tensor.t() calls tensor.transpose(0, 1) which calls

 import torch
 from absl.testing import parameterized
+from .. import benchmark_util, ops
 # Calling tensor.t() calls tensor.transpose(0, 1) which calls

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/padded_gather.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for padded_gather kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for padded_gather kernel.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for padded_scatter kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for padded_scatter kernel.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/padded_scatter_benchmark.py CHANGED Viewed

@@ -6,7 +6,7 @@ import unittest
 import torch
 from absl.testing import parameterized
-from megablocks import benchmark_util, ops
 _PADDED_SCATTER_BENCHMARK = (
     # dMoE-Medium, 8-way EMP.

 import torch
 from absl.testing import parameterized
+from .. import benchmark_util, ops
 _PADDED_SCATTER_BENCHMARK = (
     # dMoE-Medium, 8-way EMP.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/permute_benchmark.py CHANGED Viewed

@@ -6,7 +6,7 @@ import unittest
 import torch
 from absl.testing import parameterized
-from megablocks import benchmark_util, ops
 _PERMUTE_TESTS = (
     (16384, 768, 2),

 import torch
 from absl.testing import parameterized
+from .. import benchmark_util, ops
 _PERMUTE_TESTS = (
     (16384, 768, 2),

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/replicate.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from .._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/scatter.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, Optional
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for scatter kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for scatter kernel.

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/sort.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from .._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/sort_benchmark.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 import torch
 from absl.testing import parameterized
-from megablocks import ops
 _SORT_TESTS = (
     (16384, torch.int32, None),

 import torch
 from absl.testing import parameterized
+from .. import ops
 _SORT_TESTS = (
     (16384, torch.int32, None),

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/topology.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from .._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

build/torch26-cxx11-cu124-x86_64-linux/megablocks/{_megablocks_a585153_dirty.abi3.so → _megablocks_6756875_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e734576700345e035790357ea19730e84e90c176747076ce845995bc3a0e0d50
 size 11869424

 version https://git-lfs.github.com/spec/v1
+oid sha256:1419672a07ed370d7107ca54a6b694f234efa8e696644ee4e96c1bf396aff6af
 size 11869424

build/torch26-cxx11-cu124-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_a585153_dirty
-ops = torch.ops._megablocks_a585153_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_a585153_dirty::{op_name}"

 import torch
+from . import _megablocks_6756875_dirty
+ops = torch.ops._megablocks_6756875_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_6756875_dirty::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/megablocks/grouped_gemm/backend.py CHANGED Viewed

@@ -10,7 +10,8 @@ import torch
 # We import the backend operations from the megablocks package as
 # grouped_gemm is vendored in megablocks in this repository.
 # from ... import _ops as backend
-from megablocks._ops import ops as backend  # type: ignore
 def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     assert not (trans_a and trans_b)

 # We import the backend operations from the megablocks package as
 # grouped_gemm is vendored in megablocks in this repository.
 # from ... import _ops as backend
+# from megablocks._ops import ops as backend  # type: ignore
+from .._ops import ops as backend  # type: ignore
 def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     assert not (trans_a and trans_b)

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/arguments.py CHANGED Viewed

@@ -9,7 +9,8 @@ import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-import megablocks.grouped_gemm_util as grouped_gemm
 # Type annotation for in-place Tensor initialization function.
 InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]

 import torch.distributed as dist
 import torch.nn.functional as F
+# import megablocks.grouped_gemm_util as grouped_gemm
+from .. import grouped_gemm_util as grouped_gemm
 # Type annotation for in-place Tensor initialization function.
 InitFn = Union[Callable[[torch.Tensor], None], partial[torch.Tensor]]

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/common.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import torch
-from megablocks.layers.arguments import Arguments
 def dtype(args: Arguments):

 import torch
+from .arguments import Arguments
 def dtype(args: Arguments):

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/dmlp_registry.py CHANGED Viewed

@@ -3,8 +3,8 @@
 from typing import Union
-from megablocks.layers import glu, mlp
-from megablocks.layers.arguments import Arguments
 MlpType = Union[mlp.SparseMLP, glu.SparseGLU]

 from typing import Union
+from . import glu, mlp
+from .arguments import Arguments
 MlpType = Union[mlp.SparseMLP, glu.SparseGLU]

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/dmoe.py CHANGED Viewed

@@ -6,11 +6,14 @@ import stk.ops
 import torch
 from stk import Matrix
-import megablocks.ops as ops
-# from megablocks.ops import ops
-from megablocks.layers import common, dmlp_registry, moe, mpu
-from megablocks.layers.arguments import Arguments
 def promote_scalar(x):
     return x.view(1) if not len(x.size()) else x

 import torch
 from stk import Matrix
+# import megablocks.ops as ops
+# # from megablocks.ops import ops
+# from megablocks.layers import common, dmlp_registry, moe, mpu
+# from megablocks.layers.arguments import Arguments
+from .. import ops
+from . import common, dmlp_registry, moe, mpu
+from .arguments import Arguments
 def promote_scalar(x):
     return x.view(1) if not len(x.size()) else x

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/glu.py CHANGED Viewed

@@ -4,11 +4,22 @@
 import stk.ops
 import torch
-from megablocks import grouped_gemm_util as gg
-from megablocks.layers import common, mpu
-from megablocks.layers.activation_fn import act_fn
-from megablocks.layers.arguments import Arguments
-from megablocks.layers.mlp import (
     SharedMLP,
     SparseMLP,
     create_dmoe_expert_weights,

 import stk.ops
 import torch
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import Arguments
+# from megablocks.layers.mlp import (
+#     SharedMLP,
+#     SparseMLP,
+#     create_dmoe_expert_weights,
+#     resolve_dtensor,
+# )
+from .. import grouped_gemm_util as gg
+from . import common, mpu
+from .activation_fn import act_fn
+from .arguments import Arguments
+from .mlp import (
     SharedMLP,
     SparseMLP,
     create_dmoe_expert_weights,

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/memory_test.py CHANGED Viewed

@@ -6,7 +6,8 @@ import gc
 import torch
 import torch.distributed as dist
-from megablocks.layers import arguments, dmoe
 _TESTS = ((8, 2048, 4096, 4096, 32, 4),)

 import torch
 import torch.distributed as dist
+# from megablocks.layers import arguments, dmoe
+from . import arguments, dmoe
 _TESTS = ((8, 2048, 4096, 4096, 32, 4),)

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/mlp.py CHANGED Viewed

@@ -9,11 +9,15 @@ import stk.ops
 import torch
 from packaging import version
-from megablocks import grouped_gemm_util as gg
-from megablocks.layers import common, gelu, mpu
-from megablocks.layers.activation_fn import act_fn
-from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
 class ScaleGradient(torch.autograd.Function):

 import torch
 from packaging import version
+# from megablocks import grouped_gemm_util as gg
+# from megablocks.layers import common, gelu, mpu
+# from megablocks.layers.activation_fn import act_fn
+# from megablocks.layers.arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
+from .. import grouped_gemm_util as gg
+from . import common, gelu, mpu
+from .activation_fn import act_fn
+from .arguments import DEFAULT_ACTIVATION_FN, Arguments, InitFn
 class ScaleGradient(torch.autograd.Function):

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/moe.py CHANGED Viewed

@@ -6,10 +6,27 @@ import numpy as np
 import torch
 import torch.distributed as dist
-import megablocks.ops as ops
-from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
-from megablocks.layers.all_to_all import all_to_all
-from megablocks.layers.arguments import Arguments
 _LOAD_BALANCING_LOSS = []
@@ -158,7 +175,8 @@ class ParallelMLP(torch.nn.Module):
         # prior? Could we place the `torch.max` operation to return
         # 32-bit expert indices?
         top_expert = top_expert.int()
-        output = ops.sort(top_expert, self.sort_end_bit)
         assert output is not None
         bin_ids, indices = output
@@ -168,10 +186,12 @@ class ParallelMLP(torch.nn.Module):
         # TODO(tgale): Does the sorted data produce a more favorable
         # data distribution for histogram? Or is the op parallelism
         # worth more?
-        tokens_per_expert = ops.histogram(top_expert, self.num_experts)
         # Calculate the bin bounds for the sorted tokens.
-        bins = ops.inclusive_cumsum(tokens_per_expert, 0)
         assert bins is not None
         bins = bins.view(1) if not len(bins.size()) else bins
@@ -195,7 +215,8 @@ class ParallelMLP(torch.nn.Module):
     ):
         # Route the tokens for MoE computation.
         x = x.view(-1, x.shape[-1])
-        output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
         assert output is not None
         x = output
@@ -204,7 +225,9 @@ class ParallelMLP(torch.nn.Module):
         x = self.mlp(x)
         # Un-route the data for the MoE output.
-        return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
     def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
         # x: [sl, bs, hs]
@@ -264,7 +287,8 @@ class ParallelMLP(torch.nn.Module):
             # If we're sharding the experts along the hidden dimension
             # multiple devices own parts of the same sets of experts.
             # Replicate the token counts so every device gets the counts.
-            repeated_tokens_per_expert = ops.repeat(
                 tokens_per_expert,
                 (mpu.hidden_sharding_degree(self.args),),
             )
@@ -285,7 +309,8 @@ class ParallelMLP(torch.nn.Module):
         # This view updates the shape of the tensor from [sl, bs, hs] to
         # [sl * bs, hs] prior to the permutation.
         x = x.view(-1, x.shape[-1])
-        output = ops.gather(x, indices, bin_ids, bins, self.top_k)
         assert output is not None
         x = output
@@ -317,7 +342,8 @@ class ParallelMLP(torch.nn.Module):
         # get all of the tokens assigned to them.
         #
         # TODO(tgale): Fuse this into the prior, local permutation.
-        x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
         # Start the cross-device permutation asynchronously so we can
         # overlap communication with computation.
@@ -336,7 +362,8 @@ class ParallelMLP(torch.nn.Module):
             # for expert computation we'll do one more local permutation. The
             # rest of this torch.no_grad() scope sets up the indices and bins
             # for this permutation.
-            replicate_bins = ops.inclusive_cumsum(
                 parallel_tokens_per_expert.flatten(),
                 0,
             )
@@ -351,14 +378,16 @@ class ParallelMLP(torch.nn.Module):
                 ),
                 mpu.experts_per_rank(self.args),
             )
-            parallel_top_expert = ops.replicate(
                 parallel_top_expert.unsqueeze(dim=0),
                 replicate_bins,
                 tokens_received,
             ).flatten()
             # TODO(tgale): The sort_end_bit here can be reduced.
-            parallel_bin_ids, parallel_indices = ops.sort(
                 parallel_top_expert,
                 self.sort_end_bit,
             )
@@ -368,7 +397,8 @@ class ParallelMLP(torch.nn.Module):
                 dim=0,
                 dtype=torch.int,
             )
-            parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
             parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
             # If expert_capacity is set to zero, set the number of tokens
@@ -416,10 +446,12 @@ class ParallelMLP(torch.nn.Module):
             -1,
             self.args.hidden_size,
         )
-        x = ops.sum(x.view(shape), dim=0)
         # Un-permute locally to setup for the next series of operations.
-        x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
         return x, tokens_per_expert.flatten()
     def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):

 import torch
 import torch.distributed as dist
+# import megablocks.ops as ops
+# from megablocks.layers import common, mlp, mpu, router, sharedexpert_registry
+# from megablocks.layers.all_to_all import all_to_all
+# from megablocks.layers.arguments import Arguments
+from ..ops import (
+    sort,
+    histogram,
+    inclusive_cumsum,
+    exclusive_cumsum,
+    binned_gather,
+    binned_scatter,
+    gather,
+    scatter,
+    repeat,
+    replicate,
+)
+from . import common, mlp, mpu, router, sharedexpert_registry
+from .arguments import Arguments
+from .all_to_all import all_to_all
 _LOAD_BALANCING_LOSS = []
         # prior? Could we place the `torch.max` operation to return
         # 32-bit expert indices?
         top_expert = top_expert.int()
+        # output = ops.sort(top_expert, self.sort_end_bit)
+        output = sort(top_expert, self.sort_end_bit)
         assert output is not None
         bin_ids, indices = output
         # TODO(tgale): Does the sorted data produce a more favorable
         # data distribution for histogram? Or is the op parallelism
         # worth more?
+        # tokens_per_expert = ops.histogram(top_expert, self.num_experts)
+        tokens_per_expert = histogram(top_expert, self.num_experts)
         # Calculate the bin bounds for the sorted tokens.
+        # bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+        bins = inclusive_cumsum(tokens_per_expert, 0)
         assert bins is not None
         bins = bins.view(1) if not len(bins.size()) else bins
     ):
         # Route the tokens for MoE computation.
         x = x.view(-1, x.shape[-1])
+        # output = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        output = binned_gather(x, indices, bins, expert_capacity, top_k)
         assert output is not None
         x = output
         x = self.mlp(x)
         # Un-route the data for the MoE output.
+        # return ops.binned_scatter(x, indices, expert_weights, bins, top_k)
+        return binned_scatter(x, indices, expert_weights, bins, top_k)
     def forward_once(self, x: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):
         # x: [sl, bs, hs]
             # If we're sharding the experts along the hidden dimension
             # multiple devices own parts of the same sets of experts.
             # Replicate the token counts so every device gets the counts.
+            # repeated_tokens_per_expert = ops.repeat(
+            repeated_tokens_per_expert = repeat(
                 tokens_per_expert,
                 (mpu.hidden_sharding_degree(self.args),),
             )
         # This view updates the shape of the tensor from [sl, bs, hs] to
         # [sl * bs, hs] prior to the permutation.
         x = x.view(-1, x.shape[-1])
+        # output = ops.gather(x, indices, bin_ids, bins, self.top_k)
+        output = gather(x, indices, bin_ids, bins, self.top_k)
         assert output is not None
         x = output
         # get all of the tokens assigned to them.
         #
         # TODO(tgale): Fuse this into the prior, local permutation.
+        # x = ops.repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
+        x = repeat(x, (mpu.hidden_sharding_degree(self.args), 1))
         # Start the cross-device permutation asynchronously so we can
         # overlap communication with computation.
             # for expert computation we'll do one more local permutation. The
             # rest of this torch.no_grad() scope sets up the indices and bins
             # for this permutation.
+            # replicate_bins = ops.inclusive_cumsum(
+            replicate_bins = inclusive_cumsum(
                 parallel_tokens_per_expert.flatten(),
                 0,
             )
                 ),
                 mpu.experts_per_rank(self.args),
             )
+            # parallel_top_expert = ops.replicate(
+            parallel_top_expert = replicate(
                 parallel_top_expert.unsqueeze(dim=0),
                 replicate_bins,
                 tokens_received,
             ).flatten()
             # TODO(tgale): The sort_end_bit here can be reduced.
+            # parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_bin_ids, parallel_indices = sort(
                 parallel_top_expert,
                 self.sort_end_bit,
             )
                 dim=0,
                 dtype=torch.int,
             )
+            # parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
+            parallel_bins = inclusive_cumsum(parallel_tokens_per_expert, 0)
             parallel_bins = (parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins)
             # If expert_capacity is set to zero, set the number of tokens
             -1,
             self.args.hidden_size,
         )
+        # x = ops.sum(x.view(shape), dim=0)
+        x = x.view(shape).sum(dim=0)
         # Un-permute locally to setup for the next series of operations.
+        # x = ops.scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
+        x = scatter(x, indices, bin_ids, expert_weights, bins, self.top_k)
         return x, tokens_per_expert.flatten()
     def forward(self, x: torch.Tensor, scores: torch.Tensor, expert_weights: torch.Tensor, top_experts: torch.Tensor):

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/mpu.py CHANGED Viewed

@@ -6,7 +6,8 @@ from typing import Optional
 import torch
 import torch.distributed as dist
-from megablocks.layers.arguments import Arguments
 class MoeParam(torch.Tensor):

 import torch
 import torch.distributed as dist
+# from megablocks.layers.arguments import Arguments
+from .arguments import Arguments
 class MoeParam(torch.Tensor):

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/router.py CHANGED Viewed

@@ -4,8 +4,10 @@ from typing import Any
 import torch
-from megablocks.layers import common
-from megablocks.layers.arguments import Arguments
 _ROUTER_LOGITS = []

 import torch
+# from megablocks.layers import common
+# from megablocks.layers.arguments import Arguments
+from . import common
+from .arguments import Arguments
 _ROUTER_LOGITS = []

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers/sharedexpert_registry.py CHANGED Viewed

@@ -3,8 +3,10 @@
 from typing import Union
-from megablocks.layers import glu, mlp
-from megablocks.layers.arguments import Arguments
 _REGISTRY = {
     'mlp': mlp.SharedMLP,

 from typing import Union
+# from megablocks.layers import glu, mlp
+# from megablocks.layers.arguments import Arguments
+from . import glu, mlp
+from .arguments import Arguments
 _REGISTRY = {
     'mlp': mlp.SharedMLP,

build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/__init__.py CHANGED Viewed

@@ -1,20 +1,20 @@
 # Copyright 2024 Databricks
 # SPDX-License-Identifier: Apache-2.0
-from megablocks.ops.binned_gather import binned_gather
-from megablocks.ops.binned_scatter import binned_scatter
-from megablocks.ops.cumsum import exclusive_cumsum, inclusive_cumsum
-from megablocks.ops.gather import gather
-from megablocks.ops.histogram import histogram
-from megablocks.ops.padded_gather import padded_gather
-from megablocks.ops.padded_scatter import padded_scatter
-from megablocks.ops.repeat import repeat
-from megablocks.ops.replicate import replicate
-from megablocks.ops.round_up import round_up
-from megablocks.ops.scatter import scatter
-from megablocks.ops.sort import sort
-from megablocks.ops.sum import sum
-from megablocks.ops.topology import topology
 __all__ = [
     'binned_gather',

 # Copyright 2024 Databricks
 # SPDX-License-Identifier: Apache-2.0
+from .binned_gather import binned_gather
+from .binned_scatter import binned_scatter
+from .cumsum import exclusive_cumsum, inclusive_cumsum
+from .gather import gather
+from .histogram import histogram
+from .padded_gather import padded_gather
+from .padded_scatter import padded_scatter
+from .repeat import repeat
+from .replicate import replicate
+from .round_up import round_up
+from .scatter import scatter
+from .sort import sort
+from .sum import sum
+from .topology import topology
 __all__ = [
     'binned_gather',

build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -4,8 +4,11 @@
 import torch
 import torch.distributed as dist
-from megablocks import benchmark_util
-from megablocks.layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),

 import torch
 import torch.distributed as dist
+# from megablocks import benchmark_util
+# from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from ..layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),

build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/binned_gather.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for binned_gather kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for binned_gather kernel.

build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/binned_scatter.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Any
 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
-from megablocks.backend import kernels
 # Autograd wrapper for binned_scatter kernel.

 import torch
 from stk.backend.autocast import custom_bwd, custom_fwd
+from ..backend import kernels
 # Autograd wrapper for binned_scatter kernel.