drbh commited on 9 days ago

Commit

5268e56

1 Parent(s): e47036a

feat: bump build

Files changed (41) hide show

build/torch26-cxx11-cu118-x86_64-linux/megablocks/{_megablocks_13afbbe_dirty.abi3.so → _megablocks_e47036a.abi3.so} +2 -2
build/torch26-cxx11-cu118-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers.py +129 -142
build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch26-cxx11-cu124-x86_64-linux/megablocks/{_megablocks_13afbbe_dirty.abi3.so → _megablocks_e47036a.abi3.so} +2 -2
build/torch26-cxx11-cu124-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers.py +129 -142
build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/{torch26-cxx98-cu118-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so} +2 -2
build/torch26-cxx11-cu126-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/megablocks/layers.py +129 -142
build/torch26-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch26-cxx98-cu118-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so +3 -0
build/torch26-cxx98-cu118-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/megablocks/layers.py +129 -142
build/torch26-cxx98-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch26-cxx98-cu124-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so +0 -3
build/torch26-cxx98-cu124-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so +3 -0
build/torch26-cxx98-cu124-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/megablocks/layers.py +129 -142
build/torch26-cxx98-cu124-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch26-cxx98-cu126-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so +0 -3
build/torch26-cxx98-cu126-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so +3 -0
build/torch26-cxx98-cu126-x86_64-linux/megablocks/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/megablocks/layers.py +129 -142
build/torch26-cxx98-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so +0 -3
build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py +129 -142
build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so +0 -3
build/{torch26-cxx11-cu126-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py +129 -142
build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121
build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so +0 -3
build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so +3 -0
build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py +129 -142
build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/all_to_all_benchmark.py +21 -121

build/torch26-cxx11-cu118-x86_64-linux/megablocks/{_megablocks_13afbbe_dirty.abi3.so → _megablocks_e47036a.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5683ac8b3e98fc8b8ab19f964b0dbfb9a980b6135220b0a0c1b50180665ce341
-size 10517608

 version https://git-lfs.github.com/spec/v1
+oid sha256:9b5370545c29afcc1d1d0cab8d5fce563647e26aec84eb17a66f66e10ddc92c9
+size 10517576

build/torch26-cxx11-cu118-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch26-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch26-cxx11-cu124-x86_64-linux/megablocks/{_megablocks_13afbbe_dirty.abi3.so → _megablocks_e47036a.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b55d6ee3d41404603fdb75ad9a2949aa92e0224f7056fdbeb4c66934035ebd4b
-size 11869424

 version https://git-lfs.github.com/spec/v1
+oid sha256:f98b46218f277c881a492ffaf78791cd3af6b2c7707ea5883538008366b18569
+size 11869392

build/torch26-cxx11-cu124-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch26-cxx11-cu124-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/{torch26-cxx98-cu118-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5c8c1b700d297741dd86e8c388e03913a30769ceb51b7c12a01245fbdf30128
-size 10510072

 version https://git-lfs.github.com/spec/v1
+oid sha256:a8dc9a6a46b46860607a5e7f10374e80dee9e8a65782119e085c6700e199b85a
+size 11931048

build/torch26-cxx11-cu126-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch26-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch26-cxx98-cu118-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8b7431f955f32750e9006f0c06c4ec0f99ca4a7ecd51585d8626d119db69084
+size 10510040

build/torch26-cxx98-cu118-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch26-cxx98-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch26-cxx98-cu124-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d915db521f8d37fb887ed8334db60165e5923f8dce817d69f6441c5ba2d210d6
-size 11857952

build/torch26-cxx98-cu124-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c5245e311ba8b23b577be785d6dbd902a75b467ca70274c256042dd21ed235c
+size 11857920

build/torch26-cxx98-cu124-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch26-cxx98-cu124-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch26-cxx98-cu126-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:94a9a3bb426adceab66b39fe9d179b73e4524167aeb63bed5a67cd7734d31b24
-size 11923704

build/torch26-cxx98-cu126-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4816243ebcbc505f3e9229dd55ef52c7e0d804a4f4ef67f9fe3e70932cf08027
+size 11923672

build/torch26-cxx98-cu126-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch26-cxx98-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:aa9d1964e47ec6ff3c4ec77947f6a2a19868b03cec3618daf0555e011f69924d
-size 10517848

build/torch27-cxx11-cu118-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c5a31570d353a8c392d72f018198c32c6522af0f5a1345426fd7cff5965c1cd
+size 10517816

build/torch27-cxx11-cu118-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch27-cxx11-cu118-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b204da58db0f8be45dda62abd98b74a8e60f1f983bfc6a128c74ff66f67cf502
-size 11931112

build/{torch26-cxx11-cu126-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:516c5026180d4a8d013c500ed284a60ecbed4bc6c9dc084b838913f40327d1a6
 size 11931080

 version https://git-lfs.github.com/spec/v1
+oid sha256:8122201f56a5575ec964e0450cac06affb44498cbf5b6b32870676a436821c15
 size 11931080

build/torch27-cxx11-cu126-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch27-cxx11-cu126-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_13afbbe_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f861a8bffedbbf14341d39355f3f43a7c24fee2b99bb9ea7b3a2b9ad21c7ee28
-size 17892656

build/torch27-cxx11-cu128-x86_64-linux/megablocks/_megablocks_e47036a.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb62a4fdccfdac00069689f7a08d2fba56393adb0ca9e8cb7c085f6db919d55
+size 17892624

build/torch27-cxx11-cu128-x86_64-linux/megablocks/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _megablocks_13afbbe_dirty
-ops = torch.ops._megablocks_13afbbe_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_megablocks_13afbbe_dirty::{op_name}"

 import torch
+from . import _megablocks_e47036a
+ops = torch.ops._megablocks_e47036a
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_megablocks_e47036a::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/megablocks/layers.py CHANGED Viewed

@@ -333,7 +333,6 @@ def permute_and_compute(
     gradient_scale,
     alpha,
 ):
-    """Permute tokens and compute expert outputs."""
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
@@ -367,6 +366,7 @@ def forward_once(
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
@@ -430,11 +430,15 @@ def parallel_forward_once(
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
@@ -455,9 +459,7 @@ def parallel_forward_once(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
-        # print("world_size:", world_size)
-        # print("experts_per_rank_val:", experts_per_rank_val)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
@@ -493,20 +495,13 @@ def parallel_forward_once(
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
-    parallel_x, parallel_x_handle = ops.all_to_all(
-        x,
-        recv_counts,
-        send_counts,
-        expert_parallel_group,
-        async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
-        replicate_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert.flatten(),
-            0
-        )
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
@@ -528,7 +523,7 @@ def parallel_forward_once(
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
-            parallel_top_expert,
             sort_end_bit,
         )
@@ -536,10 +531,7 @@ def parallel_forward_once(
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
-        parallel_bins = ops.inclusive_cumsum(
-            parallel_tokens_per_expert,
-            0
-        )
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
@@ -558,10 +550,7 @@ def parallel_forward_once(
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
-    # if self.args.mlp_impl == 'grouped':
-    # TODO: dont always assume grouped MLP
-    if True:
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
@@ -591,7 +580,9 @@ def parallel_forward_once(
     )
     # Step 6: Reverse communication - send results back
-    x, _ = ops.all_to_all(parallel_x, send_counts, recv_counts, expert_parallel_group)
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
@@ -603,139 +594,135 @@ def parallel_forward_once(
     return x, tokens_per_expert.flatten()
-class MyReplacementLayer(torch.nn.Module):
-    def forward(
-        x: torch.Tensor,
-        router_weight: torch.Tensor,
-        moe_top_k: int,
-        moe_num_experts: int,
-        moe_jitter_eps: float = None,
-        moe_normalize_expert_weights: int = None,
-        uniform_expert_assignment: bool = False,
-        training: bool = False,
-        w1: torch.Tensor = None,
-        w2: torch.Tensor = None,
-        w1_bias: torch.Tensor = None,
-        w2_bias: torch.Tensor = None,
-        gradient_scale: Optional[float] = None,
-        alpha: float = 1.702,
-        sort_end_bit: int = 0,
-        expert_parallel_group: torch.distributed.ProcessGroup = None,
-        moe_capacity_factor: float = 1.0,
-        moe_expert_model_parallelism: bool = False,
-        forward_fn: Any = None,
-        hidden_size: int = None,  # Required for parallel forward
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Route tokens to experts
-        logits, expert_weights, expert_indices = route_tokens(
-            x,
-            router_weight,
-            moe_top_k,
-            moe_num_experts,
-            moe_jitter_eps,
-            moe_normalize_expert_weights,
-            uniform_expert_assignment,
-            training,
-        )
-        # Create router scores for output
-        router_scores = (
-            torch.zeros_like(logits)
-            .scatter_(1, expert_indices, expert_weights)
-            .transpose(0, 1)
-        )
-        in_shape = x.size()
-        # Prepare forward function arguments
-        forward_args = {
-            "x": x,
-            "expert_weights": expert_weights,
-            "top_experts": expert_indices,
-            "w1": w1,
-            "w2": w2,
-            "w1_bias": w1_bias,
-            "w2_bias": w2_bias,
-            "gradient_scale": gradient_scale,
-            "alpha": alpha,
-            "sort_end_bit": sort_end_bit,
-            "top_k": moe_top_k,
-            "num_experts": moe_num_experts,
-            "expert_parallel_group": expert_parallel_group,
-            "moe_capacity_factor": moe_capacity_factor,
-            "moe_expert_model_parallelism": moe_expert_model_parallelism,
-        }
-        # Add hidden_size for parallel forward
-        if moe_expert_model_parallelism and hidden_size is not None:
-            forward_args["hidden_size"] = hidden_size
-        elif moe_expert_model_parallelism and hidden_size is None:
-            # Infer hidden_size from input shape
-            forward_args["hidden_size"] = x.shape[-1]
-        # Compute expert outputs
-        x, tokens_per_expert = forward_fn(**forward_args)
-        # Save load balancing loss if needed
-        moe_loss_weight = 0.0  # Can be made configurable
-        if training and moe_loss_weight > 0:
-            save_load_balancing_loss((tokens_per_expert, logits))
-        # Restore original shape
-        x = x.view(in_shape)
-        return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        router_weight = self.router.weight
-        moe_top_k = 4
-        moe_num_experts = 128
-        w1 = self.experts.gate_up_proj.data
-        w2 = self.experts.down_proj.data
-        w1_bias = self.experts.gate_up_proj_bias.data
-        w2_bias = self.experts.down_proj_bias.data
-        # check if the expert_parallel_group attribute is set
-        if hasattr(self, "expert_parallel_group"):
-            expert_parallel_group = self.expert_parallel_group
-            moe_expert_model_parallelism = True
-            forward_fn = parallel_forward_once
-        else:
-            expert_parallel_group = None
-            moe_expert_model_parallelism = False
-            forward_fn = forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
-        hidden_size = self.experts.hidden_size
-        output, expert_weights_out, router_scores = MyReplacementLayer.forward(
             x=x,
-            router_weight=router_weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
-            moe_jitter_eps=None,
-            moe_normalize_expert_weights=None,
-            uniform_expert_assignment=False,
-            training=False,
-            w1=w1,
-            w2=w2,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
-            gradient_scale=None,
-            alpha=1.702,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
-            moe_capacity_factor=1.0,
-            moe_expert_model_parallelism=moe_expert_model_parallelism,
             forward_fn=forward_fn,
-            hidden_size=hidden_size,
         )
         return output, expert_weights_out

     gradient_scale,
     alpha,
 ):
     # Route tokens to experts
     x = x.view(-1, x.shape[-1])
     expert_parallel_group: int = None,
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = False,
+    mlp_impl: Optional[str] = None,
 ):
     # x: [sl, bs, hs]
     # expert_weights: [sl * bs, top-k]
     moe_capacity_factor: float = 1.0,
     moe_expert_model_parallelism: bool = True,
     hidden_size: int = 1152,
+    mlp_impl: Optional[str] = "grouped",
 ):
     # Flatten inputs
     expert_weights = expert_weights.flatten()
     top_experts = top_experts.flatten()
+    # TODO: remove debugging var
+    # my_rank = dist.get_rank(expert_parallel_group) if expert_parallel_group else 0
     with torch.no_grad():
         # Step 1: Local permutation setup
         indices, bin_ids, bins, tokens_per_expert = indices_and_bins(
         # Exchange token counts across devices
         parallel_tokens_per_expert = torch.empty_like(repeated_tokens_per_expert)
         # Ensure CUB knows which device to use
         tpe_handle = dist.all_to_all_single(
             parallel_tokens_per_expert,
     x = ops.repeat(x, (hidden_sharding_deg, 1))
     # Cross-device token exchange
+    parallel_x, parallel_x_handle = _layers.all_to_all.all_to_all(
+        x, recv_counts, send_counts, expert_parallel_group, async_op=True
     )
     with torch.no_grad():
         # Step 4: Setup for local expert computation
+        replicate_bins = ops.inclusive_cumsum(parallel_tokens_per_expert.flatten(), 0)
         replicate_bins = (
             replicate_bins.view(1) if not len(replicate_bins.size()) else replicate_bins
         )
         # Sort tokens by expert assignment
         parallel_bin_ids, parallel_indices = ops.sort(
+            parallel_top_expert,
             sort_end_bit,
         )
         parallel_tokens_per_expert = parallel_tokens_per_expert.sum(
             dim=0, dtype=torch.int
         )
+        parallel_bins = ops.inclusive_cumsum(parallel_tokens_per_expert, 0)
         parallel_bins = (
             parallel_bins.view(1) if not len(parallel_bins.size()) else parallel_bins
         )
     # Locally permute the tokens and perform the expert computation.
     # Block to make sure that the cross-device permutation is complete.
+    if mlp_impl == "grouped":
         # GroupedMLP requires counts on CPU. We can use the tensor already
         # moved to CPU for the prior all_to_all, which avoids an extra
         # device synchronization.
     )
     # Step 6: Reverse communication - send results back
+    x, _ = _layers.all_to_all.all_to_all(
+        parallel_x, send_counts, recv_counts, expert_parallel_group
+    )
     # Step 7: Reduce across hidden sharding dimension
     shape = (hidden_sharding_deg, -1, hidden_size)
     return x, tokens_per_expert.flatten()
+def moe_forward(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    moe_top_k: int,
+    moe_num_experts: int,
+    moe_jitter_eps: float = None,
+    moe_normalize_expert_weights: int = None,
+    uniform_expert_assignment: bool = False,
+    training: bool = False,
+    w1: torch.Tensor = None,
+    w2: torch.Tensor = None,
+    w1_bias: torch.Tensor = None,
+    w2_bias: torch.Tensor = None,
+    gradient_scale: Optional[float] = None,
+    alpha: float = 1.702,
+    sort_end_bit: int = 0,
+    expert_parallel_group: torch.distributed.ProcessGroup = None,
+    moe_capacity_factor: float = 1.0,
+    moe_expert_model_parallelism: bool = False,
+    forward_fn: Any = None,
+    hidden_size: int = None,
+    mlp_impl: str = "grouped",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Route tokens to experts
+    logits, expert_weights, expert_indices = route_tokens(
+        x,
+        router_weight,
+        moe_top_k,
+        moe_num_experts,
+        moe_jitter_eps,
+        moe_normalize_expert_weights,
+        uniform_expert_assignment,
+        training,
+    )
+    # Create router scores for output
+    router_scores = (
+        torch.zeros_like(logits)
+        .scatter_(1, expert_indices, expert_weights)
+        .transpose(0, 1)
+    )
+    in_shape = x.size()
+    # Prepare forward function arguments
+    forward_args = {
+        "x": x,
+        "expert_weights": expert_weights,
+        "top_experts": expert_indices,
+        "w1": w1,
+        "w2": w2,
+        "w1_bias": w1_bias,
+        "w2_bias": w2_bias,
+        "gradient_scale": gradient_scale,
+        "alpha": alpha,
+        "sort_end_bit": sort_end_bit,
+        "top_k": moe_top_k,
+        "num_experts": moe_num_experts,
+        "expert_parallel_group": expert_parallel_group,
+        "moe_capacity_factor": moe_capacity_factor,
+        "moe_expert_model_parallelism": moe_expert_model_parallelism,
+        "mlp_impl": mlp_impl,
+    }
+    # Add hidden_size for parallel forward
+    if moe_expert_model_parallelism and hidden_size is not None:
+        forward_args["hidden_size"] = hidden_size
+    elif moe_expert_model_parallelism and hidden_size is None:
+        # Infer hidden_size from input shape
+        forward_args["hidden_size"] = x.shape[-1]
+    # Compute expert outputs
+    x, tokens_per_expert = forward_fn(**forward_args)
+    # Save load balancing loss if needed
+    moe_loss_weight = 0.0  # Can be made configurable
+    if training and moe_loss_weight > 0:
+        save_load_balancing_loss((tokens_per_expert, logits))
+    # Restore original shape
+    x = x.view(in_shape)
+    return x, expert_weights, router_scores
 class MegaBlocksMoeMLP(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        moe_top_k = getattr(self, "moe_top_k", 4)
+        moe_num_experts = getattr(self, "moe_num_experts", 128)
+        gradient_scale = getattr(self, "gradient_scale", None)
+        alpha = getattr(self, "alpha", 1.702)
+        moe_capacity_factor = getattr(self, "moe_capacity_factor", 1.0)
+        moe_jitter_eps = getattr(self, "moe_jitter_eps", None)
+        moe_normalize_expert_weights = getattr(
+            self, "moe_normalize_expert_weights", None
+        )
+        uniform_expert_assignment = getattr(self, "uniform_expert_assignment", False)
+        has_parallel = hasattr(self, "expert_parallel_group")
+        expert_parallel_group = getattr(self, "expert_parallel_group", None)
+        forward_fn = parallel_forward_once if has_parallel else forward_once
         sort_end_bit = max(
             int(torch.ceil(torch.log2(torch.tensor(moe_num_experts)))), 1
         )
+        mlp_impl = getattr(self, "mlp_impl", "grouped")  # or sparse
+        output, expert_weights_out, _ = moe_forward(
             x=x,
+            router_weight=self.router.weight,
             moe_top_k=moe_top_k,
             moe_num_experts=moe_num_experts,
+            moe_jitter_eps=moe_jitter_eps,
+            moe_normalize_expert_weights=moe_normalize_expert_weights,
+            uniform_expert_assignment=uniform_expert_assignment,
+            training=self.training,
+            w1=self.experts.gate_up_proj,
+            w2=self.experts.down_proj,
+            w1_bias=self.experts.gate_up_proj_bias,
+            w2_bias=self.experts.down_proj_bias,
+            gradient_scale=gradient_scale,
+            alpha=alpha,
             sort_end_bit=sort_end_bit,
             expert_parallel_group=expert_parallel_group,
+            moe_capacity_factor=moe_capacity_factor,
+            moe_expert_model_parallelism=has_parallel,
             forward_fn=forward_fn,
+            hidden_size=self.experts.hidden_size,
+            mlp_impl=mlp_impl,
         )
         return output, expert_weights_out

build/torch27-cxx11-cu128-x86_64-linux/megablocks/ops/all_to_all_benchmark.py CHANGED Viewed

@@ -7,126 +7,28 @@ import torch.distributed as dist
 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
-# from .. import benchmark_util
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import numpy as np
-import torch
-def log_benchmark(name, arguments, time, std):
-    print("=" * 60)
-    print(f"{name} Benchmark")
-    print("Benchmark Parameters:")
-    for key, value in arguments.items():
-        print(f"{key} = {value}")
-    print("Results:")
-    print("mean time = {:.3f}ms, std time = {:.3f}ms".format(time, std))
-    print("=" * 60)
-def benchmark_function(fn, iterations=100, warmup=10):
-    print(f"Benchmarking {fn.__name__} with {iterations} iterations and {warmup} warmup iterations")
-    # Warmup iterations.
-    for _ in range(warmup):
-        fn()
-    times = []
-    print(f"Running {iterations} iterations...")
-    for i in range(iterations):
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        start.record()
-        fn()
-        end.record()
-        torch.cuda.synchronize()
-        times.append(start.elapsed_time(end))
-    return np.mean(times), np.std(times)
-# from .._layers.all_to_all import all_to_all
-# Copyright 2024 Databricks
-# SPDX-License-Identifier: Apache-2.0
-import torch
-import torch.distributed as dist
-class AllToAllOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
-        out = torch.empty(
-            (sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype
-        )
-        ctx.input_shape = x.shape
-        ctx.output_split_sizes = output_split_sizes
-        ctx.input_split_sizes = input_split_sizes
-        ctx.group = group
-        handle = dist.all_to_all_single(
-            out,
-            x,
-            output_split_sizes=output_split_sizes,
-            input_split_sizes=input_split_sizes,
-            group=group,
-            async_op=async_op,
-        )
-        return out, handle
-    @staticmethod
-    def backward(ctx, grad, _):
-        if ctx.needs_input_grad[0]:
-            out = torch.empty(
-                ctx.input_shape,
-                device=grad.device,
-                dtype=grad.dtype,
-            )
-            dist.all_to_all_single(
-                out,
-                grad,
-                output_split_sizes=ctx.input_split_sizes,
-                input_split_sizes=ctx.output_split_sizes,
-                group=ctx.group,
-            )
-            return out, None, None, None, None
-        return None, None, None, None, None
-def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
-    return AllToAllOp.apply(
-        x,
-        output_split_sizes,
-        input_split_sizes,
-        group,
-        async_op,
-    )
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
-    # (16, 1024),
-    # (32, 1024),
-    # (64, 1024),
-    # (128, 1024),
-    # (256, 1024),
-    # (512, 1024),
-    # (1024, 1024),
-    # (2 * 1024, 1024),
-    # (4 * 1024, 1024),
-    # (8 * 1024, 1024),
-    # (16 * 1024, 1024),
-    # (32 * 1024, 1024),
-    # (64 * 1024, 1024),
-    # (128 * 1024, 1024),
-    # (256 * 1024, 1024),
-    # (512 * 1024, 1024),
-    # (1024 * 1024, 1024),
 )
@@ -145,12 +47,10 @@ def benchmark_all_to_all(group, sl, hs):
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
-    # time, std = benchmark_util.benchmark_function(benchmark)
-    time, std = benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
-        log_benchmark('All-To-All', details, time, std)
-        # benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':

 # from megablocks import benchmark_util
 # from megablocks.layers.all_to_all import all_to_all
+from .. import benchmark_util
+from .._layers.all_to_all import all_to_all
 _ALL_TO_ALL_BENCHMARK = (
     (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (256, 1024),
+    (512, 1024),
+    (1024, 1024),
+    (2 * 1024, 1024),
+    (4 * 1024, 1024),
+    (8 * 1024, 1024),
+    (16 * 1024, 1024),
+    (32 * 1024, 1024),
+    (64 * 1024, 1024),
+    (128 * 1024, 1024),
+    (256 * 1024, 1024),
+    (512 * 1024, 1024),
+    (1024 * 1024, 1024),
 )
     def benchmark():
         return all_to_all(x, send_recv_sizes, send_recv_sizes, group)
+    time, std = benchmark_util.benchmark_function(benchmark)
     if dist.get_rank(group) == 0:
+        benchmark_util.log_benchmark('All-To-All', details, time, std)
 if __name__ == '__main__':