drbh commited on Jun 12

Commit

9c4ca75

1 Parent(s): a4f6452

feat: validate build with original test suite

Files changed (24) hide show

README.md +36 -0
build.toml +24 -2
tests/conftest.py +110 -0
tests/fixtures/autouse.py +107 -0
tests/fixtures/fixtures.py +13 -0
tests/layers/architectures.py +53 -0
tests/layers/moe_test.py +199 -0
tests/ops/binned_gather_test.py +71 -0
tests/ops/binned_scatter_test.py +87 -0
tests/ops/cumsum_test.py +44 -0
tests/ops/histogram_test.py +82 -0
tests/ops/padded_gather_test.py +94 -0
tests/ops/padded_scatter_test.py +155 -0
tests/ops/replicate_test.py +108 -0
tests/ops/sort_test.py +65 -0
tests/ops/topology_test.py +81 -0
tests/test_mb_moe.py +42 -0
torch-ext/megablocks/__init__.py +6 -2
torch-ext/megablocks/ops/cumsum.py +1 -1
torch-ext/megablocks/ops/histogram.py +1 -1
torch-ext/megablocks/ops/replicate.py +1 -2
torch-ext/megablocks/ops/sort.py +1 -2
torch-ext/megablocks/ops/topology.py +1 -2
torch-ext/torch_binding.cpp +13 -13

README.md CHANGED Viewed

@@ -4,3 +4,39 @@ tags:
   - kernel
 ---

   - kernel
 ---
+```bash
+nix develop --show-trace -i -L .#test --command python -m pytest -s tests
+```
+expected output:
+```
+============== test session starts ===============
+platform linux -- Python 3.12.10, pytest-8.3.5, pluggy-1.5.0
+rootdir: /home/ubuntu/Projects/megablocks-moe
+plugins: hypothesis-6.130.12
+collecting 43 items                              world_size=1
+collected 387 items
+tests/layers/moe_test.py ...........................................
+tests/ops/binned_gather_test.py .....................
+tests/ops/binned_scatter_test.py .....................
+tests/ops/cumsum_test.py ................................
+tests/ops/histogram_test.py ......................................................
+tests/ops/padded_gather_test.py ......................................
+tests/ops/padded_scatter_test.py ......................................................
+tests/ops/replicate_test.py ..................................................................................
+tests/ops/sort_test.py ..................
+tests/ops/topology_test.py ....................
+tests/test_mb_moe.py megablocks_moe module imported successfully.
+Available functions: ['Arguments', 'MLP', 'MoE', 'ParallelDroplessMLP', 'ParallelMLP', 'SparseGLU', 'SparseMLP', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_megablocks_a4f6452_dirty', '_ops', 'argsort', 'backend', 'cumsum', 'dMoE', 'exclusive_cumsum', 'get_load_balancing_loss', 'grouped_gemm_util', 'histogram', 'inclusive_cumsum', 'indices', 'layers', 'ops', 'replicate_backward', 'replicate_forward', 'sort', 'torch']
+.cumsum output: tensor([0, 1, 3, 6], device='cuda:0', dtype=torch.int16)
+...
+================ warnings summary ================
+...
+-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
+======= 387 passed, 18 warnings in 54.63s ========
+```

build.toml CHANGED Viewed

@@ -10,6 +10,20 @@ src = [
 [kernel.megablocks]
 backend = "cuda"
 src = [
     "csrc/new_cumsum.h",
     "csrc/new_cumsum.cu",
@@ -22,9 +36,17 @@ src = [
     "csrc/new_sort.h",
     "csrc/new_sort.cu",
 ]
-depends = [ "torch", "cutlass_3_8" ]
 [test]
 python-git-packages = [
-    { url = "https://github.com/stanford-futuredata/stk.git", rev = "7363137", sha256 = "0m6g5l9nlwaiwybg5j8dhnz159wdpabdnkzapnn3dsifxrsb59vz" }
 ]

 [kernel.megablocks]
 backend = "cuda"
+cuda-capabilities = [
+    "7.0",
+    "7.2",
+    "7.5",
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "12.0",
+]
+depends = ["torch", "cutlass_3_8"]
 src = [
     "csrc/new_cumsum.h",
     "csrc/new_cumsum.cu",
     "csrc/new_sort.h",
     "csrc/new_sort.cu",
 ]
 [test]
 python-git-packages = [
+    { url = "https://github.com/stanford-futuredata/stk.git", rev = "7363137", sha256 = "0m6g5l9nlwaiwybg5j8dhnz159wdpabdnkzapnn3dsifxrsb59vz" },
+    { url = "https://github.com/mosaicml/composer.git", rev = "v0.9.0", sha256 = "ekJ5nE6JwYY6Ld9kIk72R/a3iI943Gd5yvAkBHQs5aI=" },
+    # { url = "https://github.com/tgale96/grouped_gemm.git", rev = "v0.3.0", sha256 = "sha256-fS6MuDj6yQ00CSzFrmAmM20/ccvtLJ1MFjfeqdwuPl8=" }
+]
+python-packages = [
+    "tqdm",
+    "py-cpuinfo",
+    "importlib-metadata",
+    "torchmetrics",
+    # "yahp"
 ]

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import List, Optional
+import pytest
+# from composer.utils import reproducibility
+# Allowed options for pytest.mark.world_size()
+WORLD_SIZE_OPTIONS = (1, 2)
+# Enforce deterministic mode before any tests start.
+# reproducibility.configure_deterministic_mode()
+# TODO: allow plugind when deps resolved
+# Add the path of any pytest fixture files you want to make global
+pytest_plugins = [
+    # 'tests.fixtures.autouse',
+    'tests.fixtures.fixtures',
+]
+def _get_world_size(item: pytest.Item):
+    """Returns the world_size of a test, defaults to 1."""
+    _default = pytest.mark.world_size(1).mark
+    return item.get_closest_marker('world_size', default=_default).args[0]
+def _get_option(
+    config: pytest.Config,
+    name: str,
+    default: Optional[str] = None,
+) -> str:  # type: ignore
+    val = config.getoption(name)
+    if val is not None:
+        assert isinstance(val, str)
+        return val
+    val = config.getini(name)
+    if val == []:
+        val = None
+    if val is None:
+        if default is None:
+            pytest.fail(f'Config option {name} is not specified but is required',)
+        val = default
+    assert isinstance(val, str)
+    return val
+def _add_option(
+    parser: pytest.Parser,
+    name: str,
+    help: str,
+    choices: Optional[list[str]] = None,
+):
+    parser.addoption(
+        f'--{name}',
+        default=None,
+        type=str,
+        choices=choices,
+        help=help,
+    )
+    parser.addini(
+        name=name,
+        help=help,
+        type='string',
+        default=None,
+    )
+def pytest_collection_modifyitems(
+    config: pytest.Config,
+    items: List[pytest.Item],
+) -> None:
+    """Filter tests by world_size (for multi-GPU tests)"""
+    world_size = int(os.environ.get('WORLD_SIZE', '1'))
+    print(f'world_size={world_size}')
+    conditions = [
+        lambda item: _get_world_size(item) == world_size,
+    ]
+    # keep items that satisfy all conditions
+    remaining = []
+    deselected = []
+    for item in items:
+        if all(condition(item) for condition in conditions):
+            remaining.append(item)
+        else:
+            deselected.append(item)
+    if deselected:
+        config.hook.pytest_deselected(items=deselected)
+        items[:] = remaining
+def pytest_addoption(parser: pytest.Parser) -> None:
+    _add_option(
+        parser,
+        'seed',
+        help="""\
+        Rank zero seed to use. `reproducibility.seed_all(seed + dist.get_global_rank())` will be invoked
+        before each test.""",
+    )
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
+    if exitstatus == 5:
+        session.exitstatus = 0  # Ignore no-test-ran errors

tests/fixtures/autouse.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import logging
+import os
+import composer
+import pytest
+import torch
+from composer.devices import DeviceCPU, DeviceGPU
+from composer.utils import dist, reproducibility
+@pytest.fixture(autouse=True)
+def clear_cuda_cache(request: pytest.FixtureRequest):
+    """Clear memory between GPU tests."""
+    marker = request.node.get_closest_marker('gpu')
+    if marker is not None and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
+@pytest.fixture(autouse=True)
+def reset_mlflow_tracking_dir():
+    """Reset MLFlow tracking dir so it doesn't persist across tests."""
+    try:
+        import mlflow
+        mlflow.set_tracking_uri(None)  # type: ignore
+    except ModuleNotFoundError:
+        # MLFlow not installed
+        pass
+@pytest.fixture(scope='session')
+def cleanup_dist():
+    """Ensure all dist tests clean up resources properly."""
+    yield
+    # Avoid race condition where a test is still writing to a file on one rank
+    # while the file system is being torn down on another rank.
+    dist.barrier()
+@pytest.fixture(autouse=True, scope='session')
+def configure_dist(request: pytest.FixtureRequest):
+    # Configure dist globally when the world size is greater than 1,
+    # so individual tests that do not use the trainer
+    # do not need to worry about manually configuring dist.
+    if dist.get_world_size() == 1:
+        return
+    device = None
+    for item in request.session.items:
+        device = DeviceCPU() if item.get_closest_marker('gpu') is None else DeviceGPU()
+        break
+    assert device is not None
+    if not dist.is_initialized():
+        dist.initialize_dist(device, timeout=300.0)
+    # Hold PyTest until all ranks have reached this barrier. Ensure that no rank starts
+    # any test before other ranks are ready to start it, which could be a cause of random timeouts
+    # (e.g. rank 1 starts the next test while rank 0 is finishing up the previous test).
+    dist.barrier()
+@pytest.fixture(autouse=True)
+def set_log_levels():
+    """Ensures all log levels are set to DEBUG."""
+    logging.basicConfig()
+    logging.getLogger(composer.__name__).setLevel(logging.DEBUG)
+@pytest.fixture(autouse=True)
+def seed_all(rank_zero_seed: int, monkeypatch: pytest.MonkeyPatch):
+    """Monkeypatch reproducibility.
+    Make get_random_seed to always return the rank zero seed, and set the random seed before each test to the rank local
+    seed.
+    """
+    monkeypatch.setattr(
+        reproducibility,
+        'get_random_seed',
+        lambda: rank_zero_seed,
+    )
+    reproducibility.seed_all(rank_zero_seed + dist.get_global_rank())
+@pytest.fixture(autouse=True)
+def remove_run_name_env_var():
+    # Remove environment variables for run names in unit tests
+    composer_run_name = os.environ.get('COMPOSER_RUN_NAME')
+    run_name = os.environ.get('RUN_NAME')
+    if 'COMPOSER_RUN_NAME' in os.environ:
+        del os.environ['COMPOSER_RUN_NAME']
+    if 'RUN_NAME' in os.environ:
+        del os.environ['RUN_NAME']
+    yield
+    if composer_run_name is not None:
+        os.environ['COMPOSER_RUN_NAME'] = composer_run_name
+    if run_name is not None:
+        os.environ['RUN_NAME'] = run_name

tests/fixtures/fixtures.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from tests.conftest import _get_option
+@pytest.fixture
+def rank_zero_seed(pytestconfig: pytest.Config) -> int:
+    """Read the rank_zero_seed from the CLI option."""
+    seed = _get_option(pytestconfig, 'seed', default='0')
+    return int(seed)

tests/layers/architectures.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn.functional as F
+from megablocks.layers.arguments import Arguments
+class FFN(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                args.ffn_hidden_size,
+                args.hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+    def forward(self, x):
+        return torch.matmul(
+            F.gelu(torch.matmul(x, self.w1), approximate='tanh'),
+            self.w2,
+        )
+class GLU(FFN):
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+    def forward(self, x):
+        x1 = F.gelu(torch.matmul(x, self.w1), approximate='tanh') * torch.matmul(x, self.v1)
+        return torch.matmul(x1, self.w2)

tests/layers/moe_test.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from functools import partial
+import pytest
+import torch
+from megablocks.layers.arguments import Arguments
+from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from megablocks.layers.router import batched_router_zloss, clear_router_zloss
+from tests.layers.architectures import FFN
+_FORWARD_TESTS = (
+    (16, 1024, 512, 1, 1),
+    (16, 1024, 512, 2, 1),
+    (16, 1024, 512, 4, 1),
+    (16, 1024, 512, 8, 1),
+    (8, 2048, 512, 1, 1),
+    (8, 2048, 512, 2, 1),
+    (8, 2048, 512, 4, 1),
+    (16, 1024, 512, 2, 2),
+    (16, 1024, 512, 4, 2),
+    (16, 1024, 512, 4, 4),
+    (16, 1024, 512, 8, 2),
+    (16, 1024, 512, 8, 4),
+    (16, 1024, 512, 8, 8),
+)
+_DENSE_TESTS = (
+    (16, 1024, 512),
+    (8, 2048, 512),
+)
+def construct_moe(
+    hidden_size: int,
+    ffn_hidden_size: int,
+    moe_num_experts: int = 1,
+    moe_capacity_factor: int = 1,
+    moe_top_k: int = 1,
+    moe_zloss_weight: float = 0,
+):
+    # All tests are skipped if triton >=3.2.0 is installed since sparse is not supported
+    # TODO: Remove this once sparse is supported with triton >=3.2.0
+    try:
+        import triton
+        if triton.__version__ >= '3.2.0':
+            pytest.skip('Sparse MLP is not supported with triton >=3.2.0')
+    except ImportError:
+        pass
+    init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
+    args = Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=moe_num_experts,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_top_k=moe_top_k,
+        init_method=init_method,
+        moe_zloss_weight=moe_zloss_weight,
+    )
+    mlp = FFN(args)
+    moe_mlp = MoE(args)
+    mlp.cuda(torch.cuda.current_device()).half()
+    moe_mlp.cuda(torch.cuda.current_device()).half()
+    # Set the baseline parameters to match exactly.
+    if moe_num_experts == 1:
+        with torch.no_grad():
+            mlp.w1.copy_(moe_mlp.experts.mlp.w1.squeeze())
+            mlp.w2.copy_(moe_mlp.experts.mlp.w2.squeeze())
+    return args, mlp, moe_mlp
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward(bs: int, sl: int, hs: int, num_experts: int, top_k: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    _, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+    )
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    clear_load_balancing_loss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+    )
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward_with_zloss(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_zloss_weight=1e-3,
+    )
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    clear_router_zloss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
+def test_moe_forward_vs_dense(bs: int, sl: int, hs: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    _, mlp, moe_mlp = construct_moe(hidden_size=hs, ffn_hidden_size=hs * 2)
+    expected_out = mlp(x)
+    out, _ = moe_mlp(x)
+    assert out.shape == x.shape == expected_out.shape
+    assert torch.allclose(out, expected_out)
+    clear_load_balancing_loss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
+def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+    _, mlp, moe_mlp = construct_moe(hidden_size=hs, ffn_hidden_size=hs * 2)
+    out, _ = moe_mlp(x)
+    loss = out.sum()
+    loss.backward()
+    w1_grad = moe_mlp.experts.mlp.w1.grad.detach().squeeze()
+    w2_grad = moe_mlp.experts.mlp.w2.grad.detach().squeeze()
+    moe_mlp.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    expected_out = mlp(x)
+    expected_loss = expected_out.sum()
+    expected_loss.backward()
+    expected_w1_grad = mlp.w1.grad.detach()
+    expected_w2_grad = mlp.w2.grad.detach()
+    mlp.zero_grad(set_to_none=True)
+    x.grad = None
+    # Verify the gradients match.
+    assert w1_grad.shape == expected_w1_grad.shape
+    assert w2_grad.shape == expected_w2_grad.shape
+    assert torch.allclose(w1_grad, expected_w1_grad)
+    assert torch.allclose(w2_grad, expected_w2_grad)
+    clear_load_balancing_loss()

tests/ops/binned_gather_test.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+BINNED_GATHER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), BINNED_GATHER_TESTS)
+def test_binned_gather(sl: int, hs: int, ne: int, top_k: int):
+    # NOTE: Capacity factor == 1.
+    ec = (sl * top_k) // ne
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    _, indices = ops.sort(top_expert)
+    bins = ops.inclusive_cumsum(ops.histogram(top_expert, ne), 0)
+    def binned_gather(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        ec: int,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        bins = bins.cpu().numpy()
+        start = 0
+        out = np.zeros((ne, ec, hs))
+        for i in range(ne):
+            end = bins[i]
+            for j in range(min(ec, end - start)):
+                index = indices[start + j] // top_k
+                out[i, j, :] = x[index, :]
+            start = end
+        return torch.from_numpy(out).cuda().half()
+    out = ops.binned_gather(x, indices, bins, ec, top_k)
+    expected_out = binned_gather(x, indices, bins, ec, top_k)
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/binned_scatter_test.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+_BINNED_SCATTER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), _BINNED_SCATTER_TESTS)
+def testBinnedScatter(sl: int, hs: int, ne: int, top_k: int):
+    # NOTE: Capacity factor == 1.
+    ec = (sl * top_k) // ne
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    _, indices = ops.sort(top_expert)
+    bins = ops.inclusive_cumsum(ops.histogram(top_expert, ne), 0)
+    # Sample weights for the scatter reduce.
+    weights = torch.rand((sl * top_k,)).cuda().half()
+    x = ops.binned_gather(x, indices, bins, ec, top_k)
+    def binned_scatter(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        weights = weights.cpu().numpy()
+        bins = bins.cpu().numpy()
+        start = 0
+        out = np.zeros((sl, hs))
+        for i in range(ne):
+            end = bins[i]
+            for j in range(min(ec, end - start)):
+                index = indices[start + j]
+                scale = weights[index]
+                index //= top_k
+                out[index, :] += scale * x[i, j, :]
+            start = end
+        return torch.from_numpy(out).cuda().half()
+    out = ops.binned_scatter(x, indices, weights, bins, top_k)
+    expected_out = binned_scatter(x, indices, weights, bins, top_k)
+    # NOTE: We need to check approximate equality because the
+    # scatter reduce uses atomics.
+    assert np.testing.assert_allclose(
+        out.cpu(),
+        expected_out.cpu(),
+        rtol=5e-3,
+    ) is None

tests/ops/cumsum_test.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from megablocks import ops
+CUMSUM_TESTS = (
+    (1, 32),
+    (2, 32),
+    (2, 1024),
+    (4, 1024),
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (2, 16384),
+    (4, 16384),
+    (8, 16384),
+    (16, 16384),
+    (32, 16384),
+    (64, 16384),
+    (128, 16384),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('n', 'm'), CUMSUM_TESTS)
+def test_exclusive_cumsum(n: int, m: int):
+    x = torch.randint(0, 2, (n, m)).long().cuda()
+    out = ops.exclusive_cumsum(x, 1) * x
+    expected_out = (torch.cumsum(x, dim=1) - 1) * x
+    assert torch.all(torch.eq(out, expected_out))
+@pytest.mark.gpu
+@pytest.mark.parametrize(('n', 'm'), CUMSUM_TESTS)
+def test_inclusive_cumsum(n: int, m: int):
+    x = torch.randint(0, 2, (n, m)).long().cuda()
+    out = ops.inclusive_cumsum(x, 1)
+    expected_out = torch.cumsum(x, dim=1)
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/histogram_test.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from megablocks import ops
+_HISTOGRAM_TESTS = (
+    (1, 32, torch.int16, 128),
+    (1, 1024, torch.int16, 128),
+    (1, 16384, torch.int16, 128),
+    (1, 32, torch.int32, 128),
+    (1, 1024, torch.int32, 128),
+    (1, 16384, torch.int32, 128),
+    (1, 32, torch.int64, 128),
+    (1, 1024, torch.int64, 128),
+    (1, 16384, torch.int64, 128),
+    (1, 32, torch.int16, 1024),
+    (1, 1024, torch.int16, 1024),
+    (1, 16384, torch.int16, 1024),
+    (1, 32, torch.int32, 1024),
+    (1, 1024, torch.int32, 1024),
+    (1, 16384, torch.int32, 1024),
+    (1, 32, torch.int64, 1024),
+    (1, 1024, torch.int64, 1024),
+    (1, 16384, torch.int64, 1024),
+    (2, 32, torch.int16, 128),
+    (2, 1024, torch.int16, 128),
+    (2, 16384, torch.int16, 128),
+    (2, 32, torch.int32, 128),
+    (2, 1024, torch.int32, 128),
+    (2, 16384, torch.int32, 128),
+    (2, 32, torch.int64, 128),
+    (2, 1024, torch.int64, 128),
+    (2, 16384, torch.int64, 128),
+    (2, 32, torch.int16, 1024),
+    (2, 1024, torch.int16, 1024),
+    (2, 16384, torch.int16, 1024),
+    (2, 32, torch.int32, 1024),
+    (2, 1024, torch.int32, 1024),
+    (2, 16384, torch.int32, 1024),
+    (2, 32, torch.int64, 1024),
+    (2, 1024, torch.int64, 1024),
+    (2, 16384, torch.int64, 1024),
+    (8, 32, torch.int16, 128),
+    (8, 1024, torch.int16, 128),
+    (8, 16384, torch.int16, 128),
+    (8, 32, torch.int32, 128),
+    (8, 1024, torch.int32, 128),
+    (8, 16384, torch.int32, 128),
+    (8, 32, torch.int64, 128),
+    (8, 1024, torch.int64, 128),
+    (8, 16384, torch.int64, 128),
+    (8, 32, torch.int16, 1024),
+    (8, 1024, torch.int16, 1024),
+    (8, 16384, torch.int16, 1024),
+    (8, 32, torch.int32, 1024),
+    (8, 1024, torch.int32, 1024),
+    (8, 16384, torch.int32, 1024),
+    (8, 32, torch.int64, 1024),
+    (8, 1024, torch.int64, 1024),
+    (8, 16384, torch.int64, 1024),
+)
+# Override the seed_all fixture in autouse.py because
+# _histc_cuda does not have a deterministic implementation
+@pytest.fixture()
+def seed_all():
+    torch.use_deterministic_algorithms(False)
+    return
+@pytest.mark.gpu
+@pytest.mark.parametrize(('m', 'n', 'dtype', 'max_val'), _HISTOGRAM_TESTS)
+def test_histogram(m: int, n: int, dtype: torch.dtype, max_val: int):
+    x = torch.randint(0, max_val, (m, n)).cuda().to(dtype)
+    out = ops.histogram(x, max_val)
+    expected_out = torch.stack([torch.histc(y, max_val, 0, max_val - 1) for y in torch.split(x, 1)])
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/padded_gather_test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+PADDED_GATHER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 64, 1),
+    (1024, 1, 64, 2),
+    (1024, 1, 64, 4),
+    (1024, 1, 128, 1),
+    (1024, 1, 128, 2),
+    (1024, 1, 128, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+    (16384, 1, 4, 1),
+    (16384, 1, 4, 2),
+    (16384, 1, 4, 4),
+    (16384, 1, 64, 1),
+    (16384, 1, 64, 2),
+    (16384, 1, 64, 4),
+    (16384, 1, 128, 1),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 4),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), PADDED_GATHER_TESTS)
+def testPaddedGather(sl: int, hs: int, ne: int, top_k: int):
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    bin_ids, indices = ops.sort(top_expert)
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+    def padded_gather(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        bin_ids = bin_ids.cpu().numpy()
+        bins = bins.cpu().numpy()
+        padded_bins = padded_bins.cpu().numpy()
+        out = np.zeros((padded_bins[-1], hs))
+        in_idx = 0
+        for i, end in enumerate(bins):
+            out_idx = 0 if i == 0 else padded_bins[i - 1]
+            end = bins[i]
+            while in_idx < end:
+                load_idx = indices[in_idx] // top_k
+                out[out_idx, :] = x[load_idx, :]
+                in_idx += 1
+                out_idx += 1
+        return torch.from_numpy(out).cuda().half()
+    out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    expected_out = padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/padded_scatter_test.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+PADDED_SCATTER_TESTS = [
+    (4, 2, 2, 2),
+    (4, 2, 2, 1),
+    (4, 2, 2, 1),
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 2),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 64, 1),
+    (1024, 1, 64, 2),
+    (1024, 1, 64, 4),
+    (1024, 1, 128, 1),
+    (1024, 1, 128, 2),
+    (1024, 1, 128, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 1),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+    (16384, 1, 4, 1),
+    (16384, 1, 4, 2),
+    (16384, 1, 4, 4),
+    (16384, 1, 64, 1),
+    (16384, 1, 64, 2),
+    (16384, 1, 64, 4),
+    (16384, 1, 128, 1),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 4),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 2),
+]
+def _to_numpy(x: torch.Tensor) -> np.ndarray:
+    return x.detach().cpu().numpy()
+@pytest.mark.gpu
+@pytest.mark.parametrize((
+    'sl',
+    'hs',
+    'ne',
+    'top_k',
+), PADDED_SCATTER_TESTS)
+def testPaddedScatter(sl: int, hs: int, ne: int, top_k: int):
+    # Create the data and indices.
+    x = torch.randn((sl, hs), requires_grad=True).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    bin_ids, indices = ops.sort(top_expert)
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+    # Sample weights for the scatter reduce.
+    weights = torch.rand((sl * top_k,), requires_grad=True).cuda().half()
+    # Gather the data to prepare for backwards.
+    x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    def padded_scatter(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.detach().cpu().numpy()
+        indices: np.ndarray = _to_numpy(indices)
+        bin_ids: np.ndarray = _to_numpy(bin_ids)
+        weights: np.ndarray = _to_numpy(weights)
+        bins: np.ndarray = _to_numpy(bins)
+        padded_bins: np.ndarray = _to_numpy(padded_bins)
+        out = np.zeros((indices.shape[0] // top_k, hs))
+        out_idx = 0
+        for i in range(len(bins)):
+            in_idx = 0 if i == 0 else padded_bins[i - 1]
+            end = bins[i]
+            while out_idx < end:
+                store_idx = indices[out_idx]
+                scale = weights[store_idx]
+                store_idx //= top_k
+                out[store_idx, :] += scale * x[in_idx, :]
+                out_idx += 1
+                in_idx += 1
+        return torch.from_numpy(out).cuda().half()
+    out = ops.padded_scatter(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
+    expected_out = padded_scatter(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
+    out.backward(torch.randn_like(out))  # sanity check backward pass
+    # NOTE: We need to check approximate equality because the scatter reduce uses atomics.
+    # np.testing.assert_allclose returns `None` if no error and raises an AssertionError if an error exists
+    assert np.testing.assert_allclose(
+        _to_numpy(out),
+        _to_numpy(expected_out),
+        rtol=5e-3,
+    ) is None

tests/ops/replicate_test.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+try:
+    from megablocks._ops import ops as backend  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+from megablocks import ops
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if not len(x.size()) else x
+REPLICATE_TESTS = [
+    (8, 1, 1),
+    (8, 2, 1),
+    (8, 4, 1),
+    (8, 8, 1),
+    (8, 2, 2),
+    (8, 4, 2),
+    (8, 8, 2),
+    (8, 2, 4),
+    (8, 4, 4),
+    (8, 8, 4),
+    (8, 2, 8),
+    (8, 4, 8),
+    (8, 8, 8),
+    (16384, 2, 1),
+    (16384, 4, 1),
+    (16384, 8, 1),
+    (16384, 16, 1),
+    (16384, 32, 1),
+    (16384, 64, 1),
+    (16384, 128, 1),
+    (16384, 2, 2),
+    (16384, 4, 2),
+    (16384, 8, 2),
+    (16384, 16, 2),
+    (16384, 32, 2),
+    (16384, 64, 2),
+    (16384, 128, 2),
+    (16384, 2, 4),
+    (16384, 4, 4),
+    (16384, 8, 4),
+    (16384, 16, 4),
+    (16384, 32, 4),
+    (16384, 64, 4),
+    (16384, 128, 4),
+    (16384, 2, 8),
+    (16384, 4, 8),
+    (16384, 8, 8),
+    (16384, 16, 8),
+    (16384, 32, 8),
+    (16384, 64, 8),
+    (16384, 128, 8),
+]
+@pytest.mark.gpu
+@pytest.mark.parametrize(("tokens", "num_centers", "top_k"), REPLICATE_TESTS)
+def test_replicate(tokens: int, num_centers: int, top_k: int):
+    tokens_to_centers = torch.randint(0, num_centers, (tokens,)).cuda().int()
+    tokens_per_center = ops.histogram(tokens_to_centers, num_centers)
+    bins = ops.inclusive_cumsum(tokens_per_center, 0)
+    bins = promote_scalar(bins)
+    center_weights = torch.randn(top_k, num_centers).cuda().half()
+    def replicate(x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        x = x.cpu().numpy()
+        bins = bins.cpu().numpy()
+        out = np.zeros((x.shape[0], num_outputs))
+        for batch_idx in range(x.shape[0]):
+            start = 0
+            for i, end in enumerate(bins):
+                value = x[batch_idx, i]
+                while start < end:
+                    out[batch_idx, start] = value
+                    start += 1
+        return torch.from_numpy(out).cuda().half()
+    out = ops.replicate(center_weights, bins, tokens)
+    expected_out = replicate(center_weights, bins, tokens)
+    assert torch.all(torch.eq(out, expected_out))
+@pytest.mark.gpu
+@pytest.mark.parametrize(("tokens", "num_centers", "top_k"), REPLICATE_TESTS)
+def test_replicate_backward(tokens: int, num_centers: int, top_k: int):
+    tokens_to_centers = torch.randint(0, num_centers, (tokens,)).cuda().int()
+    tokens_per_center = ops.histogram(tokens_to_centers, num_centers)
+    bins = ops.inclusive_cumsum(tokens_per_center, 0)
+    bins = promote_scalar(bins)
+    center_weights = torch.randn(top_k, num_centers).cuda().half()
+    grad = ops.replicate(center_weights, bins, tokens)
+    out = torch.empty_like(center_weights)
+    backend.replicate_backward(grad, bins, out)
+    expected_out = center_weights * tokens_per_center.view([1, num_centers])
+    # NOTE: This floating-point reduction could be a problem for training stability and accuracy.
+    assert torch.allclose(out, expected_out, rtol=1e-2)

tests/ops/sort_test.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, Optional, Union
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+SORT_TESTS = [
+    (32, torch.int16, None),
+    (1024, torch.int16, None),
+    (16384, torch.int16, None),
+    (32, torch.int32, None),
+    (1024, torch.int32, None),
+    (16384, torch.int32, None),
+    (32, torch.int64, None),
+    (1024, torch.int64, None),
+    (16384, torch.int64, None),
+    (32, torch.int16, 128),
+    (1024, torch.int16, 128),
+    (16384, torch.int16, 128),
+    (32, torch.int32, 128),
+    (1024, torch.int32, 128),
+    (16384, torch.int32, 128),
+    (32, torch.int64, 128),
+    (1024, torch.int64, 128),
+    (16384, torch.int64, 128),
+]
+def torch_to_numpy_dtype(dtype: torch.dtype,) -> Union[np.int16, np.int32, np.int64]:
+    types: Dict[torch.dtype, Union[np.int16, np.int32, np.int64]] = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    ('n', 'dtype', 'max_val'),
+    SORT_TESTS,
+)
+def test_sort(n: int, dtype: torch.dtype, max_val: Optional[int]):
+    if max_val is None:
+        max_val = np.iinfo(torch_to_numpy_dtype(dtype)).max
+    end_bit = int(np.ceil(np.log2(max_val)))
+    x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+    out, indices = ops.sort(x, end_bit)
+    expected_out, expected_indices = torch.sort(x)
+    assert torch.all(torch.eq(out, expected_out))
+    # NOTE: The indices can be in different order depending
+    # on sort stability if multiple values in the array are
+    # equal.
+    data = torch.empty_like(x)
+    data.scatter_(0, indices.long(), out)
+    expected_data = torch.empty_like(x)
+    expected_data.scatter_(0, expected_indices, expected_out)
+    assert torch.all(torch.eq(data, expected_data))

tests/ops/topology_test.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+TOPOLOGY_TESTS = (
+    (1024, 1536, 2),
+    (1024, 1536, 4),
+    (1024, 1536, 8),
+    (1024, 1536, 16),
+    (1024, 1536, 32),
+    (1024, 1536, 64),
+    (1024, 1536, 128),
+    (1024, 1536, 256),
+    (1024, 1536, 512),
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384, 768, 256),
+    (16384, 768, 512),
+    (16384, 768, 1024),
+    (8, 14336, 8),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne'), TOPOLOGY_TESTS)
+def test_topology(sl: int, hs: int, ne: int):
+    # Create the data and indices.
+    blocking = 128
+    assert hs % blocking == 0
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, blocking)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    # Dimensions for the output indices.
+    output_block_rows = int(padded_bins[-1]) // blocking
+    output_block_columns = hs // blocking
+    def topology(
+        padded_bins: torch.Tensor,
+        blocking: torch.Tensor,
+        rows: int,
+        columns: int,
+    ):
+        padded_bins = padded_bins.cpu().numpy()
+        out = np.zeros([rows * columns])
+        start = 0
+        for i in range(padded_bins.shape[0]):
+            end = padded_bins[i] // blocking
+            while start < end:
+                for j in range(columns):
+                    out[start * columns + j] = j + i * columns
+                start += 1
+        return torch.from_numpy(out).cuda().short()
+    out = ops.topology(
+        padded_bins,
+        blocking,
+        output_block_rows,
+        output_block_columns,
+    )
+    expected_out = topology(
+        padded_bins,
+        blocking,
+        output_block_rows,
+        output_block_columns,
+    )
+    assert torch.all(torch.eq(out, expected_out))

tests/test_mb_moe.py CHANGED Viewed

@@ -1,6 +1,48 @@
 import megablocks
 def test_import():
     """Simple test to check if the module can be imported."""
     print("megablocks_moe module imported successfully.")
     print("Available functions:", dir(megablocks))

+import torch
 import megablocks
 def test_import():
     """Simple test to check if the module can be imported."""
     print("megablocks_moe module imported successfully.")
     print("Available functions:", dir(megablocks))
+    expected_functions = [
+        "Arguments", "MLP", "MoE", "ParallelDroplessMLP", "ParallelMLP",
+        "SparseGLU", "SparseMLP", "argsort",
+        "backend", "cumsum", "dMoE", "exclusive_cumsum",
+        "get_load_balancing_loss", "grouped_gemm_util", "histogram",
+        "inclusive_cumsum", "indices", "layers", "ops", "replicate_backward",
+        "replicate_forward", "sort", "torch"
+    ]
+    # Check if all expected functions are available
+    for func in expected_functions:
+        assert func in dir(megablocks), f"Missing function: {func}"
+# exclusive_cumsum
+def test_exclusive_cumsum():
+    """Test exclusive cumulative sum."""
+    x = torch.tensor([1, 2, 3, 4], dtype=torch.int16).cuda()
+    out = torch.empty_like(x)
+    megablocks.exclusive_cumsum(x, 0, out)
+    expected = torch.tensor([0, 1, 3, 6], dtype=torch.float32).cuda()
+    assert torch.equal(out, expected), f"Expected {expected}, got {out}"
+    print("cumsum output:", out)
+# inclusive_cumsum
+def test_inclusive_cumsum():
+    """Test inclusive cumulative sum."""
+    x = torch.tensor([1, 2, 3, 4], dtype=torch.int16).cuda()
+    out = torch.empty_like(x)
+    megablocks.inclusive_cumsum(x, dim=0, out=out)
+    expected = torch.tensor([1, 3, 6, 10], dtype=torch.float32).cuda()
+    assert torch.equal(out, expected), f"Expected {expected}, got {out}"
+# histogram
+def test_histogram():
+    """Test histogram operation."""
+    x = torch.tensor([0, 1, 1, 2, 2, 2], dtype=torch.int16).cuda()
+    num_bins = 3
+    hist = megablocks.histogram(x, num_bins)
+    expected_hist = torch.tensor([1, 2, 3], dtype=torch.int32).cuda()
+    assert torch.equal(hist, expected_hist), f"Expected {expected_hist}, got {hist}"

torch-ext/megablocks/__init__.py CHANGED Viewed

@@ -24,7 +24,9 @@ def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tens
     Returns:
         The output tensor
     """
-    return ops.exclusive_cumsum(x, dim, out)
 def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
@@ -39,7 +41,9 @@ def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tens
     Returns:
         The output tensor
     """
-    return ops.inclusive_cumsum(x, dim, out)
 def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:

     Returns:
         The output tensor
     """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
 def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
     Returns:
         The output tensor
     """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
 def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:

torch-ext/megablocks/ops/cumsum.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 # instructions for building the c++ operations.
 try:
     # import megablocks_ops as ops  # type: ignore
-    import megablocks._ops as ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # instructions for building the c++ operations.
 try:
     # import megablocks_ops as ops  # type: ignore
+    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

torch-ext/megablocks/ops/histogram.py CHANGED Viewed

@@ -10,7 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    import megablocks._ops as ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

torch-ext/megablocks/ops/replicate.py CHANGED Viewed

@@ -10,8 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    # import megablocks_ops as ops  # type: ignore
-    import megablocks._ops as ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

torch-ext/megablocks/ops/sort.py CHANGED Viewed

@@ -10,8 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    # import megablocks_ops as ops  # type: ignore
-    import megablocks._ops as ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

torch-ext/megablocks/ops/topology.py CHANGED Viewed

@@ -10,8 +10,7 @@ import torch
 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
-    # import megablocks_ops as ops  # type: ignore
-    import megablocks._ops as ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

 # Wrap this in a try-block with better error message and
 # instructions for building the c++ operations.
 try:
+    from megablocks._ops import ops  # type: ignore
 except ModuleNotFoundError as e:
     raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -34,22 +34,22 @@ torch::Tensor histogram_wrapper(torch::Tensor x, int64_t num_bins) {
 torch::Tensor indices_wrapper(torch::Tensor padded_bins,
                                int64_t block_size,
                                int64_t output_block_rows,
-                               int64_t output_block_columns) {
-  torch::Tensor out = torch::empty({output_block_rows * output_block_columns}, torch::kInt16);
   megablocks::indices(padded_bins, block_size, output_block_rows, output_block_columns, out);
   return out;
 }
-// // // Forward pass: replicate values from x according to bin sizes
-// // void replicate_forward(torch::Tensor x,
-// //   torch::Tensor bins,
-// //   torch::Tensor out);
-// tensor::Tensor replicate_forward_wrapper(torch::Tensor x, torch::Tensor bins, torch::Tensor out) {
-//   megablocks::replicate_forward(x, bins, out);
-//   return out;
-// }
 // // Backward pass: reduce gradients back to bins using segmented reduction
 // void replicate_backward(torch::Tensor grad,
@@ -90,11 +90,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("histogram(Tensor x, int num_bins) -> Tensor");
   ops.impl("histogram", torch::kCUDA, &histogram_wrapper);
-  ops.def("indices(Tensor padded_bins, int block_size, int output_block_rows, int output_block_columns) -> Tensor");
   ops.impl("indices", torch::kCUDA, &indices_wrapper);
-  // ops.def("replicate_forward(Tensor x, Tensor bins, Tensor(a!) out) -> Tensor(a!)");
-  // ops.impl("replicate_forward", torch::kCUDA, &replicate_forward_wrapper);
   ops.def("replicate_backward(Tensor grad, Tensor bins, Tensor(a!) out) -> Tensor(a!)");
   ops.impl("replicate_backward", torch::kCUDA, &replicate_backward_wrapper);

 torch::Tensor indices_wrapper(torch::Tensor padded_bins,
                                int64_t block_size,
                                int64_t output_block_rows,
+                               int64_t output_block_columns,
+                               torch::Tensor out) {
   megablocks::indices(padded_bins, block_size, output_block_rows, output_block_columns, out);
   return out;
 }
+// Forward pass: replicate values from x according to bin sizes
+// void replicate_forward(torch::Tensor x,
+//   torch::Tensor bins,
+//   torch::Tensor out);
+torch::Tensor replicate_forward_wrapper(torch::Tensor x, torch::Tensor bins, torch::Tensor out) {
+  megablocks::replicate_forward(x, bins, out);
+  return out;
+}
 // // Backward pass: reduce gradients back to bins using segmented reduction
 // void replicate_backward(torch::Tensor grad,
   ops.def("histogram(Tensor x, int num_bins) -> Tensor");
   ops.impl("histogram", torch::kCUDA, &histogram_wrapper);
+  ops.def("indices(Tensor padded_bins, int block_size, int output_block_rows, int output_block_columns, Tensor(a!) out) -> Tensor(a!)");
   ops.impl("indices", torch::kCUDA, &indices_wrapper);
+  ops.def("replicate_forward(Tensor x, Tensor bins, Tensor(a!) out) -> Tensor(a!)");
+  ops.impl("replicate_forward", torch::kCUDA, &replicate_forward_wrapper);
   ops.def("replicate_backward(Tensor grad, Tensor bins, Tensor(a!) out) -> Tensor(a!)");
   ops.impl("replicate_backward", torch::kCUDA, &replicate_backward_wrapper);