iamwyldecat commited on Jun 15

Commit

8535e80

1 Parent(s): cf531ba

chore: initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
README.md +33 -0
build.toml +23 -0
build/torch26-cxx11-cu118-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +458 -0
build/torch26-cxx11-cu124-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +458 -0
build/torch26-cxx11-cu126-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +458 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so +3 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so +3 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +458 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +458 -0
build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +458 -0
build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py +5 -0
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +9 -0
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +458 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py +5 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +9 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +458 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py +5 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +9 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +3 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +458 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+tags:
+- kernel
+---
+# Optimizer
+Optimizer is a python package that provides:
+- PyTorch implementation of recent optimizer algorithms
+- with support for parallelism techniques for efficient large-scale training.
+### Currently implemented
+- [Parallel Muon with FSDP2](./docs/muon/parallel_muon.pdf)
+## Usage
+```python
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from kernels import get_kernel
+optimizer = get_kernel("motif-technologies/optimizer")
+model = None # your model here
+fsdp_model = FSDP(model)
+optim = optimizer.Muon(
+    fsdp_model.parameters(),
+    lr=0.01,
+    momentum=0.9,
+    weight_decay=1e-4,
+)
+```

build.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[general]
+name = "optimizer"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.activation]
+backend = "rocm"
+src = [
+  "optimizer/dummy.cu",
+]
+depends = [ "torch" ]
+[kernel.activation_cuda]
+backend = "cuda"
+src = [
+  "optimizer/dummy.cu",
+]
+depends = [ "torch" ]

build/torch26-cxx11-cu118-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66ca698639fff584999fe65f8f10cc4436c197829e936be2741bf53db685caa0
+size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8325d12959ef4f31b6c6340eca29176f5077abeaa10f3a6651db55ccf3c634f
+size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch26-cxx11-cu124-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e89cd7d514bfe92598684ae3cfc2d35ac2d021340846e09c0b6c880c3d55bfa0
+size 1820136

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cbffc2cf8039069831a57afb8e2f64fa684f1a44bec79bb4b72dbb57d9ac607
+size 1824224

build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch26-cxx11-cu126-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f5dce62d3038e879e688fffa9bbc70f3e82db20b2e7ae3ba09040e0319acb71
+size 1820136

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58162f994df84868dbf62ae70e39d3c14e3390fc827f152eece83dfae7f51503
+size 1824224

build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2f60369ba2bd0a0f84e053d857d37496137ff476dc21561f211b1fa39651990
+size 1749784

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4d790535f99b7b362a966e802a547654f31749f5f28a0207493870927f1d8d2
+size 1749784

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b440dd9a60711a498010068e91d0ad013cd0b8ac732c16b5d1d17e5d4ec0f9b4
+size 1749784

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f50ea9cab62a5bd06d886516d3917e4490e65aa9addd1cbb84fc81c6f9a9d5b1
+size 1749744

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch26-cxx98-cu118-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8f8e7d78ed9a095b882cf764fd9c80a0b0810fb961ba9e8545656fc4cb0b0d7
+size 1787200

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:002dab6441bcad54ab4e7c064b5806acfd45170eb33cfa059745ba6e0c349607
+size 1787192

build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch26-cxx98-cu124-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab2379d932e40d10bee55f032bd16d2e4d9c1920bc5500628006f8a0eb8abd39
+size 1824192

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f499350bb19eca6c3da1bb72e46023834b8411ce00730854273b588b2cd9206
+size 1824184

build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch26-cxx98-cu126-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c3282a321487a6faa532afe43bc1298731983c50e2a1acdff5480ff6e4df34e
+size 1824192

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5b49ed642e1c320da3932377033ad90031124f4ec24b2d1c95fd976ff28346c
+size 1824184

build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de82486a39ded94bfe7eeaa862459944a93e284fd0d919329979bb67db3c367f
+size 1787376

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ac9027c4a93801e9f19f1e9e94a9ed33b27e92c72797053c3de55e2a6fbb41d
+size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .muon import Muon
+__all__ = [
+    "Muon",
+]

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _optimizer_b4b3752_dirty
+ops = torch.ops._optimizer_b4b3752_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_optimizer_b4b3752_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb02d3818a89c819a5a12d066ce56da0ebc4f3da491cb045ae380c5b9319e592
+size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b425a7fd854402508da5af17fa88f305753a09474686d6ec7afe540b3c5c082e
+size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+# TODO leave original url and consider LICENSE
+# This code snippet is a modified version adapted from the following GitHub repository:
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+def _zeropower_via_newtonschulz5(G, steps):
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G  # no manual typecast
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    X = X.bfloat16()
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        # B = (
+        #    b * A + c * A @ A
+        # )  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = torch.addmm(A, A, A, alpha=c, beta=b)
+        # X = a * X + B @ X
+        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X.to(G.dtype)
+@dataclass
+class _muon_state:
+    # TODO: use Optional
+    worker_rank: int | None = None
+    gathered_grad: torch.Tensor | None = None
+    computed_u: torch.Tensor | None = None
+    scattered_u: torch.Tensor | None = None
+    gather_event: torch.cuda.Event | None = None
+    compute_event: torch.cuda.Event | None = None
+def _gather(p, state, rank, comm_stream):
+    g = p.grad
+    mesh = g.device_mesh
+    if rank == state.worker_rank:
+        gather_list = [torch.empty_like(g.to_local()) for _ in range(mesh.mesh.numel())]
+    else:
+        gather_list = None
+    with torch.cuda.stream(comm_stream):
+        torch.distributed.gather(
+            g.to_local(),
+            dst=state.worker_rank,
+            gather_list=gather_list,
+            group=mesh.get_group(),
+        )
+        if rank == state.worker_rank:
+            # TODO: Consider ,,,
+            if state.gathered_grad is not None:
+                raise RuntimeError(
+                    "Gather event already exists, which should not happen."
+                )
+            state.gathered_grad = torch.cat(gather_list, dim=0)
+            state.gather_event = torch.cuda.Event()
+            state.gather_event.record()
+        else:
+            state.gathered_grad = None
+            state.gather_event = None
+def _compute_u(state, steps, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        if rank == state.worker_rank:
+            if state.gather_event is None:
+                raise RuntimeError("Gather event must be set before compute.")
+            compute_stream.wait_event(state.gather_event)
+            u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
+def _scatter(p, state, rank, comm_stream):
+    u = state.computed_u
+    mesh = p.device_mesh
+    with torch.cuda.stream(comm_stream):
+        if rank == state.worker_rank:
+            if state.compute_event is None:
+                raise RuntimeError("Compute event must be set before scatter.")
+            comm_stream.wait_event(state.compute_event)
+            scatter_list = list(torch.split(u, p.size(0) // mesh.mesh.numel(), dim=0))
+        else:
+            scatter_list = None
+        u = torch.empty_like(p.to_local())
+        torch.distributed.scatter(
+            u,
+            scatter_list=scatter_list,
+            src=state.worker_rank,
+            group=mesh.get_group(),
+        )
+        u = DTensor.from_local(
+            u,
+            placements=p.placements,
+            device_mesh=mesh,
+        )
+        state.scattered_u = u
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+    def __init__(
+        self,
+        model,
+        is_muon_func,
+        lr=1e-3,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_wd=0.1,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+        debug=False,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=adamw_wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+        super().__init__(model.parameters(), defaults)
+        self.is_muon_func = is_muon_func
+        self.model = model
+        if not dist.is_initialized():
+            raise RuntimeError(
+                "Muon optimizer requires distributed training to be initialized."
+            )
+        self.rank = dist.get_rank()
+        self.comm_stream = torch.cuda.Stream()
+        self.compute_stream = torch.cuda.Stream()
+        self.debug = debug
+    def __setstate__(self, state):
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        super().__setstate__(state)
+        for name, p in self.model.named_parameters():
+            if self.is_muon_func(p, name):
+                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+                assert p.ndim == 2, p.ndim
+                self.state[p]["use_muon"] = True
+                self.state[p]["orig_shape"] = p.shape
+            else:
+                # Do not use Muon for parameters in adamw_params
+                self.state[p]["use_muon"] = False
+    def _calc_flops(self, G, steps):
+        assert len(G.shape) == 2
+        M, N = G.shape
+        if M > N:
+            M, N = N, M
+        return steps * ((M**3) * 2 + (M**2 * N) * 4 + M * N * 2 + M**2 * 3)
+    def adjust_lr_for_muon(self, lr, param_shape):
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+    def init_state_and_assign_params(self, params, group):
+        param_to_state = {}
+        param_to_flops = {}
+        total_flops = 0
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            assert g.ndim == 2, "Muon only supports 2D parameters."
+            flops = self._calc_flops(g, group["ns_steps"])
+            param_to_flops[id(p)] = flops
+            total_flops += flops
+        if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
+        ordered_params = sorted(
+            params, key=lambda p: param_to_flops[id(p)], reverse=True
+        )
+        round_robin = 0
+        mesh = None
+        for p in ordered_params:
+            if mesh is None:
+                mesh = p.device_mesh
+                if mesh.ndim != 1:
+                    raise NotImplementedError(
+                        "Muon requires a 1D mesh for distributed training yet."
+                    )
+            elif mesh != p.device_mesh:
+                raise ValueError("All parameters must be on the same mesh.")
+            param_to_state[id(p)] = _muon_state()
+            param_to_state[id(p)].worker_rank = mesh.mesh[round_robin].item()
+            round_robin = (round_robin + 1) % mesh.mesh.numel()
+        return param_to_state, ordered_params
+    def base(self, params, group, lr, wd, momentum):
+        # generate weight updates in distributed fashion
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            assert g is not None
+            # calc update
+            state = self.state[p]
+            if "momentum_buffer" not in state:
+                state["momentum_buffer"] = torch.zeros_like(g)
+            buf = state["momentum_buffer"]
+            buf.mul_(momentum).add_(g)
+            if group["nesterov"]:
+                g = g.add(buf, alpha=momentum)
+            else:
+                g = buf
+            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+            # scale update
+            adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            # apply weight decay
+            p.data.mul_(1 - lr * wd)
+            # apply update
+            p.data.add_(u, alpha=-adjusted_lr)
+    def _update_g(self, p, g, group, momentum):
+        # calc update
+        state = self.state[p]
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros_like(g)
+        buf = state["momentum_buffer"]
+        buf.mul_(momentum).add_(g)
+        if group["nesterov"]:
+            g = g.add(buf, alpha=momentum)
+        else:
+            g = buf
+        return g
+    def _update_p(self, p, u, lr, wd):
+        # scale update
+        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+        # apply weight decay
+        p.data.mul_(1 - lr * wd)
+        # apply update
+        p.data.add_(u, alpha=-adjusted_lr)
+    def parallel(self, params, group, lr, wd, momentum):
+        """
+        Perform a parallel optimization step using Muon.
+        """
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            if g.ndim > 2:
+                g = g.view(g.size(0), -1)
+            # Update g in the local rank
+            g = self._update_g(
+                p,
+                g,
+                group,
+                momentum=momentum,
+            )
+            p.grad = g
+        param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group
+        )
+        def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream)
+        def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
+        def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx : start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        chunk_size = params[0].device_mesh.mesh.numel()
+        # Wait grad update
+        self.comm_stream.wait_stream(torch.cuda.current_stream())
+        enqueue_gathers(0, chunk_size)
+        for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_computes(i, chunk_size)
+            enqueue_gathers(i + chunk_size, chunk_size)
+            enqueue_scatters(i, chunk_size)
+        torch.cuda.current_stream().wait_stream(self.comm_stream)
+        for p in params:
+            g = p.grad
+            if g is None:
+                continue
+            # Update p with sharded u
+            state = param_to_state[id(p)]
+            self._update_p(
+                p,
+                state.scattered_u,
+                lr=lr,
+                wd=wd,
+            )
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            ############################
+            #           Muon           #
+            ############################
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+            if isinstance(params[0].data, DTensor):
+                self.parallel(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            else:
+                self.base(
+                    params,
+                    group,
+                    lr=lr,
+                    wd=wd,
+                    momentum=momentum,
+                )
+            ############################
+            #       AdamW backup       #
+            ############################
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+                g = buf1 / (eps + buf2.sqrt())
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+        return loss