iamwyldecat commited on
Commit
1f13dae
Β·
1 Parent(s): 02ac540

fix(muon): handle un-distributed env

Browse files
Files changed (36) hide show
  1. build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  2. build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  3. build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +15 -7
  4. build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +3 -3
  5. build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  6. build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +15 -7
  7. build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  8. build/torch26-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  9. build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +15 -7
  10. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +3 -3
  11. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  12. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +15 -7
  13. build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  14. build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  15. build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +15 -7
  16. build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +3 -3
  17. build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  18. build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +15 -7
  19. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  20. build/torch26-cxx98-cu126-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  21. build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +15 -7
  22. build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  23. build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  24. build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +15 -7
  25. build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  26. build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  27. build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +15 -7
  28. build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
  29. build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  30. build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +15 -7
  31. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc +0 -0
  32. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc +0 -0
  33. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
  34. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} +1 -1
  35. build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +15 -7
  36. torch-ext/optimizer/muon.py +15 -7
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1f5df341112d93e43c0801e285abd66e79bfbe399d228f8be09ff26ece7421b
3
  size 1787272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:829533f24bccb220101238fcbafa1343d2ec3ba3922a91a836b8a05813b44672
3
  size 1787272
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2921aa2aa2587e261dc9ca4e5f60303b0d1c9a305d1584918a8c56b6dc79ebfb
3
  size 1824224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:880f65ca04a52278892cdcb40dac073f21552ac16b69903f2b8026894a81e35d
3
  size 1824224
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a93530e6981fdac23236dd7e3657c5b47513cda4accec78293234ce5f233400b
3
  size 1824224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c478b90b83052c5931cb3d872adad7811663e28bd3447f12ac412f15b1d0ffc5
3
  size 1824224
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:caa40905ac8f209fecccae42c6892c3766ad5c7069382e60d2339e73da6ee7d6
3
  size 1749744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec46d147914be5998dfc62d4b87eb6730be7f012700d49543a318cadab3820db
3
  size 1749744
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6919551ed599e7e0dc1a750d1972bdb31605f57583b3617054cb70dd40d54d26
3
  size 1787192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e09882858886be06e8ac48d184b320c57624d9c85165ce8b56640b022838e44
3
  size 1787192
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f07cc2637669130fc9e209cb2c4358caba1c4c2d5837a108043b073d7897c3a7
3
  size 1824184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f63b2cd2c67b44f5e54837a0a4f26d94d3e6e8bfa4964bd99fc7e38494e2d52
3
  size 1824184
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch26-cxx98-cu126-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b9ef8fa2dd4d80cb3c1c3c2a72b99e0d76b3e676acd551f3a9ff4cdd21773eb
3
  size 1824184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48795cb66a740b14266d757ac70a6b43fb11df6662970bb4040650d237e6cbc5
3
  size 1824184
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8413f32011996384f13a985a99b4e2f863f8e4717acdb8439b63a10f77db6f15
3
  size 1787368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec1f34fd4ead50eb51db63f51afc0751d6bf0c64a46c44c713ab245f150979cc
3
  size 1787368
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9d303b11a0a82e9c51c7b32c7555bd351ec375b1879bf46eb64ea4aff32100f
3
  size 1824256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdb8ab38f72351ae88307560aca5e1af7b2dcb63a39627dbd4c806cad3f83442
3
  size 1824256
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4334fe8d7157c2a9c85217cb981692daf9eb4c6d3f205d0fd41d4b717daefa1
3
  size 1883352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0652d611e00b1bcbece47da13dffb28396ae0831dc4be43c7ae9be27ad9a10fe
3
  size 1883352
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-312.pyc differ
 
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc CHANGED
Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc and b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-312.pyc differ
 
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_64757cb_dirty
3
- ops = torch.ops._optimizer_64757cb_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_64757cb_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_02ac540_dirty
3
+ ops = torch.ops._optimizer_02ac540_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_02ac540_dirty::{op_name}"
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_64757cb_dirty.abi3.so β†’ _optimizer_02ac540_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:272fcc69e3774fa43e222efefceeaca97a8c84ee3f1fe528a7478a8e80a70976
3
  size 1749648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a96bfd1f461d7cd029dd39d142d2999dcc86dd7f56fb40f045e00f3fb2c400bd
3
  size 1749648
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,
torch-ext/optimizer/muon.py CHANGED
@@ -195,12 +195,10 @@ class Muon(torch.optim.Optimizer):
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
- if not dist.is_initialized():
199
- raise RuntimeError(
200
- "Muon optimizer requires distributed training to be initialized."
201
- )
202
-
203
- self.rank = dist.get_rank()
204
 
205
  self.comm_stream = torch.cuda.Stream()
206
  self.compute_stream = torch.cuda.Stream()
@@ -209,12 +207,14 @@ class Muon(torch.optim.Optimizer):
209
  def __setstate__(self, state):
210
  # Sort parameters into those for which we will use Muon, and those for which we will not
211
  super().__setstate__(state)
 
 
 
212
  for name, p in self.model.named_parameters():
213
  if self.is_muon_func(p, name):
214
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
215
  assert p.ndim == 2, p.ndim
216
  self.state[p]["use_muon"] = True
217
- self.state[p]["orig_shape"] = p.shape
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
@@ -402,6 +402,9 @@ class Muon(torch.optim.Optimizer):
402
  # Muon #
403
  ############################
404
 
 
 
 
405
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
406
  lr = group["lr"]
407
  weight_decay = group["weight_decay"]
@@ -432,6 +435,11 @@ class Muon(torch.optim.Optimizer):
432
  )
433
 
434
  if len(param_dtensors) > 0:
 
 
 
 
 
435
  self.parallel(
436
  param_dtensors,
437
  group,
 
195
  self.is_muon_func = is_muon_func
196
  self.model = model
197
 
198
+ if dist.is_initialized():
199
+ self.rank = dist.get_rank()
200
+ else:
201
+ self.rank = None
 
 
202
 
203
  self.comm_stream = torch.cuda.Stream()
204
  self.compute_stream = torch.cuda.Stream()
 
207
  def __setstate__(self, state):
208
  # Sort parameters into those for which we will use Muon, and those for which we will not
209
  super().__setstate__(state)
210
+ self._init_state()
211
+
212
+ def _init_state(self):
213
  for name, p in self.model.named_parameters():
214
  if self.is_muon_func(p, name):
215
  # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
216
  assert p.ndim == 2, p.ndim
217
  self.state[p]["use_muon"] = True
 
218
  else:
219
  # Do not use Muon for parameters in adamw_params
220
  self.state[p]["use_muon"] = False
 
402
  # Muon #
403
  ############################
404
 
405
+ if "use_muon" not in self.state[group["params"][0]]:
406
+ self._init_state()
407
+
408
  params = [p for p in group["params"] if self.state[p]["use_muon"]]
409
  lr = group["lr"]
410
  weight_decay = group["weight_decay"]
 
435
  )
436
 
437
  if len(param_dtensors) > 0:
438
+ if not dist.is_initialized():
439
+ raise RuntimeError(
440
+ "Parallel Muon requires torch.distributed to be initialized."
441
+ )
442
+
443
  self.parallel(
444
  param_dtensors,
445
  group,