
    RЦi              &       p   S r SSKrSSKrSSKJrJrJrJrJrJ	r	  SSK
r
 SSKJr  SrSSKJr  SS	KJr  SS
KJr  \R&                  " \5      rSrSrS// SQ/ SQ/ SQS.r\	\\\\\4   \\\\\4      4   rS\S\\S4   S\4S jr\S\
R<                  S4S\
R>                  S\S\\\\\4      S\S\S\
R@                  S\!S\
R>                  4S jjr" SBS\
RF                  S \S\4S! jjr$ SBS\
RF                  S \S\\\!4   4S" jjr%   SCS#\
R>                  S$\S%\S&\!S\	\!\\!\4   4   4
S' jjr& SDS(\
R>                  S)\S\\
R>                  \
RF                  4   4S* jjr'S+\\
R>                     S,\\
R>                     S-\\
R>                     S.\S/\S0\S1\!S2\S3\S\S\S \\   S4\S5\!S\!SS4 S6 jr(S+\\
R>                     S,\\
R>                     S-\\
R>                     S7\\
R>                     S8\\
R>                     S.\S/\S0\S1\!S9\S2\S3\S\S\S \\   S4\S5\!S\!SS4&S: jr)S+\\
R>                     S,\\
R>                     S-\\
R>                     S.\S/\S0\S1\!S2\S3\S\S\S \\   S4\S5\!S\!SS4 S; jr*S+\\
R>                     S,\\
R>                     S-\\
R>                     S7\\
R>                     S8\\
R>                     S.\S/\S0\S1\!S9\S2\S3\S\S\S \\   S4\S5\!S\!SS4&S< jr+ " S= S>\
RX                  RZ                  5      r.S?\	\\\   \\\      4   S@\\\\\      4   S\\\\\4      4SA jr/g! \ a    Sr GNf = f)Ea  Muon Optimizer

Improved Muon optimizer implementation with flexible handling of high-dimensional tensors.

Combines PyTorch-style structure with options for:
- Batched spatial processing for convolutions in addition to flatten
- Optional spatial normalization
- Selectable coefficient presets
- Automatic fallback to AdamW for 1D / scalar parameters (biases, norms, etc.) and optional fallback via param groups
- AdaMuon (https://arxiv.org/abs/2507.11005)
- mUP eps damping factor (https://arxiv.org/abs/2512.05620v1)

TODO look into mUP LR scaling and independent weight-decay scale

Based on implementation by Keller Jordan, see
- https://github.com/KellerJordan/Muon/blob/master/muon.py
- https://github.com/KellerJordan/modded-nanogpt/blob/master/train_gpt.py
- https://github.com/KellerJordan/modded-nanogpt/blob/master/train_gpt_medium.py
- https://github.com/NoahAmsel/PolarExpress/blob/main/polar_express.py

Hacked together by Ross Wightman
    N)ListMappingOptionalSequenceTupleUnion)DTensorTF   )ParamsT)adamw)nadamwgHz>   )guV@ggn@ @))gͪV@gg"~j@)gv@ggj+6gF%u@)ga4@gH}]g\Cm@)g2%@g?$	g/L
F?)g6>W[@gQkgH}8?))g8y @gWr"b(7g=90@)g3kT@g)6$g}U"?)g")i@gV}gT"?)g߸%~b
@g_O"Dveg)E?)gT`@gU4N/g?)gͶ?g!̲gG߆?)g_?g]sig??N!?)g9+_?gWg-|?))gh('P @g־{6g]!/@)g=.+@gixgےl% ?)g""@gP+.gPYJخ1?)g.69I
@gG4gH5?)gSO|g@gVg@ę?)gΐX?gxDgPh?)gkei?gB?Ggd݈ؼ?)g1?gjgZ?)originalquinticpolar_expresspolar_express_saferepsshape.returnc                 (    US   US   p2XU-  S-  -  $ )u  Scale epsilon for Newton-Schulz based on matrix dimensions (μP-style).

For μP compatibility, epsilon should scale as eps * sqrt(din/dout) to maintain
consistent damping behavior across different model widths.

Reference: https://arxiv.org/abs/2512.05620

Args:
    eps: Base epsilon value
    shape: Shape of the matrix (out, in) or (batch, out, in)

Returns:
    Scaled epsilon value
      ? )r   r   doutdins       N/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/muon.pyscale_eps_for_nsr   X   s%    ( rE"I#*$$$          ?Gstepscoefficientssafety_factordtype	scale_epsc           
      <   U R                   S;   d   SU R                    S35       e[        U5      nUS:  a  [        US   5      S:X  d   eX::  a  USU OX"S   /X-
  -  -   nU(       a  [        X0R                  5      nU R	                  US	S
9n	U	R                  S5      U	R                  S5      :  n
U
(       a  U	R                  n	U(       a?  U	R                  U	R                  SSS	S9R                  U5      R                  U5      5        O<U	R                  U	R                  SSS	S9R                  U5      R                  US95        [        =(       a    [        U [        5      nU(       a1  U H*  u  pnXR                  -  nX-  XU-  -  -   nX-  UU	-  -   n	M,     OU	R                   S:  a  [        R                   O[        R"                  nU	R%                  5       n	[        R&                  " / U	R                  SS QU	R                  S5      P7U	R(                  U	R*                  S9n[        R,                  " U5      n[        R,                  " U	5      nU H1  u  pnU" XU	R                  SSUS9  U" XXUUS9  U" U	UXSUS9  UU	nn	M3     U
(       a  U	R                  n	U	$ )uT  Newton-Schulz quintic iteration to compute the zeroth power / orthogonalization of gradient.

Supports batched operation over leading dimensions.

See
- https://github.com/KellerJordan/Muon/blob/master/muon.py
- https://github.com/NoahAmsel/PolarExpress/blob/main/polar_express.py
- https://github.com/KellerJordan/modded-nanogpt/blob/master/train_gpt.py

Args:
    G: Input gradient tensor of shape (m, n) or (batch, m, n)
    steps: Number of Newton-Schulz iterations
    coefficients: Coefficients (a, b, c) for the iteration
    eps: Numerical stability epsilon for norm
    safety_factor: Multiplicative safety factor for norm (1.01 is common safety value in 'polar express' variants)
    dtype: Computation dtype
    scale_eps: If True, scale epsilon by sqrt(din/dout) for μP compatibility

Returns:
    Orthogonalized tensor of same shape as G
)      zInput must be 2D or 3D, got zD. Flatten batch dims first.r
   r   r)   Nr   T)r%   copyr   r(   )r   r   )dimkeepdim)min)devicer%           r    )betaalphaout)ndimlenr   r   tosizemTdiv_normmuladd_clamp_has_dtensor
isinstancer	   torchbaddbmmaddmm
contiguousemptyr.   r%   
empty_like)r!   r"   r#   r   r$   r%   r&   num_cscoeff_sequenceX
transposed
is_dtensorabcABmm_fnCs                      r   zeropower_via_newtonschulzrQ   p   s^   < 66V`;AFF8C_``FQ;3|A/1444-2_\&5)R()U^<<  sGG,	5t$A affRj(JDD 	qvvaXtv488GLLSQR	qvvaXtv488GNNSVNWX7Aw!7J%GA!DDAU#AQA & "#! LLNKK3!''#2,3r
3AHHAGGTQQ &GA!!3cq9!2!Q!4aqAq	 & DDHr   param_shapeadjust_lr_fnc                     [        U 5      S:  a
  U S   U S   4OSu  p#US:X  a  [        SX#-  5      S-  $ US:X  a  S[        X#5      S-  -  $ US	:X  a  X#-  S-  $  S
U S35       e)aZ  Adjust learning rate based on parameter shape for Muon.

Args:
    param_shape: Shape of the parameter tensor
    adjust_lr_fn: Scaling function name
        - "original": sqrt(max(1, out/in)) - Original Muon impl
        - "match_rms_adamw": 0.2 * sqrt(max(out, in)) - Kimi scaling
        - "rms_to_rms": sqrt(out/in) - Scion/Bernstein scaling
r
   r   r   r    r    r   r   match_rms_adamw皙?
rms_to_rmsInvalid scaling function "z
" for Muon)r4   maxrR   rS   out_chsin_chss       r   get_lr_scaler^      s     =@<Lq<P{2B8V^OGz!1g&'3..	*	*S)S000		%  S((K2<.
KKur   c                     [        U 5      S:  a
  U S   U S   4OSu  p#US:X  a  SX#-  S-  -  S4$ US	:X  a	  X#-  S-  S
4$ US:X  a  US-  S
4$  SU S35       e)zAdjust learning rate based on parameter shape for AdaMuon.

Args:
    param_shape: Shape of the parameter tensor
    adjust_lr_fn: Scaling function name

Returns:
    Tuple of (scale_factor, use_rms_norm)
r
   r   r   rU   rV   rW   r   TrX   Frsqrt_in      rY   z" for AdaMuon)r4   r[   s       r   get_adamuon_lr_scalerb      s     =@<Lq<P{2B8V^OG(( g&3..44		% S(%//		#~u$$N2<.NNur   parammin_dim_sizemax_aspect_ratioreturn_reasonc                    U R                   nU R                  S:  d  [        S U 5       5      S:  a  U(       a  S$ S$ US   S:X  d	  US   S:X  a  U(       a  S$ S$ U R                  S:  a  US   nSnUSS	  H  nXg-  nM	     XV4nOUn[        U5      n	X:  a  U(       a  SS
U	 34$ g[	        U5      n
X-  nX:  a  U(       a  SSUS 34$ gU(       a  S$ S$ )aB  Check if a parameter is suitable for Muon optimization.

Args:
    param: Parameter tensor
    min_dim_size: Minimum size for non-unit dimensions
    max_aspect_ratio: Maximum allowed aspect ratio
    return_reason: If True, return (bool, reason_string), else just bool (faster)

Returns:
    If return_reason=False: bool indicating suitability
    If return_reason=True: Tuple of (is_suitable, reason_string)

Examples:
    (64, 128) -> True (or (True, "ok") if return_reason=True)
    (96, 3, 4, 4) -> True - will be flattened to (96, 48)
    (4, 2048) -> False - extreme aspect ratio
    (64,) -> False - insufficient dims
    (1, 196, 768) -> False - leading unit dims

NOTE: these rules were created to balance complexity with covering common timm model cases
Please let me know if there are non-optimal cases that you run into.
r(   c              3   4   #    U  H  oS :  d  M
  S v   M     g7f)r
   Nr   ).0dim_sizes     r   	<genexpr>(_is_suitable_for_muon.<locals>.<genexpr>  s     Aq8qLQQqs   		)Finsufficient_dimsFr   r
   )Fleading_unit_dimsr)   Nzmin_dim_too_small:zextreme_aspect_ratio:z.1f)TokT)r   r3   sumr-   rZ   )rc   rd   re   rf   sout_chin_ch_with_spatiald
check_dimsmin_sizemax_sizeaspect_ratios               r   _is_suitable_for_muonry      s   : 	AzzA~AqAAAE/<+G%G 	tqyAaDAI/<+G%GzzQ 112A# 1
 
 :H.xj999 :H&L&1,s1CDDD(<2d2r   tensormodec                 ^   U R                   nU R                  S:X  a  X4$ U R                  S:  a  [        SU R                   35      eU R                   SS u  p4US:X  a  U R                  US5      U4$ US:X  a(  U R                  X4S5      nUR	                  SSS5      nXR4$ [        S	U 35      e)
a  Reshape high-dimensional tensor for Muon processing.

Args:
    tensor: Input tensor of shape (out, in, *spatial)
    mode: How to handle spatial dimensions
        - "flatten": Flatten spatial into output dimension (out, in*H*W)
        - "batched": Batch over spatial positions (spatial_prod, out, in) for per-position orthogonalization

Returns:
    Reshaped tensor and original shape for restoration
r(   z,Tensor must have at least 2 dimensions, got Nflattenr   batchedr   r
   zUnknown mode: )r   r3   
ValueErrorreshapepermute)rz   r{   original_shaperr   in_chreshapeds         r   reshape_for_muonr   G  s     \\N{{a%%{{QG}UVVLL!$MFy~~fb)>99		 >>&4##Aq!,''>$011r   paramsgradsmomentum_bufslrweight_decaymomentumnesterovns_stepsns_coefficients	conv_modenormalize_spatialc                2    [        U UUUUUUUUU	U
UUUUS9  g)z8Functional API that performs Muon algorithm computation.r   r   r   r   r   r   r   r$   rS   r   r   r&   N)_single_tensor_muon)r   r   r   r   r   r   r   r   r   r   r$   rS   r   r   r&   s                  r   muonr   j  s9    & !'#!+r   exp_avg_sqsstate_stepsbeta2c                <    [        U UUUU4UUUUU	U
UUUUUUUS.6  g)a(  Functional API that performs AdaMuon algorithm computation.

AdaMuon extends Muon with element-wise second moment estimation applied
to orthogonalized update directions, providing Adam-like adaptive scaling
while preserving Muon's geometric benefits.

Reference: https://arxiv.org/abs/2507.11005
r   r   r   r   r   r   r   r   r$   rS   r   r   r&   N)_single_tensor_adamuon)r   r   r   r   r   r   r   r   r   r   r   r   r   r$   rS   r   r   r&   s                     r   adamuonr     sI    :  !'#!+%r   c          
         [        U[        5      n[        U 5       GH'  u  nnX   nX/   nUR                  SX4-  -
  5        UR	                  USU-
  5        U(       a  UR	                  UU5      OUR                  5       nUR                  S:  a  [        UUS9u  nnOUnUR                  n[        UUUU	U
US9nU(       a  [        UR                  U5      nOSnUS:X  a?  UR                  S:  a/  U(       a  UUR                  S   S-  -  nUR                  SS	S5      nUR                  U5      nUR                  UU* U-  S
9  GM*     g)zSingle tensor Muon update.r
   r    r)   r{   r   r$   r&   r~   r   ra   r(   r1   N)resolve_ns_coefficients_COEFFICIENTS	enumeratemul_lerp_cloner3   r   r   rQ   r^   r   r   r;   )r   r   r   r   r   r   r   r   r   r   r$   rS   r   r   r&   irc   gradmomentum_bufupdateupdate_reshapedr   update_orthoscales                           r   r   r     sU   & .o}MOf%5x$' 	

1r(() 	4h/7?L(3\EWEWEY ;;!.>vI.V+O^$O#\\N 2'
  !3!3\BEE 	!l&7&71&< ++A.$66'//1a8L $++N; 	

<sU{
3[ &r   c          
      ^   [        U[        5      n[        U 5       GH  u  nnUU   nUU   nUU   nUU   nUS-  nUR                  5       nUR	                  SXV-  -
  5        UR                  USU-
  5        U(       a  UR                  UU5      OUR                  5       nUR                  S:  a  [        UUS9u  nnOUnUR                  n[        UU
UUUUS9nUS:X  a#  UR                  S:  a  UR                  SSS5      nUR                  U5      nUR	                  U	5      R                  UUSU	-
  S	9  U(       a  [        UR                  U5      u  nnOS
u  nnU(       a   UR                  5       R!                  U5      nO*SU	U-  -
  n UU -  R                  5       R!                  U5      nUU-  n!U(       a$  U!R#                  5       R!                  U5      n"U!U"-  n!US:X  a1  [%        U5      S:  a"  U(       a  Sn#USS  H  n$U#U$-  n#M
     UU#S-  -  nUR!                  U!U* U-  S9  GM     g)u  Single tensor AdaMuon update.

AdaMuon applies second-moment estimation to the orthogonalized directions,
then rescales using RMS-alignment to maintain stable step sizes.

Algorithm:
    1. Update momentum buffer: M = β₁·M + (1-β₁)·G
    2. Orthogonalize: O = Newton-Schulz(M) or Newton-Schulz(nesterov_update)
    3. Update second moment: v = β₂·v + (1-β₂)·O²
    4. Bias correct: v̂ = v/(1-β₂^t)
    5. Adaptive scaling: Ô = O / (√v̂ + ε)
    6. RMS-aligned rescaling and apply update
r
   r    r)   r   r   r~   r(   r   )value)r    FNra   r   )r   r   r   itemr   r   r   r3   r   r   rQ   r   r   addcmul_rb   sqrtr;   r9   r4   )%r   r   r   r   r   r   r   r   r   r   r   r   r   r$   rS   r   r   r&   r   rc   r   r   
exp_avg_sqstep_tstepr   r   r   r   r   use_rms_normdenombias_correction2update_adaptiveupdate_normspatial_prodrt   s%                                        r   r   r     sX   D .o}MOf%5Qx$Q' ^
Q 	!{{} 	

1r(() 	4h/7?L(3\EWEWEY ;;!.>vI.V+O^$O#\\N 2'
 	!l&7&71&<'//1a8L#++N; 	''l#PU+'V "6|7I7I<"XE<",E<OO%**3/E  #Ud]2"2288:??DE '. )..055c:K-;O 	!c.&9Q&>  '+A A%L ,-- 	

?2#+
6[ &r   c            #          ^  \ rS rSrSrSSSS\S\SS	S
SSSSSS4S\S\S\S\S\	S\
S\S\S\S\\   S\S\	S\\   S\\\4   S\S\	S\	4"U 4S  jjjrU 4S! jr\R$                  " 5       S$S" j5       rS#rU =r$ )%Muoni|  a  Muon - MomentUm Orthogonalized by Newton-schulz

Combines Muon for 2D+ parameters (weight matrices) with AdamW for 1D parameters (biases, norms) and
parameter groups with 'use_fallback=True' set (or 'use_muon=False' for compatibility).

Supports two algorithms:
- "muon": Standard Muon algorithm with momentum + orthogonalization
- "adamuon": AdaMuon algorithm that adds element-wise second moment estimation
             to orthogonalized directions for Adam-like adaptive scaling
g{Gz?r   ffffff?Fr   r    rV   r}   TN)g?r   r   r   r   r   r   r   r   r   r   r$   rS   r   r   adamw_lrbetasalgor&   verbosec                   > SU::  d  [        SU 35      eSU::  d  [        SU 35      eSUs=::  a  S:  d  O  [        SU 35      eSU::  d  [        SU 35      eUS;  a  [        SU 35      eUS	;  a  [        S
U S35      e[        S0 SU_SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SUb  UOU_SU_SU_SU_SU_6n[        TU ]  UU5        g)uf  Create Muon optimizer.
Args:
    params: Iterable of parameters or dicts defining parameter groups
    lr: Learning rate (default: 0.02 for Muon parameters)
    weight_decay: Weight decay coefficient
    momentum: Momentum factor for Muon
    nesterov: Whether to use Nesterov momentum
    ns_steps: Number of Newton-Schulz iterations
    ns_coefficients: Coefficients for NS iteration
    eps: Numerical stability epsilon
    safety_factor: Multiplicative safety factor for NS norm
    adjust_lr_fn: LR adjustment function - "original", "match_rms_adamw", or "rms_to_rms".
        For adamuon mode, can set to None to disable (RMS rescaling handles scaling).
    conv_mode: How to handle convolutions - "flatten" or "batched"
    normalize_spatial: Whether to normalize by sqrt(spatial_size) in batched mode
    adamw_lr: Learning rate for AdamW (1D params), defaults to lr if not specified
    betas: Beta coefficients - (beta1, beta2) where beta1 is used for AdamW fallback
        and beta2 is used for both AdamW fallback and AdaMuon second moment
    algo: Algorithm - "muon" for standard Muon, "adamuon" for AdaMuon with
        adaptive second moment estimation (https://arxiv.org/abs/2507.11005)
    scale_eps: If True, scale epsilon by sqrt(din/dout) in Newton-Schulz for μP
        compatibility (https://arxiv.org/abs/2512.05620)
    verbose: Log parameter routing decisions (Muon vs AdamW)

Example:
    ```python
    # Simple usage - automatically uses Muon for 2D+ params, AdamW for 1D
    optimizer = Muon(model.parameters(), lr=0.02)

    # Use AdaMuon algorithm for adaptive scaling
    optimizer = Muon(model.parameters(), lr=6e-4, algo="adamuon")

    # Manual control over parameter groups
    optimizer = Muon([
        {'params': weight_matrices, 'lr': 0.02},
        {'params': biases, 'use_fallback': True, 'lr': 3e-4}, # use AdamW if use_fallback=True
    ])
    ```
r/   zInvalid learning rate: zInvalid weight_decay value: r    zInvalid momentum value: zInvalid epsilon value: )r}   r~   zInvalid conv_mode: )r   r   zInvalid algo: z. Must be 'muon' or 'adamuon'r   r   r   r   r   r   r   r$   rS   r   r   r   Nr   r   r&   r   r   )r   dictsuper__init__)selfr   r   r   r   r   r   r   r   r$   rS   r   r   r   r   r   r&   r   defaults	__class__s                      r   r   Muon.__init__  sn   v by6rd;<<l";L>JKKh$$7zBCCcz6se<==2229+>??**~dV3PQRR 

%
 
 	

 
 ,
 
 (
 &
  
 0
 "*!5X2
 
 
  
  !
$ 	*r   c                    > [         TU ]  U5        U R                   H'  nUR                  SS5        UR                  SS5        M)     g )Nr   r   r&   F)r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r   Muon.__setstate__  s@    U#&&EVV,[%0 'r   c                 >   SnUb%  [         R                  " 5          U" 5       nSSS5        U R                  R                  SS5      nSnSnU(       a  0 OSnU R                   GH  nUR                  SS5      n/ n	/ n
/ n/ n/ n/ n/ n/ n/ n/ nUS    GH  nUR
                  c  M  UR
                  R                  (       a  [        S5      eU R                  U   nS	U;  a  SnUR                  S
S5      (       a  SUS	'   U(       a  SnO<S	U;   a  US	   US	'   U(       a  SnO$U(       a  [        USS9u  nnO
[        USS9nUUS	'   UbD  UbA  SR                  S UR                   5       5      nUU;  a  / UU'   UU   R                  U5        US	   nU(       a  U	R                  U5        U
R                  UR
                  5        US-  nSU;  a&  [         R                  " U[         R                  S9US'   UR                  US   5        US:X  ap  SU;  a?  [         R                  " S5      US'   [         R                  " U[         R                  S9US'   UR                  US   5        UR                  US   5        GM  GM  UR                  U5        UR                  UR
                  5        US-  nSU;  ae  [         R                  " S5      US'   [         R                  " U[         R                  S9US'   [         R                  " U[         R                  S9US'   UR                  US   5        UR                  US   5        UR                  US   5        GM     U	(       a  US:X  aI  US   u  nn[!        U	U
UUU4US   US   US   US   UUS   US   US    US!   US"   US#   US$   US%   S&.6  O;[#        U	U
UUS   US   US   US   US   US   US    US!   US"   US#   US$   US%   S'9  U(       d  GM  US   u  nnUS   (       a#  [%        UUUUUSUUUS(   US   US    SSSSS)9  GM  ['        UUUU/ USSUUUS(   US   US    SSSSS*9  GM     U(       Ga  [)        U5      S:  Ga  [*        R-                  S+U S,U S-35        0 n[/        UR1                  5       5       H.  u  nnU H"  nUU;  a  / UU'   UU   R                  U5        M$     M0     / n[/        UR1                  5       5       H%  u  nnUR                  U S.[)        U5       35        M'     [*        R-                  S/S0R                  U5       35        [*        R3                  [4        R6                  5      (       a  [/        UR1                  5       5       H  u  nnUS1:X  a  S2OS3n [*        R-                  S4U S5U  S635        USS7  H  n![*        R-                  S8U! 35        M     [)        U5      S7:  d  Ma  [*        R-                  S9[)        U5      S7-
   S:35        M     U$ ! , (       d  f       GN= f);z$Performs a single optimization step.Nr   Fr   r   r   r   z&Muon does not support sparse gradientsuse_muonuse_fallbackuse_fallback_flaguse_muon_flagT)rf   xc              3   8   #    U  H  n[        U5      v   M     g 7fN)str)ri   rq   s     r   rk   Muon.step.<locals>.<genexpr>+  s     ,EWSVVWs   r
   momentum_buffer)memory_formatr   r   r/   r   exp_avgr   r   r   r   r   r   r   r   r$   rS   r   r   r&   r   r   r   )
foreachbeta1r   r   r   r   cautionmaximize
capturablemax_lr)r   amsgradr   r   r   r   r   r   r   r   r   zMuon parameter routing: z Muon, z AdamW=z  Breakdown: , ro   r   AdamWz    z -> :
   z      z      ... and z more)r?   enable_gradr   getr   r   	is_sparseRuntimeErrorr   ry   joinr   append
zeros_likepreserve_formatrz   r   r   r   r   r4   _loggerinfosorteditemsisEnabledForloggingINFO)"r   closurelossr   
muon_countadamw_countrouting_reasonsr   r   muon_params
muon_gradsmuon_momentum_bufsmuon_exp_avg_sqsmuon_state_stepsadamw_paramsadamw_gradsadamw_exp_avgsadamw_exp_avg_sqsadamw_state_stepspr   reasonsuitable	shape_strr   _r   r   reason_groupsreasonsreason_summaryshapesoptimizer_namer   s"                                     r   r   	Muon.step  s    ""$y % --##Iu5 
 '"T&&E99VV,D KJ!#!!LKN " "8_66>66##&'OPP

1 U*!Fyy77,1j)"%8F#u,,1*,=j)"%4F #/DQVZ/[,Hf'<Qe'TH,4j) '2v7I$'HH,EQWW,E$E	$O;9;OI6'	299&A !,&&q)%%aff-!OJ )5383C3CAUZUjUj3k/0&--e4E.FG y(!.,1LL,<E&M272B2B1TYTiTi2jE,/(//l0CD(//f> ) !''*&&qvv.1$K U*(-R(8f+0+;+;AUMbMb+ci(.3.>.>qPUPePe.fl+"))%	*:;%,,U<-@A%,,U6];Q %V 9$$W~HAu#"*(( !;%*>%:!&z!2!&z!2#!&z!2(-.?(@!%L&+O&<%*>%:"'"4*/0C*D"'"4%* #"* ;%*>%:!&z!2!&z!2!&z!2(-.?(@!%L&+O&<%*>%:"'"4*/0C*D"'"4& |$W~u$$#&)) $## ,%*>%:!%L %!&#(#$ $#&)) $ %## ,%*>%:!%L %!&#(##{ 'd s?3a7LL3J<w{mSYZ[ M&,_-B-B-D&E"	7%F]202f-!&)00; & 'F  N"()<)<)>"?%%#f+&?@ #@LL=>)B(CDE ##GLL11&,]-@-@-B&CNFF/5~V7NLL4xtN3C1!EF!'veW%56 "-6{R'~c&kB6F5Gu%MN 'D q %$s   X
Xr   r   )__name__
__module____qualname____firstlineno____doc__DEFAULT_NS_STEPSMUON_EPSr   floatboolintNSCoeffr   r   r   r   r   r?   no_gradr   __static_attributes____classcell__)r   s   @r   r   r   |  s2   	 "#"",'0!#&*;&&*(,)4#!%Z+Z+ Z+  	Z+
 Z+ Z+ Z+ %Z+ Z+ !Z+ #3-Z+ Z+  $Z+ uoZ+ &Z+  !Z+" #Z+$ %Z+ Z+x1 ]]_\ \r   r   r   presetsc                    ^^	 S m	S mS[         [           S[        [        [        [        4   4UU	4S jjn[        U [        5      (       a  X;  a9  SR                  [        UR                  5       5      5      n[        SU  SU 35      eX   nT	" U5      (       a  [        U5      S	:X  a  [        S
U  S35      eU Vs/ s H
  oR" U5      PM     sn$ T	" U 5      (       d  [        S5      e[        U 5      S:X  a#  [        U4S jU  5       5      (       a	  U" U 5      /$ / n[        U 5       H;  u  puT	" U5      (       d  [        SU SU< 35      eUR                  U" U5      5        M=     U(       d  [        S5      eU$ s  snf )Nc                 f    [        U [        5      =(       a    [        U [        [        45      (       + $ r   )r>   r   r   bytesr   s    r   <lambda>)resolve_ns_coefficients.<locals>.<lambda>  s!    z!X.Rz!c5\7R3RRr   c                 n    [        U [        R                  5      =(       a    [        U [        5      (       + $ r   )r>   numbersRealr  r%  s    r   r&  r'    s!    
1gll3OJq$<O8OOr   r   r   c                    > T" U 5      (       a)  [        U 5      S:w  d  [        U4S jU  5       5      (       d  [        SU < 35      eU u  pn[        U5      [        U5      [        U5      4$ )Nr)   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r   ri   vis_reals     r   rk   <resolve_ns_coefficients.<locals>.as_coeff.<locals>.<genexpr>  s     2Iq!71::q   z3Coefficient must be length-3 of real numbers, got: )r4   allr   r  )r   rJ   rK   rL   r/  is_seqs       r   as_coeff)resolve_ns_coefficients.<locals>.as_coeff  s]    ayyCFaKs2Iq2I/I/IRSTRWXYYaQxq58++r   r   zUnknown coefficients preset 'z'. Valid options: r   zPreset 'z' is empty or invalidz]Coefficients must be a preset name (str), a 3-sequence (a,b,c), or a sequence of 3-sequences.r)   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r   r-  s     r   rk   *resolve_ns_coefficients.<locals>.<genexpr>  s     95awqzz5r1  zItem z is not a sequence: z Coefficient list cannot be empty)r   r  r   r>   r   r   r   keysr   r4   	TypeErrorr2  r   r   )
r   r!  r4  validseqr   r2   r   r/  r3  s
           @@r   r   r     st   
 SFOG,HUO ,eUE.A(B , , %IIfW\\^45E<UGCUV[U\]^^nc{{c#h!mxw.CDEE+./343//%==,
 	
 5zQ395999   CU#d||eA3&:4(CDD

8D>" $ ;<<J) 0s   =E;)rV   )   g      `@F)r}   )0r  r   r)  typingr   r   r   r   r   r   r?   torch.distributed.tensorr	   r=   ImportError_typesr   r   r   	getLoggerr  r   r  r  r   r   r  r  r  r   bfloat16Tensorr%   r  rQ   Sizer^   rb   ry   r   r   r   r   r   optim	Optimizerr   r   r   r   r   <module>rG     s_  ,   B B 0K   


H
%  
 	"5&R U5%./eE5%<O6P1QQ
R%%S#X% %8 ""^^T<<TT 5u!456T 	T
 T {{T T \\Tr .LZZLL L> .OZZOO 5$;O: "&#	E3||E3E3  E3 	E3
 4tSy!!"E3T  2 2 2 5<<#$ 2F#U\\"#ELL!# ELL)#
 # # # # # !# # # sm# #  #  !#" 
##L0U\\"0ELL!0 ELL)0 %,,'	0
 %,,'0 0 0 0 0 0 0 !0 0 0  sm!0" #0$  %0& '0( 
)0fB4U\\"B4ELL!B4 ELL)B4
 B4 B4 B4 B4 B4 !B4 B4 B4 smB4 B4  B4  !B4" 
#B4Jq7U\\"q7ELL!q7 ELL)q7 %,,'	q7
 %,,'q7 q7 q7 q7 q7 q7 q7 !q7 q7 q7  sm!q7" #q7$  %q7& 'q7( 
)q7hK5;;   K\
)S(5/8HUO+DDE)hx778) 
%ue#
$%)W  Ks   L) )L54L5