
    RЦi6;              !          S r SSKrSSKJrJrJr  SSKrSSKJr  SSKJ	r	   " S S\R                  R                  5      r  SS	\\   S
\\   S\\   S\\   S\\   S\\   S\S\S\S\S\S\S\S\S\\   SS4 S jjrS	\\   S
\\   S\\   S\\   S\\   S\S\S\S\S\S\S\S\S\\   4S jrS	\\   S
\\   S\\   S\\   S\\   S\S\S\S\S\S\S\S\S\\   4S jrg)ac  NAdamW Optimizer

Based on simplified algorithm in https://github.com/mlcommons/algorithmic-efficiency/tree/main/baselines/nadamw

Added multi-tensor (foreach) path.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    N)ListOptionalTuple)Tensor   )ParamsTc                      ^  \ rS rSrSr         SS\S\S\\\4   S\S\S\S	\S
\S\	\   S\4U 4S jjjr
U 4S jr\R                  " 5       SS j5       rSrU =r$ )NAdamW   aW  Implements NAdamW algorithm.

See Table 1 in https://arxiv.org/abs/1910.05446 for the implementation of
the NAdam algorithm (there is also a comment in the code which highlights
the only difference of NAdamW and AdamW).

For further details regarding the algorithm we refer to
    - Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
    - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

Args:
    params: iterable of parameters to optimize or dicts defining parameter groups
    lr: learning rate
    betas: coefficients used for computing running averages of gradient and its square
    eps: term added to the denominator to improve numerical stability
    weight_decay: weight decay coefficient
    caution: enable caution
    corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
paramslrbetasepsweight_decaycautioncorrected_weight_decaymaximizeforeach
capturablec                 J  > SU::  d  [        SU 35      eSU::  d  [        SU 35      eSUS   s=::  a  S:  d  O  [        SUS    35      eSUS   s=::  a  S:  d  O  [        SUS    35      eSU::  d  [        S	U 35      e[        UUUUUUU	UU
S
9	n[        TU ]  X5        g )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: )	r   r   r   r   r   r   r   r   r   )
ValueErrordictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   defaults	__class__s               P/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/nadamw.pyr   NAdamW.__init__*   s     by6rd;<<cz6se<==eAh$$B58*MNNeAh$$B58*MNNl";L>JKK%#9!

 	*    c                   > [         TU ]  U5        [        U R                  R	                  5       5      n[        U5      S:g  =(       a    [        R                  " US   S   5      nU(       d.  U H(  n[        R                  " [        US   5      5      US'   M*     U R                   H'  nUR                  SS5        UR                  SS5        M)     g )Nr   stepr   Fr   )r   __setstate__liststatevalueslentorch	is_tensortensorfloatparam_groups
setdefault)r   r'   state_valuesstep_is_tensorsgroupr   s         r    r%   NAdamW.__setstate__N   s    U#DJJ--/0l+q0^eoolSToV\F]6^!!LLqy)9:&	 "&&EY.5u= 'r"   c                    U R                  5         SnUb%  [        R                  " 5          U" 5       nSSS5        U R                   GH  n/ n/ n/ n/ n/ nUS   u  pUS    GH$  nUR                  c  M  UR                  U5        UR                  R                  (       a  [        S5      eUR                  UR                  5        U R                  U   n[        U5      S:X  ae  [        R                  " S5      US'   [        R                  " U[        R                  S9US	'   [        R                  " U[        R                  S9US
'   UR                  US	   5        UR                  US
   5        UR                  US   5        GM'     [        UUUUUU	U
US   US   US   US   US   US   US   (       a  U R                  S   OSS9  GM     U$ ! , (       d  f       GN= f)zPerforms a single optimization step.

Args:
  closure (callable, optional): A closure that reevaluates the model
      and returns the loss.
Nr   r   z(NAdamW does not support sparse gradientsr   r   r$   )memory_formatexp_avg
exp_avg_sqr   r   r   r   r   r   r   	beta1beta2r   r   r   r   r   r   max_lr) _cuda_graph_capture_health_checkr*   enable_gradr.   gradappend	is_sparseRuntimeErrorr'   r)   r,   
zeros_likepreserve_formatnadamwr   )r   closurelossr3   params_with_gradgradsexp_avgsexp_avg_sqsstate_stepsr:   r;   pr'   s                r    r$   NAdamW.stepY   s    	--/""$y % &&E!EHKK >LE8_66> ''*66##&'QRRQVV$

1 u:?$)LL$4E&M','7'7I^I^'_E)$*/*:*:1ELaLa*bE,'i 01""5#67""5=1) %,  ;">2%Li(z* ..34L.Mt}}T*SW= '^ e %$s   G
G! )	MbP?)g?g+?g:0yE>g{Gz?FFFNFN)__name__
__module____qualname____firstlineno____doc__r   r-   r   boolr   r   r%   r*   no_gradr$   __static_attributes____classcell__)r   s   @r    r
   r
      s    . )5"&!+0"&*$"+"+ "+ &	"+
 "+  "+ "+ %)"+ "+ d^"+ "+ "+H	> ]]_= =r"   r
   r   rI   rJ   rK   rL   r   r   r:   r;   r   r   r   r   r   r<   returnc                |   [        S U 5       5      (       d  [        S5      eUcD   U(       + =(       d5    S[        R                  R                  R
                  R                  5       ;   nU(       a*  [        R                  R                  5       (       d  [        nO[        nU" U UUUUUUU	U
UUUUUS9  g!   Sn NQ= f)zYFunctional API that performs NAdamW algorithm computation.
See NAdamW class for details.
c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frQ   )
isinstancer*   r   ).0ts     r    	<genexpr>nadamw.<locals>.<genexpr>   s     @Kqz!U\\**Ks   ')zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNScalarFr9   )allrB   r*   opsaten_foreach_maximum_	overloadsjitis_scripting_multi_tensor_nadamw_single_tensor_nadamw)r   rI   rJ   rK   rL   r   r   r:   r;   r   r   r   r   r   r<   funcs                   r    rE   rE      s    , @K@@@!" 	" 	!k]X1Q1Q1[1[1]%]G uyy--//#$!	Gs   AB5 5B;c       	         ~   [        U 5       GH  u  pU(       d  X   OX   * nX.   nX>   nXN   nUS-  nUc  UOUS-  U-  nUR                  SUU-  -
  5        UR                  U5      R                  USU-
  S9  UR                  U5      R                  UUSU-
  S9  U(       Ga  UnS[        R
                  " UU5      -
  nS[        R
                  " UU5      -
  nUU-  nUR                  5       nUR                  5       nUR                  U5      R                  USU-
  S9nUR                  5       UU-  -  R                  U	U-  5      nU
(       a^  UU-  S:  R                  UR                  5      nUR                  UR                  5       R                  SS95        UR                  U5        UR                  UU5        GM  UR                  5       nSUU-  -
  nSUU-  -
  nUU-  n[         R                  " U5      nUR                  U5      R                  USU-
  S9nUR                  5       U-  R                  U	5      nU
(       a^  UU-  S:  R                  UR                  5      nUR                  UR                  5       R                  SS95        UR                  U5        UR                  UUU* S9  GM     g )	Nr      r   alpha)valuer   rP   )min)	enumeratemul_add_addcmul_r*   pownegsqrtmultodtypediv_meanclamp_addcdiv_itemmath)r   rI   rJ   rK   rL   r:   r;   r   r   r   r   r   r   r<   iparamr?   r7   r8   step_twd_scaler$   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtdenommasks                                r    rl   rl      s   $ f%'uxehY+ ^
 	!  2R1Wv-=

2<//0 	U  QY 7''d!e)'DD  !599UD#99 599UD#99--I%MMOM$4$9$9$;! kk%(--d!e)-DG__&*?-*OPVVWZ]jWjkE  $*..tzz:		$))+,,,67T"NN7E*;;=D 5D=0 5D=0--I$(II.>$?! kk%(--d!e)-DG__&)>>DDSIE$*..tzz:		$))+,,,67T"NN7E)N<} &r"   c       	            [        U 5      S:X  a  g U(       a'  [        S [        X5       5       5      (       d   S5       eU(       a  [        R                  " [        U5      5      nU Vs/ s H6  n[        R                  " U5      (       a  [        R                  " U5      OUPM8     nnU Vs/ s H6  n[        R                  " U5      (       a  [        R                  " U5      OUPM8     nnU Vs/ s H6  n[        R                  " U5      (       a  [        R                  " U5      OUPM8     nnU  Vs/ s H6  n[        R                  " U5      (       a  [        R                  " U5      OUPM8     n n[        R                  " US5        Uc  UOUS-  U-  n[        R                  " U SX-  -
  5        [        R                  " X%5        [        R                  " X!SU-
  S9  [        R                  " X65        [        R                  " X1USU-
  5        U(       Ga  U Vs/ s H  n[        R                  " UU5      PM     nnU Vs/ s H  n[        R                  " UU5      PM     nn[        R                  " US5        [        R                  " US5        [        R                  " U5        [        R                  " U5        [        R                  " UU5      n[        R                  " U5        [        R                  " U5        [        R                   " U5      n[        R"                  " X%5      n[        R                  " X!SU-
  S9  [        R                   " U5      n[        R$                  " U[        R"                  " UU5      5        [        R                  " UU	5      n[        R                  " U5        [        R&                  " UU5      nU
(       a  [        R"                  " X!5      n[        UU5       VVs/ s H$  u  nnUS:  R)                  UR*                  5      PM&     nnnU Vs/ s H  nUR-                  5       PM     nn[        R.                  " US5        [        R$                  " UU5        [        R                  " UU5        [        R0                  " XU5        g U Vs/ s H  nSUUR3                  5       -  -
  PM     nnU Vs/ s H  nSUUR3                  5       -  -
  PM     nnU Vs/ s H  nUU-  S-  PM     nnU Vs/ s H  n[4        R6                  " U5      PM     nn[        R"                  " X%5      n[        R                  " X!SU-
  S9  [        R                   " U5      n[        R$                  " UU5        [        R&                  " UU	5      nU
(       a  [        R"                  " X!5      n[        UU5       VVs/ s H$  u  nnUS:  R)                  UR*                  5      PM&     nnnU Vs/ s H  nUR-                  5       PM     nn[        R.                  " US5        [        R$                  " UU5        [        R                  " UU5        [        R0                  " XUU5        g s  snf s  snf s  snf s  snf s  snf s  snf s  snnf s  snf s  snf s  snf s  snf s  snf s  snnf s  snf )	Nr   c              3   d   #    U  H&  u  pUR                   =(       a    UR                   v   M(     g 7frQ   )is_cuda)r_   rM   r$   s      r    ra   '_multi_tensor_nadamw.<locals>.<genexpr>;  s&      
6N71AII&$,,&6Ns   .0z@If capturable=True, params and state_steps must be CUDA tensors.r   ro   rp   rP   )r)   rd   zipr*   _foreach_negtuple
is_complexview_as_real_foreach_add__foreach_mul__foreach_addcmul_rx   _foreach_sub__foreach_neg__foreach_div_foreach_reciprocal__foreach_sqrt_foreach_mul_foreach_div__foreach_addr|   r}   r   rg   _foreach_addcdiv_r   r   rz   )r   rI   rJ   rK   rL   r:   r;   r   r   r   r   r   r   r<   xr   r$   r   r   r   r   exp_avg_sq_sqrteps_over_step_sizer   masksmg
mask_scalebcs                                r    rk   rk   &  sk   " 6{a 
69&6N
 
 
 	NM	N 
 ""5<0JOP%Qe&6&6q&9&9U"q@%EPMUVX)9)9!)<)<""1%!CXHVP[\P[1E,<,<Q,?,?5%%a(QFP[K\KQR6au'7'7':':e  #A6FR 
Q' ^rq6)9H	X%< <= 
(	q5y9	+	Kq5yA?JK{tEIIeT2{K?JK{tEIIeT2{K,a0,a0,-,- &&'7<	""9-I& % 3 34D E %%h6H1u9=--k:4i@	
 #//	3?""#56""?4FG&&x7E585FG5FTQa!eZZ(5FEG,12Eq!&&(EJ2##J5z2%0%8ALMA 44MALMA 44M.>?.>b2g^.>	?9I J9I229I J %%h6H1u9=--k:O-BC""?C8&&x7E585FG5FTQa!eZZ(5FEG,12Eq!&&(EJ2##J5z2%0%Cq QV\R$ LK> H2 NM? J H2sT   )=Y8,=Y=/=Z2=Z!Z!Z9+Z+Z( Z! Z&4Z+ Z0+Z5>Z;)NF)rV   r   typingr   r   r   r*   r   _typesr   optim	Optimizerr
   rW   r-   rE   rl   rk   rO   r"   r    <module>r      su  	  ( (   BU[["" BV #' 6V6F|6 v,6 &\	6
 &\6 $6 6 6 6 6 6 6 6 6  !6" 
#6rP=VP=F|P= v,P= &\	P=
 &\P= P= P= P= P= P= P= P= P= P=ftDVtDF|tD v,tD &\	tD
 &\tD tD tD tD tD tD tD tD tD tDr"   