
    RЦi7                     J    S r SSKJr  SSKJr  SSKrSSKJr   " S S\5      rg)	am  PyTorch impl of LaProp optimizer

Code simplified from https://github.com/Z-T-WANG/LaProp-Optimizer, MIT License

Paper: LaProp: Separating Momentum and Adaptivity in Adam, https://arxiv.org/abs/2002.04839

@article{ziyin2020laprop,
  title={LaProp: a Better Way to Combine Momentum with Adaptive Gradient},
  author={Ziyin, Liu and Wang, Zhikang T and Ueda, Masahito},
  journal={arXiv preprint arXiv:2002.04839},
  year={2020}
}

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

    )Tuple)	OptimizerN   )ParamsTc                      ^  \ rS rSrSr      SS\S\S\\\4   S\S\S\S	\4U 4S
 jjjr	U 4S jr
\R                  " 5       SS j5       rSrU =r$ )LaProp   znLaProp Optimizer

Paper: LaProp: Separating Momentum and Adaptivity in Adam, https://arxiv.org/abs/2002.04839
paramslrbetasepsweight_decaycautioncorrected_weight_decayc           	        > SU::  d  [        SR                  U5      5      eSU::  d  [        SR                  U5      5      eSUS   s=::  a  S:  d  O  [        SR                  US   5      5      eSUS   s=::  a  S:  d  O  [        SR                  US   5      5      e[        UUUUUUS	9n[        [        U ]  X5        g )
N        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)
selfr
   r   r   r   r   r   r   defaults	__class__s
            P/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/laprop.pyr   LaProp.__init__    s     by8??CDDcz8??DEEeAh$$DKKERSHUVVeAh$$DKKERSHUVV%#9
 	fd$V6    c                    > [         TU ]  U5        U R                   H'  nUR                  SS5        UR                  SS5        M)     g )Nr   Fr   )r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r    LaProp.__setstate__<   sA    U#&&EY.5u= 'r   c                     SnUb%  [         R                  " 5          U" 5       nSSS5        U R                   GH@  nUS    GH2  nUR                  c  M  UR                  nUR                  (       a  [        S5      eU R                  U   n[        U5      S:X  aA  SUS'   [         R                  " U5      US'   SUS'   SUS	'   [         R                  " U5      US
'   US   US
   pUS   u  pUS==   S-  ss'   SU
-
  nSU	-
  nUR                  U
5      R                  XUUS9  US   U	-  XS   -  -   US'   US	   U
-  U-   US	'   US   S:w  a  US   US   -  OSnUS	   nSU-  nUR                  U5      R                  5       R                  US   5      nUU-  nUR                  U	5      R                  UUS   U-  S9  US   (       aQ  Xu-  S:  R                  UR                  5      nUR!                  UR#                  5       R%                  SS95        UU-  nUR                  X* S9  US   S:w  d  GM  US   (       a  US   S-  U R&                  S   -  nOUS   nUR                  UU* US   -  S9  GM5     GMC     U$ ! , (       d  f       GNb= f)zPerforms a single optimization step.

Arguments:
    closure (callable, optional): A closure that reevaluates the model
        and returns the loss.
Nr
   z(LaProp does not support sparse gradientsr   stepexp_avgr   exp_avg_lr_1exp_avg_lr_2
exp_avg_sqr   r   )valuer   r   r   )alphar   gMbP?)minr   r      )torchenable_gradr!   grad	is_sparseRuntimeErrorr#   len
zeros_likemul_addcmul_divsqrt_add_todtypediv_meanclamp_r   )r   closurelossr$   pr2   r#   r(   r+   beta1beta2one_minus_beta2one_minus_beta1bias_correction1bias_correction2	step_sizedenomstep_of_this_gradmaskwd_scales                       r   r'   LaProp.stepB   s    ""$y % &&E8_66>vv>>&'QRR

1 u:?$%E&M','7'7':E)$,.E.),.E.)*/*:*:1*=E,'&+I&6l8K$W~f""#e)"#e) &////R(-n(=(E`dZeHe(en%(-n(=(E(Wn% KPPT+Y[J[5#85;#Fac #(#8  00	"'78>>@EEeElS$(5L!U#(():%+P_B_(`##NQ.224::>DIIdiik00T0:;%nGwj1(A-56#(;!#3dmmD6I#I#(;FF1XIn0E$EFFo % 't { %$s   I>>
J )g-C6:?)g?g+?gV瞯<r   FF)N)__name__
__module____qualname____firstlineno____doc__r   floatr   boolr   r    r0   no_gradr'   __static_attributes____classcell__)r   s   @r   r   r      s     )5"$!+077 7 &	7
 7  7 7 %)7 78> ]]_F Fr   r   )	rU   typingr   torch.optimr   r0   _typesr   r   rP   r   r   <module>r^      s&   $  !  nY nr   