
    RЦij                     >    S r SSKrSSKJr  SSKJr   " S S\5      rg)a  RMSProp modified to behave like Tensorflow impl

Originally cut & paste from PyTorch RMSProp
https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py
Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Modifications Copyright 2021 Ross Wightman
    N)	Optimizer   )ParamsTc                      ^  \ rS rSrSr          SS\S\S\S\S\S\S	\S
\S\S\S\4U 4S jjjrU 4S jr	\
R                  " 5       SS j5       rSrU =r$ )	RMSpropTF   ag  Implements RMSprop algorithm (TensorFlow style epsilon)

NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
and a few other modifications to closer match Tensorflow for matching hyper-params.

Noteworthy changes include:
1. Epsilon applied inside square-root
2. square_avg initialized to ones
3. LR scaling of update accumulated in momentum buffer

Proposed by G. Hinton in his
`course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.

The centered version first appears in `Generating Sequences
With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.

Args:
    params: iterable of parameters to optimize or dicts defining parameter groups
    lr: learning rate
    momentum: momentum factor
    alpha: smoothing (decay) constant
    eps: term added to the denominator to improve numerical stability
    centered: if ``True``, compute the centered RMSProp, the gradient is normalized by an estimation of its variance
    weight_decay: weight decay (L2 penalty) (default: 0)
    decoupled_decay: decoupled weight decay as per https://arxiv.org/abs/1711.05101
    corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr) when decoupled_decay is True
    lr_in_momentum: learning rate scaling is included in the momentum buffer update as per defaults in Tensorflow
    caution: apply caution
paramslralphaepsweight_decaymomentumcentereddecoupled_decaycorrected_weight_decaylr_in_momentumcautionc                   > SU::  d  [        SR                  U5      5      eSU::  d  [        SR                  U5      5      eSU::  d  [        SR                  U5      5      eSU::  d  [        SR                  U5      5      eSU::  d  [        SR                  U5      5      e[        UUUUUUUU	U
US9
n[        [        U ]  X5        g )N        zInvalid learning rate: {}zInvalid epsilon value: {}zInvalid momentum value: {}zInvalid weight_decay value: {}zInvalid alpha value: {})
r
   r   r   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)selfr	   r
   r   r   r   r   r   r   r   r   r   defaults	__class__s                T/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/rmsprop_tf.pyr   RMSpropTF.__init__3   s     by8??CDDcz8??DEEh9@@JKKl"=DD\RSSe|6==eDEE%+#9)
 	i'9    c                    > [         [        U ]  U5        U R                   HK  nUR	                  SS5        UR	                  SS5        UR	                  SS5        UR	                  SS5        MM     g )Nr   r   r   Fr   r   )r   r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r"   RMSpropTF.__setstate__Z   sc    i+E2&&EZ+Z/Y.5u=	 'r    c                    SnUb%  [         R                  " 5          U" 5       nSSS5        U R                   GH  nUS    GH  nUR                  c  M  UR                  nUR                  (       a  [        S5      eU R                  U   n[        U5      S:X  ac  SUS'   [         R                  " U5      US'   US   S:  a  [         R                  " U5      US'   US	   (       a  [         R                  " U5      US
'   US   nSUS   -
  nUS==   S-  ss'   US   S:w  a^  US   (       aB  US   (       a  US   S-  U R                  S   -  n	OUS   n	UR                  SXS   -  -
  5        OUR                  XCS   S9nUR                  UR                  S5      U-
  US9  US	   (       aH  US
   n
U
R                  XZ-
  US9  UR                  XSS9R                  US   5      R!                  5       nO"UR                  US   5      R!                  5       nUS   S:  a  US   nUR                  US   5        S nUS   (       a:  UR#                  X[US   S9  US   (       a  U" X5      nUR                  U* 5        GM<  UR#                  X[5        US   (       a  U" X5      nUR                  XS   * S9  GMu  UR#                  X[US   * S9  GM     GM     U$ ! , (       d  f       GN= f)zPerforms a single optimization step.

Arguments:
    closure (callable, optional): A closure that reevaluates the model
        and returns the loss.
Nr	   z)RMSprop does not support sparse gradientsr   step
square_avgr   momentum_bufferr   grad_avgg      ?r   r   r   r   r   r
      )r   )valuer   c                     X-  S:  R                  UR                  5      nUR                  UR                  5       R	                  SS95        X-  $ )Nr   gMbP?)min)todtypediv_meanclamp_)_m_gmasks      r   _apply_caution&RMSpropTF.step.<locals>._apply_caution   sD     "!//9		$))+"4"4"4">?!y(r    r   r   )torchenable_gradr#   grad	is_sparseRuntimeErrorr%   len	ones_like
zeros_liker   mul_addadd_powaddcmulsqrt_addcdiv_)r   closurelossr&   pr>   r%   r*   one_minus_alphawd_scaler,   avgbufr:   s                 r   r)   RMSpropTF.stepb   s    ""$y % &&E8_66>vv>>&'RSS

1 u:?$%E&M*///!*<E,'Z(1,383C3CA3F/0Z(,1,<,<Q,?j)"<0
"$uW~"5f"(A-./ !9:',T{a'7$--:M'MH',T{HrH^/D$DDE#xx~1FxG j 8P $$Z0HMM$/MI$,,Xr,JNNuUZ|\bbdC %..u6<<>C$q( 12CHHU:./) -.TeDkB +"0";Ct T/ +"0";Cs;,7JJttJ=G % 'L S %$s   K
K$ )
g{Gz?g?g|=r   r   FFFTF)N)__name__
__module____qualname____firstlineno____doc__r   floatboolr   r"   r<   no_gradr)   __static_attributes____classcell__)r   s   @r   r   r      s    B "# "$)+0#'!%:%: %: 	%:
 %:  %: %: %: "%: %)%: !%: %: %:N> ]]_R Rr    r   )rX   r<   torch.optimr   _typesr   r   rS   r    r   <module>r`      s#     ! a	 ar    