
    RЦiZ)                     V    S r SSKrSSKJrJr  SSKrSSKJr  SSKJ	r	   " S S\5      r
g)	a  PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb

This optimizer code was adapted from the following (starting with latest)
* https://github.com/HabanaAI/Model-References/blob/2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py
* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py
* https://github.com/cybertronai/pytorch-lamb

Use FusedLamb if you can (GPU). The reason for including this variant of Lamb is to have a version that is
similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or cannot install/use APEX.

In addition to some cleanup, this Lamb impl has been modified to support PyTorch XLA and has been tested on TPU.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Original copyrights for above sources are below.

Modifications Copyright 2021 Ross Wightman
    N)OptionalTuple)	Optimizer   )ParamsTc                      ^  \ rS rSrSr            SS\S\S\S\\\4   S\S\S	\S
\	\   S\S\S\S\S\4U 4S jjjr
U 4S jrS r\R                  " 5       SS j5       rSrU =r$ )LambC   a  Implements a pure pytorch variant of FuseLAMB (NvLamb variant) optimizer from apex.optimizers.FusedLAMB
reference: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py

LAMB was proposed in:
- Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:  https://arxiv.org/abs/1904.00962
- On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

Args:
    params: Iterable of parameters to optimize or dicts defining parameter groups.
    lr: Learning rate
    betas: Coefficients used for computing running averages of gradient and its norm.
    eps: Term added to the denominator to improve numerical stability.
    weight_decay: Weight decay
    grad_averaging: Whether apply (1-beta2) to grad when calculating running averages of gradient.
    max_grad_norm: Value used to clip global grad norm.
    trust_clip: Enable LAMBC trust ratio clipping.
    always_adapt: Apply adaptive learning rate to 0.0 weight decay parameter.
    caution: Apply caution.
    decoupled: apply decoupled weight decay
    corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr) when using decoupled_decay
paramslrbias_correctionbetasepsweight_decaygrad_averagingmax_grad_norm
trust_clipalways_adaptcautiondecoupled_decaycorrected_weight_decayc                 L   > [        UUUUUUUU	U
UUUS9n[        TU ]	  X5        g )N)r   r   r   r   r   r   r   r   r   r   r   r   )dictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   defaults	__class__s                  N/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/lamb.pyr   Lamb.__init__Z   sB      +%)'!%+#9
 	*    c                    > [         TU ]  U5        U R                   H9  nUR                  SS5        UR                  SS5        UR                  SS5        M;     g )Nr   Fr   r   )r   __setstate__param_groups
setdefault)r   stategroupr   s      r   r#   Lamb.__setstate__z   sR    U#&&EY..65u= 'r!   c                    U R                   S   nUc  g / nU R                   Ht  nUS    Hh  nUR                  c  M  UR                  nUR                  (       a  [	        S5      eUR                  [        R                  R                  U5      5        Mj     Mv     [        R                  R                  [        R                  " U5      5      nXa-  R                  SS9nU$ )Nr   r   zDLamb does not support sparse gradients, consider SparseAdam instead.      ?min)r   r$   grad	is_sparseRuntimeErrorappendtorchlinalgvector_normstackclamp_)r   r   normsr'   pr-   global_normclip_global_norms           r   _get_clip_grad_normLamb._get_clip_grad_norm   s    o6 &&E8_66>vv>>&'mnnU\\55d;< % ' ll..u{{5/AB'7??C?Hr!   c           
         SnUb%  [         R                  " 5          U" 5       nSSS5        U R                  5       nU R                   GH  nUS   (       a  SOSnUS   u  pgUS   (       a  SOSnU(       a  SU-
  OSn	SU;   a  US==   S-  ss'   OSUS'   U(       a  SXdS   -  -
  n
SXtS   -  -
  nOS	u  pUS
    GH  nUR                  c  M  UR                  nUb  UR                  U5        U R                  U   n[        U5      S:X  a2  [         R                  " U5      US'   [         R                  " U5      US'   US   US   nnUR                  U5      R                  XS9  UR                  U5      R                  XSU-
  S9  UR                  5       [        R                  " U5      -  R                  US   5      nX-  R                  U5      nUS   (       a^  UU-  S:  R                  UR                  5      nUR                  UR!                  5       R#                  SS95        UR                  U5        US   nUS:w  ad  UR%                  SS5      (       a=  US   (       a  US   S-  U R&                  S   -  nOUS   nUR                  UU* U-  S9  OUR                  UUS9  US:w  d
  US   (       a  UR)                  S5      nUR)                  S5      nUU-  n[         R*                  " US:  [         R*                  " US:  US5      S5      nUS   (       a  [         R,                  " USS9nUR                  U5        UR                  UUS   * S9  GM     GM     U$ ! , (       d  f       GNI= f)zPerforms a single optimization step.
Arguments:
    closure (callable, optional): A closure that reevaluates the model
        and returns the loss.
Nr   r   r   r   r   r*   step)r*   r*   r   exp_avg
exp_avg_sq)alpha)valuer   r   MbP?r+   r   r   Fr   r      r   g       @r   )max)r1   enable_gradr:   r$   r-   div_r&   len
zeros_likemul_add_addcmul_sqrtmathtodtypemeanr5   getr   normwhereclamp)r   closurelossclip_grad_normr'   r   beta1beta2r   beta3bias_correction1bias_correction2r7   r-   r&   r>   r?   denomupdatemaskr   wd_scalew_normg_normtrust_ratios                            r   r=   	Lamb.step   sO    ""$y % 113&&E#():#;aO >LE"'(8"9QqN!/AISE f" !f#$uf'=#= #$uf'=#= 5=2 8_66>vv!-IIn-

1 u:?','7'7':E)$*/*:*:1*=E,'&+I&6l8K U#(((;&//!e)/L#*TYY7G-HHNNuUZ|\!4::5A#"TMA-11$**=DIIdiik00T0:;KK%$^41$yy!2E:: !9:',T{a'7$--:M'MH',T{Hq	L(@AA\:1$n(= VVC[F#[[-F"(6/K #(++
FQJSA#K
 \*&+kk+3&GKK,veDk\2{ %' 'd o %$s   M%%
M4 )rB   T)g?g+?gư>g{Gz?Tr*   FFFFF)N)__name__
__module____qualname____firstlineno____doc__r   floatboolr   r   r   r#   r:   r1   no_gradr=   __static_attributes____classcell__)r   s   @r   r	   r	   C   s    2 $()5"&#'-0$!&!$)+0++ + "	+
 &+ +  + !+ $E?+ + + + "+ %)+ +@> $ ]]__ _r!   r	   )rj   rM   typingr   r   r1   torch.optimr   _typesr   r	   re   r!   r   <module>rs      s*   r  "  ! p9 pr!   