
    RЦi&                     r    S r SSKrSSKJrJr  SSKrSSKJr   " S S\R                  R                  5      r
g)zAdafactor Optimizer

Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py

Modified by Ross Wightman to fix some issues with factorization dims for non nn.Linear layers

Original header/copyright below.
    N)OptionalTuple   )ParamsTc                     ^  \ rS rSrSr           SS\S\\   S\S\S\S\S	\\\\4      S
\S\	S\	S\
S\	4U 4S jjjrU 4S jr\S 5       r\SS j5       r\S 5       rS r\R&                  " 5       SS j5       rSrU =r$ )	Adafactor   a  Implements Adafactor algorithm.

This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
(see https://arxiv.org/abs/1804.04235)

Note that this optimizer internally adjusts the learning rate depending on the
*scale_parameter*, *relative_step* and *warmup_init* options.

To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
`relative_step=False`.

Ags:
    params: iterable of parameters to optimize or dicts defining parameter groups
    lr: external learning rate
    eps: regularization constants for square gradient and parameter scale respectively
    eps_scale: regularization constants for parameter scale respectively
    clip_threshold: threshold of root-mean-square of final gradient update
    decay_rate: coefficient used to compute running averages of square gradient
    beta1: coefficient used for computing running averages of gradient
    weight_decay: weight decay
    scale_parameter: if True, learning rate is scaled by root-mean-square of parameter
    warmup_init: time-dependent learning rate computation depends on whether warm-up initialization is being used
paramslreps	eps_scaleclip_threshold
decay_ratebetasweight_decayscale_parameterwarmup_initmin_dim_size_to_factorcautionc                    > U(       + nU
(       a  U(       d  [        S5      eUc  S OUS   n[        UUUUUUUU	UU
UUS9n[        [        U ]  X5        g )Nz'warmup_init requires relative_step=Truer   )r   r   r   r   r   beta1r   r   relative_stepr   r   r   )
ValueErrordictsuperr   __init__)selfr
   r   r   r   r   r   r   r   r   r   r   r   r   r   defaults	__class__s                   S/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/adafactor.pyr   Adafactor.__init__.   sk     }FGG58)!%+'##9
 	i'9    c                    > [         TU ]  U5        U R                   H'  nUR                  SS5        UR                  SS5        M)     g )Nr   Fr      )r   __setstate__param_groups
setdefault)r   stategroupr   s      r    r%   Adafactor.__setstate__R   sA    U#&&EY.5r: 'r"   c                     U S   (       a_  U S   (       a  SUS   -  OSn[        US[        R                  " US   5      -  5      nSnU S   (       a  [        U S   US	   5      nX4-  U S
'   U S
   $ )Nr   r   gư>stepg{Gz?      ?r   r   RMSr   )minmathsqrtmax)param_groupparam_statemin_steplr_tparam_scales        r    _get_lrAdafactor._get_lrX   s~    '5@5Otk&11UYHxtyyV1D'E!EFDK,-!+k":K<NO $ 2K4  r"   c                     U S   S LnS n[        U5      nUS:  a  US   U:  a  US   U:  a  SnXC4$ US:  a  US   U:  a  US   U:  a
  US-
  US-
  4nXC4$ )Nr      r   r   )r   r   )len)r3   param_shapemin_size_to_factoruse_first_momentfactoredndims         r    _get_optionsAdafactor._get_optionsc   s    &w/t;;
 !8A);;AQc@cH
 ))	 QY;r?-??KPROVhDhax)H))r"   c                 L    U R                  S5      U R                  5       S-  -  $ )Nr;   g      ?)normnumel)tensors    r    _rmsAdafactor._rmsu   s     {{1~3!677r"   c                     XR                  USS9-  R                  5       R                  U5      nUR                  U5      R                  5       n[        R
                  " XV5      $ )NT)dimkeepdim)meanrsqrt_	unsqueezersqrttorchmul)r   exp_avg_sq_rowexp_avg_sq_coldim_coldim_rowr_factorc_factors          r    _approx_sq_gradAdafactor._approx_sq_grady   sW    "%8%8Wd%8%SS[[]gghop!++G4::<yy,,r"   c                 	   SnUb%  [         R                  " 5          U" 5       nSSS5        U R                   GH  nUS    GH  nUR                  c  M  UR                  nUR                  [         R
                  [         R                  1;   a  UR                  5       nUR                  (       a  [        S5      eU R                  U   nU R                  UUR                  US   S9u  px[        U5      S:X  a  SUS'   U(       a  [         R                  " U5      US'   Ubz  Uu  pS	 n[         R                  " U" UR                  U
5      5      R!                  U5      US
'   [         R                  " U" UR                  U	5      5      R!                  U5      US'   O[         R                  " U5      US'   SUS'   OgU(       a  US   R!                  U5      US'   Ub/  US
   R!                  U5      US
'   US   R!                  U5      US'   OUS   R!                  U5      US'   UnUR                  [         R
                  [         R                  1;   a  UR                  5       nUS==   S-  ss'   U R#                  U5      US'   U R%                  X65      nS[&        R(                  " US   US   5      -
  nUS-  US   -   nUb  Uu  pUS
   nUS   nUR+                  U5      R-                  UR/                  U
S9SU-
  S9  UR+                  U5      R-                  UR/                  U	S9SU-
  S9  U R1                  UUX5      nUR+                  U5        OFUS   nUR+                  U5      R-                  USU-
  S9  UR3                  5       R+                  U5      nUR5                  U R#                  U5      US   -  R7                  SS95        UR+                  U5        U(       a  US   nUR+                  US   5      R-                  USUS   -
  S9  US   (       aS  UU-  S:  R!                  UR                  5      nUR5                  UR/                  5       R7                  SS95        UU-  nOUnUS   S:w  a  UR-                  XS   * U-  S9  UR-                  U* 5        UR                  [         R
                  [         R                  1;   d  GM  UR9                  U5        GM     GM     U$ ! , (       d  f       GN= f)zPerforms a single optimization step.
Arguments:
    closure (callable, optional): A closure that reevaluates the model and returns the loss.
Nr
   z,Adafactor does not support sparse gradients.r   )r@   r   r,   exp_avgc                     U S U XS-   S  -   $ )Nr    )shaperM   s     r    _remove_dim#Adafactor.step.<locals>._remove_dim   s    #(#;Qwx#@@r"   rU   rV   
exp_avg_sqr.   r   r-   r   r;   r   )rM   )alphar   )r/   r   r   MbP?r   )rS   enable_gradr&   graddtypefloat16bfloat16float	is_sparseRuntimeErrorr(   rD   ra   r>   
zeros_likezerostorJ   r8   r0   powmul_add_rO   r[   rR   div_clamp_copy_)r   closurelossr)   prh   r(   factored_dimsrA   rW   rX   rb   p_fp32r6   beta2tupdaterU   rV   rd   r^   masks                        r    r,   Adafactor.step   s    ""$y % &&E8_66>vv::%--!@@::<D>>&'UVV

1262C2CJJ',-E'F 3D 3/ u:?$%E&M'+0+;+;D+Ai($0+8(A27++k$**V]>^2_2b2bcg2h./27++k$**V]>^2_2b2bcg2h./.3.>.>t.Dl+#$E%L'+0+;+>+>t+Di($0278H2I2L2LT2R./278H2I2L2LT2R./.3L.A.D.DT.Jl+77u}}enn==#\\^Ff"#yy0e||E1txxfu\7JKKU5\1 ,'4$G%*+;%<N%*+;%<N"''/44V[[W[5MUX[aUa4b"''/44V[[W[5MUX[aUa4b "11..RYcFKK%!&|!4JOOF+00sV|0L'--/44T:FTYYv.7G1HHPPUXPYZD!##I.GLLw055fAgDV5WY' '$ 266tzzB		$))+"4"4"4">?!(4!((A-KK^/D.Dt.KKLVG$77u}}enn==GGFOq % 'v } %$s   S,,
S;r`   )NgKH9rf   r-   gNg        TFr$   F)r$   )N)__name__
__module____qualname____firstlineno____doc__r   r   rl   r   boolintr   r%   staticmethodr8   rD   rJ   r[   rS   no_gradr,   __static_attributes____classcell__)r   s   @r    r   r      s    6 #'#$' $37"%$( %*,!":": ": 	":
 ": "": ": E%,/0":  ": "": ": %(": ": ":H; ! ! * *" 8 8- ]]_e er"   r   )r   r0   typingr   r   rS   _typesr   optim	Optimizerr   r`   r"   r    <module>r      s0     "  P%% Pr"   