
    RЦih2              '       ,   S r SSKJrJrJrJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  S rS	\\S
4   S\S\S\\\\4      4S jr " S S\	5      rS\\   S\\   S\\\      S\\\      S\\\      S\\\      S\\   S\S\S\S\S\S\S\\   S\\\R(                  4   S\\   S \S!\S"\\   4&S# jrS\\   S\\   S\\\      S\\\      S\\\      S\\\      S\\   S\S\S\S\S\S\S\\   S\\\R(                  4   S\\   S \S!\S"\\   4&S$ jrg)%a  Adafactor (Big Vision variant) for PyTorch

Adapted from the implementation in big vision: https://github.com/google-research/big_vision

Described in 'Scaling Vision Transformers': https://arxiv.org/abs/2106.04560

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Adaptation and PyTorch modifications by Ross Wightman
    )ListOptionalTupleUnionN)Tensor)	Optimizer   )ParamsTc                  "    [         R                  $ )z6Get the scalar dtype that the optimizer uses for state)torchfloat64     V/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/adafactor_bv.py_get_scalar_dtyper      s    ==r   shape.factoredmin_dim_size_to_factorreturnc                     U(       a  [        U 5      S:  a  g[        S [        U 5       5       5      nXS   S      U:  a  g[        US   S   5      [        US   S   5      4$ )a  Whether to use a factored second moment estimator.

This function returns a tuple with the two largest axes to reduce over.
If no two dimensions have size >= min_dim_size_to_factor, return None.

Args:
  shape: an input shape
  factored: whether to use factored second-moment estimator for > 2d vars.
  min_dim_size_to_factor: only factor accumulator if two array dimensions have at least this size.

Returns:
  None or a tuple of ints
   Nc              3   ,   #    U  H
  u  pX!4v   M     g 7fNr   ).0ixs      r   	<genexpr>!_factored_dims.<locals>.<genexpr>/   s     >-=TQ1&-=s   r	   )lensorted	enumerateint)r   r   r   sorted_dimss       r   _factored_dimsr&      sh    $ s5zA~>Yu-=>?K_Q #99{2q!"CB(:$;;;r   c            !         ^  \ rS rSrSrSSSSSS\R                  S	S
S	SSS4SS.S\S\S\	S\S\	S\S\
\   S\\\R                  4   S\
\   S\S\
\   S\S\S\S\
\   4U 4S jjjjrU 4S jr\R"                  " 5       S S j5       rSrU =r$ )!AdafactorBigVision5   z
PyTorch implementation of BigVision's Adafactor variant with both single and multi tensor implementations.

Adapted from https://github.com/google-research/big_vision by Ross Wightman
      ?   g?r   g+?g?N        F)foreachparamslrr   
decay_ratedecay_offset	beta2_capmomentummomentum_dtypeepsweight_decayclipping_thresholdunscaled_wdcautioncorrected_weight_decayr-   c                  > [        U[        5      (       aN  US:X  a  [        R                  nO7US:X  a  [        R                  nO US:X  d
   U S35       e[        R
                  n[        UUUUUUUU	U
UUUUUS9n[        TU ]!  UU5        g )Nfloat16bfloat16float32z dtype not supported)r/   r   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r-   )	
isinstancestrr   r<   r=   r>   dictsuper__init__)selfr.   r/   r   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r-   defaults	__class__s                    r   rC   AdafactorBigVision.__init__<   s    & nc***!&:-!&%2[~6FFZ4[[2!& #9!%)%1##9
  	*r   c                 P  > [         TU ]  U5        U R                   GH  nUR                  SS5        UR                  SS5        UR                  SS 5        US    H  nU R                  R                  U0 5      n[        U5      S:w  aJ  [        R                  " US   5      (       d,  [        R                  " [        US   5      [        5       S9US'   S	U;   d  M  [        R                  " US	   5      (       d  M  US	   R                  U R                  S
   S9US	'   M     GM     g )Nr9   Fr:   r-   r.   r   stepdtypeexp_avgr4   )rB   __setstate__param_groups
setdefaultstategetr!   r   	is_tensortensorfloatr   torE   )rD   rP   grouppp_staterF   s        r   rM   AdafactorBigVision.__setstate__k   s    U#&&EY.5u=Y-8_**..B/w<1$U__WV_-M-M&+ll53IQbQd&eGFO'EOOGI<N,O,O *1);)>)>T]]ScEd)>)eGI& %	 'r   c                    S nUb%  [         R                  " 5          U" 5       nS S S 5        U R                   GH  n/ n/ n/ n/ n/ n/ n	/ n
US    GHf  nUR                  c  M  UR                  R                  (       a  [        S5      eUR                  U5        UR                  UR                  5        U R                  U   n[        U5      S:X  GaJ  [         R                  " S[        5       S9US'   UR                  R                  n[        USU R                  S   S	9nUb  Uu  nn[        UR                  R                  5      nS
UU'   [        UR                  R                  5      nS
UU'   UR                  R                  U5      US'   UR                  R                  U5      US'   O0[         R                   " UR                  [         R"                  S9US'   U R                  S   b/  [         R                   " UR                  U R                  S   S9US'   U	R                  US   5        UR                  UR%                  SS 5      5        UR                  UR%                  SS 5      5        UR                  UR%                  SS 5      5        U
R                  UR%                  SS 5      5        GMi     US   (       a  [&        nO[(        nU" S$0 SU_SU_SU_SU_SU_SU
_SU	_SUS   _SUS   _SUS   _SUS   _SUS   _SUS   _SUS   _SUS   _SUS   _S US    _S!US!   _S"US#   (       a  U R                  S   OS _6  GM     U$ ! , (       d  f       GN4= f)%Nr.   zSparse gradients not supportedr   r,   rJ   rI   Tr   )r   r   r	   exp_avg_sq_rexp_avg_sq_c)memory_format
exp_avg_sqr3   r4   rL   r-   gradsexp_avg_sq_rsexp_avg_sq_csexp_avg_sqsexp_avgsstate_stepsbeta2_decayr0   r2   r5   r/   r6   r7   r8   r9   max_lrr:   r   )r   enable_gradrN   grad	is_sparseRuntimeErrorappendrP   r!   rS   r   r   r&   rE   list	new_zeros
zeros_likepreserve_formatrQ   _multi_tensor_adafactor_single_tensor_adafactor)rD   closurelossrV   params_with_gradr_   r`   ra   rb   rd   rc   rW   rP   r   factored_dimsdcdr	row_shape	col_shapefuncs                       r   rI   AdafactorBigVision.step|   s   ""$y % &&E!EMMKKH8_66>66##&'GHH ''*QVV$

1u:?$)LL<M<O$PE&MFFLLE$2!%/3}}=U/V%M %0!.B$($6	()	"$($6	()	"010@0@0Kn-010@0@0Kn-.3.>.>qvvUZUjUj.kl+}}Z0<+0+;+;AFF$--XhJi+ji(""5=1$$UYY~t%DE$$UYY~t%DE""599\4#@A		)T :;S %V Y./ ' , ,	
 ( " ( ",/  , (--E'F %L ; #>2 z*  %%56  $))=#>!" "-0#$ i(%& /44L.Mt}}T*SW's '^ e %$s   M
Mr   r   )__name__
__module____qualname____firstlineno____doc__r   r=   r
   rT   r$   r   r   r@   rK   boolrC   rM   no_gradrI   __static_attributes____classcell__)rF   s   @r   r(   r(   5   s    *, # !$(+6;nn#'"%26 %!+0-+" ',#-+-+ -+ %(	-+
 -+ -+ -+ uo-+ "#u{{"23-+ %-+  -+ !)-+ -+ -+ %)-+" d^#-+ -+^f" ]]_U Ur   r(   r.   r_   r`   ra   rb   rc   rd   re   r2   r5   r/   r6   r3   r4   r7   r8   r9   rf   c                Z   [        U 5       GH  u  nnUU   nUU   nUU   nUU   nUU   nUU   nU
c"  UR                  [        R                  :X  a  SOSn
US-  n[	        US[        U5      U* -  -
  5      nSU-
  n[        R                  " U5      U
-   nUc  [        UR                  SU	S9u  nnUR                  UR                  USS9U5        UR                  UR                  USS9U5        UU:  a  US-
  OUn UR                  U SS9n!UU!-  R                  5       n"UR                  5       n#UU"-  U#-  n$O-Uc  Ub   eUR                  UU5        UUR                  5       -  n$UbF  U$R                  S5      U$R                  5       S	-  U-  -  R                  SS
9n%U$R                  U%5        Ub  Ub  UUR                  :w  a@  UR                  U$R!                  U5      SU-
  5        UR!                  UR                  5      n$O%UR                  U$SU-
  5        UR#                  5       n$U(       a^  U$U-  S:  R!                  UR                  5      n&U&R                  U&R                  5       R                  SS95        U$R%                  U&5        U$R%                  U5        US:w  aq  U(       a3  Uc  UR%                  SU-
  5        ORUR%                  SUU-  U-  -
  5        O7Uc  UR%                  SX-  -
  5        OUR%                  SUS-  U-  U-  -
  5        UR'                  U$SS9  GM     g )NgHz>gKH9r	   r*   T)r   )dimkeepdimr   g      ?)maxr   gMbP?)ming      )alpha)r#   rK   r   r<   r   rT   squarer&   r   lerp_meanrsqrtnormnumelclamp_div_rU   clonemul_add_)'r.   r_   r`   ra   rb   rc   rd   re   r2   r   r5   r/   r6   r3   r4   r7   r8   r9   rf   r   paramrh   r[   r\   r^   rL   step_tbeta2_tone_minus_beta2_tgrad_sqrrv   rw   	reduce_dcrow_col_mean
row_factor
col_factorupdatedenommasks'                                          r   rq   rq      s$   , f%5Qx$Q'$Q' ^
1+Q;**5$5C 	!iuV}+'F!FGK<<%+#DJJMcdFBx}}T}BDUVx}}T}BDUV"$r'QrI',,D,IL&5<<>J%++-JJ&3F  'L,@@@X'89J,,..F )[[^#(=AS'ST\\ad\eEKK G$7+fii7XF DJJ/fa(l3 )--djj9		$))+,,,67D! 	B 1>JJrL01 JJrR&[L$@@A >JJrB$556 JJrR1Wv%5$EEF 	

6
&[ &r   c                     S5       e)Nz2multi-tensor fn (foreach=True) not implemented yetr   )r.   r_   r`   ra   rb   rc   rd   re   r2   r   r5   r/   r6   r3   r4   r7   r8   r9   rf   s                      r   rp   rp   ;  s    . GFF5r   )r   typingr   r   r   r   r   r   torch.optimr   _typesr
   r   r$   r   tupler&   r(   rT   r@   rK   rq   rp   r   r   r   <module>r      s   0 /   ! 
<S#X<< !$< eCHo	<4] ]@c'Vc'F|c' HV,-c' HV,-	c'
 (6*+c' x'(c' &\c' c' c' !$c' c' c' c' 5/c'  c5;;./!c'" %UO#c'$ %c'& 'c'( )c'LGVGF|G HV,-G HV,-	G
 (6*+G x'(G &\G G G !$G G G G 5/G  c5;;./!G" %UO#G$ %G& 'G( )Gr   