
    RЦiؽ                        S r SSKrSSKJr  SSKJr  SSKJrJrJ	r	J
r
JrJrJrJrJrJr  SSKJr  SSKrSSKrSSKJr  SSKrSSKJrJr  SS	KJrJrJr  SS
KJr  SSK J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1  SSK2J3r3  SSK4J5r5  SSK6J7r7  SSK8J9r9  SSK:J;r;  SSK<J=r=  SSK>J?r?  SSK@JArA  SSKBJCrC  SSKDJErE  SSKFJGrG  SSKHJIrI  SS KJJKrK  SS!KLJMrM  \R                  " \O5      rPS"\QS#\4S$ jrR\" S%S&9 " S' S(5      5       rS " S) S*5      rTS+\TS#S4S, jrUS+\TS#S4S- jrVS+\TS#S4S. jrWS+\TS#S4S/ jrXS+\TS#S4S0 jrYS+\TS#S4S1 jrZS+\TS#S4S2 jr[S+\TS#S4S3 jr\\T" 5       r]SOS4 jr^\^" 5            SPS5\\Q\\Q   4   S6\\\Q      S7\_S#\\\Q\\Q\Q4   4      4S8 jjr`S9\QS#\S4S: jra SQS9\QS;\_S#\\\4   4S< jjrb            SRS=\\R                  \4   S>\QS?\\d   S@\dSA\dSB\\_   SC\_SD\	\Q   SE\_SF\\d   SG\dSH\\d   SI\\\R                  /\4      SJ\S#\R                  R                  4SK jjrgSL rh SQSM\\R                  \4   SC\_S#\R                  R                  4SN jjrig)SzrOptimizer Factory w/ custom Weight Decay & Layer Decay support

Hacked together by / Copyright 2021 Ross Wightman
    N)	dataclass)partial)
AnyCallable
CollectionDictListOptionalSetTupleTypeUnionfnmatch   )param_groups_layer_decayparam_groups_weight_decay)ParamsT	OptimTypeOptimizerCallable)	AdaBelief)	Adafactor)AdafactorBigVision)
Adahessian)AdamP)AdamWLegacy)Adan)Adopt)Kron)Lamb)LaProp)Lars)Lion)	Lookahead)MADGRAD)Mars)Muon)NAdamLegacy)NAdamW)
NvNovoGrad)RAdamLegacy)	RMSpropTF)SGDP)SGDWclass_stringreturnc                      U R                  SS5      u  p[        R                  " U5      n[        X25      $ ! [        [
        4 a  n[	        SU  SU 35      eSnAff = f)z)Dynamically import a class from a string..r   zCould not import z: N)rsplit	importlibimport_modulegetattrImportErrorAttributeError)r/   module_name
class_namemodulees        X/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/timm/optim/_optim_factory.py_import_classr>   .   sh    C"."5"5c1"=((5v**( C-l^2aSABBCs   47 AAAT)frozenc                       \ rS rSr% Sr\\S'   \\\4   \S'   Sr	\\S'   Sr
\\S'   S	r\\S
'   S	r\\S'   Sr\\S'   S	r\\S'   Sr\\\\4      \S'   Srg)	OptimInfo9   a$  Immutable configuration for an optimizer.

Attributes:
    name: Unique identifier for the optimizer
    opt_class: The optimizer class
    description: Brief description of the optimizer's characteristics and behavior
    has_eps: Whether the optimizer accepts epsilon parameter
    has_momentum: Whether the optimizer accepts momentum parameter
    has_betas: Whether the optimizer accepts a tuple of beta parameters
    num_betas: number of betas in tuple (valid IFF has_betas = True)
    defaults: Optional default parameters for the optimizer
name	opt_class descriptionThas_epsFhas_momentum	has_betas   	num_betassecond_orderNdefaults )__name__
__module____qualname____firstlineno____doc__str__annotations__r   r   rF   rG   boolrH   rI   rK   intrL   rM   r
   r   r   __static_attributes__rN       r=   rA   rA   9   sp     IS)^$$KGTL$ItIsL$)-HhtCH~&-rY   rA   c                      \ rS rSrSrS%S jrS\SS4S jrS\S	\SS4S
 jr	S\SS4S jr
   S&S\\\\   4   S\\\      S\S\\\\\\4   4      4S jjrS\S\4S jr S'S\\\4   S\S\\\4   4S jjr           S(S\\R,                  \4   S\S\\   S\S\S\\   S\S\\   S\S\\   S\\   S \\   S!\\\R,                  /\4      S"\S\R:                  R<                  4S# jjrS$r g))OptimizerRegistryR   zRegistry managing optimizer configurations and instantiation.

This class provides a central registry for optimizer configurations and handles
their instantiation with appropriate parameter groups and settings.
r0   Nc                 "    0 U l         S1U l        g )Nlion)_optimizers_foreach_defaults)selfs    r=   __init__OptimizerRegistry.__init__Y   s    13,28rY   infoc                     UR                   R                  5       nX R                  ;   a  [        R	                  SU S35        XR                  U'   g)zxRegister an optimizer configuration.

Args:
    info: The OptimInfo configuration containing name, type and description

Optimizer z  already registered, overwritingN)rC   lowerr_   _loggerwarning)ra   rd   rC   s      r=   registerOptimizerRegistry.register]   sD     yy ###OOj.NOP!%rY   aliastargetc                     UR                  5       nX R                  ;  a  [        SU 35      eU R                  U   U R                  UR                  5       '   g)zRegister an alias for an existing optimizer.

Args:
    alias: The alias name
    target: The target optimizer name

Raises:
    KeyError: If target optimizer doesn't exist
z/Cannot create alias for non-existent optimizer N)rg   r_   KeyError)ra   rl   rm   s      r=   register_alias OptimizerRegistry.register_aliash   sO     )))LVHUVV*.*:*:6*B'rY   rC   c                 V    U R                   R                  UR                  5       5        g)z4Register an optimizer as defaulting to foreach=True.N)r`   addrg   ra   rC   s     r=   register_foreach_default*OptimizerRegistry.register_foreach_defaultw   s    ""4::<0rY   filterexclude_filterswith_descriptionc                   ^
 [        U R                  R                  5       5      nU(       aT  [        U[        5      (       a  U/nOUn[        5       nU H  m
UR                  U
4S jU 5       5        M      [        U5      nU(       a-  U H'  nU Vs/ s H  n[        X5      (       a  M  UPM     nnM)     U(       a+  U V	s/ s H  oU R                  U	   R                  4PM     sn	$ U$ s  snf s  sn	f )aG  List available optimizer names, optionally filtered.

Args:
    filter: Wildcard style filter string (e.g., 'adam*')
    exclude_filters: Optional list of wildcard patterns to exclude
    with_description: If True, return tuples of (name, description)

Returns:
    List of either optimizer names or (name, description) tuples
c              3   N   >#    U  H  n[        UT5      (       d  M  Uv   M     g 7fNr   ).0nfs     r=   	<genexpr>4OptimizerRegistry.list_optimizers.<locals>.<genexpr>   s     %HA'!Q-aas   %	%)	sortedr_   keys
isinstancerT   setupdater   rF   )ra   rw   rx   ry   namesfiltersfiltered_namesexclude_filterr~   rC   r   s             @r=   list_optimizers!OptimizerRegistry.list_optimizers{   s      t'',,./&#&&!(  UN%%%H%HH >*E"1$)LEq1KEL #2 KPQ544++D1==>5QQ M Rs   C((C(?$C-c                 |    UR                  5       nXR                  ;  a  [        SU S35      eU R                  U   $ )Get the OptimInfo for an optimizer.

Args:
    name: Name of the optimizer

Returns:
    OptimInfo configuration

Raises:
    ValueError: If optimizer is not found
rf   z not found in registry)rg   r_   
ValueErrorrt   s     r=   get_optimizer_info$OptimizerRegistry.get_optimizer_info   sA     zz|'''z$/EFGG%%rY   name_or_infobind_defaultsc                 `   [        U[        5      (       a  U R                  U5      nO[        U[        5      (       d   eUn[        UR                  [        5      (       a  UR                  R                  S5      (       aA  [        R                  R                  5       (       d   S5       e [        UR                  5      nOUR                  R                  S5      (       aA  [        R                  R                  5       (       d   S5       e [        UR                  5      nO"[        UR                  5      nOUR                  nU(       a'  UR                  (       a  [        U40 UR                  D6nU$ ! [         a  n[        S5      UeSnAff = f! [         a  n[        S5      UeSnAff = f)a  Get the optimizer class with any default arguments applied.

This allows direct instantiation of optimizers with their default configs
without going through the full factory.

Args:
    name_or_info: Name of the optimizer
    bind_defaults: Bind default arguments to optimizer class via `partial` before returning

Returns:
    Optimizer class or partial with defaults applied

Raises:
    ValueError: If optimizer not found
zapex.z!CUDA required for APEX optimizersz,APEX optimizers require apex to be installedNzbitsandbytes.z)CUDA required for bitsandbytes optimizersz<bitsandbytes optimizers require bitsandbytes to be installed)r   rT   r   rA   rD   
startswithtorchcudais_availabler>   r7   rM   r   )ra   r   r   opt_inforD   r<   s         r=   get_optimizer_class%OptimizerRegistry.get_optimizer_class   sp   ( lC((..|<HlI6666#Hh((#..!!,,W55zz..00U2UU0] -h.@.@ AI ##..??zz..00]2]]0m -h.@.@ AI *(*<*<=	 **I X..	?X->->?I# # ]%&TU[\\] # m%&dekllms0   +E4 F 4
F>F

F
F-F((F-model_or_paramsoptlrweight_decaymomentumforeachweight_decay_exclude_1dfallback_listfallback_no_weight_decaylayer_decaylayer_decay_min_scalelayer_decay_no_opt_scaleparam_group_fnkwargsc                    [        U[        R                  5      (       ai  [        USS 5      " 5       nU(       a	  U" U5      nOHU
b  [	        UUU
UUU	UUUS9	nSnO1U(       a  U(       a  [        UUUUU	S9nSnOUR                  5       nOUnUR                  5       R                  S5      nUS   n[        U5      S	:  a  US
   S:H  OSnU R                  U5      nSU0UEnUb  UUS'   UR                  (       a6  UR                  R                  5        H  u  nnUR                  UU5        M     UR                  (       a  UR                  SU5        UR                  (       d  UR!                  SS5        UR"                  (       d  UR!                  SS5        Ub  UR                  SU5        U R%                  USS9nU" U40 UD6nU(       a  ['        U5      nU$ )a  Create an optimizer instance.

Args:
    model_or_params: Model or parameters to optimize
    opt: Name of optimizer to create
    lr: Learning rate
    weight_decay: Weight decay factor
    momentum: Momentum factor for applicable optimizers
    foreach: Enable/disable foreach operation
    weight_decay_exclude_1d: Whether to skip weight decay for 1d params (biases and norm affine)
    fallback_list: Collection of parameter name patterns to use fallback optimizer for hybrid optimizers
    fallback_no_weight_decay: If True, params in no_weight_decay list will use fallback optimizer (e.g., AdamW for Muon)
    layer_decay: Layer-wise learning rate decay
    layer_scale_min_scale: Minimum layer scale factor clamp value
    layer_scale_no_opt_scale: Layer scale below which optimization is disabled
    param_group_fn: Optional custom parameter grouping function
    **kwargs: Additional optimizer-specific arguments

Returns:
    Configured optimizer instance

Raises:
    ValueError: If optimizer not found or configuration invalid
no_weight_decayc                      [        5       $ r|   )r   rN   rY   r=   <lambda>4OptimizerRegistry.create_optimizer.<locals>.<lambda>  s    RURWrY   N)r   r   no_weight_decay_listr   r   r   	min_scaleno_opt_scale        )r   r   r   r   _r   r   	lookaheadFr   r   r   epsbetasr   r   )r   nnModuler6   r   r   
parametersrg   splitlenr   rM   items
setdefaultrH   rG   poprI   r   r$   )ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   params	opt_splitopt_nameuse_lookaheadr   opt_argskvrD   	optimizers                             r=   create_optimizer"OptimizerRegistry.create_optimizer   s   V oryy11%o7H-XZO'8(1#!- +)8"/-E,C3!9
  ""92#!-)8"/-E  "(335 %F IIK%%c*	R=7:9~7I	!3u**84 %3L#KF#K >HTN  ))//11##Aq) 2   
H5 LL%!!LL$' 	73 ,,XU,K	f11	 !),IrY   )r`   r_   r0   NrE   NFT)Nr   ?NTrN   FNNNN)!rO   rP   rQ   rR   rS   rb   rA   rj   rT   rp   ru   r   r	   r
   rV   r   r   r   r   r   r   r   r   r   floatr   r   r   r   optim	Optimizerr   rX   rN   rY   r=   r[   r[   R   s   4	&Y 	&4 	&CC C C C1S 1T 1 -/37%*	##tCy.)# &d3i0# #	#
 
eCsCx()	*#J&s &y &( #'1Y/1  1 
y++	,	1n #'"$!&*,0-/-2+/598<GKy"299g#56y y 	y
  y y d^y &*y &c?y '+y "%y $,E?y '/uoy %Xryyk7.B%CDy y  
		!y yrY   r[   registryc                 "   [        S[        R                  R                  SSSSS0S9[        S[        R                  R                  SSSSS0S9[        S	[        S
SSS0S9[        S[
        SSSSS0S9/nU H  nU R                  U5        M     g)zRegister SGD-based optimizerssgdzDtorch.Optim Stochastic Gradient Descent (SGD) with Nesterov momentumFTnesterovrC   rD   rF   rG   rH   rM   r   zEtorch.Optim Stochastic Gradient Descent (SGD) with classical momentumsgdpz0SGD with built-in projection to unit norm sphererC   rD   rF   rH   rM   sgdwz5SGD with decoupled weight decay and Nesterov momentumN)rA   r   r   SGDr-   r.   rj   )r   sgd_optimizersr   s      r=   _register_sgd_variantsr   `  s     	kkoo^ $'	
 	kkoo_ %(	
 	J $'	
 	O $'	
1 NB # rY   c                    [        S[        R                  R                  SSS9[        S[        R                  R                  SSS9[        S[
        SSS9[        S	[        S
SSSS.S9[        S[        R                  R                  SSS9[        S[        SSS9[        S[        SSS9[        S[        R                  R                  SSS9[        S[        SSS9[        S[        R                  R                  SSSS0S9[        S[        R                  R                  SSS9[        S[        SS9[        S [        S!S9[        S"[        S#S9[        S$[        S%S&S0S'9/nU H  nU R!                  U5        M     g())zRegister Adam-based optimizersadamz,torch.optim.Adam, Adaptive Moment EstimationTrC   rD   rF   rI   adamwz3torch.optim.AdamW, Adam with decoupled weight decayadamwlegacyz<legacy impl of AdamW that pre-dates inclusion to torch.optimadampz1Adam with built-in projection to unit norm sphere{Gz?)wd_ratior   rC   rD   rF   rI   rM   nadamz.torch.optim.NAdam, Adam with Nesterov momentumnadamlegacyz<legacy impl of NAdam that pre-dates inclusion in torch.optimnadamwz]Adam with Nesterov momentum and decoupled weight decay, mlcommons/algorithmic-efficiency implradamz:torch.optim.RAdam, Rectified Adam with variance adaptationradamlegacyz;legacy impl of RAdam that predates inclusion in torch.optimradamwzVtorch.optim.RAdamW, Rectified Adam with variance adaptation and decoupled weight decaydecoupled_weight_decayadamaxzCtorch.optim.Adamax, Adam with infinity norm for more stable updates	adafactorz?Memory-efficient implementation of Adam with factored gradientsrC   rD   rF   adafactorbvzPBig Vision variant of Adafactor with factored gradients, half precision momentumadoptuB   Modified Adam that can converge with any β2 with the optimal rateadoptwuU   Modified AdamW (decoupled decay) that can converge with any β2 with the optimal rate	decoupledrC   rD   rF   rM   N)rA   r   r   AdamAdamWr   r   NAdamr(   r)   RAdamr+   Adamaxr   r   r   rj   )r   adam_optimizersr   s      r=   _register_adam_variantsr     s    	kk&&F		
 	kk''M		
 	!V		
 	K"&D9	
 	kk''H		
 	!V		
 	w		
 	kk''T		
 	!U		
 	kk''p.5	
 	kk((]		
 	Y	

 	(j	

 	\	

 	o!4(		
iZOv # rY   c                 T   [        S[        SSS9[        S[        SSSS0S9[        S	[        S
SSS0S9[        S[        SSSSS.S9[        S[        SSS9[        S[        SSSS0S9[        S[        SSSS0S9[        S[        SSSSS.S9/nU H  nU R                  U5        M     g)zRegister LAMB and LARS variantslambz2Layer-wise Adaptive Moments for batch optimizationTr   lambcz,LAMB with trust ratio clipping for stability
trust_clipr   lambwz LAMB with decoupled weight decaydecoupled_decaylambcwz@LAMB with trust ratio clipping for stability and decoupled decay)r   r  larsz Layer-wise Adaptive Rate ScalingrC   rD   rF   rH   larcz,LARS with trust ratio clipping for stabilityr   nlarszLARS with Nesterov momentumr   nlarcz2LARS with Nesterov momentum & trust ratio clipping)r   r   N)rA   r    r"   rj   )r   lamb_lars_optimizersr   s      r=   _register_lamb_larsr	    s    	L		
 	F"D)	
 	:'.	
 	Z$(TB	
 	:		
 	F"D)	
 	5 $'	
 	L"&d;	
a7p $# $rY   c                    [        S[        SSSS0S9[        S[        SSSS0S9[        S[        S	S
SSSS.S9[        S[        SSSS.S9[        S[
        SSSSS.S9[        S[        SSSSS.S9[        S[        SS
SSS0S9[        S[        SSSS0S9[        S[        SSSSSS.S9[        S [        S!SS0S9/
nU H  nU R                  U5        M     [        S"[        S#SSSS$.S9[        S%[        S&SSSS'.S9[        S([        S)SSSS$.S9[        S*[        S+S
SSSSS,.S9[        S-[        S.S
SSSS$.S9[        S/[        S0SSS$.S9/nU H  nU R                  U5        M     g1)2z2Register corrected weight decay optimizer variantsadamcu7   AdamW with corrected weight decay (lr²/max_lr scaling)Tcorrected_weight_decayr   nadamcu8   NAdamW with corrected weight decay (lr²/max_lr scaling)sgdcu?   SGD with corrected decoupled weight decay (lr²/max_lr scaling)F)r   r  r   adoptcuA   Adopt with corrected decoupled weight decay (lr²/max_lr scaling))r   r  r   lambcdu@   LAMB with corrected decoupled weight decay (lr²/max_lr scaling))r  r  kroncuE   PSGD Kron with corrected decoupled weight decay (lr²/max_lr scaling)r   lioncu6   Lion with corrected weight decay (lr²/max_lr scaling)rC   rD   rF   rG   rI   rM   lapropcu8   LaProp with corrected weight decay (lr²/max_lr scaling)
rmsproptfcuL   RMSprop TF-style with corrected decoupled weight decay (lr²/max_lr scaling)r   )alphar  r  adafactorbvcuS   Adafactor Big Vision with corrected weight decay (lr²/max_lr or lr/max_lr scaling)cadamcu@   Cautious AdamW with corrected weight decay (lr²/max_lr scaling))cautionr  cadoptcuJ   Cautious Adopt with corrected decoupled weight decay (lr²/max_lr scaling))r   r  r  cnadamcuA   Cautious NAdamW with corrected weight decay (lr²/max_lr scaling)csgdcuH   Cautious SGD with corrected decoupled weight decay (lr²/max_lr scaling))r   r  r  clioncu?   Cautious Lion with corrected weight decay (lr²/max_lr scaling)cadafactorbvcz9Cautious Adafactor Big Vision with corrected weight decayN)rA   r   r)   r.   r   r    r   r#   r!   r,   r   rj   )r   corrected_optimizersr   cautious_correcteds       r=   $_register_corrected_decay_optimizersr!  &  s4    	!Q.5	
 	R.5	
 	Y"&$G	
 	[#'4H		
 	Z)-N	
 	_)-N	
 	P.5	
 	R.5	
 	f"tW[\	
 	(m.5		
CGP $# $
 	!Z!%F	
 	d#'DTXY		
 	[!%F	
 	b"&4SWX	
 	Y!%F	
 	 (S!%F		
K+X "# "rY   c                    [        S[        SSS0S9[        S[        SSS0S9[        S[        S	SSS0S
9[        S[        SSS0S9[        S[
        SSSS.SSS9[        S[
        SSSS.SSS9[        S[        SSSS.S9[        S[        SSSS0S
9[        S[        SSSSS.S
9[        S[        SSSS0S
9[        S[        S SSSS0S!9[        S"[        S#SSS0S
9[        S$[        S%SSS0S
9[        S&[        S'SS(SS).S*9[        S+[        S,SSSSS-.S.9[        S/[        S0SS1SSS2.S
9/nU H  nU R                  U5        M     g )3N
cadafactorzCautious Adafactorr  Tr   cadafactorbvzCautious Big Vision AdafactorcadamwzCautious AdamWr   cadoptzCautious Adoptcadanz-Cautious Adaptive Nesterov Momentum AlgorithmF)r  no_prox   rC   rD   rF   rM   rI   rK   cadanwz?Cautious Adaptive Nesterov Momentum with decoupled weight decaycadoptwz!Cautious AdoptW (decoupled decay))r   r  clambzCautious LAMBclambwz)Cautious LAMB with decoupled weight decay)r  r  clapropzCautious LaPropclionzCautious Lionr  cmarszCautious MARScnadamwzCautious NAdamW
crmsproptfz!Cautious TensorFlow-style RMSpropr   )r  r  r   csgdwz>Cautious SGD with decoupled weight decay and Nesterov momentum)r   r  r   cadampzQAdd the spherical cautious optimizer and the standard cautious optimizer to AdamPr   )r   r   r  )rA   r   r   r   r   r   r    r!   r#   r&   r)   r,   r.   r   rj   )r   cautious_optimizersr   s      r=   _register_cautious_optimizersr7    s   ,&		
 	(7&		
 	!(&	
 	(&		
 	G!%%8	
 	Y!%$7	
 	;#'D9		
 	'&	
 	C!%$?	
 	)&	
 	'!4(	
 	'&	
 	)&	
 	;"t4	
 	X"&48	
 	k"&DTJ	
Uqd ## #rY   c                    [        S[        SSSS0S9[        S[        SSSS0S9[        S	[        R                  R                  S
S9[        S[        R                  R
                  SSS0S9[        S[        SSS0SSS9[        S[        SSS0SSS9[        S[        SSSS9[        S[        SSSS9[        S[        SSSS S0S!9[        S"[        S#SS$9[        S%[        S&SSS'9[        S([        S)SS*9[        S+[        S,SS S0S-9[        S.[        S/SS$9[        S0[        S1SSSS29[        S3[        S4SSSS5S0S69[        S7[        S8SSSS9S70S69[        S:[        S;SSSS7SS<.S69[        S=[        S>SS$9[        S?[        R                  R                  S@SSASB0S-9[        SC[         SDSSASB0S-9/nU H  nU R#                  U5        M     U R%                  S%5        gE)Fz!Register miscellaneous optimizers	adabeliefz7Adapts learning rate based on gradient prediction errorTrectifyFr   
radabeliefz,Rectified AdaBelief with variance adaptationadadeltazQtorch.optim.Adadelta, Adapts learning rates based on running windows of gradientsr   adagradzMtorch.optim.Adagrad, Adapts learning rates using cumulative squared gradientsr   g:0yE>r   adanz$Adaptive Nesterov Momentum Algorithmr(  r)  r*  adanwz6Adaptive Nesterov Momentum with decoupled weight decay
adahessianz"An Adaptive Second Order Optimizer)rC   rD   rF   rI   rL   kronz5PSGD optimizer with Kronecker-factored preconditioner)rC   rD   rF   rG   rH   kronwzPPSGD optimizer with Kronecker-factored preconditioner and decoupled weight decayr  )rC   rD   rF   rH   rG   rM   lapropz*Separating Momentum and Adaptivity in Adamr   r^   z8Evolved Sign Momentum optimizer for improved convergence)rC   rD   rF   rG   rI   madgradz'Momentum-based Adaptive gradient methodr  madgradwz#MADGRAD with decoupled weight decayr   marszDUnleashing the Power of Variance Reduction for Training Large ModelsmuonzJMomentUm Orthogonalized by Newton-schulz with AdamW fallback for 1D params)rC   rD   rF   rH   rG   rI   nmuonzXMomentUm Orthogonalized by Newton-schulz with Nesterov and NAdamW fallback for 1D paramsr   )rC   rD   rF   rH   rG   rI   rM   adamuonzQAdaMuon: Muon with adaptive second moment estimation on orthogonalized directionsalgonadamuonz@AdaMuon with Nesterov momentum and NAdamW fallback for 1D params)rJ  r   novogradz3Normalized Adam with L2 norm gradient normalizationrmspropz1torch.optim.RMSprop, Root Mean Square Propagationr  r   	rmsproptfzETensorFlow-style RMSprop implementation, Root Mean Square PropagationN)rA   r   r   r   AdadeltaAdagradr   r   r   r!   r#   r%   r&   r'   r*   RMSpropr,   rj   ru   )r   other_optimizersr   s      r=   _register_other_optimizersrS    s    	Q'	
 	F&	
 	kk**k	

 	kk))gT]		
 	>'	
 	P&	
 	 <	
 	O	
 	j'.	
 	D		
 	R	
 	A		
 	='.	
 	^		
 	d	
 	r $'	
 	ki(	
 	Z'T:	
 	 M		
 	kk))Ks^	
 	_s^	
aWp  #  %%f-rY   c                     [        SSSSSSS0S9[        SS	S
SSS0S9[        SS	SSSS0S9[        SSSSS9[        SSSSSS0S9/nU H  nU R                  U5        M     g)z&Register APEX optimizers (lazy import)fusedsgdzapex.optimizers.FusedSGDz8NVIDIA APEX fused SGD implementation for faster trainingFTr   r   	fusedadamzapex.optimizers.FusedAdamz%NVIDIA APEX fused Adam implementationadam_w_moder   
fusedadamwz&NVIDIA APEX fused AdamW implementation	fusedlambzapex.optimizers.FusedLAMBz%NVIDIA APEX fused LAMB implementationr   fusednovogradzapex.optimizers.FusedNovoGradz)NVIDIA APEX fused NovoGrad implementationr   )gffffff?g\(\?NrA   rj   )r   apex_optimizersr   s      r=   _register_apex_optimizersr]    s     	0R $'	
 	1?#U+	
 	1@#T*	
 	1?		
 	 5C|,	
;$OJ # rY   c                 0   [        SSSSSSS0S9[        SS	S
SSSS0S9[        SSSSS9[        SSSSS9[        SSSSS9[        SSSSS9[        SSSSSS9[        SSSSSS9[        SSSSS S!9[        S"S#S$SS S!9/
nU H  nU R                  U5        M     g%)&z.Register bitsandbytes optimizers (lazy import)bnbsgdzbitsandbytes.optim.SGDzbitsandbytes SGDFTr   r   
bnbsgd8bitzbitsandbytes.optim.SGD8bitz0bitsandbytes 8-bit SGD with dynamic quantizationbnbadamzbitsandbytes.optim.Adamzbitsandbytes Adamr   bnbadam8bitz1bitsandbytes 8-bit Adam with dynamic quantizationbnbadamwzbitsandbytes.optim.AdamWzbitsandbytes AdamWbnbadamw8bitz2bitsandbytes 8-bit AdamW with dynamic quantizationbnblionzbitsandbytes.optim.Lionzbitsandbytes Lion)rF   rG   rI   bnblion8bitzbitsandbytes.optim.Lion8bitz1bitsandbytes 8-bit Lion with dynamic quantizationbnbademamixzbitsandbytes.optim.AdEMAMixzbitsandbytes AdEMAMixr)  )rF   rI   rK   bnbademamix8bitzbitsandbytes.optim.AdEMAMix8bitz5bitsandbytes 8-bit AdEMAMix with dynamic quantizationNr[  )r   bnb_optimizersr   s      r=   _register_bnb_optimizersrj    s$    	.* $'	
 	2J $'	
 	/+		
 	/K		
 	0,		
 	0L		
 	%+	
 	)K	
 	)/	
 	-O	
}ENL # rY   c                  L   [        [        5        [        [        5        [        [        5        [	        [        5        [        [        5        [        [        5        [        [        5        [        [        5        [        R                  SS5        [        R                  SS5        g)z7Register all default optimizers to the global registry.r   r   	nesterovwr   N)
r   default_registryr   r	  rS  r]  rj  r7  r!  rp   rN   rY   r=   _register_default_optimizersrn  2  sn     +,,-()/0./-.!"23()9: ##J6##K8rY   rw   rx   ry   c                 .    [         R                  XU5      $ )a  List available optimizer names, optionally filtered.

List all registered optimizers, with optional filtering using wildcard patterns.
Optimizers can be filtered using include and exclude patterns, and can optionally
return descriptions with each optimizer name.

Args:
    filter: Wildcard style filter string or list of filter strings
        (e.g., 'adam*' for all Adam variants, or ['adam*', '*8bit'] for
        Adam variants and 8-bit optimizers). Empty string means no filtering.
    exclude_filters: Optional list of wildcard patterns to exclude. For example,
        ['*8bit', 'fused*'] would exclude 8-bit and fused implementations.
    with_description: If True, returns tuples of (name, description) instead of
        just names. Descriptions provide brief explanations of optimizer characteristics.

Returns:
    If with_description is False:
        List of optimizer names as strings (e.g., ['adam', 'adamw', ...])
    If with_description is True:
        List of tuples of (name, description) (e.g., [('adam', 'Adaptive Moment...'), ...])

Examples:
    >>> list_optimizers()
    ['adam', 'adamw', 'sgd', ...]

    >>> list_optimizers(['la*', 'nla*'])  # List lamb & lars
    ['lamb', 'lambc', 'larc', 'lars', 'nlarc', 'nlars']

    >>> list_optimizers('*adam*', exclude_filters=['bnb*', 'fused*'])  # Exclude bnb & apex adam optimizers
    ['adam', 'adamax', 'adamp', 'adamw', 'nadam', 'nadamw', 'radam']

    >>> list_optimizers(with_description=True)  # Get descriptions
    [('adabelief', 'Adapts learning rate based on gradient prediction error'),
     ('adadelta', 'torch.optim Adadelta, Adapts learning rates based on running windows of gradients'),
     ('adafactor', 'Memory-efficient implementation of Adam with factored gradients'),
    ...]
)rm  r   )rw   rx   ry   s      r=   r   r   H  s    T ++FEUVVrY   rC   c                 ,    [         R                  U 5      $ )r   )rm  r   )rC   s    r=   r   r   u  s     ..t44rY   r   c                 (    [         R                  XS9$ )a)  Get optimizer class by name with option to bind default arguments.

Retrieves the optimizer class or a partial function with default arguments bound.
This allows direct instantiation of optimizers with their default configurations
without going through the full factory.

Args:
    name: Name of the optimizer to retrieve (e.g., 'adam', 'sgd')
    bind_defaults: If True, returns a partial function with default arguments from OptimInfo bound.
        If False, returns the raw optimizer class.

Returns:
    If bind_defaults is False:
        The optimizer class (e.g., torch.optim.Adam)
    If bind_defaults is True:
        A partial function with default arguments bound

Raises:
    ValueError: If optimizer name is not found in registry

Examples:
    >>> # Get SGD with nesterov momentum default
    >>> SGD = get_optimizer_class('sgd')  # nesterov=True bound
    >>> opt = SGD(model.parameters(), lr=0.1, momentum=0.9)

    >>> # Get raw optimizer class
    >>> SGD = get_optimizer_class('sgd')
    >>> opt = SGD(model.parameters(), lr=1e-3, momentum=0.9)

r   )rm  r   )rC   r   s     r=   r   r     s    D ///RRrY   r   r   r   r   r   r   filter_bias_and_bnr   r   r   r   r   r   r   c                 J    [         R                  " U 4UUUUUUUUU	U
UUS.UD6$ )aH  Create an optimizer instance via timm registry.

Creates and configures an optimizer with appropriate parameter groups and settings.
Supports automatic parameter group creation for weight decay and layer-wise learning
rates, as well as custom parameter grouping.

Args:
    model_or_params: A PyTorch model or an iterable of parameters/parameter groups.
        If a model is provided, parameters will be automatically extracted and grouped
        based on the other arguments.
    opt: Name of the optimizer to create (e.g., 'adam', 'adamw', 'sgd').
        Use list_optimizers() to see available options.
    lr: Learning rate. If None, will use the optimizer's default.
    weight_decay: Weight decay factor. Will be used to create param groups if model_or_params is a model.
    momentum: Momentum factor for optimizers that support it. Only used if the
        chosen optimizer accepts a momentum parameter.
    foreach: Enable/disable foreach (multi-tensor) implementation if available.
        If None, will use optimizer-specific defaults.
    filter_bias_and_bn: If True, bias, norm layer parameters (all 1d params) will not have
        weight decay applied. Only used when model_or_params is a model and
        weight_decay > 0.
    fallback_list: Collection of parameter name patterns to use fallback optimizer for
        hybrid optimizers (e.g., AdamW for Muon). Supports wildcard matching.
    fallback_no_weight_decay: If True, params in model's no_weight_decay() list will use
        fallback optimizer for hybrid optimizers (e.g., AdamW for Muon).
    layer_decay: Optional layer-wise learning rate decay factor. If provided,
        learning rates will be scaled by layer_decay^(max_depth - layer_depth).
        Only used when model_or_params is a model.
    param_group_fn: Optional function to create custom parameter groups.
        If provided, other parameter grouping options will be ignored.
    **kwargs: Additional optimizer-specific arguments (e.g., betas for Adam).

Returns:
    Configured optimizer instance.

Examples:
    >>> # Basic usage with a model
    >>> optimizer = create_optimizer_v2(model, 'adamw', lr=1e-3)

    >>> # SGD with momentum and weight decay
    >>> optimizer = create_optimizer_v2(
    ...     model, 'sgd', lr=0.1, momentum=0.9, weight_decay=1e-4
    ... )

    >>> # Adam with layer-wise learning rate decay
    >>> optimizer = create_optimizer_v2(
    ...     model, 'adam', lr=1e-3, layer_decay=0.7
    ... )

    >>> # Custom parameter groups
    >>> def group_fn(model):
    ...     return [
    ...         {'params': model.backbone.parameters(), 'lr': 1e-4},
    ...         {'params': model.head.parameters(), 'lr': 1e-3}
    ...     ]
    >>> optimizer = create_optimizer_v2(
    ...     model, 'sgd', param_group_fn=group_fn
    ... )

Note:
    Parameter group handling precedence:
    1. If param_group_fn is provided, it will be used exclusively
    2. If layer_decay is provided, layer-wise groups will be created
    3. If weight_decay > 0 and filter_bias_and_bn is True, weight decay groups will be created
    4. Otherwise, all parameters will be in a single group
)r   r   r   r   r   r   r   r   r   r   r   r   )rm  r   )r   r   r   r   r   r   rr  r   r   r   r   r   r   r   s                 r=   create_optimizer_v2rt    sN    f ,,! 2#!93!9%  rY   c                    U R                   U R                  U R                  U R                  S.n[	        U SS5      =nb  X!S'   [	        U SS5      =nb  X1S'   [	        U SS5      =nb  XAS'   [	        U SS5      =nb  XQS'   [	        U S	S5      =nb  XaS	'   [	        U S
S5      =nb  UR                  U5        [	        U SS5      =nb  XS'   U$ )zGConvert argparse-style `cfg` object to kwargs for an optimizer factory.)r   r   r   r   opt_epsNr   	opt_betasr   r   r   r   r   opt_foreachr   )r   r   r   r   r6   r   )	cfgr   r   r   r   ld_min	ld_no_optr   r   s	            r=   optimizer_kwargsr|    s     ))	F sIt,,9uk400=wsM488E +}#6==J*0&'S"<dCC	P-6)*CT22?h3t44A#yMrY   modelc                 0    [        U40 [        U S9DSU0D6$ )zbLegacy optimizer factory for backwards compatibility.
NOTE: Use create_optimizer_v2 for new code.
)ry  rr  )rt  r|  )argsr}  rr  s      r=   r   r   (  s+     
t
$ . rY   r   r   r   )r   Nr   r   NTrN   FNr   NN)jrS   loggingdataclassesr   	functoolsr   typingr   r   r   r   r	   r
   r   r   r   r   r   r4   r   torch.nnr   torch.optim_param_groupsr   r   _typesr   r   r   r9  r   r   r   adafactor_bvr   r@  r   r   r   r   r   r>  r   r   r   rA  r   r   r    rC  r!   r  r"   r^   r#   r   r$   rD  r%   rF  r&   rG  r'   r   r(   r   r)   
nvnovogradr*   r   r+   
rmsprop_tfr,   r   r-   r   r.   	getLoggerrO   rh   rT   r>   rA   r[   r   r   r	  r!  r7  rS  r]  rj  rm  rn  rV   r   r   r   r   r   r   r   rt  r|  r   rN   rY   r=   <module>r     s    !  [ [ [      N 9 9     , "                 "  !  


H
%C C C $. . .0K K\$%6 $4 $N^&7 ^D ^B;"3 ; ;|{3D { {|t,= t$ tl\.): \.t \.~((9 (d (VI'8 IT IX %& 9$  
 )+/3!&*Wc49n%*W!$s),*W *W 
%U38_$
%&	*WZ5S 5Y 5" #"S"S"S 9''("SN " "&#')+).'+'*48CGbryy'12bb UOb 	b
 b $b !b "#b #'b e_b  %b #+5/b !299+w*>!?@b b [[bJ: $(RYY'( ! [[	rY   