
    IЦi                        % S SK r S SKrS SKrS SKrS SKJrJrJr  S SKJ	r	  S SK
JrJrJrJrJrJrJrJrJrJrJrJr  S SKrS SKJr  S SKJr  S SKJr  S SKJ r J!r!J"r"J#r#J$r$J%r%  S SK&J'r'  S SK(J)r)J*r*J+r,J-r-J.r.J/r/J0r0J1r1  S S	K2J3r3J4r4  S S
K5J6r6  S SK7J8r8  S SK9J:r;  S SK<J=r=  / SQr>Sr?Sr@SrASrB\\C   rD\\6\\R                  \F\G\C4   rH\\H\\H   \\H   \\CS4   4   rI\\C\I4   rJ\\J   rK\\C\\J\K4   4   rL\M" 5       rN\\   \OS'   \ R                  S 5       rQ\ " S S5      5       rR\ " S S\R5      5       rS\R                  " SS9  SHS\R                  S\CS\VS\VS\D4
S  jj5       rW " S! S"5      rXS# rYSSS$.S\R                  S%\\R                  R                  S&4   S'\VS(\\\R                        S)\\R   S\S4S* jjr\S+\\C\I4   S,\LS-\SSS4S. jr]S/\\R                  \R                  R                  4   S0\CS\4S1 jr^S2\\C\4   S-\SS\\C\4   4S3 jr_\R                  " 5       S\R                  S-\SS\\C\I4   4S4 j5       ra\R                  " 5       S\R                  S2\\C\I4   S-\SS\84S5 j5       rbS6\R                  R                  SS4S7 jrcS2\LS\\C\I4   4S8 jrdS6\R                  R                  S2\\C\I4   S-\SS\L4S9 jre\R                  " 5       S\R                  S:\\R                  R                  S&4   S-\SS\L4S; j5       rfS\R                  S6\R                  R                  S,\LS-\SS\L4
S< jrg\R                  " 5       S\R                  S:\\R                  R                  S&4   S2\LS-\SSS4
S= j5       rhSSS$.S\R                  S(\\\R                        S)\\R   S\\C\I4   4S> jjriSSS$.S\R                  S:\\R                  R                  \\R                  R                     4   S(\\\R                        S)\\R   S\L4
S? jjrjSSS$.S\R                  S:\\R                  R                  \\R                  R                     4   S(\\\R                        S)\\R   S\\\C\I4   \L4   4
S@ jjrkS\R                  S2\\\R                  \\C\I4   4   \\C\I4   4   S\\C\I4   4SA jrlSSB.S\R                  S+\\C\I4   S)\\R   S\84SC jjrmSSB.S\R                  S:\\R                  R                  \\R                  R                     4   S,\LS)\\R   SS4
SD jjrnSSB.S\R                  S:\\R                  R                  \\R                  R                     4   S+\\C\I4   S,\LS)\\R   S\84SE jjro\SSB.S\R                  S)\\R   SS4SF jj5       rp\SSB.S\R                  S:\\R                  R                  S&4   S)\\R   SS4SG jj5       rqg)I    N)asdict	dataclassfield)chain)AnyCallablecastDict	GeneratorIterableListno_type_checkOptionalSetTupleUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater+   _patched_state_dictc               #      #    [         R                  " 5       n [         R                  " 5          S v   U (       a  [         R                  " 5         g g ! U (       a  [         R                  " 5         f f = f7fN)gc	isenableddisableenable)
is_enableds    f/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextrC   ]   sC     JJJLIIK :IIK s   +A2A A2A//A2c                   |    \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   Sr
\\S'   Sr\\S	'   Sr\\S
'   Sr\\S'   Srg)r/   h   a   
This dataclass specifies how get_state_dict/set_state_dict will work.

- ``full_state_dict``: if this is set to True, all the tensors in the
  returned state_dict will be gathered. No ShardedTensor and DTensor
  will be in the returned state_dict.

- ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
  ``full_state_dict`` is also true, then only the rank0 will get the
  state_dict and all other ranks will get empty state_dict.

- ``ignore_frozen_params``: if the value is True, the returned state_dict
  won't contain any frozen parameters -- the ``requires_grad`` is False.
  The default value is False.

- ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
  indicates whether to keep the submodule prefixes from the state_dict keys.
  or example, if the submodule is ``module.pretrain`` and the full FQN of
  the parameter is ``pretrain.layer1.weight`` of the param. When this option
  is True, the parameter's key in the returned state_dict will be
  ``pretrain.layer1.weight``. If the options is False, the key will be
  ``layer1.weight``.
  Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
  FQNs, hence there should be only one submodule in ``submodules``.

- ``strict``: the ``strict`` option when ``set_state_dict`` calls
  model.load_state_dict().

- ``broadcast_from_rank0``: when the option is True, rank0 should receive a
   full state_dict and will broadcast the tensors in the state_dict/
   optim_state_dict one by one to other ranks. Other ranks will receive
   the tensors and shard according to the local shards in the model and
   optimizer. ``full_state_dict`` must be set to True when using this option.
   This option currently only supports DTensor, not the legacy ShardedTensor.
Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict N)__name__
__module____qualname____firstlineno____doc__rF   bool__annotations__rG   rH   rI   rJ   rK   rL   __static_attributes__rM       rB   r/   r/   h   sT    "H "OT!K!&$&$(T(FD!&$&). $.rV   r/   c                   X   \ rS rSr% \" \S9r\\\	\
R                  4   \\\
R                  4   4   \S'   \" \S9r\\\	\
R                  4   \\\
R                  4   4   \S'   \" \S9r\\	   \S'   Sr\\S'   Sr\\S'   \R,                  r\\S	'   \" \S9r\\R:                     \S
'   Srg)_StateDictInfo   )default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesrM   N)rN   rO   rP   rQ   r   dictr[   r
   r   strtorchTensorr)   rT   r\   setr]   r   r^   rS   r_   
contextlibnullcontextr`   r   listra   r   nnModulerU   rM   rV   rB   rX   rX      s     	d# tc5<< %(<"== $
 	d# 4c5<< %(<"== $ $)#=C=L$L$'33L(3$)$$?L$ryy/?rV   rX   )maxsizemodelnameskip_ddp_prefixskip_compiler_prefixreturnc                    UR                  [        S5      nSU;  a  U1$ UR                  S5      n/ nU n[        U5       GH  u  px[	        U[
        5      (       a0  US:X  d   eUR                  nU(       d  UR                  U5        MI  MK  [	        U[        5      (       a  U[        U5      S-
  :  a^  XGS-      [        :X  aO  SR                  U5      n	[        U[        5      n
U	(       a  U	 S3n	U
R                   Vs1 s H  o U 3iM
     sns  $ [        U[        5      nU[        :w  a  UR                  U5        [        Xh5      nGM	  GM  [	        U[        R                   R"                  R$                  5      (       a2  US:X  d   eUR&                  nU(       d  UR                  U5        GMn  GMq  UR                  U5        U[(        R*                  R                  R,                  :X  a   U[        U5      S-
  :w  a  [/        S5      eGM  [        Xh5      nGM     SR                  U5      R                  [        S5      1$ s  snf )a  
This API is used to convert the name of a parameter to the FQNs. For FSDP
without `use_orig_params`, the name of FlatParameter can be mapped to
multiple original parameters. As a result, the return type of this function
is `Set[str]`.

Args:
    module (nn.Module): the root model.
    name (str): the name
    skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

Returns:
    The canonical FQNs based on the model traversal.
 .module   	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPru   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr$   rd   _dynamo
eval_frameOptimizedModulerw   rj   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)rm   rn   ro   rp   	obj_namesfqn_obj_namescurr_objicurr_obj_nameprefix
flat_paramfqns               rB   	_get_fqnsr      s   . <<*B/D
$v

3IMH%i0h$$ H,,,H"$$]3 #$''3y>A%%)E*:k*I-0$X{;
 &xq\F4>4D4DE4DS(3%(4DEEx)<=H 33$$]3"8; 4 %--":":"J"JKK K///))H'$$]3 (   /

 1 1 I III**&'VWW + #8;9 1< HH]#++,>CDD% Fs   <I c                       \ rS rSrSrg)_EXTRA_STATE   rM   N)rN   rO   rP   rQ   rU   rM   rV   rB   r   r      s    rV   r   c              #      ^^#    [        5       mS[        R                  S[        S[        4UU4S jjmT" U S5       S h  vN   g  N7f)Nru   curr_fqnrq   c              3   X  >#    TR                  U 5        U(       a  U S3OSnU R                  5        H#  u  p#UT;   a  M  U U 3nT" X45       S h  vN   M%     [        U R                  SS9U R	                  SS95       H!  u  p%X R
                  ;   a  M  U U 3nXE4v   M#     [        U R                  S[        R                  R                  5      [        R                  R                  :w  a7  U [        R                  R                  R                   3nU[        5       4v   g g  N7f)Nrt   rs   F)recurseget_extra_state)addnamed_childrenr   named_buffersnamed_parameters_non_persistent_buffers_setr   	__class__rj   rk   r   r   ru   r   r   )ru   r   rn   	submodulenew_fqnobjr   visited_moduless         rB   r   +_iterate_valid_model_state.<locals>.recurse   s$    F#%-hZq>2%446ODO+!
4&)Gy222	  7    /1H1HQV1H1W
ID 999!
4&)G,
 F$$&79R9RSyy(() "
2::#4#4#L#L"MNG<>))	) 3s   AD*D(CD*rs   )rf   rj   rk   rc   r   )rm   r   r   s    @@rB   _iterate_valid_model_stater      sB     &)eO*		 *S *Y * *2 ub!!!s   <A AA)
submodulesoptionsoptims.
optim_onlyr   r   c                z   U(       a  [         R                  " S[        5        U(       a  U(       d  [        S5      eU=(       d
    [	        5       n0 n0 n[        U 5       H  u  px[        U[        5      (       a  M  [        X5      n	UR                  US5      n
U
b/  [        [        [           XX   5      R                  U	5        XX   Xh'   OU	R                  5       XX'   U	 H  n
[        U[        5      (       a  M  XU
'   M      M     [        UR!                  5       5       H*  u  pU H  n
[        ["        R$                  U5      Xj'   M!     M,     ['        5       nU(       ad  ['        U5      nU R)                  5        HE  u  p~X;  a  M  [        X5      n	[+        U	5      S:X  d   S5       eUR                  S U	 5       5        MG     UR,                  (       a  UR.                  (       d  [1        S5      e[2        R4                  " U 5      nU(       a  UR.                  (       a`  [7        UR8                  UR8                  S9n[;        UR8                  UR8                  =(       d    UR,                  S9n[<        R>                  nO6[A        UR8                  S	9n[C        UR8                  S	9n[<        RD                  n[F        RH                  S
 5       n[J        RL                  " UU UUUS9nO[F        RN                  n[Q        S0 [S        U5      DUUUU[        [T        [V        RX                     U5      U(       + [+        U5      S:  S.D6$ )zO
Verify the model and options passed by the user and generates _StateDictInfo.
zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrv   z)Submodule FQN should only have 1 instancec              3   *   #    U  H	  o S 3v   M     g7f)rt   NrM   ).0r   s     rB   	<genexpr>"_verify_options.<locals>.<genexpr>C  s     %@4CQi4s   z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              3     #    [         R                  " 5          [         R                  " SS[        S9  [        R
                  " U UUUS9   S v   S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f7f)NignorezFSDP.state_dict_type)messagecategoryru   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningr~   r   r   s       rB   $fsdp_state_dict_type_without_warning=_verify_options.<locals>.fsdp_state_dict_type_without_warninga  sj      ((*''&<} ))!$3&7,C	 	 +* 	 +*s4   B2A1
A A1	B 
A.	*A11
A?;Br   r   )r[   r\   r]   r`   ra   r^   r_   rM   )-r   warnr   r   r/   r   r{   r   r   getr	   r   rc   updatecopyri   itemsrd   re   rf   named_modulesr   rK   rF   
ValueErrorr~   ra   r   rG   r   r"   FULL_STATE_DICTr    r   SHARDED_STATE_DICTrg   contextmanager	functoolspartialrh   rX   r   r   rj   rk   )rm   r   r   r   r   r[   r\   rn   paramfqnsr   param_fqns_r]   ru   ra   r   r   r   r   r`   s                        rB   _verify_optionsr     s    I 		
 &I
 	
 +)+G 	 
 	  2%8e\**%##E40?S,34;;DA+<+C!( (,yy{$Ce\22).#&  9  399;<C)-ellF)C!&  = $'5_
!//1LD'U)Dt9>N#NN>%%%@4%@@ 2 ##G,C,CM
 	
 $$U+L "" 3&22w?R?R! '?&22#//O73O3O'# ,;;O 6&22! 'B&22'# ,>>O		"	"	 
#	$ !((0+/$;
 "-- 	
/	+3-!$ryy/<8#^&kAo	 	rV   model_state_dictoptim_state_dictinfoc                    UR                    H  n[        U5      nUb  M   S5       e   UR                  (       a  U (       d  UR                  (       dx  UR                  (       dg  UR
                  (       a  UR                  (       dE  UR                  (       a4  UR                  (       d#  [        S[        R                  " 5       < S35      eUR                  (       aH  U(       dA  UR
                  (       a  UR                  (       d  UR                  (       d  [        SU 35      eU R                  5        H"  n[        U;   d  M  [        U S[         S35      e   g )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rt   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)ra   r#   r^   r]   rH   rG   rF   rJ   rK   r   distget_rankr_   keysr   )r   r   r   ru   
fsdp_statekeys         rB   _verify_state_dictr     s   
 ##CFK
%R'RR% $ 	 ''))!!d&:&:KK))'mmo'q*
 	
  %%$*>*>..::J9KM 
  $$&#%z+ /* *  'rV   r   apic                     [        X5      nU[        ;   a)  [        R                  " [        U R                  U5      U S9nU$ )N)self)r   r:   r   r   r   )r   r   calls      rB   _state_dict_fnr     s7    3D""  !<3GKrV   
state_dictc                     UR                   (       aL  UR                  (       a#  [        R                  R	                  5       (       d  SOSn[        XR                  US9$ UR                  (       a  [        U 5      $ U $ )NrM   )r   )rG   
ranks_only)rF   rG   rd   distributedis_initializedr   r   )r   r   r   s      rB   _maybe_full_or_cpu_state_dictr     so      $$E,=,=,L,L,N,N  	
 "$4$4
 	
 
		)*55rV   c                 z   UR                   (       d  0 $ UR                  5          [        U S5      " 5       nS S S 5        [        WR	                  5       5       H|  n[        X5      n[        U5      S:X  d   X445       e[        [        U5      5      nXS:w  d  M@  S[        4S jnU" X55      (       d  [        SU SU 35      eUR                  U5      X%'   M~     UR                  (       as  0 nUR	                  5        H[  nUR                   HH  nUR                  U5      (       d  M  UR                  (       a  X%   Xu'   M4  U[        U5      S  n	X%   Xy'   MJ     M]     UnUR                  (       aQ  U R!                  5        H=  u  p:U
R"                  (       a  M  [        X5      nU H  nUR                  U5        M     M?     [        UR%                  5       5       HF  u  p;[&        R(                  " U5      (       d  M"  UR*                  (       d  M5  UR                  U5        MH     [-        X!5      $ ! , (       d  f       GN= f)Nr   rv   rq   c                    [        U5      [        U 5      :  a  gUR                  S5      nU R                  S5      nSn[        U5       H>  u  pVXbU   :X  a)  US-  nU[        U5      :X  a  U[        U5      S-
  :H  s  $ M6  US;   a  M>    g   g)NFrt   r   rv   )ru   rw   T)r   ry   rz   )r   r   	fqn_split	key_splitfqn_idxkey_idxkey_names          rB   verify%_get_model_state_dict.<locals>.verify  s    s8s3x' IIcN	IIcN	)29)=%GW#551"c)n4#*c)nq.@#@@ 5!%<< $ *> rV   zAn unexpected key, z, exists. FQN is )r^   r`   r   ri   r   r   r   nextiterrS   r   popr]   
startswithrI   rH   r   requires_gradr   rd   	is_tensoris_metar   )rm   r   r   r   r   r   r   new_state_dictr   r   r   ps               rB   _get_model_state_dictr     s    					#E<8:
 
 JOO%&$4yA~*{*~4::D " ###"%8=Nse#TUU(nnS1JO7 ': /1??$C11~~f--//*4/N'!#f+-0G.8oN+ 2 % $
  002JC""U(Ds# 	 3 z'')*??1!)))NN3 + )::u 
	s   H++
H:c           
         UR                   (       a  U(       d  UR                  (       d  [        0 0 5      $ 0 n[        U 5       Hu  u  pE[	        X5      n[	        XSSS9n[        Xg5       HK  u  pUR                  (       a  [        R                  " 5       S:X  a  X:w  a  UR                  U5      X'   XSU	'   MM     Mw     Sn
UR                  (       d  UR                  (       Ga  S nUR                  5        HZ  u  pE[        R                  " U5      (       d  M"  UR                  5       S:  d  M8  Uc  UR                  nMI  XR                  :X  a  MZ   e   Uc   eU[        R                  " S5      :X  a   [        R                  R!                  5       nSn
UR                  (       a  [#        XXR$                  S9  OUR                  (       a
  ['        XUS9  UR                  5        H	  u  pXU'   M     UR)                  5          [+        [        [-        U S5      " XR$                  U
S	95      sS S S 5        $ ! , (       d  f       g = f)
NF)ro   rp   r   metaT)devicerJ   r   load_state_dict)r   rJ   assign)r^   rK   r&   r   r   zipr   r   r   rF   r   rd   r   dimr   distributed_c10d_get_pg_default_devicer   rJ   r   r`   r	   r   )rm   r   r   local_state_dictr   valuer   fqns_with_prefixr   fqn_with_prefixr   r   local_states                rB   _load_model_state_dictr    s    Z8Q8Q R((07
$$E
 %($? C--A1E(.8nnS.A
+05_- %@ 8 F  D$8$8$8*002JCu%%%))+/>"\\F!\\111 3 !!!U\\&))**AACFF$$!VKK !!":O 0 6 6 8C)sO !9 
			5"34%kk&
 
		s   >)H11
H?optimc                    U R                   (       a  gU R                   H#  nU[            H  nUR                  c  M      g   M%     U R                   HA  nU[            H1  nUR                  (       d  M  [
        R                  " U5      Ul        M3     MC     / nU R                   H\  nSU;   d  M  UR                  US   5        [        US   [
        R                  5      (       a  [
        R                  " S5      OSUS'   M^     U R                  SS9  U R                   H  nSU;   d  M  UR                  S5      US'   M!     U R                  SS9  g)z@
Initialize optim states by calling the step() with zero grads.
Nlrg        )closurer   T)set_to_none)r9   r7   _PARAMSgradr   rd   
zeros_liker}   r{   re   tensorstepr   	zero_grad)r	  param_groupr   lrss       rB   _init_optim_stater  J  s*    {{ )) )Ezz% * *
 )) )E""""--e4
 * * C));JJ{4() k$/>> S!  * 
JJtJ )); #
K * 
OOO%rV   c           
         S n0 n[        [        U [           5      R                  5        HD  u  p4[        [        U5      R                  5        H  u  pVU" U5        Xb[         SU SU 3'   M     MF     [        [        U [
           5       H_  nUR                  [        5      n[        [        [           U5       H,  nUR                  5        H  u  pVXb[
         SU SU 3'   M     M.     Ma     U$ )a  
This API flattens the optimizer state_dict to support optimizer resharding for
MPMD, e.g., pipeline parallelism.

Without the API, the original optimizer state_dict looks like:
{
    "state": {
        "layer1.weight": {
            "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
        },
        "layer2.weight": {
            "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
        },
    },
    "param_group": [
        {
            "lr": 0.0,
            "betas": (0.9, 0.95), ...,
            "params": ["layer1.weight", "layer2.weight"]
        }
    ]
}

With this API, the optimizer state_dict looks like:
{
    "state.layer1.weight.step": 10,
    "state.layer2.weight.step": 10,
    "state.layer1.weight.exp_avg": SomeTensor,
    "state.layer2.weight.exp_avg": SomeTensor,
    "state.layer1.weight.exp_avg_sq": SomeTensor,
    "state.layer2.weight.exp_avg_sq": SomeTensor,
    "param_group.layer1.weight.lr" : 0.1,
    "param_group.layer2.weight.lr" : 0.1,
    "param_group.layer1.weight.betas" : (0.9, 0.95),
    "param_group.layer2.weight.betas" : (0.9, 0.95),
}

Note that if any of the value is a container, like the betas in the example,
this API won't flattent it.
c                     [        U [        R                  [        [        45      (       d  [        S[        U 5       S35      eg )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rt   )r{   rd   re   intfloatNotImplementedErrortype)vs    rB   _raise_if_type_not_supported?_flatten_optim_state_dict.<locals>._raise_if_type_not_supported  sA    !ellC788%7)1&  9rV   rt   )
r	   r,   _STATEr   r-   _PGr   r  r   rc   )	r   r  retr   r9   kr  r  r   s	            rB   _flatten_optim_state_dictr$  t  s    T !#C=*V*<=CCE
.446DA(+)*6(!C5!%& 7 F
 -z#?w'S	4(C#))+*+se1SE1#&' , ) @
 JrV   c                    0 n/ n[         U[        U0nU R                   GHg  nUR                  [        / 05        U[            H  nUR
                  U    H  nUS   [           n	[        U	[        5      (       d   eU	R                  U5        UR                  (       d  MJ  0 X8'   U R                  U   R                  5        H'  n
U[          SU SU
 3   [        [        X8   5      U
'   M)     M     M     [        [        [           US   [           5      S   nUR                  5        H[  nU[        :X  a  M  U[         SU SU 3   nXS   ;  a	  XS   U'   M1  US   U   U:w  d  M?  [        SU SU SU SUS   U    S3	5      e   GMj     U$ )z
This API unflattens the state_dict generated by _flatten_optim_state_dict().
See the docstring of _flatten_optim_state_dict() for more detail.
rt   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r   r!  r7   r}   r  r[   r{   ri   r   r9   r   r	   r,   r   rc   r   )r	  r   r   r9   pg_state
return_osdr  r   r   r8   
state_namefirst_param_fqnr#  r  s                 rB   _unflatten_optim_state_dictr+    s    E"$H&,eS(%CJ))"& )E--e4!"g.!&$////c"**
"'++e"4"9"9";JBL!(!C5*6CD
3J? #< 5 * tCy(2,w*?@C!!#AG|#a'8!=>E$"'Q"aE)"==L<MQqc R 3HRLO3DAG  $! *: rV   
optimizersc                 @   UR                   (       d  0 $ [        0 [        / 0nU GH  n[        U5        [	        US5      " 5       nUR
                  (       a  UR                  5          [        R                  " XU5      nS S S 5        U(       d  Mj  [        U[           R                  5       5       H=  nSU;   d  M  U[           R                  U5      U[           UR                  SS5      '   M?     U[            H3  nU[            Vs/ s H  ofR                  SS5      PM     nnX[        '   M5     GO5[        [        R                  " S UR                    5       5      5      n[#        [%        U['        [)        U5      5      5      5      n	0 n
U R+                  5        HH  u  p[-        X5      n[)        U5      S:X  d   e[/        [1        U5      5      nX;  a  M<  X   nXU'   XU'   MJ     [        U[           R                  5       5       H)  nX   nU[           R                  U5      U[           U'   M+     U[            H&  nU[            Vs/ s H  oU   PM	     snU[        '   M(     U(       d  GML  [3        [4        U[           5      R7                  U[           5        [3        [8        U[           5      R;                  U[           5        GM     UR<                  (       a  [3        [>        [A        U5      5      n[C        X25      $ ! , (       d  f       GN= fs  snf s  snf )Nr   rw   z
_orig_mod.rs   c              3   2   #    U  H  o[            v   M     g 7fr<   )r  )r   gs     rB   r   (_get_optim_state_dict.<locals>.<genexpr>  s     -UBTQjBTs   rv   )"r_   r   r!  r  r   ra   r`   r~   r   ri   r   r   rx   r  r   from_iterabler7   rb   r   ranger   r   r   r   r   r	   r,   r   r-   extendrL   r.   r$  r   )rm   r,  r   r   r	  osdr#  r/  r8   param_pid_mappingfqn_pid_mappingr   r   r   r   pidgroups                    rB   _get_optim_state_dictr9    s    	,2BR+@% UL13""$++E#> % #f+**,-!#?B6{q?QCK		, ;< . X?@zJz!))L"5zJ#'
  %---U%BTBT-UUVF $Ss6{1C%D E O#446
 ,4yA~%~4:&1'.'*$'*$ 7 CK,,./%*#&v;??3#7FC  0 SBG.!Q.3#"6.!Qg " ],V45<<S[I 0 56==c#hGY \ (( 9:J K
 ))9@@_ %$ K* "Rs   %LL
L
L	c           
      x   0 n/ n[         U[        U0n0 n[        S [        [        U[            5      R                  5        5       5      (       a  U$ UR                   GHj  nUR                  [        / 05        U[            GHA  n	UR                  U	    GH)  n
XR                  ;   aG  Sn[        [        U[           5       H)  nU
[        [        [           U[           5      ;   d  M'  Sn  O   OSnU(       d  Me  US   [           n[        U[        5      (       d   eUR                  U
5        U	R                   (       a  [        [        U[            5      U
   XJ'   [        [        U[           5       HH  nU
[        [        [           U[           5      ;   d  M'  [#        U[           5      S-
  U[%        U5      '   MJ     GM,     GMD     GMm     [        [        U[           5       HS  nUR'                  [%        U5      S5      nUS:X  a  M&  UR)                  5        H  u  nnU[        :X  a  M  UX^   U'   M     MU     U$ )a  
Extract the corresponding optim state_dict from ``optim_state_dict`` for
``optim`` and return the result optim state_dict.

Args:
    model (nn.Module): the root model.
    optim (torch.optim.Optimizer): the optimizer.
    optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
        contains the optim state_dict of ``optim``.
    info (_StateDictInfo): state dict information.

Returns:
    The optim state_dict of ``optim``.
c              3   B   #    U  H  n[        U[        5      v   M     g 7fr<   )r{   r  )r   r#  s     rB   r   *_split_optim_state_dict.<locals>.<genexpr>;  s      $Xq
1c$Xs   FTr&  rv   )r   r!  allr	   r,   r   r7   r}   r  r[   r\   r-   r   rc   r{   ri   r   r   idr   r   )rm   r	  r   r   r9   r'  r(  
pg_mappingr  r   r   	in_paramsloaded_param_groupr8   idxr   r  s                    rB   _split_optim_state_dictrC  !  s   * E"$H&,eS(%CJ!#J
 $(8H8P$Q$V$V$X    ))"& )E--e4444 %I.2)+;C+@/* $tCy2DW2M"NN(,I!/ !%I !"g.!&$////c"&&!%m5Ef5M!Ns!SEJ*.%'7'<+& d49.@.IJJ=@C=QTU=U
2&8#9:	+' 5 * *8 -/?/DEnnR_b1"9%++-JCg~!&HM#	 .	 F rV   c           
        ^ UR                   (       d  g U GH  n[        U5        U(       a@  [        U;   a  [        XX#5      nO+[	        U[        [        [        [        4   U5      U5      nO0 nUR                  (       GaW  U R                  5        GH  u  pg[        X5      n[        XSS9n	X:X  a  M"  [        U5      S:X  d   eUR                  5       n
U	R                  5       nU[            HO  n[        [        [        [        4   U5      nU[             Vs/ s H  oR#                  X5      PM     nnX[         '   MQ     [        [$        U[           5      n['        UR)                  5       5       H.  nU
U;   d  M  UR                  U5      UUR#                  X5      '   M0     GM     UR+                  5          [,        R.                  " XU5      nS S S 5        OUR0                  (       a  SUl        [3        X4U5      nSUl        S mU4S jn[5        [6        R8                  UU5      nTc   e[;        U5      u  nn[;        U5      u  nnUR<                  (       a  [?        UUTS9  O[A        UUTS9  UR)                  5        H#  nUU;  d  M  UU;   d   eUU   UU'   UU   UU'   M%     [C        UU5      n[E        US5      " US9  GM     g s  snf ! , (       d  f       N)= f)	NF)rp   rv   Tc                    > U R                  5       S:  a,  Tc  U R                  mU $ TU R                  :w  a  [        S5      eU $ )Nr   zDevice mismatch)r   r   r   )tr   s    rB   _device'_load_optim_state_dict.<locals>._device  sD    557Q;~!"   188+():;;rV   r   r   )r   )#r_   r  r   rC  r+  r	   r
   rc   r+   ra   r   r   r   r   r!  r   r  rx   r,   ri   r   r`   r~   optim_state_dict_to_loadrF   r9  r(   rd   re   r   rK   r   r   r   r   )rm   r,  r   r   r	  r   original_fqn_r   fqns_with_compilerr   fqn_with_compilerr/  valr   r8   	osd_stater#  r  rG  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keyr   s                            @rB   _load_optim_state_dictrU  i  s    % ##:*$  $?4S)^ 4jA4$   " $)#9#9#; 5%.e&" -4yA~%~hhj$6$:$:$<!)#.AtCH~q1CGJ7|GSC;|   $*L / !0@0HI	inn./AaxGP}}UVGW	!))C"CD 0% $<, ""$#'#@#@"2$  %$ !!#(D 4UHdK#'D F ellG5EFA%%%':;K'L$K3FGW3X00((%k3DVT&{4EfU
 )--/	$55$3333>y3I%i03>y3I%i0	 0
  5!#4  	u/0<LMc : %$s   K
6K
K	c          	          [        5          [        U SSUUS9n[        X5      n[        U0 U5        UsSSS5        $ ! , (       d  f       g= f)a  
Return the model state_dict of ``model``.

See ``get_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
        that belong to the submodules.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be returned. See
        `StateDictOptions` for the details.

Returns:
    The state_dict for ``model``.

:rtype: typing.Dict[str, ValueType]
rM   Fr   r   r   N)rC   r   r   r   )rm   r   r   r   r   s        rB   r0   r0     sI    0 
!
 1=+R6 
s	   '<
A
c          	         [        5          [        U[        R                  R                  5      (       a  U4O
[        U5      n[        U USUUS9n[        XU5      n[        0 XT5        UsSSS5        $ ! , (       d  f       g= f)a  
Return the combined state_dict for optimizers.

See ``get_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
        that belong to the submodules.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be returned. See
        `StateDictOptions` for the details.

Returns:
    The state_dict for ``optimizers``.

:rtype: OptimizerStateType
TrW  N)	rC   r{   rd   r	  	Optimizertupler   r9  r   )rm   r,  r   r   r   r   s         rB   r1   r1     sx    6 
 *ekk&;&;<< Mz" 	
 !
 1DI2/6 
s   AA33
Bc          	          [        5          [        U[        R                  R                  5      (       a  U4O
[        U5      n[        U USUUS9n[        X5      n[        XU5      n[        XVU5        XV4sSSS5        $ ! , (       d  f       g= f)a  
Return the model state_dict and optimizers state_dict.

``get_state_dict`` can process any module that is parallelized by PyTorch
FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
combination of these parallelisms. The main functions of ``get_state_dict``
are: 1.) returning a model and optimizer state_dict that can be resharded
with a different number of trainers and/or different parallelisms.
2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
these APIs.
3.) sanity checking the result state_dict.

The keys of the result state dictionary are the canonical FQNs (Fully
Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
position in an nn.Module hierarchy. More specifically, a canonical FQN to a
parameter is the FQN returned by ``module.named_parameters()`` or
``module.named_buffers()`` when the module is not distributed by any
parallelisms. Since the optimizer internally uses parameter IDs to represent
a parameter, there will be a conversion from the parameter IDs to the
canonical FQNs when calling this API.

``get_state_dict`` can also process a module that is not parallelized. In
such a case, ``get_state_dict`` only performs one function -- converting the
optimizer parameter IDs to the canonical FQNs.

Example:
    >>> # xdoctest: +SKIP
    >>> import torch
    >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    >>> from torch.nn.parallel import DistributedDataParallel as DDP
    >>> from torch.distributed.checkpoint.state_dict import get_state_dict

    >>> fsdp_model = FSDP(copy.deepcopy(model))
    >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
    >>> ddp_model = DDP(copy.deepcopy(model))
    >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


    >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
    >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

    >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
    >>> # the asserts will fail.
    >>> assert ddp_state_dict == fsdp_state_dict
    >>> assert ddp_optim_state == fsdp_optim_state_dict


Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
        that belong to the submodules.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be returned. See
        `StateDictOptions` for the details.

Returns:
    ``Tuple`` that contain model state_dict and optimizer state_dict.

:rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
FrW  N)
rC   r{   rd   r	  rY  rZ  r   r   r9  r   )rm   r,  r   r   r   r   r   s          rB   r2   r2     s    L 
 *ekk&;&;<< Mz" 	
 !
 1=0DI+tD1! 
s   A*A??
Bc           
         U(       d  0 $ [        [        [        UR                  5       5      5      [        R
                  5      (       a  [        R                  " S[        5        [        [        [        R
                  [        [        [        4   4   U5      n0 nUR                  5        H  u  pEU R                  5        H{  u  pgXt:w  a  M  [        X5      n[!        U5      S:X  d   S5       e[        [        U5      5       S3n	UR#                  UR                  5        V
Vs0 s H
  u  pX-   U_M     snn
5        M}     M     U$ [        [        [        [        4   U5      $ s  snn
f )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rv   z/FQNs for a submodule should only have 1 elementrt   )r{   r   r   r   rj   rk   r   r   r   r	   r
   rc   r+   r   r   r   r   r   )rm   r   cast_state_dictr   r   sub_state_dictrn   mr   r   subfqnr  s               rB   _unflatten_model_state_dictra  r  s0    	$tJOO-./;;" 	
 tBIItCN/C$CDjQ/1)8)>)>)@%I ..0> -4yA~X'XX~ d,-Q/%%AOAUAUAWXAWV_e+AWX 1 *A Di(*55	 Ys   !E!)r   c                    [        X5      n[        5          [        U SSUS9n[        U0 U5        [	        XU5      sSSS5        $ ! , (       d  f       g= f)a  Load the model state_dict.

The counterpart of ``get_model_state_dict`` to set the state_dict to the
model. See ``set_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    model_state_dict: (Dict[str, ValueType]):
       the model state_dict to load. If the key of the ``model_state_dict``
       is nn.Module, the key is a submodule of ``model`` and the value should
       be the state_dict of the submodule. When loading the state_dict,
       the prefix of the submodule will be append to the state_dict.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.

Returns:
    ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
        * **missing_keys** is a list of str containing the missing keys
        * **unexpected_keys** is a list of str containing the unexpected keys

:type model_state_dict: typing.Dict[str, ValueType]
rM   Fr   r   N)ra  rC   r   r   r  )rm   r   r   r   s       rB   r3   r3     sK    : .I. 
ubUGL+R6%etD	 
s   %A
Ac                    [        5          [        U[        R                  R                  5      (       a  U4O
[        U5      n[        XSUS9n[        0 X$5        [        XX$5        SSS5        g! , (       d  f       g= f)a  Load the optimizers state_dict.

The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
optimizers. See ``set_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    optim_state_dict: OptimizerStateType:
        the optimizer state_dict to load.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.

Returns:
    None

:type optim_state_dict: typing.OptimizerStateType
Trc  N)	rC   r{   rd   r	  rY  rZ  r   r   rU  )rm   r,  r   r   r   s        rB   r4   r4     sf    6 
 *ekk&;&;<< Mz" 	
 uT7S2/6u2BI 
s   AA//
A=c                8   [        X5      n[        5          [        U[        R                  R
                  5      (       a  U4O
[        U5      n[        XU(       + US9n[        X#U5        [        XX55        [        XU5      sSSS5        $ ! , (       d  f       g= f)a  Load the model state_dict and optimizers state_dict.

The counterpart of ``get_state_dict`` to set the state_dict to the model and
optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
have to be returned by ``get_state_dict`` but must meet the following
requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
3) optimizer state_dict cannot contain the parameter IDs; the keys should be
the canonical FQNs.

Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
       the model state_dict to load. If the key of the ``model_state_dict``
       is nn.Module, the key is a submodule of ``model`` and the value should
       be the state_dict of the submodule. When loading the state_dict,
       the prefix of the submodule will be append to the state_dict.
    optim_state_dict: OptimizerStateType:
        the optimizer state_dict to load.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.

Returns:
    ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
        * **missing_keys** is a list of str containing the missing keys of the model state_dict.
        * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

:type model_state_dict: typing.Dict[str, ValueType]
:type optim_state_dict: typing.OptimizerStateType
rc  N)ra  rC   r{   rd   r	  rY  rZ  r   r   rU  r  )rm   r,  r   r   r   r   s         rB   r5   r5     s    T .I. 
 *ekk&;&;<< Mz" 	
 .>*>
 	+tDu2BI%etD 
s   A+B
Bc                  ^^ [         R                  " [        U US9mU4S jnX l        [         R                  " [        U US9mS[
        [        [        4   4U4S jjnX0l        [        R                  U5        [        R                  U5        g)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
be a partial function to call ``get_state_dict`` and ``set_state_dict``.

Example:
    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    from torch.distributed.checkpoint.state_dict import patch_model_state_dict

    model = fsdp(model)
    patch_model_state_dict(model)

Args:
    model (nn.Module): the nn.Module to the model.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.
Returns:
    None
)rm   r   c                     > T " 5       $ r<   rM   _state_dict_calls   rB   state_dict_call0_patch_model_state_dict.<locals>.state_dict_call@      !!rV   r   c                    > T" U S9  g )N)r   rM   r   _load_state_dict_calls    rB   load_state_dict_call5_patch_model_state_dict.<locals>.load_state_dict_callK      z:rV   N)r   r   r0   r   r3   r
   rc   r   r   r:   r   )rm   r   rj  rp  ro  ri  s       @@rB   _patch_model_state_dictrs    s    6 !((" '%--;c3h ; 1O,01rV   c                  ^^ [         R                  " [        U UUS9mU4S jn[         R                  " [        U UUS9mS[        [
        [        4   4U4S jjn[        R                  U5        [        R                  U5        [        U[        R                  R                  5      (       a  U4O
[        U5      nU H  nX5l        XEl        M     g)a`  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
be a partial function to call ``get_state_dict`` and ``set_state_dict``.

Note that if there are multiple optimizers, all of the optimizers will be patched.
So users only need to call one of the state_dict() to get the full result.

Example:
    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    from torch.distributed.checkpoint.state_dict import patch_model_state_dict

    model = fsdp(model)
    patch_model_state_dict(model)

Args:
    model (nn.Module): the nn.Module to the model.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.
Returns:
    None
)rm   r,  r   c                     > T " 5       $ r<   rM   rh  s   rB   rj  4_patch_optimizer_state_dict.<locals>.state_dict_call|  rl  rV   r   c                    > T" U S9  g )N)r   rM   rn  s    rB   rp  9_patch_optimizer_state_dict.<locals>.load_state_dict_call  rr  rV   N)r   r   r1   r4   r
   rc   r   r:   r   r{   rd   r	  rY  rZ  r   r   )rm   r,  r   rj  rp  r	  ro  ri  s         @@rB   _patch_optimizer_state_dictry  V  s    > !(( 	" &-- 	;c3h ; O,01 j%++"7"788 
: 
 * 4 rV   )TT)rrg   r   r=   r   dataclassesr   r   r   	itertoolsr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   rd   torch.distributedr   r   torch.nnrj   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   r~   r   r   r    r!   r"   $torch.distributed.fsdp._common_utilsr#   r$   torch.distributed.tensorr%   torch.nn.modules.moduler&   torch.nn.parallelr'   r|   torch.utils._pytreer(   __all__r   r!  r  r   rc   r)   re   r  r  r*   r+   r,   r-   r.   rf   r:   rT   r   rC   r/   rX   	lru_cacherk   rS   r   r   r   r	  rY  r   r   r   r   no_gradr   r  r  r$  r+  r9  rC  rU  r0   r1   r2   ra  r3   r4   r5   rs  ry  rM   rV   rB   <module>r     s     	  0 0         A 	 	 	 - 5 < -" 
		Sg}ellCKL4&m(<d3CS>TT	 S)^$' #u]4E%EFFG  &)U S] *   +/ +/ +/\ @% @ @ T" !!%	;E99;E
;E ;E 	;E
 ;E #;E|	 	"H ,0*.99%++'',- 
 RYY( &' D*3	>**(* * 
	*Zbii)>)>>? c h S#X&4	#s(^$ @;99@;*@;	#y.@; @;F 2
992
S)^$2
 2
 	2
 2
j'&U[[22 '&t '&T=*< =c9nAU =@*;;  *S)^$* * 	*Z <A99<Aekk++S01<A <A 	<A <A~E99E;;  E )E 	E
 EP ZN99ZNekk++S01ZN #ZN 	ZN
 
ZN ZN@ ,0*.	" 99"  RYY("  &'	" 
 
#y." R ,0*.* 99* ekk++Xekk6K6K-LLM*  RYY(	* 
 &'*  * b ,0*.V299V2ekk++Xekk6K6K-LLMV2 RYY(	V2
 &'V2 4Y!334V2r6996d299d3	>&::;T#y.=QQR6 
#y.6J +/	$E99$E3	>*$E &'	$E
 $EX +/$J99$Jekk++Xekk6K6K-LLM$J )$J
 &'$J 
$JZ +/9E999Eekk++Xekk6K6K-LLM9E 3	>*	9E
 )9E &'9E 9E|  +/129912 &'12 
	12 12l 
 +/	;599;5 ekk++S01;5 &'	;5
 
;5 ;5rV   