
    ΑiR                     <   S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJrJ	r	  S SK
JrJr  SSKJrJrJr  SqSqSq\	R(                  " 5       rSrSrS	rS
rSq0 rSS jrS r " S S5      r " S S\5      r " S S\5      r S r!S r"S r#S r$S r%S r&SS jr'S r(S r)g)    N)current_thread)compilerunique_name)Programin_dygraph_mode   )CheckpointSaverPaddleModelSerializableBase
checkpointmemory_initdacpacpc                 6   [         b  [         $ [        R                  " U5      q [         R                  U 5        S[         l        [        R
                  " 5       n[        R                  " S5      nUR                  U5        [         R                  U5        [         $ )NFz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)	loggerlogging	getLoggersetLevel	propagateStreamHandler	FormattersetFormatter
addHandler)	log_levelnamelog_handler
log_formats       o/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/base/incubate/checkpoint/auto_checkpoint.py_get_loggerr   -   su    t$F
OOIF'')K""HJ Z(
k"M    c                  B    [        5       R                  S:X  d   S5       eg )N
MainThreadz*auto checkpoint must run under main thread)r   r    r    r   _thread_checkerr$   @   s#      L0 40r    c                       \ rS rSrS rS rS rS r\S 5       r	S r
S r\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rSrg)AutoCheckpointCheckerF   c                     S U l         S U l        S U l        S U l        S U l        S U l        S U l        S U l        S U l        [        R                  " S5      U l         U R                   S:w  a  g  [        R                  S   U l        [        R                  S   U l        [        R                  S   U l        [        R                  S   U l        [        R                  S   U l        [        R                  S   U l        [        [        R                  S	   5      U l        [        [        R                  " S
S5      5      U l        [        R                  " SS5      U l        [        [        R                  " SS5      5      U l        U R                  (       dl  [        U R                  5      S:  aK  [        U R                  5      S:  a2  [        U R
                  5      S:  a  [        U R                  5      S:  d   S5       eg [        U R                  5      S:  a  [        U R                  5      S:  d   S5       eg ! [          a8  n["        R%                  SU 35        [&        R(                  " S5         S nAg S nAff = f)NPADDLE_RUNNING_ENVPADDLE_EDL_AUTO_CHECKPOINTPADDLE_RUNNING_PLATFORMPADDLE_JOB_IDPADDLE_EDL_HDFS_HOMEPADDLE_EDL_HDFS_NAMEPADDLE_EDL_HDFS_UGIPADDLE_EDL_HDFS_CHECKPOINT_PATHPADDLE_TRAINER_IDPADDLE_EDL_ONLY_FOR_CE_TEST0PADDLE_EDL_FS_CACHEz.cache PADDLE_EDL_SAVE_CHECKPOINT_INTER900      r   zhdfs environ must setz
exception:r   )_run_env	_platform_job_id
_hdfs_home
_hdfs_name	_hdfs_ugi_hdfs_checkpoint_path_trainer_id_ce_testosgetenvenvironint	_fs_cache_save_checkpoint_interlen	Exceptionr   fatalsysexit)selfes     r   __init__AutoCheckpointChecker.__init__G   s   %)"		"67==88!	ZZ(ABDN::o6DL jj)?@DO jj)?@DOZZ(=>DN)+1*D&  #2::.A#BCD		*G MNDMYY'<hGDN*-		<eD+D' ==(1,DOO,q0DNN+a/D667!;	+
 ++< < (1,D667!;+ ++<;  	LL:aS)*HHQKK	s   -FH; 9H; ;
I=.I88I=c                 >    U R                    SU R                   SU 3$ )N/z/range/hdfs_checkpoint_pathjob_idrM   r   s     r   get_range_checkpoint_path/AutoCheckpointChecker.get_range_checkpoint_pathy   s$    ++,Adkk]'$HHr    c                 >    U R                    SU R                   SU 3$ )NrR   z/exe/rS   rV   s     r   get_exe_checkpoint_path-AutoCheckpointChecker.get_exe_checkpoint_path|   s$    ++,Adkk]%vFFr    c                 8    U R                    SU R                   3$ )NrR   rS   rM   s    r   get_job_path"AutoCheckpointChecker.get_job_path   s    ++,Adkk];;r    c                     U R                   $ N)rG   r]   s    r   save_checkpoint_inter+AutoCheckpointChecker.save_checkpoint_inter   s    ***r    c                 d   [        5       (       a  gU R                  S L=(       a    U R                  S L=(       aw    U R                  S L=(       ab    U R                  S L=(       aM    U R
                  S L=(       a8    U R                  S L=(       a#    U R                  S L=(       a    U R                  S L$ )NF)	r   r9   r:   r;   r<   r=   r>   r?   r@   r]   s    r   validAutoCheckpointChecker.valid   s     MM% -d*-D(- t+- t+	-
 d*- **$6-   ,		
r    c                     SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   SU R                   SU R                   S	3$ )
Nzrun_env:z
 platform:z job_id:z             hdfs_home:z hdfs_name:z
 hdfs_ugi:z"             hdfs_checkpoint_path:z trainer_id:z ce_test)r9   r:   r<   r=   r>   r?   r@   rA   r]   s    r   __str__AutoCheckpointChecker.__str__   su    $--
4>>2B(4??J[ \'{4>>2B*TMgMgLh i""&"2"2!3<hX 	Xr    c                     U R                   $ ra   )r@   r]   s    r   
trainer_id AutoCheckpointChecker.trainer_id   s    r    c                     U R                   $ ra   )r9   r]   s    r   run_envAutoCheckpointChecker.run_env       }}r    c                     U R                   $ ra   )r:   r]   s    r   platformAutoCheckpointChecker.platform       ~~r    c                     U R                   $ ra   )r;   r]   s    r   rU   AutoCheckpointChecker.job_id   s    ||r    c                     U R                   $ ra   )r<   r]   s    r   	hdfs_homeAutoCheckpointChecker.hdfs_home       r    c                     U R                   $ ra   )r=   r]   s    r   	hdfs_nameAutoCheckpointChecker.hdfs_name   rz   r    c                     U R                   $ ra   )rA   r]   s    r   ce_testAutoCheckpointChecker.ce_test   rp   r    c                     U R                   $ ra   )r>   r]   s    r   hdfs_ugiAutoCheckpointChecker.hdfs_ugi   rt   r    c                     U R                   $ ra   )r?   r]   s    r   rT   *AutoCheckpointChecker.hdfs_checkpoint_path   s    )))r    c                      [        S5      $ )N_range_)	generatorr#   r    r   generate_range_name)AutoCheckpointChecker.generate_range_name   s    ##r    )rA   rF   r?   r<   r=   r>   r;   r:   r9   rG   r@   N)__name__
__module____qualname____firstlineno__rO   rW   rZ   r^   propertyrb   re   rh   rk   rn   rr   rU   rx   r|   r   r   rT   staticmethodr   __static_attributes__r#   r    r   r&   r&   F   s    0dIG< + +
X
                   * * $ $r    r&   c                   R    \ rS rSrS rS rS rS rS/4S jrS r	S	 r
S
 rS rSrg)ExeTrainStatus   c                     SU l         S U l        S U l        S U l        S U l        S U l        S U l        S U l        S U l        S U l	        SU l
        g )Nexe_train_status)	_epoch_no	_hash_key_key_checkpoint_path_checkpoint_no_restored_from_exe_program	_exe_name_program_name
_file_namer]   s    r   rO   ExeTrainStatus.__init__   sS    	 $""	!,r    c                    U R                   UR                   :H  =(       a    U R                  UR                  :H  =(       a    U R                  UR                  :H  =(       ay    U R                  UR                  :H  =(       aY    U R                  UR                  :H  =(       a9    U R
                  UR
                  :H  =(       a    U R                  UR                  :H  $ ra   )r   r   r   r   r   r   r   rM   ts     r   __eq__ExeTrainStatus.__eq__   s    NNakk) 6!++-6		QVV#6 %%););;6 ##q'7'77	6
 !++-6 ""aoo5	
r    c                     X:X  + $ ra   r#   r   s     r   __ne__ExeTrainStatus.__ne__   s
    }r    c                     U SU R                    3n[        US5       nU R                  5       nUR                  U5        S S S 5        g ! , (       d  f       g = fNrR   wr   open
_serializewriterM   path	file_namefss        r   	serializeExeTrainStatus.serialize   F    fAdoo./	)S!Q!AGGAJ "!!   "A		
Arestored_fromc                     U R                  5       nU H  nUR                  US 5        M     [        R                  " U5      $ ra   )_to_dictpopjsondumps)rM   pop_keysdks       r   r   ExeTrainStatus._serialize   s2    MMOAEE!TN zz!}r    c                     S nU SU R                    3n[        US5       nUR                  5       nU R                  U5        S S S 5        g ! , (       d  f       g = f)NrR   r)r   r   read_deserialize)rM   r   r   r   r   r   s         r   deserializeExeTrainStatus.deserialize   sM    fAdoo./	)S!QAa  "!!s   "A
Ac                     [         R                  " U5      nUS   U l        US   U l        US   U l        US   U l        US   U l        US   U l        US   U l        g )Nepoch_nokeyhash_keycheckpoint_pathcheckpoint_noexe_nameprogram_name)	r   loadsr   r   r   r   r   r   r   )rM   r   r   s      r   r   ExeTrainStatus._deserialize   sc    JJqM:eH	: !"3 40:~.r    c           	          U R                   U R                  U R                  U R                  U R                  U R
                  U R                  U R                  S.$ )N)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r]   s    r   r   ExeTrainStatus._to_dict   sL    99#44!00 ..!00	
 		
r    c                 $    U R                  / 5      $ ra   r   r]   s    r   rh   ExeTrainStatus.__str__	      r""r    )r   r   r   r   r   r   r   r   r   r   r   N)r   r   r   r   rO   r   r   r   r   r   r   r   rh   r   r#   r    r   r   r      s7    -	
 $3"3 !/

#r    r   c                       \ rS rSr SS jrS rS rS rS r\	S 5       r
S	 rS
S/4S jr\	S 5       rS rS rS rS rS rS rSrg)TrainEpochRangei  Nc                 8   Xl         SU l        X l        S U l        0 U l        SU l        [        U l        Ub  X0l        OU R                  R                  U l        U R                  S:  d   SU R                   S35       e[        R                  " 5       U l        S U l        S U l        U R                  R                  5       (       d  g SU l        U(       d  g U R                  R!                  U5      U l        U R                  R$                  U R                  R&                  S.nU R                  R(                  (       a  S nSSKJn  U" U R                  R.                  U5      U l        [3        U R0                  5      U l        [7        5         U R9                  5         g )	Nr   Fr   zcheckpoint inter:z	 must >=0range_train_status)zfs.default.namezhadoop.job.ugi)
HDFSClient)_max_epoch_numr   _namer   _exe_status_flag_generated	g_checker_checkerrG   rb   time_last_checkpoint_time_load_cp_nos_checkpoint_epoch_nore   r   rW   r   r|   r   r   !paddle.distributed.fleet.utils.fsr   rx   _hdfsr	   _cperr$   _get_last_valid_checkpoint)rM   max_epoch_numr   checkpoint_interrestoredconfigr   s          r   rO   TrainEpochRange.__init__  sO    ,
"$!'*:'*.--*M*MD'**a/ 	
 ; ;<IF	
/ &*YY[" $(!}}""$$. $ G G M  $}}66"mm44

 ==  F@ 7 7@
$TZZ0
'')r    c           	         / nSnUS S S2    H  n[        U R                  U R                  SS9nU R                  R	                  U R
                  U/U R                  R                  UU R                  R                  S9  UR                  U5        [        R                  SU SUR                  5        35        US:  a  UR                  nM  X5R                  -
  S:  d  M  XT4s  $    g	)
Nr   F)r   r   local_cache_pathzlook for valid:z t:r   r   )NN)r   r   r   r   load_checkpointr   r   rk   rF   appendr   debugr   r   )rM   cp_noscpsr   ir   s         r   _look_for_validTrainEpochRange._look_for_validA  s    "A 3 3TYYOAJJ&&%%((!%!8!8 '  JJqMLL?1#S0@AB!|;;kk)Q.4K   r    c                    U R                   R                  U R                  5      U l        [        R                  SU R                   35        [        U R                  5      S:  a  [        U l        g [        [        :X  a  U R                   R                  U R                  U /U R                  R                  U R                  R                  S9  [        U l        U R                   U l        [        R                  SU R%                  5        35        g [        [&        :X  a  U R)                  U R                  5      u  pUc  [        U l        g U R                   R                  U R                  U /U R                  R                  UU R                  R                  S9  [        U l        U R                   U l        [        R                  SU R%                  5        35        g [+        S[         35      e)Nzfind checkpoint nos:r   r   z!load tain_epoch_range checkpoint:r   znot supported acp_type:)r   get_checkpoint_nor   r   r   inforH   CONST_MEMORYINITr   
g_acp_typeCONST_ACP_TYPEr   r   rk   rF   CONST_CHECKPOINTr   r   r   CONST_DACP_TYPEr   AssertionError)rM   r   r   s      r   r   *TrainEpochRange._get_last_valid_checkpointV  s    JJ889N9NO*4+<+<*=>?t  !A%"2D'JJ&&%%((!%!8!8	 '  #3D(,D%KK;DOO<M;NOP?*''(9(9:DAy&6#JJ&&%%((!%!8!8 '  #3D(,D%KK;DOO<M;NOP #::,!GHHr    c                     U R                   U R                  U R                  U R                  U R                  U R
                  S.nU$ )N)r   r   r   r   r   checkpoint_epoch_no)r   r   r   r   r   r   )rM   r   s     r   r   TrainEpochRange._to_dict  sB    !00JJ#44!00#'#<#<
 r    c                 $    U R                  / 5      $ ra   r   r]   s    r   rh   TrainEpochRange.__str__  r   r    c                     U R                   $ ra   )r   r]   s    r   r   TrainEpochRange.name  s    zzr    c                     U SU R                    3n[        US5       nU R                  5       nUR                  U5        S S S 5        g ! , (       d  f       g = fr   r   r   s        r   r   TrainEpochRange.serialize  r   r   r   r  c                    U R                  5       nU H  nUR                  US 5        M     0 US'   US   nU R                  R                  5        H!  u  p5UR	                  5       XER
                  '   M#     [        R                  " U5      $ )N
exe_status)r   r   r   itemsr   r   r   r   )rM   r   r   r   rN   r   s         r   r   TrainEpochRange._serialize  sr    MMOAEE!TN  ,lO$$**,DAAffI -zz!}r    c                     U R                   $ ra   )r   r]   s    r   r   TrainEpochRange.restored_from  s    """r    c                    S nU SU R                    3n[        US5       n[        R                  " U5      nS S S 5        US   U l        US   U l        US   U l        US   U l        US   nUR                  5        H.  u  pg[        5       nUR                  U5        XR                  U'   M0     g ! , (       d  f       N~= f)NrR   r   r   r   r   r   r  )r   r   r   loadr   r   r   r   r  r   r   r   )	rM   r   r   r   r   rN   r   vr   s	            r   r   TrainEpochRange.deserialize  s    fAdoo./	)S!Q		!A "  0:vY
 !"3 4 lOGGIDA ANN1"#Q  "!s   B//
B=c              #     #    [        5         U R                  S:  a  [        R                  U l        U R                  S:  d   SU R                   S35       e[
        R
                  " 5       U l        U R                  S-   n[        R                  SU SU R                   35        [        XR                  5       H  nX l        Uv   U R                  5         M     g 7f)Nr   r   zself._epoch_no:z
 must >=-1r   zstarted epoch_no:z max_epoch_num:)r$   r   rK   maxsizer   r   r   r   r  rangesave_checkpoint)rM   startr   s      r   nextTrainEpochRange.next  s     ""%++D~~# 	
dnn-Z8	
# &*YY[""wod6I6I5JK	
 u112ANG  "	 3s   CCc                     U R                   $ ra   )r   r]   s    r   getTrainEpochRange.get  s    ~~r    c                    U R                   R                  S:X  a  [        R                  " 5       U R                  -
  U R                  :  av  [
        [        :X  a>  U R                  S:  a-  U R                  U R                  S-
  :w  a  U R                  5         O*[
        [        :X  a  U R                  5         O[        S5      e[        R                  " 5       U l        g g )Nr   r   z#not supported acp_type:{g_acp_type})r   rk   r   r   rG   r  r  r   r   _save_checkpointr  r  r]   s    r   r   TrainEpochRange.save_checkpoint  s    ==##q(		d888../ / ++a/ NNd.A.AA.EE--/?2))+()NOO)-D&! )r    c                 x   U R                   R                  5       (       d  gU R                  nU R                  R                  5        H  u  p#[	        UR
                  UR                  5      nU R                   R                  UR                  5      nU R                  5       Ul
        U R                  R                  UU/U R                   R                  U R                   R                  S9u  pgXcl        Xsl        X1UR"                  '   [$        R'                  SUR)                  5        35        M     [+        U R                  5      S:  aq  U R                  R                  U R                  U /U R                   R                  S9  [$        R-                  SU R)                  5        35        U R/                  5         gg)zK
status => /jobid/xxx_range_xx/range/
model =>                       /exe/
Nr   zsave executor checkpoint:r   z"save train_epoch_range checkpoint:)r   re   r   r  r
   r   r   rZ   r   r%  r   r   r   rk   rF   r   r   r   r   r   r   rH   r  _generate_flag)rM   rN   r   r   mpr   r   s           r   r(   TrainEpochRange._save_checkpoint  se   
 }}""$$$$**,DAAFFAJJ/A55akkBA((*AK"&**"<"<((!%!8!8	 #= #D "&,affILL4Q\\^4DEF! -$ t 1$JJ&&%%!%!8!8 ' 
 KK4T__5F4GH ! %r    c                 >   U R                   (       a  g SnU R                  R                  5       S-   U-   n[        R	                  S5        U R
                  R                  U R                  R                  5       5        U R
                  R                  USS9  SU l         g )Nzcan_be_auto_checkpoint.flagrR   zthis job can_be_auto_checkpointT)exist_ok)r   r   r^   r   r  r   mkdirstouch)rM   r   r   s      r   r+  TrainEpochRange._generate_flag  sz    ,}}))+c1D856

$--4467

-#r    )r   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   )NT)r   r   r   r   rO   r   r   r   rh   r   r   r   r   r   r   r"  r%  r   r(  r+  r   r#   r    r   r   r     s}    CG1*f*'IR	#   $34I"J  # #$&#,5(%"N
$r    r   c                      [         $ ra   )g_train_epoch_ranger#   r    r   _get_train_epoch_ranger6    s    r    c                     U R                  5       nSnSn[        UR                  5       HE  u  pEUR                  5       (       a  SnUR	                  5       (       a  SnU(       d  M<  U(       d  ME    g   g)NFT)global_block	enumerateops_is_backward_op_is_optimize_op)programr8  has_backwardhas_optidxops         r   _check_program_oprolerB     sj    '')LLG\--.LG<GG / r    c                    [        U [        R                  5      (       d  [        U [        5      (       d  g[        U [        R                  5      (       a*  U R                  b  U R                  R
                  (       a  gOU R
                  (       a  g[        U 5      nUR                  [        ;   a  [        UR                     (       d  gOSn[        U[        R                  5      (       a  [        UR                  5      nO[        U5      nU[        UR                  '   U(       d$  [        R                  SUR                   S35        g[        R                  5       =(       a    [        S L$ )NFzprogram z need't to auto checkpoint)
isinstancer   CompiledProgramr   r   _is_distributed_get_valid_program_auto_checkpoint_nameg_program_attrrB  r   r   r   re   r5  )progr=  rets      r   _can_auto_checkpointrL  1  s   dH4455jg? ? $0011== DMM$A$A %B  &G$$6g;;< = gx7788'(8(89C'0C8;w445LL78899ST ??@!4D!@@r    c                     U  SU 3$ )N_r#   )r   r   s     r   _get_running_keyrO  T  s    Zq''r    c                  F    [        S5        [        c
  [        5       q[        $ )N   )r   r   r&   r#   r    r   _get_checkerrR  X  s    O)+	r    c              #   d   #    U S:  a  [         R                  n [        SU 5       S h  vN   g  N7f)Nr   )rK   r  r  )r   s    r   _normal_yieldrT  a  s&     qQ&&&s   &0.0c              #     #    [        5       R                  5       (       d,  [        R                  S5        [	        U 5       H  nUv   M	     g [
        [        :X  a  [	        U 5       H  nUv   M	     g [        q[        R                  S[
         35         [        U [        R                  5       US9q[        R                  5        H  nUv   M	     S qg ! S qf = f7f)Nz=auto checkpoint will take effect automatically on PaddleCloudz	acp_type:)r   )rR  re   r   warningrT  r  r  r  r  r   r   r   r5  r"  )r   rb   r   s      r   train_epoch_rangerW  g  s     >!!K	
 }-AG . 	_$}-AG . 	J
KK)J<()
#-))+2
 %))+AG , #ds   BC<C CCCc                 \    [        U [        R                  5      (       a  U R                  $ U $ ra   )rD  r   rE  r   )rJ  s    r   rG  rG    s$    $0011}}Kr    c                    [        5         U R                  c   e[        U5      (       d  g [        U5      nUR                  c   e[        R
                  n[        U R                  UR                  5      n[        R                  [        :X  a  XC;   d   SU S[         35       eS nXC;   a  X4   nUR                  c  [        [        R                  5      n[        X5      nUR                  [        R                  U5      U/[        R                   UR"                  [        R$                  S9  [        Ul	        [&        R)                  SU 35        Xl        X%l        [        R/                  5       Ul        O[3        5       n[        R/                  5       Ul        XEl        XEl        [8        Ul	        Xl        X%l        U R                  Ul        UR                  Ul        XSU'   [&        R)                  S5        [?        5         g )Nzwhen restored key:z must be in train_epoch_range:)rk   r   r   zload executor checkpoint z+not found checkpoint, so train from epoch 0) rR  rH  rL  rG  r5  r   rO  r   r  r   r	   r   r
   r   r   rZ   rk   r   rF   r   r  r   r   r%  r   r   r   r   r  r   r   r$   )exerJ  r=  r  r   r   ar,  s           r   _auto_checkpointr\    s   N$$000%% &G((444$00J
!!7#@#@C ((,<<  	
 %CDWCXY	
  	A
O# 3 9 9:AC)A11#6$//..!*!4!4    0AKK3A378
)--/)--/+
//!77 3ABr    )auto_checkpointra   )*r   r   rB   rK   r   	threadingr   paddle.baser   r   paddle.base.frameworkr   r   checkpoint_saverr	   r
   r   r5  r   r   UniqueNameGeneratorr   r  r  r  r  r  rI  r   r$   r&   r   r   r6  rB  rL  rO  rR  rT  rW  rG  r\  r#   r    r   <module>rc     s      	 
  $ - : L L 		++-	    
&z$ z$zG#% G#TL$& L$^" AF('#D6r    