
    x-jR                     H   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 d dl
mZmZ ddlmZmZmZ dadada e	j                    ZdZdZd	Zd
Zdai ZddZd Z G d d          Z G d de          Z G d de          Z d Z!d Z"d Z#d Z$d Z%d Z&ddZ'd Z(d Z)dS )    N)current_thread)compilerunique_name)Programin_dygraph_mode   )CheckpointSaverPaddleModelSerializableBase
checkpointmemory_initdacpacpauto_checkpointc                 L   t           t           S t          j        |          a t                               |            dt           _        t          j                    }t          j        d          }|                    |           t                               |           t           S )NFz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)	loggerlogging	getLoggersetLevel	propagateStreamHandler	FormattersetFormatter
addHandler)	log_levelnamelog_handler
log_formats       o/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/base/incubate/checkpoint/auto_checkpoint.py_get_loggerr    -   s    t$$F
OOIF'))K"H J Z(((
k"""M    c                  H    t                      j        dk    s
J d            d S )N
MainThreadz*auto checkpoint must run under main thread)r   r    r!   r   _thread_checkerr%   @   s-     L0004 10000r!   c                   $   e Zd Zd Zd Zd Zd Zed             Zd Z	d Z
ed             Zed	             Zed
             Zed             Zed             Zed             Zed             Zed             Zed             Zed             ZdS )AutoCheckpointCheckerc                    d | _         d | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _        t          j
        d          | _         | j         dk    rd S 	 t          j        d         | _        t          j        d         | _        t          j        d         | _        t          j        d         | _        t          j        d         | _        t          j        d         | _        t          t          j        d	                   | _        t          t          j
        d
d                    | _        t          j
        dd          | _        t          t          j
        dd                    | _        | j        slt          | j                  dk    rHt          | j                  dk    r0t          | j                  dk    rt          | j                  dk    s
J d            d S t          | j                  dk    rt          | j                  dk    s
J d            d S # t           $ r<}t"                              d|            t'          j        d           Y d }~d S d }~ww xY w)NPADDLE_RUNNING_ENVPADDLE_EDL_AUTO_CHECKPOINTPADDLE_RUNNING_PLATFORMPADDLE_JOB_IDPADDLE_EDL_HDFS_HOMEPADDLE_EDL_HDFS_NAMEPADDLE_EDL_HDFS_UGIPADDLE_EDL_HDFS_CHECKPOINT_PATHPADDLE_TRAINER_IDPADDLE_EDL_ONLY_FOR_CE_TEST0PADDLE_EDL_FS_CACHEz.cache PADDLE_EDL_SAVE_CHECKPOINT_INTER900      r   zhdfs environ must setz
exception:r   )_run_env	_platform_job_id
_hdfs_home
_hdfs_name	_hdfs_ugi_hdfs_checkpoint_path_trainer_id_ce_testosgetenvenvironint	_fs_cache_save_checkpoint_interlen	Exceptionr   fatalsysexit)selfes     r   __init__zAutoCheckpointChecker.__init__G   sL   %)"	"677=888F!	Z(ABDN:o6DL j)?@DO j)?@DOZ(=>DN)+1*D&  #2:.A#BCCD	*G M MNNDMY'<hGGDN*-	<eDD+ +D' = +((1,,DO,,q00DN++a//D677!;;;* <;< <; ((1,,D677!;;;* <;<;;  	 	 	LL)a))***HQKKKKKKKKK	s   'FH, 0:H, ,
I261I--I2c                 *    | j          d| j         d| S )N/z/range/hdfs_checkpoint_pathjob_idrM   r   s     r   get_range_checkpoint_pathz/AutoCheckpointChecker.get_range_checkpoint_pathy   s#    +HHdkHH$HHHr!   c                 *    | j          d| j         d| S )NrQ   z/exe/rR   rU   s     r   get_exe_checkpoint_pathz-AutoCheckpointChecker.get_exe_checkpoint_path|   s#    +FFdkFFFFFr!   c                 $    | j          d| j         S )NrQ   rR   rM   s    r   get_job_pathz"AutoCheckpointChecker.get_job_path   s    +;;dk;;;r!   c                     | j         S N)rG   rZ   s    r   save_checkpoint_interz+AutoCheckpointChecker.save_checkpoint_inter   s    **r!   c                     t                      rdS | j        d uo>| j        d uo5| j        d uo,| j        d uo#| j        d uo| j        d uo| j        d uo| j        d uS )NF)	r   r9   r:   r;   r<   r=   r>   r?   r@   rZ   s    r   validzAutoCheckpointChecker.valid   s     	5 M% -d*-D(- t+- t+	-
 d*- *$6-  ,		
r!   c                     d| j          d| j         d| j         d| j         d| j         d| j         d| j         d| j         d	S )
Nzrun_env:z
 platform:z job_id:z             hdfs_home:z hdfs_name:z
 hdfs_ugi:z"             hdfs_checkpoint_path:z trainer_id:z ce_test)r9   r:   r<   r=   r>   r?   r@   rA   rZ   s    r   __str__zAutoCheckpointChecker.__str__   s    X$- X X4> X X4? X XX X37>X XMQMgX X"&"2X X@DX X X 	Xr!   c                     | j         S r]   )r@   rZ   s    r   
trainer_idz AutoCheckpointChecker.trainer_id   s    r!   c                     | j         S r]   )r9   rZ   s    r   run_envzAutoCheckpointChecker.run_env   
    }r!   c                     | j         S r]   )r:   rZ   s    r   platformzAutoCheckpointChecker.platform   
    ~r!   c                     | j         S r]   )r;   rZ   s    r   rT   zAutoCheckpointChecker.job_id   s
    |r!   c                     | j         S r]   )r<   rZ   s    r   	hdfs_homezAutoCheckpointChecker.hdfs_home   
    r!   c                     | j         S r]   )r=   rZ   s    r   	hdfs_namezAutoCheckpointChecker.hdfs_name   rn   r!   c                     | j         S r]   )rA   rZ   s    r   ce_testzAutoCheckpointChecker.ce_test   rg   r!   c                     | j         S r]   )r>   rZ   s    r   hdfs_ugizAutoCheckpointChecker.hdfs_ugi   rj   r!   c                     | j         S r]   )r?   rZ   s    r   rS   z*AutoCheckpointChecker.hdfs_checkpoint_path   s    ))r!   c                       t          d          S )N_range_)	generatorr$   r!   r   generate_range_namez)AutoCheckpointChecker.generate_range_name   s    ###r!   N)__name__
__module____qualname__rO   rV   rX   r[   propertyr^   r`   rb   rd   rf   ri   rT   rm   rp   rr   rt   rS   staticmethodry   r$   r!   r   r'   r'   F   s       0 0 0dI I IG G G< < < + + X+
 
 
X X X
     X    X   X   X   X   X   X   X * * X* $ $ \$ $ $r!   r'   c                   J    e Zd Zd Zd Zd Zd ZdgfdZd Zd Z	d	 Z
d
 ZdS )ExeTrainStatusc                     d| _         d | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _	        d| _
        d S )Nexe_train_status)	_epoch_no	_hash_key_key_checkpoint_path_checkpoint_no_restored_from_exe_program	_exe_name_program_name
_file_namerZ   s    r   rO   zExeTrainStatus.__init__   sW    	 $""	!,r!   c                     | j         |j         k    o_| j        |j        k    oO| j        |j        k    o?| j        |j        k    o/| j        |j        k    o| j        |j        k    o| j        |j        k    S r]   )r   r   r   r   r   r   r   rM   ts     r   __eq__zExeTrainStatus.__eq__   s    Nak) 6!+-6	QV#6 %);;6 #q'77	6
 !+-6 "ao5	
r!   c                     | |k     S r]   r$   r   s     r   __ne__zExeTrainStatus.__ne__   s    19}r!   c                     | d| j          }t          |d          5 }|                                 }|                    |           d d d            d S # 1 swxY w Y   d S NrQ   wr   open
_serializewriterM   path	file_namefss        r   	serializezExeTrainStatus.serialize       //do//	)S!! 	Q!!AGGAJJJ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	   *AAArestored_fromc                     |                                  }|D ]}|                    |d            t          j        |          S r]   )_to_dictpopjsondumps)rM   pop_keysdks       r   r   zExeTrainStatus._serialize   s?    MMOO 	 	AEE!TNNNNz!}}r!   c                     d }| d| j          }t          |d          5 }|                                }|                     |           d d d            d S # 1 swxY w Y   d S )NrQ   r)r   r   read_deserialize)rM   r   r   r   r   r   s         r   deserializezExeTrainStatus.deserialize   s    //do//	)S!! 	!QAa   	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!s   *AAAc                     t          j        |          }|d         | _        |d         | _        |d         | _        |d         | _        |d         | _        |d         | _        |d         | _        d S )Nepoch_nokeyhash_keycheckpoint_pathcheckpoint_noexe_nameprogram_name)	r   loadsr   r   r   r   r   r   r   )rM   r   r   s      r   r   zExeTrainStatus._deserialize   si    JqMM:eH	: !"3 40:~.r!   c           	      h    | j         | j        | j        | j        | j        | j        | j        | j        dS )N)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   rZ   s    r   r   zExeTrainStatus._to_dict   s=    9#4!0 .!0	
 	
 		
r!   c                 ,    |                      g           S r]   r   rZ   s    r   rb   zExeTrainStatus.__str__	      r"""r!   N)rz   r{   r|   rO   r   r   r   r   r   r   r   rb   r$   r!   r   r   r      s        - - -	
 	
 	
     $3"3    ! ! !/ / /

 

 

# # # # #r!   r   c                       e Zd Z	 ddZd Zd Zd Zd Zed             Z	d	 Z
d
dgfdZed             Zd Zd Zd Zd Zd Zd ZdS )TrainEpochRangeNTc                    || _         d| _        || _        d | _        i | _        d| _        t          | _        ||| _        n| j        j	        | _        | j        dk    sJ d| j         d            t          j
                    | _        d | _        d | _        | j                                        sd S d| _        |sd S | j                            |          | _        | j        j        | j        j        d}| j        j        rd }ddlm}  || j        j        |          | _        t3          | j                  | _        t7                       |                                  d S )	Nr   Fr   zcheckpoint inter:z	 must >=0range_train_status)zfs.default.namezhadoop.job.ugi)
HDFSClient)_max_epoch_numr   _namer   _exe_status_flag_generated	g_checker_checkerrG   r^   time_last_checkpoint_time_load_cp_nos_checkpoint_epoch_nor`   r   rV   r   rp   rt   rr   !paddle.distributed.fleet.utils.fsr   rm   _hdfsr	   _cperr%   _get_last_valid_checkpoint)rM   max_epoch_numr   checkpoint_interrestoredconfigr   s          r   rO   zTrainEpochRange.__init__  s|    ,
"$!'*:D''*.-*MD'*a///F ;FFF 0// &*Y[[" $(!}""$$ 	F. 	F $ G G M M  $}6"m4
 

 =  	F@@@@@@Z 7@@
$TZ00
'')))))r!   c                    g }d}|d d d         D ]}t          | j        | j        d          }| j                            | j        |g| j        j        || j        j                   |	                    |           t                              d| d|                                            |dk     r|j        }||j        z
  dk    r||fc S d	S )
Nr   F)r   r   local_cache_pathzlook for valid:z t:r   r   )NN)r   r   r   r   load_checkpointr   r   rd   rF   appendr   debugr   r   )rM   cp_noscpsr   ir   s         r   _look_for_validzTrainEpochRange._look_for_validA  s    " 	  	 A 3TYOOOAJ&&%(!%!8 '    JJqMMMLLA1AAAABBB!||;ak)Q..a4KKK /zr!   c                    | j                             | j                  | _        t                              d| j                    t          | j                  dk     rt          | _        d S t          t          k    r| j                             | j        | g| j        j        | j        j                   t          | _        | j        | _        t                              d|                                             d S t          t&          k    r|                     | j                  \  }}|t          | _        d S | j                             | j        | g| j        j        || j        j                   t          | _        | j        | _        t                              d|                                             d S t+          dt                     )Nzfind checkpoint nos:r   r   z!load tain_epoch_range checkpoint:r   znot supported acp_type:)r   get_checkpoint_nor   r   r   inforH   CONST_MEMORYINITr   
g_acp_typeCONST_ACP_TYPEr   r   rd   rF   CONST_CHECKPOINTr   r   r   CONST_DACP_TYPEr   AssertionError)rM   r   r   s      r   r   z*TrainEpochRange._get_last_valid_checkpointV  s    J889NOO>4+<>>???t !!A%%"2DF''J&&%(!%!8	 '    #3D(,D%KKODOO<M<MOOPPPPP?**''(9::DAqy&6#J&&%(!%!8 '    #3D(,D%KKODOO<M<MOOPPPPP !G:!G!GHHHr!   c                 T    | j         | j        | j        | j        | j        | j        d}|S )N)r   r   r   r   r   checkpoint_epoch_no)r   r   r   r   r   r   )rM   r   s     r   r   zTrainEpochRange._to_dict  s7    !0J#4!0#'#<
 
 r!   c                 ,    |                      g           S r]   r   rZ   s    r   rb   zTrainEpochRange.__str__  r   r!   c                     | j         S r]   )r   rZ   s    r   r   zTrainEpochRange.name  s
    zr!   c                     | d| j          }t          |d          5 }|                                 }|                    |           d d d            d S # 1 swxY w Y   d S r   r   r   s        r   r   zTrainEpochRange.serialize  r   r   r   r   c                    |                                  }|D ]}|                    |d            i |d<   |d         }| j                                        D ]!\  }}|                                ||j        <   "t          j        |          S )N
exe_status)r   r   r   itemsr   r   r   r   )rM   r   r   r   rN   r   s         r   r   zTrainEpochRange._serialize  s    MMOO 	 	AEE!TNNNN ,lO$**,, 	' 	'DAqAafIIz!}}r!   c                     | j         S r]   )r   rZ   s    r   r   zTrainEpochRange.restored_from  s    ""r!   c                    d }| d| j          }t          |d          5 }t          j        |          }d d d            n# 1 swxY w Y   |d         | _        |d         | _        |d         | _        |d         | _        |d         }|                                D ]2\  }}t                      }|
                    |           || j        |<   3d S )NrQ   r   r   r   r   r   r   )r   r   r   loadr   r   r   r   r   r   r   r   )	rM   r   r   r   r   rN   r   vr   s	            r   r   zTrainEpochRange.deserialize  s   //do//	)S!! 	Q	!A	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  0:vY
 !"3 4 lOGGII 	$ 	$DAq  ANN1"#DQ	$ 	$s   A  AAc              #     K   t                       | j        dk     rt          j        | _        | j        dk    sJ d| j         d            t          j                    | _        | j        dz   }t                              d| d| j                    t          || j                  D ]!}|| _        |V  | 
                                 "d S )Nr   r   zself._epoch_no:z
 must >=-1r   zstarted epoch_no:z max_epoch_num:)r%   r   rK   maxsizer   r   r   r   r   rangesave_checkpoint)rM   startr   s      r   nextzTrainEpochRange.next  s      """%+D~###8dn888 $## &*Y[[""KKKd6IKK	
 	
 	
 ud122 	# 	#ADNGGG  """"		# 	#r!   c                     | j         S r]   )r   rZ   s    r   getzTrainEpochRange.get  s
    ~r!   c                    | j         j        dk    rt          j                    | j        z
  | j        k    rwt
          t          k    r3| j        dk    r'| j        | j        dz
  k    r| 	                                 n4t
          t          k    r| 	                                 nt          d          t          j                    | _        d S d S )Nr   r   z#not supported acp_type:{g_acp_type})r   rd   r   r   rG   r   r   r   r   _save_checkpointr   r   rZ   s    r   r   zTrainEpochRange.save_checkpoint  s    =#q((	d88./ / // +a// Nd.AA.EEE--///?22))++++()NOOO)-D&&&! )(r!   c                 :   | j                                         sdS | j        }| j                                        D ]\  }}t	          |j        |j                  }| j                             |j                  }| 	                                |_
        | j                            ||g| j         j        | j         j                  \  }}||_        ||_        |||j        <   t$                              d|                                            t+          | j                  dk    rr| j                            | j        | g| j         j                   t$                              d|                                             |                                  dS dS )zc
        status => /jobid/xxx_range_xx/range/
        model =>                       /exe/
        Nr   zsave executor checkpoint:r   z"save train_epoch_range checkpoint:)r   r`   r   r   r
   r   r   rX   r   r   r   r   r   rd   rF   r   r   r   r   r   r   rH   r   _generate_flag)rM   rN   r   r   mpr   r   s           r   r   z TrainEpochRange._save_checkpoint  s   
 }""$$ 	F$**,, 	G 	GDAqAFAJ//A55akBBA((**AK"&*"<"<(!%!8	 #= # #D- "&A,AAafILLEQ\\^^EEFFFFt  1$$J&&%!%!8 '   
 KKHT__5F5FHH   !!!!! %$r!   c                 6   | j         rd S d}| j                                        dz   |z   }t                              d           | j                            | j                                                   | j                            |d           d| _         d S )Nzcan_be_auto_checkpoint.flagrQ   zthis job can_be_auto_checkpointT)exist_ok)r   r   r[   r   r   r   mkdirstouch)rM   r   r   s      r   r  zTrainEpochRange._generate_flag  s     	F,}))++c1D85666
$-4466777
---#r!   )NT)rz   r{   r|   rO   r   r   r   rb   r}   r   r   r   r   r   r   r   r   r   r  r$   r!   r   r   r     s+       CG1* 1* 1* 1*f  *'I 'I 'IR	 	 	# # #   X   $34I"J     # # X#$ $ $&# # #,  5 5 5(%" %" %"N
$ 
$ 
$ 
$ 
$r!   r   c                      t           S r]   )g_train_epoch_ranger$   r!   r   _get_train_epoch_ranger
    s    r!   c                     |                                  }d}d}t          |j                  D ]8\  }}|                                rd}|                                rd}|r|r dS 9dS )NFT)global_block	enumerateops_is_backward_op_is_optimize_op)programr  has_backwardhas_optidxops         r   _check_program_oproler     s    ''))LLG\-..  R 	 L 	G 	G 	445r!   c                 j   t          | t          j                  st          | t                    sdS t          | t          j                  r| j        | j        j        rdS n	| j        rdS t          |           }|j        t          v rt          |j                 sdS nvd}t          |t          j                  rt          |j                  }nt          |          }|t          |j        <   |s%t                              d|j         d           dS t                                          ot          d uS )NFzprogram z need't to auto checkpoint)
isinstancer   CompiledProgramr   r   _is_distributed_get_valid_program_auto_checkpoint_nameg_program_attrr  r   r   r   r`   r	  )progr  rets      r   _can_auto_checkpointr   1  sE   dH455 jg? ?  u$011 = DM$A 5 !  	5 &&G$66g;< 	5	 gx788 	1'(899CC'00C8;w45 	LLT78TTT   5??@!4D!@@r!   c                     |  d| S )N_r$   )r   r   s     r   _get_running_keyr#  T  s    '''''r!   c                  X    t          d           t          t                      at          S )N   )r    r   r'   r$   r!   r   _get_checkerr&  X  s$    OOO)++	r!   c              #   Z   K   | dk     rt           j        } t          d|           E d {V  d S )Nr   )rK   r   r   )r   s    r   _normal_yieldr(  a  s@      qQ&&&&&&&&&&&r!   c              #     K   t                                                      s2t                              d           t	          |           D ]}|V  d S t
          t          k    rt	          |           D ]}|V  d S t          at                              dt
                      	 t          | t                                          |          at                                          D ]}|V  	 d ad S # d aw xY w)Nz=auto checkpoint will take effect automatically on PaddleCloudz	acp_type:)r   )r&  r`   r   warningr(  r   r   r   r   r   r   ry   r	  r   )r   r^   r   s      r   train_epoch_ranger+  g  s.     >>!! K	
 	
 	
 }-- 	 	AGGGG_$$}-- 	 	AGGGGJ
KK(J(()))
#-))++2
 
 
 %))++ 	 	AGGGG	 #d""""s   'A	C5 5C9c                 H    t          | t          j                  r| j        S | S r]   )r  r   r  r   )r  s    r   r  r    s$    $011 }Kr!   c                     t                       | j        J t          |          sd S t          |          }|j        J t          j        }t          | j        |j                  }t          j        t          k    r||v sJ d| dt                       d }||v r||         }|j	        t          t          j                  }t          | |          }|                    t                              |          |gt          j        |j        t          j                   t          |_	        t&                              d|            | |_        ||_        t                                          |_        nt3                      }t                                          |_        ||_        ||_        t8          |_	        | |_        ||_        | j        |_        |j        |_        |||<   t&                              d           t?                       d S )Nzwhen restored key:z must be in train_epoch_range:)rd   r   r   zload executor checkpoint z+not found checkpoint, so train from epoch 0) r&  r  r   r  r	  r   r#  r   r   r   r	   r   r
   r   r   rX   rd   r   rF   r   r   r   r   r   r   r   r   r   r   r   r   r%   )exer  r  r   r   r   ar  s           r   _auto_checkpointr0    s   NNN$000%%  &&G(444$0J
!7#@ C (,<<<j   YYYDWYY !   	A
jsO# 3 9::AC))A11#66$/.!*!4      0AKK7A77888
)--//)--//+
/!7 
3ABBBr!   )r   r]   )*r   r   rB   rK   r   	threadingr   paddle.baser   r   paddle.base.frameworkr   r   checkpoint_saverr	   r
   r   r	  r   r   UniqueNameGeneratorrx   r   r   r   r   r   r  r    r%   r'   r   r   r
  r  r   r#  r&  r(  r+  r  r0  r$   r!   r   <module>r6     s-     				 



  $ $ $ $ $ $ - - - - - - - - : : : : : : : : L L L L L L L L L L 		+K+--	    
   &  z$ z$ z$ z$ z$ z$ z$ z$zG# G# G# G# G#% G# G# G#TL$ L$ L$ L$ L$& L$ L$ L$^    " A  A  AF( ( (  ' ' '# # # #D  6 6 6 6 6r!   