
    Αi3V                        S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJrJr  S SKJr  SSKJr  \" SS5      rS	rS
rSrSr " S S5      r " S S5      r " S S5      r " S S5      rg)    )annotationsN)cloud_utilslaunch_utils)
get_logger   )getenv_or_backupINFOELASTICe   f   x   <   c                      \ rS rSrSrSrSrg)ElasticLevel,          N)__name__
__module____qualname____firstlineno__FAULT_TOLERANCEr
   __static_attributes__r       h/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/fleet/elastic/manager.pyr   r   ,   s    OGr   r   c                  (    \ rS rSrSrSrSrSrSrSr	g)	ElasticStatus1   	completederrorholdrestartexitr   N)
r   r   r   r   	COMPLETEDERRORHOLDRESTARTEXITr   r   r   r   r   r   1   s    IEDGDr   r   c                  8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
LauncherInterface9   c                    Xl         / U l        g Nargsprocs)selfr0   s     r   __init__LauncherInterface.__init__:   s    	
r   c                   [         R                  S:w  a  U R                   H  nUR                  R	                  5       b  M   [         R
                  " [         R                  " UR                  R                  5      [        R                  5        UR                  (       a  UR                  R                  5         [        R                  SUR                  R                   35        M     [        R                  " S5        U R                   H  nUR                  R	                  5       b  M   UR                  R!                  5         UR                  (       a  UR                  R                  5         [        R                  SUR                  R                   35        M     [#        SS5       H  nSnU R                   H[  nUR                  R	                  5       b  M   [         R$                  " UR                  R                  [        R&                  5        SnM]     U(       d  [        R                  S	5          g[        R                  " S5        M     g)
Nntzterminate process group gid:r   zterminate process id:r   2   FTzterminated all the procs)osnamer1   procpollkillpggetpgidpidsignalSIGTERMlog_fncloseloggerinfotimesleep	terminaterangekillSIGKILL)r2   pstepalives       r   _terminate_procs"LauncherInterface._terminate_procs>   sd   77d?ZZ66;;=(IIbjj4fnnExx(KK">qvvzzl KL   JJqMAvv{{}$  "88HHNN$3AFFJJ<@A  !RLDEZZ66;;=(GGAFFJJ7 E  
 67JJqM ! r   c                \   SnS nU R                    H  nUR                  R                  5       nUc  SnM$  US:w  d  M,  U[        :X  a  [        R                  S5        Us  $ [        R                  S5        [        R                  SUR                   SU S35        UnM     U(       d  Uc  gU$ )	NFTr   z+return form elastic auto parallel re-launchzABORT!!! ABORT!!! ABORT!!!zERROR rank z error with exit code z, check log for detail.)r1   r:   r;   ELASTIC_AUTO_PARALLEL_EXIT_CODErC   rD   r!   rank)r2   rM   resultrK   rets        r   _check_procsLauncherInterface._check_procs^   s    A&&++-C{99KK MNJ9:!!&&)?uD[\   Mr   c                    [         er.   NotImplementedErrorr2   s    r   launchLauncherInterface.launchs       !!r   c                    [         er.   rX   rZ   s    r   stopLauncherInterface.stopv   r]   r   c                    [         er.   rX   rZ   s    r   watchLauncherInterface.watchy   r]   r   r/   N)r   r   r   r   r3   rN   rU   r[   r_   rb   r   r   r   r   r+   r+   9   s!    @*"""r   r+   c                      \ rS rSrS r S       SS jjrSS jrS rSS jrS r	S r
SSS
 jjrS rS rS rS rS rS rS rS rS rSrg	)ElasticManager}   c                F  ^ ^^ UT l         UR                  =(       d    [        R                  " S5      nUR                  =(       d    [        R                  " S5      nT R                  UR                  5      u  T l        T l        UR                  =(       d    [        R                  " S5      nUR                  =(       d     [        [        R                  " SS5      5      nUR                  =(       d    [        R                  " S5      nU(       a  UOT R                  5       T l	        [        R                  " U5      u  T l        T l        [        [        R                  " S[$        5      5      T l        [        [        R                  " S[(        5      5      mS T l        [,        R.                  " 5       (       a  [        R                  " S	S
5      T l        [3        T R0                  R5                  S5      5      T l        [        [        R                  " SS5      5      T l        [        R                  " SS
5      T l        [9        SS
5      nUR5                  S5      T l        OUR<                  =(       d    [        R                  " S	S
5      T l        T R0                  R5                  S5      n	[3        U	5      T l        [        [        R                  " SS5      5      T l        T R?                  U	T R"                  T R*                  5      T l        U	 V
s/ s H  o ST R*                   3PM     sn
T l        T R                   ST R*                   3T l         [B        RE                  ST R                   35        [B        RE                  ST R0                   ST R:                   35        [        [        R                  " S[F        RH                  5      5      T l%        T R                  T R                  :X  d   T R                  S:  a:  T R                  S:X  a*  [F        RH                  T l%        [B        RE                  S5        T R                  S:  aD  T R                  T R                  :  a*  [F        RL                  T l%        [B        RE                  S5        U(       dp  [        R                  " S5      (       aU  [        R                  " S5      (       a:  SRO                  [        R                  " S5      [        R                  " S5      5      n[B        RQ                  SU SU 35        / T l)        ST l*        ST l+        ST l,        S T l-        U(       a  SU;  d  U(       a  T R                  (       d0  [B        RE                  SU SU S T R                   35        ST l.        g S!T l.        UT l/        S"U-   T l0        T R`                  S#-   T l1        T R`                  S$-   T l2        T R`                  S%-   T l3        S
Ri                  S& [k        S'5       5       5      nT Rb                   S(U [l        Rl                  " 5        3T l7         T R^                  Rq                  T R`                  S)5        U 4S* jnT R^                  Rs                  T Rb                  U5      nT R^                  Ru                  T5      mUUU 4S+ jn[v        Rx                  " S,US!S-9nUR{                  5         T R^                  Rq                  T Rn                  T R@                  R}                  S.5      TS/9  T R^                  Rq                  T Rf                  T R6                   S0T R0                   3R}                  S.5      5        U 4S1 jnT R^                  R                  T Rf                  U5      nUU/T l@        S T lA        g s  sn
f )2NPADDLE_ELASTIC_SERVERPADDLE_ELASTIC_JOB_IDPOD_IPPADDLE_ELASTIC_SCALEr   PADDLE_ELASTIC_FORCEPADDLE_ELASTIC_TIMEOUTPADDLE_ELASTIC_TTLPADDLE_TRAINERS ,PADDLE_PORT6170DISTRIBUTED_TRAINER_ENDPOINTSPADDLE_TRAINER_ENDPOINTSFLAGS_START_PORT:zstart job with np=z	trainers=z, trainer_endpoints_list=#PADDLE_ELASTIC_FAULT_TOLERANC_LEVELz+start job with ElasticLevel.FAULT_TOLERANCEz#start job with ElasticLevel.ELASTIC PADDLE_ELASTIC_ETCD_SERVICE_HOST PADDLE_ELASTIC_ETCD_SERVICE_PORTz{}:{}zinit with server z host Fz#Elastic is not enabled with server z name z and np Tz/paddle/z/nodesz/npz
/endpointsc              3  N   #    U  H  n[         R                  " S 5      v   M     g7f)abcdefghijklmnopqrstuvwxyzN)randomchoice).0_s     r   	<genexpr>*ElasticManager.__init__.<locals>.<genexpr>   s       
AIAFMM677s   #%   /   0c                  > TR                   R                  TR                  5       Vs/ s H  oS   R                  5       PM     snTl        TR                  (       a  [        [        TR                  5      5      OTR                  Tl        [        R                  STR                   STR                   35        STl
        S Tl        g s  snf )Nr   zhost_call_back curr_host=z, hosts:T)etcd
get_prefixnode_prefixdecodehostslistsetrC   rD   	curr_host	need_syncelastic_startup_time)eventir2   s     r   host_call_back/ElasticManager.__init__.<locals>.host_call_back   s    '+yy';';D<L<L'M'M!!'MDJ 37**c$**o.$**DJKK+DNN+;8DJJ<P "DN(,D%s   Cc                   >   TR                  5         TR                  R                  TR                  5       V s/ s H  n U S   R	                  5       PM     nn U(       a  [        [        U5      5      OUn[        R                  STR                   SU 35        TR                  U;  a`  [        R                  STR                   35        TR                  R                  TR                  TR                  R                  S5      TS9  ["        R$                  " TS	-  5        GM%  s  sn f ! [         a8  n[        R                  SU S[        R                   " 5        35         S nAg S nAff = f)
Nr   z[lease_heartbeat] curr_host=, hosts=z [lease_heartbeat] register host=latin-1leasez![lease_heartbeat] internal error: r   )refreshr   r   r   r   r   r   rC   rD   r   put	host_pathencode	Exceptionr!   	traceback
format_excrE   rF   )r   r   eelastic_ttl
host_leaser2   s      r   lease_heartbeat0ElasticManager.__init__.<locals>.lease_heartbeat   sB   &&( "&!5!5d6F6F!G!GA !!G   16DU,5EKK6t~~6FhugV ~~U2>t~~>NO 		 NN NN11)<", &  

;?+5 " ! LL;A3a	@T@T@V?WX 	s)   7D- D(B4D- (D- -
E/7.E**E/r   )r9   targetdaemonr   r   |c                l  > TR                   (       d  g TR                  R                  TR                  5      S   nUb  UR	                  5       OSnUR                  S5      u  Tl         Tl        [        R                  STR                    S35        [        R                  STR                   S35        g )Nr   rp   r   z"set DISTRIBUTED_TRAINER_ENDPOINTS r   zset PADDLE_TRAINERS )	dist_endpointsr   getendpoints_pathr   splittrainersrC   rD   )r   valueedpsr2   s      r   endpoints_call_back4ElasticManager.__init__.<locals>.endpoints_call_back*  s    &&IIMM$"5"56q9E%*%65<<>BD15C.DKK4T5H5H4IK KK.t}}oQ?@r   )Br0   elastic_serverr8   getenvjob_id	_parse_npnpmin_npmax_nphostscaleintforce	_get_hostr   get_device_proc_infodevice_modedevices_per_procELASTIC_TIMEOUTelastic_timeoutELASTIC_TTL
start_portr   use_paddlecloudr   lenr   r   r   trainer_endpoints_listips_host_to_endpointsr   rC   rD   r   r   elastic_levelr
   formatdebugr   stoppedsigintr   r   enabler   prefixr   np_pathr   joinrH   rE   r   r   add_watch_prefix_callbackr   	threadingThreadstartr   add_watch_callbackwatcheslauncher)r2   r0   etcd_clientserverr9   r   r   r   trainer_endpointsnode_ipsipnode_tagr   
host_watchr   keepalived_threadr   endpoints_watchr   r   s   `                 @@r   r3   ElasticManager.__init__~   s   	$$J		2I(J{{@bii(?@#'>>$''#: T[yy/BIIh/

Gc")),BA"FG

?bii(>? Ddnn&6	
 --d3	
!  #II.@ 
 "))$8+FG&&((II&7<DM$----c23DG!"))M6"BCDO"$)),KR"PD 01KR P*;*A*A#*FD' HHH		2CR(HDM}}**3/H(mDG!")),>"GHDO"&"9"9$//#D 5=+4<b$a()H+D' !II;a'89(	23&?@[@[?\]	
 !II5,,
 ;;$++%$++/dkkQ>N!-!=!=DKKEF;;?t{{T[[8!-!5!5DKK=> 		<==		<==^^		<=		<=F
 	(tf=>
$(!F*$dggKK5fXVD6RVRYRYQZ[  DKDK	 !4';;1{{U*"kkL877 
AFq
 
 !,,-QxjF	 			dkk4(		- YY88n

 YY__[1
	,: &,,"?4
 	!		NNDNN11)<J 	 	

 			""#1T]]O4;;IF	

		A ))66!4
 #O4o+s   ^c           
     D   / nU H  nUR                  S5      n[        U5      S:X  a  US   n[        US   5      nOUnUn[        [	        X[        U5      -   5      5      n	UR                  U	 Vs/ s H	  o SU 3PM     sn5        M     SR                  U5      n
U
$ s  snf )Nrw   r   r   r   rq   )r   r   r   r   rH   extendr   )r2   ip_port_listr   r   endpoint_listip_port	endpointsr   portportsr   s              r   r   !ElasticManager._host_to_endpoints<  s     #Gc*I9~"q\9Q<(!tC0@,A%ABCE  U!CUTD$.U!CD $ -0 "Ds   /B
c                t   [         R                  SU 35        U R                  (       a  U R                  R                  5         U R                  (       d  g U(       a&  U R
                  R                  U R                  S5        U R                   H  nU R
                  R                  U5        M      U R
                  R                  U R                  5        [        U R
                  R                  U R                  5      5      n[        U5      S:X  a&  U R
                  R!                  U R                  5        g g )Nzmanager exist completed    1r   )rC   rD   r   r_   r   r   r   r   r   cancel_watchdeleter   r   r   r   r   delete_prefix)r2   r    rb   r   s       r   r$   ElasticManager.exitO  s    .yk:;==MM {{IIMM$++t,\\EII""5) "		(TYY))$*:*:;<u:?II##DKK0 r   c                H   U R                   R                  (       d  [        R                  S5        g [        R                  S5        [        R                  " [
        R                  R	                  5       5      n[        R                  " U R                   R                  U[        R                  [        R                  SS9R                  5       u  p#U(       a  [        R                  S5        g [        R                  SUR                  S5      R                  5        35        g )Nzskip pre_hookzexecute pre_hook...T)envstdoutstderrshellzpre_hook exec failedzpre_hook exec result: zutf-8)r0   elastic_pre_hookrC   rD   copyr8   environ
subprocessPopenPIPEcommunicatewarningr   strip)r2   current_envouterrs       r   pre_hookElasticManager.pre_hookc  s    yy))KK()*ii

 12##II&&????
 +- 	 NN12KK0G1D1J1J1L0MNOr   c                f   U=(       d    [         R                  " SS5      nUR                  S5      nS=pE[        U5      S:X  a  [	        US   5      nUS::  a  SOUnSnXE4$ [        U5      S:X  a4  [	        US   5      n[	        US   5      nUS::  a  SOUn[        XT5      nXE4$ [        SU S35      e)	z!
np format is "MIN" or "MIN:MAX"
PADDLE_ELASTIC_NP0rw   r   r   r   zthe np=z) needs to be in "MIN" or "MIN:MAX" format)r8   r   r   r   r   max
ValueError)r2   r   np_strnp_dictr   r   s         r   r   ElasticManager._parse_npu  s     :ryy!4c:,,s#w<1_F A+Q6FF ~ \Q_F_F A+Q6F(F ~	 "FG r   c                     [         R                  " [         R                  " [         R                  " 5       5      5      $ !    g= f)Nz	127.0.0.1)socketgethostbynamegetfqdngethostnamerZ   s    r   r   ElasticManager._get_host  s4    	''v7I7I7K(LMM	s	   <? Ac                    U R                   (       d  g[        U R                  R                  U R                  5      S   5      S:H  $ )NTr   r   )r   r   r   r   r   rZ   s    r   
_completedElasticManager._completed  s2    {{499==-a01Q66r   Nc           
        U(       a  Xl         OKU R                  R                  U R                  5       Vs/ s H  o"S   R	                  5       PM     snU l         U R                   (       a  [        [        U R                   5      5      OU R                   U l         U R                  [        R                  :X  a%  [        U R                   5      U R                  :X  a  ggU R                  [        R                  :X  a  [        U R                   5      nX0R                  :X  a  gU R                  (       d  [        R                  " 5       U l        X0R                  :X  a  S U l        gX0R                   :  aw  X0R                  :  ah  [        R                  " 5       U R                  -
  nX@R"                  ::  a6  [$        R'                  SU SU R                    SU SU R"                   35        ggS U l        ggs  snf )Nr   TFzawait for timeout, you can set value by PADDLE_ELASTIC_TIMEOUT,                         hosts_num=z	, min_np=z(,                         interval_time=z, elastic_timeout=)r   r   r   r   r   r   r   r   r   r   r   r   r
   r   rE   r   r   r   rC   rD   )r2   	host_listr   	hosts_numinterval_times        r   _matchElasticManager._match  s   "J (,yy';';D<L<L'M'M!!'MDJ /3jjT#djj/*djj
!=!==4::$'')!5!55DJJIGG#,,,0IIK)KK',0)kk)i++.E $		d.G.G G $8$88KK##,+Yt{{m D''4o5GH\H\G]_
 !,0)Gs   G1c                x    U R                   R                  U R                  U SU 3R                  S5      5        g )Nr   r   )r   r   r   r   )r2   r   r   s      r   _update_endpointElasticManager._update_endpoint  s4    		k5'")))4	
r   c                   [        [        R                  " SS5      5      n[        R	                  SU R
                   SU R                   35        U R
                  U R                  ;   a  U R                  [        R                  S'   U R                  [        R                  S'   [        R                  SU R                   S35        [        R                  S	U R                   S35        g U R                  R                  U R
                  5      nUS
:  a6  U R                  U   U R                  U'   U R
                  U R                  U'   OU [        R                  S'   SR                  U R                   Vs/ s H  o3R                  S5      S
   PM     sn5      nX@R                  l        U[        R                  S'   g s  snf )NPADDLE_TRAINER_IDzself.curr_host=z, self.dist_endpoints=rt   ro   z)update env DISTRIBUTED_TRAINER_ENDPOINTS r   zupdate env PADDLE_TRAINERS r   rq   rw   )r   r8   r   rC   r   r   r   r   r   rD   r   indexr   r   r0   r   )r2   rR   idx	host_portr   s        r   _update_fault_tolerance&ElasticManager._update_fault_tolerance  sc   2990"56dnn--CDDWDWCXY	
 >>T000:>:M:MBJJ67,0MMBJJ()KK;D<O<O;PPQR KK5dmm_AFG jjt~~. 19"jj.DJJsO#~~DJJt14BJJ*+4::N:i//#.q1:NO		(-

$% Os   9Gc           
        [         R                  " U R                  5      n[        R	                  S[        U R                  5       SU R                   SU R                   SU 35        U R                   H  nX!;  d  M
  UR                  U5        M     [        UR                  U R                  5      5      [        R                  S'   SR                  U Vs/ s H  o3R                  S5      S   PM     sn5      nX@R                   l        U[        R                  S	'   [        U5      U l        SR                  U5      [        R                  S
'   U R$                  [        R                  S'   Xl        g s  snf )Nzelastic scale out, from  to r   , host_endpoints=r#  rq   rw   r   ro   ru   rt   )r   deepcopyr   rC   rD   r   r   r   appendstrr%  r   r8   r   r   r   r0   r   r   )r2   host_endpointscurr_host_portr'  r   s        r   _update_elastic_scale_out(ElasticManager._update_elastic_scale_out  s+   t'B'BC&s4::&7tDGG9HTZZLXijxiyz	
 #jjN3%%n5 ) +.  0+


&' 6DEn__S!!$nE
 		(-

$%n%14.1I

-.6:6I6I

23&4# Fs   E5c           
        [         R                  " U R                  5      n[        R	                  SU R
                   S[        U R                  5       SU R                   SU 35        0 n/ n[        U R                  5       H_  u  pEUR                  U5      nU[        U R                  5      S-
  ::  a  UR                  U5      (       d  XRU'   MN  UR                  U5        Ma     Sn/ n[        [        U R                  5      5       HS  nUR                  U5      (       d  [        U5      S:  a  X7   X&'   US-  nUR                  UR                  U5      5        MU     [        R	                  SU 35        Xl        U V	s/ s H  oR                  S5      S   PM     n
n	S	R                  U
5      nU R                  XR                   5      nXR"                  l        ['        UR                  U R(                  5      5      [*        R,                  S
'   U[*        R,                  S'   [        U5      U l        S	R                  U5      [*        R,                  S'   U[*        R,                  S'   U R/                  X5        g s  sn	f )Nzelastic scale in, from r+  r   r,  r   r   z#elastic scale in, sorted_endpoints=rw   rq   r#  ro   ru   rt   )r   r-  r   rC   rD   r   r   r   	enumerater%  r   r.  rH   r   r   r   r   r0   r   r/  r   r8   r   r   )r2   r0  endpoints_dictunsorted_endpointsidr'  r&  
idle_indexsorted_endpointsr   ip_listr   new_endpointss                r   _update_elastic_scale_in'ElasticManager._update_elastic_scale_in  s    t'B'BC%dggYd3tzz?2C8DJJ<Whiwhxy	
 &tzz2MB &&y1Cc$**o)).2D2DS2I2I&/s#")))4 3 
TZZ)C!%%c**s3E/F/J&8&D#a
##N$6$6s$;< * 	9:J9KLM&6#8HI8HW==%a(8HI!//33
 		*-""4>>2+


&' ).

$%&'14:J1K

-.6C

23m3 Js    I:c                   [        U R                  5      S:w  d   S5       eU R                  [        R                  :X  a  U R                  5         g [        U R                  5      U R                  :X  a3  [        R                  SU R                   35        U R                  5         g [        U R                  5      U R                  :  a  U R                  5         g U R                  5         g )Nr   zhosts emptyzelastic startup, hosts=)r   r   r   r   r   r(  r   rC   rD   r2  r=  rZ   s    r   _update_hostsElasticManager._update_hosts,  s    4::!#2]2#!=!==((* 4::$'')5djj\BC,,.TZZ477*..0 --/r   c                   U R                   (       d  g SnU R                  (       d  U R                  5       (       a3  [        R	                  SU R
                   35        U R                  5         g [        R	                  SU R                   SU R
                   35        US-  n[        R                  " S5        U R                  (       d  M  g )Nr   zready with hosts znot ready for np z with hosts r   )
r   r   r  rC   rD   r   r@  r   rE   rF   )r2   r&  s     r   waitElasticManager.wait=  s    {{,,{{}}/

|<=""$KK+DGG9LMN1HCJJqM ,,, 	r   c                    U R                   (       a  g U" U R                  5      U l        U R                  R                  5         g r.   )r   r0   r   r[   )r2   r   s     r   runElasticManager.runL  s,    << +r   c                   U R                   (       a  SU l         U R                  (       Gd  U R                  R                  5       n[        R                  SU 35        Ub  [        R                  SU 35        U[        :X  a?  [        R                  S5        U R                  R                  5         [        R                  $ US:X  a  SOSnU R                  US9  U(       a  [        R                  $ U R                  [        R                  :X  a  [        R                   $ [        R"                  $ U R%                  5       (       dP  U R'                  5       (       a  U R                   (       a*  U R                  R                  5         [        R                  $ [(        R*                  " S5        U R                  (       d  GM  U R                  (       a  U R                  R                  5         [        R,                  $ )	NFzlauncher.watch():zjob exit with code zjob re-launch for auto parallelr   T)r    r   )r   r   r   rb   rC   r   rD   rQ   r_   r   r'   r$   r%   r   r   r   r(   r&   r  r  rE   rF   r)   )r2   rT   r    s      r   rb   ElasticManager.watchS  sV   >>"DN,,,--%%'CLL,SE231#7899KK ABMM&&((--- %(1HD%			I	.(222%%)E)EE(000(...??$$dkkmmt~~""$$)))JJqM3 ,,,6 ==MM !!!r   c                `    U R                   (       a  U R                  5         Xl        SU l        g )NT)r   r$   r   r   )r2   r   frames      r   signal_handlerElasticManager.signal_handlerw  s    ;;IIKr   )r0   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )i  )r   r   r   r   r   r   returnr/  )F)r   r/  r.   )r  zlist | None)r   r   r   r   r3   r   r$   r  r   r   r  r  r   r(  r2  r=  r@  rC  rF  rb   rL  r   r   r   r   re   re   }   s    |~ MQ 48FI	&1(P$27'R
.65./4b0"""Hr   re   )
__future__r   r   r8   r}   r?   r  r   r   rE   r   paddle.distributed.fleetr   r   "paddle.distributed.utils.log_utilsr   
backup_envr   rC   ELASTIC_EXIT_CODErQ   r   r   r   r   r+   re   r   r   r   <module>rT     s    #  	        > 9 *	FI	& "%    
 A" A"H~ ~r   