
    x-j3V                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ ddlmZ  edd          Zd	Zd
ZdZdZ G d d          Z G d d          Z G d d          Z G d d          ZdS )    )annotationsN)cloud_utilslaunch_utils)
get_logger   )getenv_or_backupINFOELASTICe   f   x   <   c                      e Zd ZdZdZdS )ElasticLevel      N)__name__
__module____qualname__FAULT_TOLERANCEr
        h/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/fleet/elastic/manager.pyr   r   ,   s        OGGGr   r   c                  "    e Zd ZdZdZdZdZdZdS )ElasticStatus	completederrorholdrestartexitN)r   r   r   	COMPLETEDERRORHOLDRESTARTEXITr   r   r   r   r   1   s'        IEDGDDDr   r   c                  2    e Zd Zd Zd Zd Zd Zd Zd ZdS )LauncherInterfacec                "    || _         g | _        d S N)argsprocs)selfr*   s     r   __init__zLauncherInterface.__init__:   s    	


r   c                   t           j        dk    r| j        D ]}|j                                        t          j        t          j        |j        j                  t          j	                   |j
        r|j
                                         t                              d|j        j                    t          j        d           | j        D ]{}|j                                        `|j                                         |j
        r|j
                                         t                              d|j        j                    |t#          dd          D ]}d}| j        D ]F}|j                                        +t          j        |j        j        t          j                   d}G|st                              d	            dS t          j        d           dS )
Nntzterminate process group gid:r   zterminate process id:r   2   FTzterminated all the procs)osnamer+   procpollkillpggetpgidpidsignalSIGTERMlog_fncloseloggerinfotimesleep	terminaterangekillSIGKILL)r,   pstepalives       r   _terminate_procsz"LauncherInterface._terminate_procs>   s   7d??Z M M6;;==(Ibj44fnEEEx )(((KK Kqvz K KLLLJqMMM 	B 	BAv{{}}$  """8 %HNN$$$@AFJ@@AAA!RLL 	 	DEZ ! !6;;==(GAFJ777 E 6777ttJqMMMMur   c                L   d}d }| j         D ]}|j                                        }|d} |dk    rk|t          k    rt                              d           |c S t                              d           t                              d|j         d| d           |}|s|dS |S )	NFTr   z+return form elastic auto parallel re-launchzABORT!!! ABORT!!! ABORT!!!zERROR rank z error with exit code z, check log for detail.)r+   r3   r4   ELASTIC_AUTO_PARALLEL_EXIT_CODEr<   r=   r   rank)r,   rF   resultrD   rets        r   _check_procszLauncherInterface._check_procs^   s     	 	A&++--C{999KK MNNNJJJ9:::\!&\\\\\    	1Mr   c                    t           r)   NotImplementedErrorr,   s    r   launchzLauncherInterface.launchs       !!r   c                    t           r)   rO   rQ   s    r   stopzLauncherInterface.stopv   rS   r   c                    t           r)   rO   rQ   s    r   watchzLauncherInterface.watchy   rS   r   N)	r   r   r   r-   rG   rM   rR   rU   rW   r   r   r   r'   r'   9   so            @  *" " "" " "" " " " "r   r'   c                      e Zd Zd Z	 dd d
Zd!dZd Zd"dZd Zd Z	d#d$dZ
d Zd Zd Zd Zd Zd Zd Zd Zd ZdS )%ElasticManagerc                |    | _         |j        pt          j        d          }|j        pt          j        d          }                     |j                  \   _         _        |j	        pt          j        d          }|j
        p!t          t          j        dd                    }|j        pt          j        d          }|r|n                                  _	        t          j        |          \   _         _        t          t          j        dt$                               _        t          t          j        dt(                              d  _        t-          j                    rt          j        d	d
           _        t3           j                            d                     _        t          t          j        dd                     _        t          j        dd
           _        t9          dd
          }|                    d           _        n|j        pt          j        d	d
           _         j                            d          }	t3          |	           _        t          t          j        dd                     _                             |	 j         j                   _         fd|	D              _         j	         d j          _         tB          "                    d j                    tB          "                    d j         d j                    t          t          j        dtF          j$                             _%         j         j        k    s j        dk    r6 j        dk    r+tF          j$         _%        tB          "                    d            j        dk    r; j         j        k    r+tF          j&         _%        tB          "                    d           |sbt          j        d          rNt          j        d          r:d'                    t          j        d          t          j        d                    }tB          (                    d| d|            g  _)        d _*        d _+        d _,        d  _-        |rd|vs	|r j        s1tB          "                    d| d | d! j                    d _.        d S d" _.        | _/        d#|z    _0         j0        d$z    _1         j0        d%z    _2         j0        d&z    _3        d
4                    d' tk          d(          D                       }
 j1         d)|
 tm          j6                      _7        	  j/        8                     j0        d*            fd+} j/        9                     j1        |          } j/        :                               fd,}tw          j<        d-|d".          }|=                                  j/        8                     j7         j         >                    d/          0            j/        8                     j3         j         d1 j         >                    d/                      fd2} j/        ?                     j3        |          }||g _@        d  _A        d S )3NPADDLE_ELASTIC_SERVERPADDLE_ELASTIC_JOB_IDPOD_IPPADDLE_ELASTIC_SCALEr   PADDLE_ELASTIC_FORCEPADDLE_ELASTIC_TIMEOUTPADDLE_ELASTIC_TTLPADDLE_TRAINERS ,PADDLE_PORT6170DISTRIBUTED_TRAINER_ENDPOINTSPADDLE_TRAINER_ENDPOINTSFLAGS_START_PORTc                (    g | ]}| d j          S :)
start_port).0ipr,   s     r   
<listcomp>z+ElasticManager.__init__.<locals>.<listcomp>   s6     + + +.02))))+ + +r   rl   zstart job with np=z	trainers=z, trainer_endpoints_list=#PADDLE_ELASTIC_FAULT_TOLERANC_LEVELz+start job with ElasticLevel.FAULT_TOLERANCEz#start job with ElasticLevel.ELASTIC PADDLE_ELASTIC_ETCD_SERVICE_HOST PADDLE_ELASTIC_ETCD_SERVICE_PORTz{}:{}zinit with server z host Fz#Elastic is not enabled with server z name z and np Tz/paddle/z/nodesz/npz
/endpointsc              3  >   K   | ]}t          j        d           V  dS )abcdefghijklmnopqrstuvwxyzN)randomchoice)rn   _s     r   	<genexpr>z*ElasticManager.__init__.<locals>.<genexpr>   s>       
 
<=FM677
 
 
 
 
 
r      /   0c                <   d j                             j                  D             _        j        r!t	          t          j                            nj        _        t                              dj         dj                    d_	        d _
        d S )Nc                B    g | ]}|d                                           S r   decodern   is     r   rp   zCElasticManager.__init__.<locals>.host_call_back.<locals>.<listcomp>   1       "#!  r   zhost_call_back curr_host=z, hosts:T)etcd
get_prefixnode_prefixhostslistsetr<   r=   	curr_host	need_syncelastic_startup_time)eventr,   s    r   host_call_backz/ElasticManager.__init__.<locals>.host_call_back   s     '+y';';D<L'M'M  DJ 37*Lc$*oo...$*DJKKPDNPPDJPP   "DN(,D%%%r   c                    	 	                                   d j                            j                  D             } | rt	          t          |                     n| } t                              dj         d|             j        | vr\t                              dj                    j        	                    j
        j                            d                     nI# t          $ r<}t                              d| d	t          j                                Y d }~d S d }~ww xY wt!          j        d
z             K)NTc                B    g | ]}|d                                           S r   r   r   s     r   rp   zDElasticManager.__init__.<locals>.lease_heartbeat.<locals>.<listcomp>  s4        !  r   z[lease_heartbeat] curr_host=, hosts=z [lease_heartbeat] register host=latin-1leasez![lease_heartbeat] internal error: r   )refreshr   r   r   r   r   r<   r=   r   put	host_pathencode	Exceptionr   	traceback
format_excr>   r?   )r   eelastic_ttl
host_leaser,   s     r   lease_heartbeatz0ElasticManager.__init__.<locals>.lease_heartbeat   s   ,&&((( !%!5!5d6F!G!G  E 16@DU,,,5EKKVt~VVuVV   ~U22Ot~OO   	 N N11)<<", &   
 !   LLXAXX	@T@V@VXX   EEEEE	
 
;?+++5,s   C'C, ,
D261D--D2r   )r2   targetdaemonr   r   |c                d   j         sd S j                            j                  d         }||                                nd}|                    d          \  _         _        t                              dj          d           t                              dj         d           d S )Nr   rc   r   z"set DISTRIBUTED_TRAINER_ENDPOINTS r   zset PADDLE_TRAINERS )	dist_endpointsr   getendpoints_pathr   splittrainersr<   r=   )r   valueedpsr,   s      r   endpoints_call_backz4ElasticManager.__init__.<locals>.endpoints_call_back*  s    & IMM$"566q9E%*%65<<>>>BD15C.DKKKT5HKKK   KK?t}???@@@@@r   )Br*   elastic_serverr1   getenvjob_id	_parse_npnpmin_npmax_nphostscaleintforce	_get_hostr   get_device_proc_infodevice_modedevices_per_procELASTIC_TIMEOUTelastic_timeoutELASTIC_TTLrm   r   use_paddlecloudr   lenr   r   r   trainer_endpoints_listips_host_to_endpointsr   r<   r=   r   r   elastic_levelr
   formatdebugr   stoppedsigintr   r   enabler   prefixr   np_pathr   joinrA   r>   r   r   add_watch_prefix_callbackr   	threadingThreadstartr   add_watch_callbackwatcheslauncher)r,   r*   etcd_clientserverr2   r   r   r   trainer_endpointsnode_ipsnode_tagr   
host_watchr   keepalived_threadr   endpoints_watchr   r   s   `                @@r   r-   zElasticManager.__init__~   s   	$J	2I(J(J{@bi(?@@#'>>$'#:#: T[y/BIh//
Gc"),BA"F"FGG
?bi(>?? 6DDdnn&6&6	
 -d33	
!  #I.@@ 
  
 ")$8+FFGG&(( 	I&7<<DM$---c2233DG!")M6"B"BCCDO"$),KR"P"PD 01KR P P*;*A*A#*F*FD'' HH	2CR(H(HDM}**3//H(mmDG!"),>"G"GHHDO"&"9"9$/# #D+ + + +4<+ + +D' !I9999222333]]]@[]]	
 	
 	
 !I5, 
 
 ;$+%%$+//dkQ>N>N!-!=DKKEFFF;??t{T[88!-!5DKK=>>> 		<==	 	<==	
 ^^	<==	<== F
 	===t==>>>
$(! 	F**$*dg*KK[f[[D[[RVRY[[    DKFDK	 !4';1{U*"kL877 
 
AFq
 
 
 
 
 !,FFxFFF	 		dk4(((		- 		- 		- 		- 		- Y88n
 

 Y__[11
	, 	, 	, 	, 	, 	, 	,: &,"?4
 
 
 	!!!	NDN11)<<J 	 	
 	
 	

 		"44T]44;;IFF	
 	
 	

		A 		A 		A 		A 		A )66!4
 
 #O4r     ip_port_listr   r   rm   r   returnstrc           
     p  
 g }|D ]}|                     d          }t          |          dk    r|d         
t          |d                   }n|
|}t          t	          ||t          |          z                       }|                    
fd|D                        d                    |          }	|	S )Nrl   r   r   r   c                    g | ]	} d | 
S rk   r   )rn   portro   s     r   rp   z5ElasticManager._host_to_endpoints.<locals>.<listcomp>J  s#    !C!C!CTR..$..!C!C!Cr   rd   )r   r   r   r   rA   extendr   )r,   r   r   rm   endpoint_listip_port	endpointsr   portsr   ro   s             @r   r   z!ElasticManager._host_to_endpoints<  s     # 
	E 
	EGc**I9~~""q\9Q<((!tTC0@,A,A%ABBCCE  !C!C!C!CU!C!C!CDDDD-00r   Fc                   t                               d|            | j        r| j                                         | j        sd S |r | j                            | j        d           | j        D ]}| j        	                    |           | j        
                    | j                   t          | j                            | j                            }t          |          dk    r!| j                            | j                   d S d S )Nzmanager exist completed    1r   )r<   r=   r   rU   r   r   r   r   r   cancel_watchdeleter   r   r   r   r   delete_prefix)r,   r   rW   r   s       r   r    zElasticManager.exitO  s    :y::;;;= 	!M   { 	F 	-IMM$+t,,,\ 	* 	*EI""5))))	(((TY))$*:;;<<u::??I##DK00000 ?r   c                B   | j         j        st                              d           d S t                              d           t	          j        t
          j                                                  }t          j        | j         j        |t          j	        t          j	        d          
                                \  }}|rt                              d           d S t                              d|                    d                                                      d S )Nzskip pre_hookzexecute pre_hook...T)envstdoutstderrshellzpre_hook exec failedzpre_hook exec result: zutf-8)r*   elastic_pre_hookr<   r=   copyr1   environ
subprocessPopenPIPEcommunicatewarningr   strip)r,   current_envouterrs       r   pre_hookzElasticManager.pre_hookc  s    y) 	KK(((F)***i
 1 122#I&??
 
 
 +-- 	S  	PNN122222KKNG1D1D1J1J1L1LNNOOOOOr   r   c                   |pt          j        dd          }|                    d          }dx}}t          |          dk    r"t	          |d                   }|dk    rdn|}d}nkt          |          dk    rEt	          |d                   }t	          |d                   }|dk    rdn|}t          ||          }nt          d| d          ||fS )	z1
        np format is "MIN" or "MIN:MAX"
        PADDLE_ELASTIC_NP0rl   r   r   r   zthe np=z) needs to be in "MIN" or "MIN:MAX" format)r1   r   r   r   r   max
ValueError)r,   r   np_strnp_dictr   r   s         r   r   zElasticManager._parse_npu  s     :ry!4c::,,s##w<<1__F A++QQ6FFF\\Q__F__F A++QQ6F((FFG"GGG   v~r   c                    	 t          j        t          j        t          j                                        S #  Y dS xY w)Nz	127.0.0.1)socketgethostbynamegetfqdngethostnamerQ   s    r   r   zElasticManager._get_host  s;    	'v7I7K7K(L(LMMM	;;s   69 >c                    | j         sdS t          | j                            | j                  d                   dk    S )NTr   r   )r   r   r   r   r   rQ   s    r   
_completedzElasticManager._completed  s9    { 	449==--a011Q66r   N	host_listlist | Nonec           
        |r|| _         n.d | j                            | j                  D             | _         | j         r!t	          t          | j                             n| j         | _         | j        t          j        k    r!t          | j                   | j
        k    rdS dS | j        t          j        k    rt          | j                   }|| j
        k    rdS | j        st          j                    | _        || j        k    r	d | _        dS || j        k    re|| j        k     rZt          j                    | j        z
  }|| j        k    r2t"                              d| d| j         d| d| j                    dS dS d | _        dS dS )Nc                B    g | ]}|d                                           S r   r   r   s     r   rp   z)ElasticManager._match.<locals>.<listcomp>  r   r   TFzawait for timeout, you can set value by PADDLE_ELASTIC_TIMEOUT,                         hosts_num=z	, min_np=z(,                         interval_time=z, elastic_timeout=)r   r   r   r   r   r   r   r   r   r   r   r
   r   r>   r   r   r   r<   r=   )r,   r  	hosts_numinterval_times       r   _matchzElasticManager._match  s    	"DJJ '+y';';D<L'M'M  DJ /3jHT#dj//***dj
!===4:$'))tu!555DJIDG##t, 8,0IKK)DK'',0)tdk))i$+.E.E $	d.G G D$888KK_#,_ _7;{_ _'4_ _HLH\_ _  
 !5t,0)uur   c                v    | j                             | j        | d|                     d                     d S )Nr   r   )r   r   r   r   )r,   r   r   s      r   _update_endpointzElasticManager._update_endpoint  sI    	""5"")))44	
 	
 	
 	
 	
r   c                   t          t          j        dd                    }t                              d| j         d| j                    | j        | j        v rp| j        t          j        d<   | j        t          j        d<   t          	                    d| j         d           t          	                    d	| j         d           d S | j
                            | j                  }|d
k    r%| j
        |         | j
        |<   | j        | j
        |<   n| t          j        d<   d                    d | j
        D                       }|| j        _        |t          j        d<   d S )NPADDLE_TRAINER_IDzself.curr_host=z, self.dist_endpoints=rg   rb   z)update env DISTRIBUTED_TRAINER_ENDPOINTS r   zupdate env PADDLE_TRAINERS r   rd   c                D    g | ]}|                     d           d         S rl   r   r   rn   	host_ports     r   rp   z:ElasticManager._update_fault_tolerance.<locals>.<listcomp>  s)    NNNi)//#..q1NNNr   )r   r1   r   r<   r   r   r   r   r   r=   r   indexr   r*   r   )r,   rJ   idxr   s       r   _update_fault_tolerancez&ElasticManager._update_fault_tolerance  s]   290"5566YdnYYDDWYY	
 	
 	
 >T000:>:MBJ67,0MBJ()KKRD<ORRR   KKFdmFFFGGGF jt~.. 199"j.DJsO#~DJt14hBJ*+NN4:NNNOO	(-
$%%%r   c           
        t          j        | j                  }t                              dt          | j                   d| j         d| j         d|            | j        D ]}||vr|                    |           t          |
                    | j                            t          j        d<   d                    d |D                       }|| j        _        |t          j        d<   t          |          | _        d                    |          t          j        d	<   | j        t          j        d
<   || _        d S )Nzelastic scale out, from  to r   , host_endpoints=r  rd   c                D    g | ]}|                     d           d         S r  r  r  s     r   rp   z<ElasticManager._update_elastic_scale_out.<locals>.<listcomp>  s)    EEEY__S!!!$EEEr   rb   rh   rg   )r   deepcopyr   r<   r=   r   r   r   appendr   r  r   r1   r   r   r*   r   r   )r,   host_endpointscurr_host_portr   s       r   _update_elastic_scale_outz(ElasticManager._update_elastic_scale_out  s;   t'BCCzs4:zzDGzzTZzzjxzz	
 	
 	
 #j 	6 	6N^33%%n555*-  00+
 +

&' EEnEEE
 
 	(-
$%n%%14.1I1I
-.6:6I
23&4###r   c           
        t          j        | j                  }t                              d| j         dt          | j                   d| j         d|            i }g }t          | j                  D ]e\  }}|	                    |          }|t          | j                  dz
  k    r|
                    |          s|||<   P|                    |           fd}g }t          t          | j                            D ]b}|
                    |          s#t          |          dk    r||         ||<   |dz  }|                    |
                    |                     ct                              d|            || _        d |D             }	d	                    |	          }
|                     || j                  }|
| j        _        t%          |	                    | j                            t(          j        d
<   |
t(          j        d<   t          |          | _        d	                    |          t(          j        d<   |t(          j        d<   |                     ||
           d S )Nzelastic scale in, from r#  r   r$  r   r   z#elastic scale in, sorted_endpoints=c                D    g | ]}|                     d           d         S r  r  )rn   r   s     r   rp   z;ElasticManager._update_elastic_scale_in.<locals>.<listcomp>  s)    IIIW7==%%a(IIIr   rd   r  rb   rh   rg   )r   r&  r   r<   r=   r   r   r   	enumerater  r   r'  rA   r   r   r   r*   r   r   r   r1   r   r  )r,   r(  endpoints_dictunsorted_endpointsidr  r   
idle_indexsorted_endpointsip_listr   new_endpointss               r   _update_elastic_scale_inz'ElasticManager._update_elastic_scale_in  ss   t'BCCydgyy3tz??yyDJyyiwyy	
 	
 	
 &tz22 	5 	5MB	 &&y11Cc$*oo))).2D2DS2I2I)&/s##")))4444
TZ)) 	= 	=C!%%c**  s3E/F/F/J/J&8&Ds#a
##N$6$6s$;$;<<<<L:JLLMMM&6#II8HIII!!//d3
 
 	*-""4>22+
 +

&' ).
$%&''14:J1K1K
-.6C
23mU33333r   c                   t          | j                  dk    s
J d            | j        t          j        k    r|                                  d S t          | j                  | j        k    r8t                              d| j                    |                                  d S t          | j                  | j        k    r| 	                                 d S | 
                                 d S )Nr   zhosts emptyzelastic startup, hosts=)r   r   r   r   r   r!  r   r<   r=   r*  r5  rQ   s    r   _update_hostszElasticManager._update_hosts,  s    4:!###]###!===((***** 4:$'))BdjBBCCC,,.....TZ47**..00000 --/////r   c                V   | j         sd S d}| j        s|                                 r8t                              d| j                    |                                  d S t                              d| j         d| j                    |dz  }t          j	        d           | j        d S )Nr   zready with hosts znot ready for np z with hosts r   )
r   r   r  r<   r=   r   r7  r   r>   r?   )r,   r   s     r   waitzElasticManager.wait=  s    { 	F, 	{{}} <
<<===""$$$KKMDGMMMMNNN1HCJqMMM , 	 	r   c                t    | j         rd S  || j                  | _        | j                                         d S r)   )r   r*   r   rR   )r,   r   s     r   runzElasticManager.runL  s>    < 	F ++r   c                Z   | j         rd| _         | j        sj| j                                        }t                              d|            |t                              d|            |t          k    r?t                              d           | j                                         t          j
        S |dk    rdnd}|                     |           |rt          j        S | j        t          j        k    rt          j        S t          j        S |                                 s@|                                 r| j         r%| j                                         t          j
        S t)          j        d           | j        j| j        r| j                                         t          j        S )	NFzlauncher.watch():zjob exit with code zjob re-launch for auto parallelr   T)r   r   )r   r   r   rW   r<   r   r=   rI   rU   r   r#   r    r!   r   r   r   r$   r"   r  r  r>   r?   r%   )r,   rL   r   s      r   rW   zElasticManager.watchS  s   > 	#"DN, 	-%%''CLL2S223337#77888999KK ABBBM&&((((-- %(1HHDD%			I	... 3(22%)EEE(00(..??$$ *dkkmm *t~ *""$$$$))JqMMM3 , 	6 = 	!M   !!r   c                X    | j         r|                                  || _        d| _        d S )NT)r   r    r   r   )r,   r   frames      r   signal_handlerzElasticManager.signal_handlerw  s+    ; 	IIKKKr   )r   )r   r   r   r   rm   r   r   r   )F)r   r   r)   )r  r  )r   r   r   r-   r   r    r   r   r   r  r  r  r!  r*  r5  r7  r9  r;  rW   r?  r   r   r   rY   rY   }   sB       | | |~ MQ    &1 1 1 1(P P P$   2  7 7 7' ' ' ' 'R
 
 
. . .65 5 5./4 /4 /4b0 0 0"    "" "" ""H    r   rY   )
__future__r   r   r1   rv   r8   r  r   r   r>   r   paddle.distributed.fleetr   r   "paddle.distributed.utils.log_utilsr   
backup_envr   r<   ELASTIC_EXIT_CODErI   r   r   r   r   r'   rY   r   r   r   <module>rE     s   # " " " " "  				                 > > > > > > > > 9 9 9 9 9 9 * * * * * *	FI	&	& "%          
       A" A" A" A" A" A" A" A"H~ ~ ~ ~ ~ ~ ~ ~ ~ ~r   