
    Αi                     &   S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKJr  S SKJs  Js  Jr  S SKJr  S SKJr  \R.                  " S5      rS\l         " S S5      r " S	 S
5      r " S S5      r " S S5      r " S S5      r " S S5      rS0S jr S r!S r"S r#S r$S r%S r&S1S jr' " S S5      r(Sq)S r* S2S jr+S r,S  r-S! r.S" r/S# r0S$ r1S% r2S3S& jr3S' r4S( r5S) r6S* r7 " S+ S,5      r8S- r9S. r:S/ r;g)4    N)closing)	framework)	strtoboolrootFc                   $    \ rS rSrSrSrSrSrSrg)DistributeMode&   zT
There are various mode for fleetrun, each of them is designed for different model.
r          N)	__name__
__module____qualname____firstlineno____doc__
COLLECTIVEPSPS_HETER__static_attributes__r       e/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/fleet/launch_utils.pyr   r   &   s     J	
BHr   r   c                   ,    \ rS rSrSrSrSrSrSrSr	Sr
g)	
DeviceMode0   z
Training devices type
r   r
   r   r   N)r   r   r   r   r   UNKNOWNCPUGPUKUNLUNXPUr   r   r   r   r   r   0   s"     G
C
CF
Cr   r   c                   V    \ rS rSrS rS rS rS rS rS r	S r
S	 rS
 rS rS rSrg)Cluster<   c                 <    S U l         / U l        S U l        S U l        g N)
job_serverpodshdfsjob_stage_flag)selfr(   s     r   __init__Cluster.__init__=   s    		"r   c           	          SU R                    SU R                   Vs/ s H  n[        U5      PM     sn SU R                   SU R                   3$ s  snf )Nzjob_server:z pods:z job_stage_flag:z hdfs:)r&   r'   strr)   r(   r*   pods     r   __str__Cluster.__str__C   si    T__-V4S#SX4S3TTdeiexexdyy  AE  AJ  AJ  @K  L  	L4Ss   A
c                     [        U R                  5      [        UR                  5      :w  a  g[        U R                  UR                  5       H  u  p#X#:w  d  M    g   U R                  UR                  :w  a  ggNFT)lenr'   zipr)   )r*   clusterabs       r   __eq__Cluster.__eq__F   s]    tyy>S..		7<<0DAv 1 '"8"88r   c                 .    U R                  U5      (       + $ r%   )r:   r*   r7   s     r   __ne__Cluster.__ne__S   s    ;;w'''r   c                 N    [         R                   " UR                  5      U l        g r%   )copyr'   r=   s     r   update_podsCluster.update_podsV   s    IIgll+	r   c                 4    [        U R                  5       5      $ r%   )r5   trainers_endpointsr*   s    r   trainers_nranksCluster.trainers_nranksY   s    4**,--r   c                 ,    [        U R                  5      $ r%   )r5   r'   rF   s    r   pods_nranksCluster.pods_nranks\   s    499~r   c                     / nU R                    H1  nUR                   H  nUR                  UR                  5        M      M3     U$ r%   )r'   trainersappendendpoint)r*   rr0   ts       r   rE   Cluster.trainers_endpoints_   s:    99C\\$ "  r   c                     / nU R                    HL  nUR                   H9  nUR                   Vs/ s H  n[        U5      PM     nnUR	                  U5        M;     MN     U$ s  snf r%   )r'   rM   acceleratorsr.   rN   )r*   rP   r0   rQ   accstr_acceleratorss         r   world_device_idsCluster.world_device_idsf   sZ    99C\\89#GCH #G)* "   $Hs   A!c                     / nU R                    HS  nUR                   SUR                   3nUR                  b  UR                  c
   U S35       eUR                  U5        MU     U$ )N:z not a valid endpoint)r'   addrportrN   )r*   rP   r0   eps       r   pods_endpointsCluster.pods_endpointsn   sl    99CHH:Qsxxj)B88'CHH,@ $+,@ HHRL  r   c                 v    U R                    H)  n[        U5      [        UR                  5      :X  d  M'  Us  $    g r%   )r'   r.   id)r*   pod_idr0   s      r   get_pod_by_idCluster.get_pod_by_idx   s.    99C6{c#&&k)
  r   )r(   r&   r)   r'   N)r   r   r   r   r+   r1   r:   r>   rB   rG   rJ   rE   rW   r^   rc   r   r   r   r   r"   r"   <   s:    #L(,.r   r"   c                   ,    \ rS rSrS rS rS rS rSrg)	JobServer   c                     S U l         g r%   rO   rF   s    r   r+   JobServer.__init__   s	    r   c                     U R                    $ r%   ri   rF   s    r   r1   JobServer.__str__   s    --!r   c                 4    U R                   UR                   :H  $ r%   ri   r*   js     r   r:   JobServer.__eq__   s    }}

**r   c                     X:X  + $ r%   r   rn   s     r   r>   JobServer.__ne__   
    }r   ri   N)	r   r   r   r   r+   r1   r:   r>   r   r   r   r   rf   rf      s    "+r   rf   c                   2    \ rS rSrS rS rS rS rS rSr	g)	Trainer   c                 <    / U l         S U l        S U l        S U l        g r%   rT   rO   rankstagerF   s    r   r+   Trainer.__init__   s    	
r   c                 T    SU R                    SU R                   SU R                   3$ )Nzaccelerator:z
 endpoint:z rank:)rT   rO   ry   rF   s    r   r1   Trainer.__str__   s-    d//0
4==/PTPYPY{[[r   c                 (   [        U R                  5      [        UR                  5      :w  a  gU R                  UR                  :w  d  U R                  UR                  :w  a  g[	        U R                  UR                  5       H  u  p#X#:w  d  M    g   gr4   )r5   rT   rO   ry   r6   )r*   rQ   r8   r9   s       r   r:   Trainer.__eq__   sm    t  !S%88==AJJ&$))qvv*=))1>>:DAv ; r   c                     X:X  + $ r%   r   )r*   rQ   s     r   r>   Trainer.__ne__   rs   r   c                     U R                   $ r%   ry   rF   s    r   ry   Trainer.rank       yyr   rx   N)
r   r   r   r   r+   r1   r:   r>   ry   r   r   r   r   ru   ru      s    \r   ru   c                   >    \ rS rSrS rS rS rS rS rS r	S r
S	rg
)Pod   c                     S U l         S U l        S U l        S U l        / U l        / U l        / U l        / U l        / U l        / U l	        S U l
        g r%   )ry   ra   r[   r\   rM   serversworkerscoordinatorsheter_workersrT   device_moderF   s    r   r+   Pod.__init__   sS    			r   c                 @   SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   Vs/ s H  n[        U5      PM     sn SU R                   Vs/ s H  n[        U5      PM     sn SU R                   Vs/ s H  n[        U5      PM     sn S	U R                   Vs/ s H  n[        U5      PM     sn S
U R                   Vs/ s H  n[        U5      PM     sn 3$ s  snf s  snf s  snf s  snf s  snf )Nzrank:z id:z addr:z port:z visible_accelerator:z
 trainers:z	 servers:z             workers:z heter_workers:z coordinators:)ry   ra   r[   r\   rT   rM   r.   r   r   r   r   )r*   rQ   swhcs         r   r1   Pod.__str__   s   tyykdggYfTYYKvdii[Pefjfwfwex  yC  UY  Ub  Ub  Dc  Ub  PQ  EH  IJ  EK  Ub  Dc  Cd  dm  C  K  K  nL  K  z{  or  st  ou  K  nL  mM M&*ll3lc!fl34OUYUgUgDhUgPQSVUgDhCiiw  JN  J[  J[  y\  J[  EFy|}~y  J[  y\  x]^ 	^  Dc  nL3Dh  y\s   D8DD"D&-D*c                    U R                   UR                   :w  dN  U R                  UR                  :w  d4  U R                  UR                  :w  d  U R                  UR                  :w  a  [        R                  SU  SU 35        g[        U R                  5      [        UR                  5      :w  a0  [        R                  SU R                   SUR                   35        g[        [        U R                  5      5       HZ  nU R                  U   UR                  U   :w  d  M%  [        R                  SU R                  U    SUR                  U    35          g   [        U R                  5      [        UR                  5      :w  a0  [        R                  SU R                   SUR                   35        g[        [        U R                  5      5       HZ  nU R                  U   UR                  U   :w  d  M%  [        R                  SU R                  U    SUR                  U    35          g   [        U R                  5      [        UR                  5      :w  a0  [        R                  SU R                   SUR                   35        g[        [        U R                  5      5       HZ  nU R                  U   UR                  U   :w  d  M%  [        R                  SU R                  U    SUR                  U    35          g   g)	Nzpod z != Fz	trainers ztrainer zservers zworkers T)ry   ra   r[   r\   loggerdebugr5   rM   ranger   r   )r*   r0   is      r   r:   
Pod.__eq__   s=   II!ww#&& yyCHH$yyCHH$LL4vT#/0t}}S\\!22LL9T]]O4~FGs4==)*A}}Q3<<?2xa(8'9cll1o=NOP +
 t||CKK 00LL8DLL>ckk]CDs4<<()A||A#++a.0xQ'8S[[^<LMN *
 t||CKK 00LL8DLL>ckk]CDs4<<()A||A#++a.0xQ'8S[[^<LMN *
 r   c                     X:X  + $ r%   r   r/   s     r   r>   
Pod.__ne__   s
    r   c                     g r%   r   )r*   res_podss     r   parse_responsePod.parse_response   s    r   c                     U R                   $ r%   r   rF   s    r   ry   Pod.rank   r   r   c                 j    SnU R                    H
  nX S3-  nM     US:w  d   SU  S35       eUS S nU$ )N ,z	this pod z can't see any acceleratorsr   )rT   )r*   rP   gs      r   get_visible_acceleratorsPod.get_visible_accelerators   sQ    ""A3aLA # BwE)D6)DEEwcrFr   )rT   r[   r   r   r   ra   r\   ry   r   rM   r   N)r   r   r   r   r+   r1   r:   r>   r   ry   r   r   r   r   r   r   r      s'     ^%Nr   r   c                     [         R                  " U5      nUR                  U 5        [         R                  " 5       n[         R                  " S5      nUR                  U5        UR                  U5        U$ )Nz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)logging	getLoggersetLevelStreamHandler	FormattersetFormatter
addHandler)	log_levelnamer   log_handler
log_formats        r   
get_loggerr      sb    t$F
OOI'')K""HJ Z(
k"Mr   c                 @   [        U5      [        L d   S5       e[        S S9nSn[        U 5       GH  u  px[	        5       n	Xyl        Xl        X9l        X'   n
[        U
5      [        U5      :  d   S5       e[        [        U5      5       GHQ  n[        5       nU[        R                  :X  a  [        XK   [        [        45      (       a;  UR                  R!                  XK   5        U	R                  R!                  XK   5        OUR                  R#                  XK   5        U	R                  R#                  XK   5        OlU[        R$                  :X  aX  [        XK   [        [        45      (       a  UR                  R!                  XK   5        OUR                  R#                  XK   5        X    Ul        Xll        US-  nU	R(                  R#                  U5        GMT     UR*                  R#                  U	5        GM     U R-                  U5      nXUR*                  U   4$ )Ntrainer_endpoints must be listr(   r   zNcurrent trainer_endpoints size should be greater equal than accelerators size.r
   )typelistr"   	enumerater   ry   r[   r   r5   r   ru   r   r   
isinstancetuplerT   extendrN   r    rO   rM   r'   index)node_ipsnode_iptrainer_endpointsr   devices_per_procr7   trainer_rank	node_rankipr0   cur_node_endpointsr   trainerpod_ranks                 r   get_clusterr     s    !"d*L,LL*4 GL"8,	e%.9%&#.>*?? 	
\	
? s+,-AiGjnn,.1D%=AA((//0@0CD$$++,<,?@((//0@0CD$$++,<,?@
..1D%=AA((//0@0CD((//0@0CD"4"7!8G'LALLL(% .& 	C = -@ ~~g&HLL***r   c                    [         R                  S:w  a  U  H  nUR                  R                  5       b  M   [         R                  " [         R
                  " UR                  R                  5      [        R                  5        UR                  (       a  UR                  R                  5         [        R                  SUR                  R                   35        M     [        R                  " S5        U  H  nUR                  R                  5       b  M   UR                  R                  5         UR                  (       a  UR                  R                  5         [        R!                  SUR                  R                   35        M     [        R                  " S5        [#        SS5       H  nSnU  H[  nUR                  R                  5       b  M   [         R$                  " UR                  R                  [        R&                  5        S	nM]     U(       d  [        R                  S
5          g [        R                  " S5        M     [        R)                  S5        [*        R,                  " S5        g )Nntzterminate process group gid:r
   zterminate process id:   r   2   FTzterminate all the procszcan't kill all process and exit)osr   procpollkillpggetpgidpidsignalSIGTERMlog_fncloser   infotimesleep	terminater   r   killSIGKILLfatalsysexit)procspstepalives       r   terminate_local_procsr   1  sx   	ww$Avv{{}$		"**QVVZZ0&..A88HHNN$:166::,GH  	

166;;= FFxx LL0=>  	JJqMaAvv{{}$

FNN3 
 KK12

1  LL23HHQKr   c                  n     [         R                  " 5       n [         R                  " U 5      nX4$ !    g = fr%   )socketgethostnamegethostbyname)	host_namehost_ips     r   get_host_name_ipr   W  s8    &&(	&&y1!!s   -0 4c                 b    U[         :X  a  [        OUnUR                  " SU -   4UUUS-   S.UD6  g)aD  Add argparse's argument.

Examples:
    .. code-block:: python

        >>> import argparse
        >>> from paddle.distributed.fleet.launch_utils import add_arguments
        >>> parser = argparse.ArgumentParser()
        >>> add_arguments("name", str, "Jonh", "User name.", parser)
        >>> args = parser.parse_args()

z--z Default: %(default)s.)defaultr   helpN)boolr   add_argument)argnamer   r   r   	argparserkwargss         r   add_argumentsr   `  sD     9$Dw,,	
 r   c                     S n[        5       nSn U" 5       nXB;  a  UR                  U5        [        U5      U :  a  U$ US-  nUS:  a  [        S5        g MG  )Nc            
         [        [        R                  " [        R                  [        R                  5      5       n U R	                  [        R
                  [        R                  [        R                  " SSS5      5        U R                  S5        U R                  5       S   sS S S 5        $ ! , (       d  f       g = f)Niir
   r   )r   r   )r   r   AF_INETSOCK_STREAM
setsockopt
SOL_SOCKET	SO_LINGERstructpackbindgetsockname)r   s    r   __free_port$find_free_ports.<locals>.__free_portx  sw    V]]6>>63E3EFG1 LL!!6#3#3V[[q!5L FF7O==?1% HGGs   A)B00
B>r   r
   i  z@can't find available port and use the specified static port now!)setaddr5   print)numr  port_setr   r\   s        r   find_free_portsr  w  sg    & uHD
}LLx=CO	#:R  r   c                     [         R                  R                  S5      c  [        U 5      nUb  [	        U5      nU$ [        [         R                  R                  S5      5      n[        X1-   X1-   U -   S5      nU$ )NFLAGS_START_PORTr
   )r   environgetr  r   intr   )r	  offsetports
start_ports       r   	get_portsr    sn    	zz~~()1$KE L (:;<
j):+>+DaHLr   c           	         SnSnSnU R                  5        H  u  pV[        U[        U5      5      nM     SSR                  USU-  U5      -   nSSU SU S	3-   nX4-   U-   n	S
SR	                  S/U	-  5      -   S-   n
S
SR	                  S/U	-  5      -   S-   nSnXS-   -  nU(       a  XR                  US   US   5      -  nOXR                  SS5      -  nXS-   -  nU R                  5        HU  u  pV[        U[        5      (       a  [        U5      U:  a	  SUSS  -   nOUnXR                  USU-  [        U5      5      -  nMW     X-  nSU S3nU$ )Nr   (   -   z    z|{{:>{}s}}{}{{:^{}s}}|
 z|{:>zs}{}{:^zs}|
z    +r   =+-
r   r
   zfleetrun Distributed EnvsValuez... i)itemsmaxr5   formatjoinr   r.   )envsheaderspacingmax_kmax_vkvh_formatl_formatlengthborderlinedrawsstr_v_strs                  r   pretty_print_envsr1    s   GEE

E3q6"  299sW}e H %wk%??H]W$Frwwuv~..4FRWWcUV^,,s2DE	d]EF1I66!<gFF	D[E

a#a&E/QstW$EEC'M3u:>>  
OEwb>DKr   c                       \ rS rSrS rSrg)TrainerProci  c                 X    S U l         S U l        S U l        S U l        S U l        S U l        g r%   )r   r   
log_offsetry   
local_rankcmdrF   s    r   r+   TrainerProc.__init__  s,    		r   )r7  r6  r   r5  r   ry   N)r   r   r   r   r+   r   r   r   r   r3  r3    s    r   r3  c                      [        U 5      S::  d   S[        U 5       S35       e[        U 5      S:X  a  [        U S   [        5      (       d   eU S   q[        $ )Nr
   z
len(args) z should <= 1r   )r5   r   r   _run_with_coverage)argss    r   run_with_coverager<    sV    t9>?ZD	{,??>
4yA~$q'4((((!!Wr   c                    Uc3  [         R                   " [        R                  R                  5       5      nO[         R                   " U5      nUR                  SS 5        UR                  SS 5        U R	                  5       nU Vs/ s H  nSR                  U5      PM     n	n/ n
[        UR                  5       GH  u  p[        UR                  5      [        UR                  5      [        U R                  5       5      SR                  U R                  5       5      [        U5      SR                  UR                   Vs/ s H  n[        U5      PM     sn5      SR                  U	5      S.nUR                  SS 5      b  US   US'   UR                  SS 5      b  US   US'   UR                  SS 5      b  US   US'   [        UR                  5      S	:  ae  UR                   ["        R$                  :X  aG  S
R'                  SR                  UR                   Vs/ s H  n[        U5      PM     sn5      5      US'   [        UR                  5      S	:  aG  S
R'                  SR                  UR                   Vs/ s H  n[        U5      PM     sn5      5      US'   [(        R*                  R-                  5       (       a`  [        UR                  5      S	:  aG  S
R'                  SR                  UR                   Vs/ s H  n[        U5      PM     sn5      5      US'   UR/                  U5        / n[1        5       (       d$  [        R                  R                  SS5      S:X  a  / SQn[2        R4                  S/UQUPUQn[6        R9                  SU SU 35        US	:X  a^  [6        R;                  SR'                  [        UR                  5      [=        US5      5      5        [6        R;                  SU SU S35        S n[        R>                  S:X  a  S O[        R@                  nUGb  [        RB                  " USS9  [        RD                  RG                  U S35      (       a  [        RH                  " U S35        [K        U S3S5       nURM                  S5        URM                  S R                  U R                  5       5      5        S S S 5        UR                  S5      b5  UR                  S!5      RO                  5       S":X  a  [K        U S#U 3S$5      nO[K        U S%U 3S$5      n[P        RR                  " UUUUUS&9nO[P        RR                  " UUUS'9n[U        5       nUUl+        UR                  Ul	        UUl,        UUl-        U(       a  UR]                  5       OS Ul/        UUl0        U
Rc                  U5        GM     U
$ s  snf s  snf s  snf s  snf s  snf ! , (       d  f       GN= f)(N
http_proxyhttps_proxyrZ   r   )PADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSPADDLE_RANK_IN_NODEPADDLE_LOCAL_DEVICE_IDSPADDLE_WORLD_DEVICE_IDSPADDLE_CLUSTER_TOPO_PATHPADDLE_RANK_MAPPING_PATHPADDLE_ENABLE_AUTO_MAPPINGr   z{}FLAGS_selected_gpusFLAGS_selected_acceleratorsFLAGS_selected_xpusWITH_COVERAGEOFFON)z-mcoveragerunz--branchz-p-uzstart trainer procz  env:zYLocal start {} processes. First process distributed environment info (Only For Debug): {}zDistributed Envsr  z7Details about PADDLE_TRAINER_ENDPOINTS can be found in z8/endpoints.log, and detail running logs may be found in z/workerlog.0r   Texist_okz/endpoints.logr   zPADDLE_TRAINER_ENDPOINTS: 
r  PADDLE_NEED_RANK_MAPPINGtruez/prelaunchlog.r8   /workerlog.)envstdoutstderr
preexec_fn)rY  r\  )2rA   r   r  poprW   r!  r   rM   r.   ry   rO   rG   rE   rT   r  r5   r   r   r   r   r   coreis_compiled_with_xpuupdater<  r   
executabler   r   r   r1  r   setsidmakedirspathexistsremoveopenwritelower
subprocessPopenr3  r   r6  r   tellr5  r7  rN   )r7   r0   training_scripttraining_script_argslog_dirr"  current_envidseleresr   idxrQ   rU   proc_envr   coverage_argsr7  fnpre_fnfr   tps                          r   start_local_trainersr{    s    |ii

 12iio OOL$'OOM4(

"
"
$C$'
(CS388C=CC
(ECLL)!$QVV'*1::#&w'>'>'@#A(+1K1K1M(N#&s8'*xx%&^^4^cS^4( (+xx}

 ??5t<H3>*4H/0 ??5t<H3>*4H/0 ??7>J5@,6H12 q~~"s*..'H.2kk!..9.Q#a&.9:/H*+ q~~"6:kk!..9.Q#a&.9:7H23 >>..00S5H15L.2kk!..9.Q#a&.9:/H*+ 	8$zz~~ou5=GMNN
 
 	

 "
 	)#f[MBC!8KK88>%%h0MN9 KKI)S)<)
 DbiiKK$/ww~~	899		WI^45	0#6!67		'"<"<">?@ 7  <=IOO$>?EEG WI^C593?WI[6<##RvD ##C[VLD]&&	%'	TRQ *T LY ) 5* :
 :
 :P 76s+   W WW:W=W!A W&&
W5	c                    U R                   (       a  [        U R                   R                  S5       nUR                  U R                  S5        U H#  n [
        R                  R                  U5        M%     UR                  5       U l        S S S 5        g g ! [         a;    [
        R                  R                  SU R                   R                   S35         M  f = f! , (       d  f       g = f)NrP   r   zOUnicodeEncodeError occurs at this line. Please refer to the original log file "z"
)
r   rg  r   seekr5  r   rZ  rh  UnicodeEncodeErrorrl  )rz  finr-  s      r   pull_worker_logr  _  s    	yy"))..#&#HHR]]A&JJ$$T*   HHJBM '&  * JJ$$BBD))..AQQTV '&s0   "CB4CACCCC
C-c                 n    Sn/ nSnU  Hu  nUR                   (       a  UR                  S:X  a  [        U5        UR                  R	                  5       nUc  SnMP  US:w  d  MX  SnUR                  UR                  5        Mw     U(       a!  [        U 5        [        R                  " S5        U$ ! [         a#    [        R                  S5        [        U 5         g [         a)    [        R                  SU SW S35        [        U 5        e   [        R                  SU SW S35        [        U 5         g = f)	NFr   Tr
   zKeyboardInterrupt, exitzABORT!!! Out of all z) trainers, the trainer process with rank=z# was aborted. Please check its log.)r   r6  r  r   r   rN   ry   r   r   r   KeyboardInterruptr   warning
SystemExiterror)r   nranksr  
error_rankr   r   rets          r   watch_local_trainersr  n  s-   #
AxxALLA-"&&++-C{!!!&&)  !%(HHQK& L#  01e$ "6(*ST^S_  `C  D	
 	e$"6(*ST^S_  `C  D	
 	e$s   AB- "A	B- -*D4AD4c                    U cE  [         R                  R                  5       n[        SU5       Vs/ s H  n[	        U5      PM     nnU$ [
        R                  " S5      nUb  US:X  a0  U R                  S5       Vs/ s H  o"R                  5       PM     nnU$ UR                  S5      nU R                  S5       H  nX%;   a  M
   SU SU S35       e   U R                  S5       Vs/ s H"  nUR                  UR                  5       5      PM$     nn[        R                  SU  S	U S
U 35        U$ s  snf s  snf s  snf )Nr   CUDA_VISIBLE_DEVICESr   r   zCan't find your gpus z in CUDA_VISIBLE_DEVICES[].z1Change selected_gpus into relative values. --ips: will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)r   r^  get_cuda_device_countr   r.   r   getenvsplitstripr   r   r   )gpusgpus_numxres_gpuscuda_visible_devicescuda_visible_devices_lists         r   get_gpusr    s^   |>>779$)!X$67$6qCF$672 O/  "yy)?@'+?2+E+/::c?;?a	?H;* O! )=(B(B3(G%ZZ_5 !!"#<=Q<RRTV5 % C(A *//	:(   KKCD6 J119
 ;((A'BD O3 8 <   D2;D7')D<c                    U cE  [         R                  R                  5       n[        SU5       Vs/ s H  n[	        U5      PM     nnU$ [
        R                  " S5      nUb  US:X  a0  U R                  S5       Vs/ s H  o"R                  5       PM     nnU$ UR                  S5      nU R                  S5       H  nX%;   a  M
   SU SU S35       e   U R                  S5       Vs/ s H"  nUR                  UR                  5       5      PM$     nn[        R                  SU  S	U S
U 35        U$ s  snf s  snf s  snf )Nr   XPU_VISIBLE_DEVICESr   r   zCan't find your xpus z in XPU_VISIBLE_DEVICES[r  z1Change selected_xpus into relative values. --ips:r  z' according to your XPU_VISIBLE_DEVICES:)r   r^  get_xpu_device_countr   r.   r   r  r  r  r   r   r   )xpusxpus_numr  res_xpusxpu_visible_devicesxpu_visible_devices_lists         r   get_xpusr    s^   |>>668$)!X$67$6qCF$672 O/ !ii(=>&*=*C+/::c?;?a	?H;* O! (;'@'@'E$ZZ_4 !!"#;<O;PPRT4 % C(A )..qwwy9(   KKCD6 J119
 ;''?&@B O3 8 <r  c                    U S:X  a  [         R                  R                  5       (       a=  [         R                  R                  5       S:  a  [	        S5        [
        R                  $ [         R                  R                  5       (       a=  [         R                  R                  5       S:  a  [	        S5        [
        R                  $ U S:X  a=  [         R                  R                  5       S:  a  [	        S5        [
        R                  $ U S:X  a=  [         R                  R                  5       S:  a  [	        S5        [
        R                  $ U S	:X  a  [	        S
5        [
        R                  $ [        S5      e)Nheterr   z+launch train in heter mode with GPU device.z+launch train in heter mode with XPU device.ncclzlaunch train in GPU mode!bkclzlaunch train in XPU modegloozlaunch train in CPU modezDon't supported devices)r   r^  is_compiled_with_cudar  r  r   r   r_  r  r    r   RuntimeErrorbackends    r   get_device_moder    s   'NN0022446:?@>>!NN//113359?@>>!&Y^^AACaG)*~~&Y^^@@BQF()~~&()~~
0
11r   c                    [        U R                  5      n/ nU[        R                  :X  a  [	        U R
                  5      nU R                  b  [        U5      [        U R                  5      -  S:X  d!   S[        U5       SU R                   S35       e[        [        U5      [        U R                  5      -  5      n[        S[        U5      U5       Vs/ s H	  oSXUU-    PM     nnX4$ Un X4$ U[        R                  :X  a  [        U R                  5      nU R                  b  [        U5      [        U R                  5      -  S:X  d!   S[        U5       SU R                   S35       e[        [        U5      [        U R                  5      -  5      n[        S[        U5      U5       Vs/ s H	  oVXUU-    PM     nnX4$ Un X4$ U[        R                  :X  an  [        U S5      (       a'  U R                  c  [        R                   " 5       U l        U R                  c  S/nX4$ [#        [        SU R                  5      5      n X4$ [%        SU S35      es  snf s  snf )	Nr   zgpus' number:z mod args.nproc_per_node:z
 must == 0zxpus' number:paddle_cpuonlyzCan't support device_mode:z, support only cpu|gpu|xpu now.)r  r  r   r   r  r  nproc_per_noder5   r  r   r    r  r  r   hasattrmultiprocessing	cpu_countr   AssertionError)r;  r   r   r  nr   r  s          r   get_device_proc_infor    sj   !$,,/K jnn$		"*ID$7$7 88Q> D	{*CDDWDWCXXbc> CID$7$7 889A9>q#d)Q9OP9OAQQ9OP6 **3  $2 **1 

	&		"*ID$7$7 88Q> D	{*CDDWDWCXXbc> CID$7$7 889A9>q#d)Q9OP9OAQQ9OP  **  $ ** 

	&4)**t/B/B/J"1";";"=D& !s **  $E!T-@-@$AB **	 (5TU
 	
/  Q  Qs   I/I c                     [         R                  SU R                  /U R                  Qn[        R
                  " U5      nUR                  5         g )NrR  )r   ra  rm  rn  rj  rk  wait)r;  r7  r   s      r   direct_startr    sI     	 
	"	"	C C DIIKr   c           	      (   U c   e/ nU R                  S5       Hd  nUR                  S5      S   nUR                  S5      S   n[        U5      U-   nUR                  SR                  U[	        U5      45      5        Mf     SR                  U5      nU$ )zA
origin_endpoint: ip:port
user_define_endpoint: ip:(port+offset)
r   rZ   r   r
   )r  r  rN   r!  r.   )origin_endpointsr  !paddle_user_define_endpoints_listip_portr   r\   new_portpaddle_user_define_endpointss           r   get_custom_endpointsr  +  s    
 '''(*%#))#.]]3"}}S!!$t9v%)002s8}:M1NO	 /
 $'88,M#N ''r   c                 ,   [        U5      [        L d   S5       eU[        R                  :X  d   S5       e[	        S S9n[        U 5       H  u  pg[        5       nXhl        Xxl        X8l	        X&   n	XF   n
[        U
5      S:X  d   e[        [        U
5      5       H;  n[        5       nX    Ul        X   Ul        UR                  R                  U5        M=     UR                   R                  U5        M     U R#                  U5      nXUR                   U   4$ )Nr   ,Only support get mapped cluster for gpu now.r   r
   )r   r   r   r   r"   r   r   ry   r[   r   r5   r   ru   rO   rM   rN   r'   r   )r   r   r   r   
node_ranksr7   r   r   r0   r   ranks_per_noder   r   r   s                 r   'get_mapped_cluster_without_rank_mappingr  c  s    !"d*L,LL**..( 6( 4 G"8,	e%.9 $.>"a'''s>*+AiG"4"7!8G),GLLL(	 ,
 	C  -" ~~g&HLL***r   c                 L   U[         R                  :X  d   S5       e[        R                  R	                  5       nS n[        U R                  S5       n[        R                  " U5      nS S S 5        / n/ n[        US   5       H+  u  pxUR                  US   5        UR                  U/5        M-     [        U5      S:X  a  US   n	O*U R                  (       a  U R                  n	O[        5       u  pX;   d   SU	 SU S	35       eUR                  U	5      n[        U5      [        U5      :X  d   S
5       e[        R!                  SU SU	 SU SXk    35        / n/ nU GH   nUR                  U5      n["        R$                  R'                  S5      bC  [)        ["        R*                  " SS5      5      n[-        [/        X[        Xk   5      -   5      5      nO["        R$                  R'                  S5      bK  [)        ["        R$                  R'                  S5      5      n[-        [/        X[        Xk   5      -   5      5      nO[1        [        Xk   5      5      nUR                  U Vs/ s H
  nU SU 3PM     sn5        GM#     [3        XYXU5      $ ! , (       d  f       GN-= fs  snf )Nr  rP   machinesr[   r
   r   Can't find your local ip {} in node_ips: {}+ranks length should be equal to ips length.parsed from args: node_ips:	 node_ip: node_rank: node_ranks:PADDLE_PORTr   r  rZ   )r   r   r   r^  r  rg  cluster_topo_pathjsonloadr   rN   r5   hostr   r   r   r   r   r  r  r  r  r   r   r  r  )r;  r   r  cluster_topo	json_filer   r  rt  cur_cluster_topor   _r   
free_portsr   r   r  r\   s                    r   1get_mapped_cluster_from_args_without_rank_mappingr    s|   *..( 6( ~~335H L	d$$c	*iyy+ 
+ HJ!*<
+C!D(013%  "E 8}1+99iiG)+JA 
%gY.@
"M w'Iz?c(m+ 5+ LL
%hZy	 BK|J,A+B	D JNN2&	::>>-(4RYY}b9:Jjs:3H/I"IJJ ZZ^^./;RZZ^^,>?@Jjs:3H/I"IJJ )Z-B)CDJ  Z!HZTRD$.Z!HI   3,: e 
+	*` "Is   J%J!

Jc                    [        U5      [        L d   S5       eU[        R                  :X  d   S5       eS n[	        S S9n[        U 5       H  u  p[        5       n
Xl        Xl        X:l	        X(   nXH   nXX   n[        [        U5      5       H  n[        5       nUS   [        X   5         n[        U5      S:X  d   S5       eUR                  R                  U" US   5      5        X    Ul        X   Ul        U
R"                  R                  U5        M     UR$                  R                  U
5        M     U R'                  U5      nXwR$                  U   4$ )	Nr   r  c                     [         R                  " S5      nUb  US:X  a  U $ UR                  S5      nUR                  [	        U 5      5      n[
        R                  SU  SU SU 35        U$ )Nr  r   r   zChange gpu id from z to z based on CUDA_VISIBLE_DEVICES )r   r  r  r   r.   r   r   )gpu_idr  r  relative_ids       r   get_relative_gpu_idAget_mapped_cluster_with_rank_mapping.<locals>.get_relative_gpu_id  sw    !yy)?@'+?2+EM(<(B(B3(G%399#f+FKKK%fXT+>]^w]xy r   r   ranksr
   z.Only support one process to one device mappingr   )r   r   r   r   r"   r   r   ry   r[   r   r   r5   ru   r.   rT   rN   rO   rM   r'   r   )r   r   r   r   r  node_rank_mappingsr  r7   r   r   r0   r   r  cur_node_rank_mappingr   r   local_device_idsr   s                     r   $get_mapped_cluster_with_rank_mappingr    sj    !"d*L,LL**..( 6(
 4 G"8,	e%.9 $. 2 =s>*+AiG4W=N%&  '(A- @-   ''#$4Q$78 #5"7!8G),GLLL( , 	C 1 -4 ~~g&HLL***r   c                    U[         R                  :X  d   S5       e[        R                  R	                  5       nU R
                  =(       d    [        R                  " S5      nS n[        US5       n[        R                  " U5      nS S S 5        S[        R                  S'   / n/ n/ nU H~  n	UR                  U	S   5        [        U	S   R                  5       5       V
s/ s H  n
[        U
5      PM     nn
UR!                  5         UR                  U5        UR                  U	5        M     [#        U5      S:X  a  US   nO*U R$                  (       a  U R$                  nO['        5       u  pX;   d   S	U S
U S35       eUR)                  U5      n[#        X~   5      U::  d   S5       e[#        U5      [#        U5      :X  d   S5       e[*        R-                  SU SU SU SX~    35        / n/ nU GH"  nUR)                  U5      n[        R                  R/                  S5      bD  [        [        R                  " SS5      5      n[        [1        UU[#        X~   5      -   5      5      nO[        R                  R/                  S5      bL  [        [        R                  R/                  S5      5      n[        [1        UU[#        X~   5      -   5      5      nO[3        [#        X~   5      5      nUR                  U Vs/ s H
  nU SU 3PM     sn5        GM%     [5        UUUUUU5      $ ! , (       d  f       GN= fs  sn
f s  snf )Nr  rH  rP   r   r[   r  r
   r   r  r  r  zHnumber of ranks mapped to one node should not exceed the available ones.r  r  r  r  r  r  r  rZ   )r   r   r   r^  r  rank_mapping_pathr   r  rg  r  r  r  rN   r   keysr  sortr5   r  r   r   r   r   r  r   r  r  )r;  r   r  r  rank_mappingr  r   r  r  cur_rank_mappingr   cur_node_rank_listr   r  r   r  r   r   r  r\   s                       r   .get_mapped_cluster_from_args_with_rank_mappingr    s&   *..( 6( ~~335H .. "))"3 L		%yy+ 
& .0BJJ)*HJ((01 !1'!:!?!?!AB
BqCFB 	 
 	!,-!!"23 ) 8}1+99iiG)+JA 
%gY.@
"M w'Iz$%1 R1 z?c(m+ 5+ LL
%hZy	 BK|J,A+B	D JNN2&	::>>-(4RYY}b9:Jj*s:3H/I"IJJ ZZ^^./;RZZ^^,>?@Jj*s:3H/I"IJJ )Z-B)CDJ  Z!HZTRD$.Z!HI   0 { 
&	%
b "Is   1L' L9:L>
'
L6c                   >    \ rS rSrS rS rS rS rS rS r	S r
S	rg
)ParameterServerLauncheriI  c                    Xl         X l        SU l        SU l        SU l        SU l        SU l        SU l        / U l        / U l	        SU l
        / U l        / U l        SU l        / U l        / U l        SU l        / U l        / U l        SU l        SU l        / U l        0 U l        / U l        0 U l        SU l        U R5                  U5        g )NFr   r   T)r;  distribute_modewith_coordinator
server_num
worker_numheter_worker_numcoordinator_numserver_endpointsserver_endpoints_ipsserver_endpoints_portworker_endpointsworker_endpoints_ipsworker_endpoints_portheter_worker_endpointsheter_worker_endpoints_ipsheter_worker_endpoints_portcoordinator_endpointscoordinator_endpoints_ipscoordinator_endpoints_portis_localcurrent_node_ipstage_trainer_numstage_heter_map
stage_liststage_device_map	stage_numget_role_endpoints)r*   r;  r  s      r   r+    ParameterServerLauncher.__init__J  s    	. % !  "$&!%'" "$&!%'"&(#*,'+-(%'")+&*,'!!#! "%r   c                 $   UR                   (       a  UR                   U l         UR                  (       a  [        UR                  R                  S5      5      U R                   :X  dC   SR	                  [        UR                  R                  S5      5      U R                   5      5       eUR                  U l        O[        U R                   S5      nSR                  U Vs/ s H  nS[        U5      -   PM     sn5      U l        OQUR                  S:w  d   S5       eUR                  U l        [        U R
                  R                  S5      5      U l         UR                  (       a  UR                  U l	        UR                  (       a  [        UR                  R                  S5      5      U R                  :X  dC   SR	                  [        UR                  R                  S5      5      U R                  5      5       eUR                  U l        GO[        U R                  U R                   5      nSR                  U Vs/ s H  nS[        U5      -   PM     sn5      U l        GOqUR                  S:w  d   S5       eUR                  R                  S5       Vs/ s H$  o3R                  5       R                  S	5      S   PM&     nn[        U5      U l	        UR                  R                  S5       Vs/ s H+  n[        UR                  5       R                  S	5      5      PM-     nnS
U;   a  Sn[        X`R                   -   X`R                   -   U R                  -   S
5      n/ n[        U R                  5       H2  n	UR                  S	R                  XI   [        Xy   5      45      5        M4     SR                  U5      U l        OUR                  U l        UR                  (       Ga  SU l        UR                  U l        UR"                  (       a  [        UR"                  R                  S5      5      U R                  :X  dC   SR	                  [        UR"                  R                  S5      5      U R                  5      5       eUR"                  U l        OT[        U R                  S
5      nSR                  U Vs/ s H  nS[        U5      -   PM     sn5      U l        ['        S5        U R(                  [*        R,                  :X  Ga  UR.                  S:w  d   S5       eSU R0                  S
'   UR.                  R                  S5      n
[        [        U
5      5       H  n	X   U R0                  U	S-   '   M     U R                  U R2                  S
'   UR4                  (       Ga]  UR4                  R                  S5      U l        U R6                   Vs/ s H  n[9        U5      PM     snU l        UR:                  (       Ga  [        UR:                  R                  S5      5      [        U R6                  5      :X  dL   SR	                  [        UR:                  R                  S5      5      [        U R6                  5      5      5       eUR:                  R                  S5      nSU l        [        [        U R6                  5      5       GH  n	U R<                  S:w  a  U =R<                  S-  sl        UU	   R                  S5      n[        U5      U R6                  U	   :X  d   SU	 S35       eU Vs/ s H%  nUR                  5       R                  S	5      S   PM'     nnU Vs/ s H+  n[        UR                  5       R                  S	5      5      PM-     nnS
U;   a  [        [        U5      U R                  U R                   -   U R4                  -   5      n/ n[        [        U5      5       H4  nUR                  S	R                  UU   [        UU   5      45      5        M6     SR                  U5      nOSR                  U5      nUU R2                  U	S-   '   U R>                  RA                  U	S-   /[        UR                  S5      5      -  5        U =R4                  U R6                  U	   -  sl        U =R<                  U-  sl        GM     GO[        [        U R6                  5      5       GH  n	U R6                  U	   n[        UU R                   U R                  -   U R4                  -   5      nSR                  U Vs/ s H  nS[        U5      -   PM     sn5      nUU R2                  U	S-   '   U R>                  RA                  U	S-   /[        UR                  S5      5      -  5        U =R4                  U-  sl        U R<                  S:w  a  U =R<                  S-  sl        U =R<                  U-  sl        GM     GOVUR:                  S:w  d   S5       e/ U l        UR:                  R                  S5      nSU l        [        [        U5      5       GH  n	UU	   R                  S5      nU R6                  R                  [        U5      5        U Vs/ s H$  o3R                  5       R                  S	5      S   PM&     nnU Vs/ s H+  n[        UR                  5       R                  S	5      5      PM-     nnS
U;   a  [        [        U5      U R                  U R                   -   U R4                  -   5      n/ n[        [        U5      5       H4  nUR                  S	R                  UU   [        UU   5      45      5        M6     SR                  U5      nOSR                  U5      nUU R2                  U	S-   '   U R>                  RA                  U	S-   /[        UR                  S5      5      -  5        U =R4                  U R6                  S   -  sl        U R<                  S:w  a  U =R<                  S-  sl        U =R<                  U-  sl        GM      U R                  /U R6                  QU l!        [        U RB                  5      U l"        URF                  (       a  URF                  /nO0[        S
U R                   U R                  -   U R4                  -   5      nU R
                  R                  S5      S   R                  S	5      S   nUS	-   [        US   5      -   U l#        U R
                  R                  S5       Vs/ s H$  o3R                  5       R                  S	5      S   PM&     snU l$        U R                  R                  S5       Vs/ s H$  o3R                  5       R                  S	5      S   PM&     snU l%        U R                   (       a  U R$                  R                  S5       Vs/ s H%  nUR                  5       R                  S	5      S   PM'     snU l&        U R$                  R                  S5       Vs/ s H%  nUR                  5       R                  S	5      S
   PM'     snU l'        U R
                  R                  S5       Vs/ s H$  o3R                  5       R                  S	5      S
   PM&     snU l(        U R                  R                  S5       Vs/ s H$  o3R                  5       R                  S	5      S
   PM&     snU l)        / U l*        U RH                   H0  nUU RT                  ;  d  M  U RT                  R                  U5        M2     U RJ                   H0  nUU RT                  ;  d  M  U RT                  R                  U5        M2     U R(                  [*        R,                  :X  a  U R<                  R                  S5       Vs/ s H%  nUR                  5       R                  S	5      S   PM'     snU l+        U R<                  R                  S5       Vs/ s H%  nUR                  5       R                  S	5      S
   PM'     snU l,        U RV                   H0  nUU RT                  ;  d  M  U RT                  R                  U5        M2     [        [[        U RT                  5      5      S
:X  a  SU l.        U RT                  S   U l/        OSU l.        [`        Rb                  " SS 5      nUc  [e        5       u  nU l/        OUU l/        U R(                  [*        R,                  :X  d<  U R^                  U RT                  ;   d"   SU R^                   SU RT                   S35       eU R^                  U RT                  ;   ag  U RT                  Rg                  U R^                  5      U l4        [j        Rm                  SU RT                   SU R^                   SU Rh                   35        g g s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf ) Nr   zThe server_num and servers doesn't match. Expect servers endpoints num equal to server_num, but received servers endpoint num: {} and server_num {}r   z
127.0.0.1:r   z?The setting of Parameter-Server must has server_num or servers.zThe worker_num and workers doesn't match. Expect workers endpoints num equal to worker_num, but received workers endpoint num: {} and worker_num {}z?The setting of Parameter-Server must has worker_num or workers.rZ   r
   i  TzThe coordinator_num and coordinators doesn't match. Expect coordinators endpoints num equal to coordinator_num, but received coordinator endpoint num: {} and coordinator_num {}z2>>> use default coordinator addr(only one process)zBThe setting of Parameter-Server heter mode must has heter_devices.cpu;r   zThe stage_num and heter_workers doesn't match. Expect heter_workers endpoints stage num equal to heter_worker_num stage, but received heter_workers endpoint stage num: {} and heter_worker_num stage {}zThe heter trainer num in stage z= is not equal in args.heter_worker_num and args.heter_workerszVThe setting of Parameter-Server heter mode must has heter_worker_num or heter_workers.r   FPOD_IPr  z)} in args.servers and args.workers ips: {r  r  z current_node_ip:r  )7r  r   r5   r  r   r  r  r!  r.   r  r   r  r  r   rN   r  r  r   r  r  r  r   r   heter_devicesr  r  r  stage_heter_trainer_numr  r   r  r  r   r  r  	http_portr  r  r   r  r  r  r   r  r  r  r  r  r   r  r   r   r   r   r   )r*   r;  r  r  r  worker_endpoints_lenr  r  r  r   heter_devices_listtrainer_numheter_worker_endpoints_listr  r  heter_worker_endpoints_lenr  new_heter_worker_endpointsro   ip_port_listheter_trainer_numr  http_ipr   pod_ipr  s                             r   r	  *ParameterServerLauncher.get_role_endpointsn  sn   ??"ooDO||4<<--c23tF  j  q  qDLL..s34dooF
 )-%!$//15(+49:Eq\CF*E:)% <<2% Q% %)LLD!!$"7"7"="=c"BCDO ??"ooDO||4<<--c23tF  j  q  qDLL..s34dooF )-%!$//4??C(+49:Eq\CF*E:)% <<2% Q% 261C1CC1H$1HA	$Q'1H ! $ ""67DO37<<3E3Ec3J$3JaAGGIOOC()3J ! $ ((!
(-004??B)% $& t/A$++ 4 7 #$9$< = 0 ),1A(B%(,% $(D!#'#7#7D   ))//459M9MM G  N  ND--33C894;O;OM .2->->*!$"6"6:-0XX49:Eq\CF*E:.* JK >#:#::%%+ T+ (-D!!!$!%!3!3!9!9#!>3123/A/D%%a!e, 4 '+&;&;D  #$$$/3/D/D/J/J3/O, (,'C'C0'C $'C0,
 %%%t1177<=44B   c  j  j 2 2 8 8 => < <=  372D2D2J2J32O/24D/"3t'C'C#DE66"< 773>71L2%* /   67#;;A>? >aS@}~	? &<6%; GGIOOC03%; 3 6 &<6%;  	 45%; 3 6
  :::C #$> ? $"&//!2"&"7"7!8;7 :<6%*3/I+J%K : A A$'HH,Fq,I,/0KA0N,O)*%&!" &L ,/884N+OL+.884J+KL6B,,QU3..UGc,*<*<S*A&BB --1M1Ma1PP-33|C33e Fh #3t'C'C#DE,0,H,H,K) )- OO"oo."334! (+xx<ABEq\CF2EB( 7C,,QU3..UGc,*<*<S*A&BB --1BB-66"< 773>733|C33% F( ))R/ l/ 02,.2.@.@.F.Fs.K+.0+s#>?@A-H.eCj + 007723 :P29OA	,Q/9O / 2
 "82!7A AGGIOOC01!7 / 2 666? :; OO"oo."33473 682!&s+E'F!GA6== #(B1(E(+,G,J(K%&!" "H (+xx0J'K'*xx0F'G2>D((Q/OO**Q#l&8&8&=">> ))T-I-I"-MM)22b833s:3//<?//[ A` &--&D" !!7!78DN >>(I!4??T__4t7L7LLI ''--c215;;C@C 3Yq\):: .2-B-B-H-H-M%
-MGGIOOC #-M%
! .2-B-B-H-H-M%
-MGGIOOC #-M%
!    3399#>.>A 	$Q'>.D* 3399#>/>A 	$Q'>/D+ .2-B-B-H-H-M&
-MGGIOOC #-M&
" .2-B-B-H-H-M&
-MGGIOOC #-M&
" ++B&$$R( , ++B&$$R( , >#:#:: 44::3?/?A 	$Q'?/D+ 44::3?0?A 	$Q'?0D, 55T]]*MM((, 6 s4==!"a' DM#'==#3D !DMYYx.F~*:*<'4''-$''>+B+BB++t}}< 1$2F2F1GGrsw  tA  tA  sB  BD  E< 4==0!]]001E1EFDNLL-dmm_<MdNbNbMccnoso}o}n~ 1m	 ;. ;$$X ;"0:66X C022l%
%

./
&
&
/0s    AG9 AG>+AH42AH>AHAH3,AH%2AHAH!
+AH&	2AH++AH0+AH58,AH:	,AH?+AI*+AI	,AI@0,AIc                    U R                   U R                  ;  a  g [        S S9nSnSnSnSn[        U R                  5       GHb  u  pg[	        5       nXhl        Xxl        [        [        U R                  5      5       H`  n	XpR                  U	   :X  d  M  [        5       n
U SU R                  U	    3U
l        X*l        US-  nUR                  R                  U
5        Mb     [        [        U R                  5      5       Hg  nXpR                  U   :X  d  M  [        5       nU SU R                   U    3Ul        X<l        SUl        US-  nUR$                  R                  U5        Mi     [        [        U R&                  5      5       Hg  nXpR&                  U   :X  d  M  [        5       nU SU R(                  U    3Ul        X^l        SUl        US-  nUR*                  R                  U5        Mi     [        [        U R,                  5      5       Hu  nXpR,                  U   :X  d  M  [        5       nU SU R.                  U    3Ul        UUl        U R0                  U   Ul        US-  nUR2                  R                  U5        Mw     UR4                  R                  U5        GMe     UR4                  U R6                     n[8        R:                  " 5       U l        / / / / S.U l        / / / / S.U l         / / / / S.U l!        U RE                  U RF                  U5        U RI                  U RF                  U5        U RJ                  (       a  U RM                  U RF                  U5        U RN                  [P        RR                  :X  a  U RU                  U RF                  U5        [V        RY                  SU RF                  RZ                   SU RF                  RZ                   SU RF                  RZ                   S	U RF                  RZ                   S
3	5        [        U R>                  S   5      S:  Gaw  [        U R>                  S   5       Hn  u  n	nU R>                  S   U	   R\                  R_                  5         [        U RB                  S   5      S:  d  MN  U RB                  S   U	   Ra                  5         Mp     [V        RY                  S5        [        U R>                  S   5      S:  a  [        U R>                  S   5       HP  u  n	nU RB                  S   U	   Ra                  5         U R>                  S   U	   R\                  Rc                  5         MR     [V        RY                  S5        [        U R>                  S   5      S:  a  [        U R>                  S   5       HP  u  n	nU RB                  S   U	   Ra                  5         U R>                  S   U	   R\                  Rc                  5         MR     [V        RY                  S5        [        U R>                  S   5      S:  a  [        U R>                  S   5       HP  u  n	nU RB                  S   U	   Ra                  5         U R>                  S   U	   R\                  Rc                  5         MR     [V        RY                  S5        O[        U R>                  S   5      S:  aL  [        U R>                  S   5       H0  u  n	nU R>                  S   U	   R\                  R_                  5         M2     [        U R>                  S   5      S:  aL  [        U R>                  S   5       H0  u  n	nU R>                  S   U	   R\                  R_                  5         M2     [d        Rf                  Ri                  U R<                  5      (       a!  [j        Rl                  " U R<                  5        g g )Nr   r   rZ   r
   )workercoordinatorserverheter_workerzDPlease check servers, workers, coordinator and heter_worker logs in z/workerlog.*, z/serverlog.* , z/coordinatorlog.*, and z/heterlog.*r  zDall workers exit, going to finish parameter server and heter_worker.r!  zall heter_worker are killedr   zall parameter server are killedr  zall coordinators are killed)7r  r   r"   r   r   ry   r[   r   r5   r  ru   r  rO   r   rN   r  r  rz   r   r   r  r   r  r  r  r   r'   r   tempfilemkdtempgloo_rendezvous_dirr   cmdslog_fnsstart_pod_serverr;  start_pod_workerr  start_pod_coordinatorr  r   r   start_pod_heter_workerr   r   ro  r   r  r   r   r   rd  re  shutilrmtree)r*   r7   server_rankworker_rankheter_worker_rankcoordinator_rankr   r   r0   r   r   ro   r  mr  r'  r!  r   s                     r   start_ps ParameterServerLauncher.start_ps  s   t}}4t$&t}}5MI%C HH3t889:22155$YF)+Ad.H.H.K-L&MFO"-K1$KKK&&v. ; 3t889:22155$YF)+Ad.H.H.K-L&MFO"-K#$FL1$KKK&&v. ; 3t==>?77::"))K$a ? ? BCD  ( (8$()K%$)$$$++K8 @ 3t>>?@88;;#*9L$a @ @ CDE !) ):L%)-);L&%*%%%,,\: A LL$S 6V ll4>>*#+#3#3#5  	

 	
	 	
 	dii-dii-  &&tyy#6>#:#::''		37RSWS\S\SdSdReestxt}t}  uF  uF  tG  GV  W[  W`  W`  Wh  Wh  Vi  i@  AE  AJ  AJ  AR  AR  @S  S^  _	

 tzz(#$q( %TZZ%9:4

8$Q',,113t||H-.2LL*1-335 ; KKV 4::n-.2(N)CDGAtLL0399;JJ~.q166@@B  E 9:4::h'(1,(H)=>GAtLL*1-335JJx(+00::<  ? =>4::m,-1(M)BCGAtLL/288:JJ}-a055??A  D 9:
 4::h'(1,(H)=>GAtJJx(+00557  ? 4::n-.2(N)CDGAtJJ~.q166;;=  E 77>>$2233MM$223 4r   c                 *   [         R                  R                  5       n[        R                  " U5      nUR                  SS 5        UR                  SS 5        [	        UR
                  5       GH!  u  pVU R                  [        R                  :X  a  U R                  U R                  U R                  U R                  UR                  R                  S5      S   S[        U R                   5      UR                  R                  S5      S   [        [         R"                  " SS5      5      S	U R$                  U R&                  S
.nOU R                  U R                  U R                  UR                  R                  S5      S   S[        U R                   5      UR                  R                  S5      S   [        [         R"                  " SS5      5      S	U R$                  U R&                  S.nUR)                  U5        [*        R,                  SUR.                  /UR0                  QnU R2                  S   R5                  U5        US:X  aB  [6        R9                  SR;                  [=        UR
                  5      [?        US5      5      5        UR@                  bn  [         RB                  " UR@                  SS9  [E        UR@                   SU 3S5      n	U RF                  S   R5                  U	5        [H        RJ                  " XXS9n
O[H        RJ                  " XS9n
[M        5       nXl'        URP                  Ul(        X[l)        W	Ul*        U	(       a  U	RW                  5       OS Ul,        Xl-        U R\                  S   R5                  U5        GM$     g )Nr>  r?  rZ   r
   PSERVERr   PADDLE_WITH_GLOO03)PADDLE_PSERVERS_IP_PORT_LISTrC  PADDLE_COORDINATOR_ENDPOINTS%PADDLE_ALL_HETER_TRAINER_IP_PORT_LISTr  TRAINING_ROLErB  r  r6  PADDLE_GLOO_RENDEZVOUSPADDLE_GLOO_FS_PATHPADDLE_GLOO_HTTP_ENDPOINT)r9  rC  r:  r  r<  rB  r  r6  r=  r>  r?  rR  r   z`Local server start {} processes. First process distributed environment info (Only For Debug): {}rS  TrT  z/serverlog.r   rY  rZ  r[  rY  )/r   r  rA   r]  r   r   r  r   r   r  r  r  r  rO   r  r.   r  r  r$  r  r`  r   ra  rm  rn  r%  rN   r   r   r   r5   r1  ro  rc  rg  r&  rj  rk  r3  r   ry   r6  r   rl  r5  r7  r   )r*   r;  r0   default_envrp  rt  
cur_serverru  r7  rw  r   rz  s               r   r'  (ParameterServerLauncher.start_pod_server7  s   jjoo'ii,d+t,(5OC##~'>'>>484I4I040E0E484N4N=A=X=X#-#6#6#<#<S#A!#D%.+.t+?(1177<Q?(+BII6H#,N(O.1+/+C+C15  594I4I040E0E484N4N#-#6#6#<#<S#A!#D%.+.t+?(1177<Q?(+BII6H#,N(O.1+/+C+C15 x( $$ **	C IIh&&s+ax<<BFCKK()$&C= ||'DLL48T\\N+cU;SAX&--b1!'' "''=BG ooBGMBI)+BGGIBMFJJx ''+K  6r   c           
      4
   [         R                  R                  5       n[        R                  " U5      nUR                  SS 5        UR                  SS 5        Sn/ n[        R
                  R                  5       (       a!  [        UR                  5      n[        U5      nOf[        R
                  R                  5       (       aC  [        R
                  R                  5       n[        SU5       Vs/ s H  n[        U5      PM     nn[        UR                  5       GH  u  pUS:X  a  SO[        XhU-     5      n
U R                   ["        R$                  :X  Ga7  0 SU R&                  _SU R(                  _S[        U R*                  5      _SU R,                  _S	[        U R.                  5      _S
S_S[        U R0                  5      _SS_SU R2                  S   _SU R4                  _SU R6                  S   _SS_SU	R8                  R;                  S5      S   _SU	R8                  R;                  S5      S   _S[        U	R<                  5      _S[        [         R>                  " SS5      5      _SS_U R@                  SSU
U
U RB                  S.EnO0 SU R&                  _SU R(                  _S[        U R*                  5      _SS_SU R,                  _SU	R8                  R;                  S5      S   _SU	R8                  R;                  S5      S   _S[        U	R<                  5      _S[        [         R>                  " SS5      5      _SS_SU R@                  _SS_S S_S!U
_S"U
_S#U RB                  _nURE                  U5        [F        RH                  S$URJ                  /URL                  QnU RN                  S%   RQ                  U5        US:X  aB  [R        RU                  S&RW                  [        UR                  5      [Y        US'5      5      5        URZ                  bn  [         R\                  " URZ                  S(S)9  [_        URZ                   S*U 3S+5      nU R`                  S%   RQ                  U5        [b        Rd                  " XXS,9nO[b        Rd                  " XS-9n[g        5       nXl4        U	R<                  Ul        Xl5        WUl6        U(       a  URo                  5       OS Ul8        Xl9        U Rt                  S%   RQ                  U5        GM     g s  snf ).Nr>  r?  r   r7  r9  rC  rB  r:  PADDLE_STAGE_TRAINERS_NUMSTAGE_ID1	STAGE_NUM*PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LISTr   &PADDLE_NEXT_HETER_TRAINER_IP_PORT_LISTr   r;  HETER_DEVICE_TYPEr
   r<  TRAINERr  rZ   r  r@  r6  r=  r8  )r>  rJ  rL  r  r  r?  r>  rJ  rL  r  r  r?  rR  r  z`Local worker start {} processes. First process distributed environment info (Only For Debug): {}rS  TrT  rX  r   r@  rA  );r   r  rA   r]  r   r^  r  r  r  r5   r_  r  r   r.   r   r   r  r   r   r  r  r  r  r  r  r  r  r  rO   r  ry   r  r$  r  r`  r   ra  rm  rn  r%  rN   r   r   r   r1  ro  rc  rg  r&  rj  rk  r3  r   r6  r   rl  r5  r7  r   )r*   r;  r0   rB  rp  heter_device_numdevice_listr  rt  
cur_worker	device_idru  r7  rw  r   rz  s                   r   r(  (ParameterServerLauncher.start_pod_worker  s   jjoo'ii,d+t,>>//11"499-K";/^^0022(~~BBD+04D+EF+Ea3q6+EKF(5OC $q( -=%=>? 
 ##~'>'>>2D4I4I.0E0E *3t+? 3D4N4N	
 0T5K5K1L   T^^!4 A" =d>R>R? <T=X=X ()>)>q)A $Y j1177<Q?  ":#6#6#<#<S#A!#D!" (Z__)=#$ 'BII6H#,N(O%& -c'( ,0+C+C+.+.,5+415382D4I4I.0E0E *3t+? $Y	
 3D4N4N j1177<Q? ":#6#6#<#<S#A!#D (Z__)= 'BII6H#,N(O -c *4+C+C *3 *3 +I *9  0!& x($$ **	C IIh&&s+ax<<BFCKK()$&C= ||'DLL48T\\N+cU;SAX&--b1!'' "''=BG ooBGMBI)+BGGIBMFJJx ''+y  6 Gs   0Tc           
      8   [        S5        [        R                  R                  5       n[        R                  " U5      nUR	                  SS 5        UR	                  SS 5        [        UR                  5       GH  u  pVSn0 SU R                  _SU R                  _S[        U R                  5      _SU R                  _S	[        U R                  5      _S
S_SUR                  R                  S5      S   _SUR                  R                  S5      S   _S[        UR                  5      _S[        [        R                   " SS5      5      _SS_SU R"                  _SS_SS_SU_SU_SU R$                  _nUR'                  U5        [(        R*                  SUR,                  /UR.                  Qn	U R0                  S   R3                  U	5        US:X  aB  [4        R7                  SR9                  [;        UR                  5      [=        US5      5      5        UR>                  bn  [        R@                  " UR>                  SS 9  [C        UR>                   S!U 3S"5      n
U RD                  S   R3                  U
5        [F        RH                  " XXS#9nO[F        RH                  " XS$9n[K        5       nXl&        UR                  Ul        X\l'        W
Ul(        U
(       a  U
RS                  5       OS Ul*        Xl+        U RX                  S   R3                  U5        GM     g )%Nz">>> entering start_pod_coordinatorr>  r?  r7  r9  rC  rB  r:  PADDLE_COORDINATOR_NUMr<  COORDINATORr  rZ   r   r  r
   r@  r6  r=  r8  r>  rJ  rL  r  r  r?  rR  r  zeLocal coordinator start {} processes. First process distributed environment info (Only For Debug): {}rS  TrT  z/coordinator.r   r@  rA  )-r  r   r  rA   r]  r   r   r  r  r.   r  r  r  rO   r  ry   r  r$  r  r`  r   ra  rm  rn  r%  rN   r   r   r   r5   r1  ro  rc  rg  r&  rj  rk  r3  r   r6  r   rl  r5  r7  r   )r*   r;  r0   rB  rp  rt  cur_coordinatorrQ  ru  r7  rw  r   rz  s                r   r)  -ParameterServerLauncher.start_pod_coordinator  s   23jjoo'ii,d+t,$-c.>.>$? CI.0E0E*D,A,A &s4??'; /0J0J	
 )#d.B.B*C   /2288=a@ 77==cB1E $S)=)=%> #C		2Dc(J$K )# &t'?'? &s &s '	  &y!" ,T^^#H( x($$ **	C IIm$++C0ax<<BFC,,-)$&C= ||'DLL48T\\N-u=sC]+2226!'' "''=BG%**BGMBI)+BGGIBMFJJ}%,,R0y %@r   c           
      t   [         R                  R                  5       n[        R                  " U5      nUR                  SS 5        UR                  SS 5        Sn/ n[        R
                  R                  5       (       a!  [        UR                  5      n[        U5      nOf[        R
                  R                  5       (       aC  [        R
                  R                  5       n[        SU5       Vs/ s H  n[        U5      PM     nn[        UR                  5       GH  u  pUS:X  a  SO[        XhU-     5      n
U	R                   n0 SU R"                  _SU R$                  _SXR&                  S-
  ::  a  U R(                  US-      OS	_S
U R(                  US-
     _SU R*                  _SU R,                  U   _S[        U5      _S[        U R&                  5      _SU	R.                  R1                  S5      S   _SS_S[        U R2                  5      _S[        U R4                  5      _SU	R.                  R1                  S5      S   _S[        [         R6                  " SS5      5      _SS_SU R8                  _SS_SU
U
U R:                  S.EnUR=                  U5        [>        R@                  SURB                  /URD                  QnU RF                  S   RI                  U5        US:X  aB  [J        RM                  SRO                  [        UR                  5      [Q        US5      5      5        URR                  bn  [         RT                  " URR                  S S!9  [W        URR                   S"U 3S#5      nU RX                  S   RI                  U5        [Z        R\                  " XXS$9nO[Z        R\                  " XS%9n[_        5       nUUl0        U	Rb                  Ul1        UUl2        WUl3        U(       a  URi                  5       OS Ul5        UUl6        U Rn                  S   RI                  U5        GM     g s  snf )&Nr>  r?  r   r7  r9  rC  rK  r
   r   rJ  r;  rL  rG  rI  r  rZ   r<  HETER_TRAINERrB  rF  r  r6  r=  r8  r>  rJ  )rL  r  r  r?  rR  r!  zfLocal heter_worker start {} processes. First process distributed environment info (Only For Debug): {}rS  TrT  z
/heterlog.r   r@  rA  )8r   r  rA   r]  r   r^  r  r  r  r5   r_  r  r   r.   r   r   rz   r  r  r  r  r  r  rO   r  r  r  r  r$  r  r`  r   ra  rm  rn  r%  rN   r   r   r   r1  ro  rc  rg  r&  rj  rk  r3  r   ry   r6  r   rl  r5  r7  r   )r*   r;  r0   rB  rp  rN  rO  r  rt  cur_heter_workerrQ  stage_idru  r7  rw  r   rz  s                    r   r*  .ParameterServerLauncher.start_pod_heter_worker5  s   jjoo'ii,d+t,>>//11"499-K";/^^0022(~~BBD+04D+EF+Ea3q6+EKF%.s/@/@%A!C $q( -=%=>? 
 (--H.0E0E*D,A,A 9>>A#55 ((A6 =d>R>RqL? 89T9T $T%:%:8%D CM S0 /88>>sCAF   !" &s4??';#$ ,S1G1G-H%& *3399#>qA'( #C		2Dc(J$K)* )#+, &t'?'?-. &s/0 (+(1'0-1^^7H: x( $$ **	C IIn%,,S1ax<<BFC--.)$&C= ||'DLL48T\\N*SE:C@^,33B7!'' "''=BBG&++BGBMBI)+BGGIBMBFJJ~&--b1W &B Gs   0P5)"r;  r%  r  r   r  r  r  r  r$  r  r  r  r  r  r  r&  r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  N)r   r   r   r   r+   r	  r2  r'  r(  r)  r*  r   r   r   r   r  r  I  s1    "&HGR
~4@J,Xk,ZC1JZ2r   r  c                 f   U S;  a  [        SU  35      eU S:X  a.  [        R                  R                  5       (       d  [        S5      eU S:X  a.  [        R                  R	                  5       (       d  [        S5      eU S:X  a/  [        R                  R                  5       (       d  [        S5      eg g )	N)r  r  r  autor  xcclflagcxzpaddle.distributed initialize error, backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', 'heter', 'xccl' but got r  zlpaddle.distributed initialize error, your paddle is not compiled with cuda but you assign 'nccl' as backend.r  zkpaddle.distributed initialize error, your paddle is not compiled with xpu but you assign 'bkcl' as backend.r`  zppaddle.distributed initialize error, your paddle is not compiled with flagcx but you assign 'flagcx' as backend.)
ValueErrorr   r^  r  r_  is_compiled_with_flagcxr  s    r   check_backendrc    s        i!
 	
 &!E!E!G!GV
 	

 &!D!D!F!FU
 	

 (9>>#I#I#K#KZ
 	
 $Lr   c                     U S:w  a  g [         R                  R                  S5      (       a  [        S5      e[         R                  (       a  [        S5      eg )Nr  darwinzDYou are going to using gloo on macos, but currently is not supportedzFYou are going to using gloo on windows, but currently is not supported)utilsOS_NAME
startswithra  
IS_WINDOWSr  s    r   block_windows_and_macosrj    sR    &}}))R
 	
 T
 	
 r   c                      [         R                  R                  5       (       a  g[         R                  R                  5       (       a  gg)Nr  r  r  )r   r^  r  r_  r   r   r   get_backend_by_compile_flagrl    s1    ~~++--~~**,,r   )   r   r%   )NN)r   )<rA   r  r   r  r   r+  r   r   r   rj  r   r"  r   
contextlibr   *paddle.utils.cpp_extension.extension_utilsrf  cpp_extensionextension_utilspaddler   paddle.utilsr   r   r   	propagater   r   r"   rf   ru   r   r   r   r   r   r   r  r  r1  r3  r:  r<  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  rc  rj  rl  r   r   r   <module>ru     sG       	      
    : :  "			6	"  	 	A AH  <J JZ'+T#L.>&R    MQ|~'&R>>2<)+X	(p+:<~5+pOdF2 F2R!
H

r   