
    x-j                     L   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmc mc mZ d dlmZ d dlmZ  ej        d          Zde_         G d d          Z G d	 d
          Z G d d          Z G d d          Z G d d          Z G d d          Zd1dZ d Z!d Z"d Z#d Z$d Z%d Z&d2dZ' G d d          Z(da)d Z*	 d3dZ+d  Z,d! Z-d" Z.d# Z/d$ Z0d% Z1d& Z2d4d'Z3d( Z4d) Z5d* Z6d+ Z7 G d, d-          Z8d. Z9d/ Z:d0 Z;dS )5    N)closing)	framework)	strtoboolrootFc                       e Zd ZdZdZdZdZdS )DistributeModez\
    There are various mode for fleetrun, each of them is designed for different model.
    r         N)__name__
__module____qualname____doc__
COLLECTIVEPSPS_HETER     e/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/fleet/launch_utils.pyr   r   &   s)          J	
BHHHr   r   c                   &    e Zd ZdZdZdZdZdZdZdS )
DeviceModez
    Training devices type
    r   r	   r
   N)	r   r   r   r   UNKNOWNCPUGPUKUNLUNXPUr   r   r   r   r   0   s3          G
C
CF
CCCr   r   c                   P    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd ZdS )Clusterc                 >    d | _         g | _        d | _        d | _        d S N)
job_serverpodshdfsjob_stage_flag)selfr#   s     r   __init__zCluster.__init__=   s%    		"r   c                 Z    d| j          dd | j        D              d| j         d| j         S )Nzjob_server:z pods:c                 ,    g | ]}t          |          S r   str).0pods     r   
<listcomp>z#Cluster.__str__.<locals>.<listcomp>D   s    4S4S4S#SXX4S4S4Sr   z job_stage_flag:z hdfs:)r!   r"   r$   r#   r%   s    r   __str__zCluster.__str__C   si     LT_  L  L4S4S4S4S4S  L  Leiex  L  L  AE  AJ  L  L  	Lr   c                     t          | j                  t          |j                  k    rdS t          | j        |j                  D ]\  }}||k    r dS | j        |j        k    rdS dS NFT)lenr"   zipr$   )r%   clusterabs       r   __eq__zCluster.__eq__F   sw    ty>>S....5	7<00 	 	DAqAvvuu  '"8885tr   c                 .    |                      |           S r    )r7   r%   r4   s     r   __ne__zCluster.__ne__S   s    ;;w''''r   c                 B    t          j         |j                  | _        d S r    )copyr"   r9   s     r   update_podszCluster.update_podsV   s    Igl++			r   c                 D    t          |                                           S r    )r2   trainers_endpointsr.   s    r   trainers_nrankszCluster.trainers_nranksY   s    4**,,---r   c                 *    t          | j                  S r    )r2   r"   r.   s    r   pods_nrankszCluster.pods_nranks\   s    49~~r   c                 f    g }| j         D ]&}|j        D ]}|                    |j                   '|S r    )r"   trainersappendendpoint)r%   rr,   ts       r   r?   zCluster.trainers_endpoints_   sJ    9 	% 	%C\ % %$$$$%r   c                 ~    g }| j         D ]2}|j        D ](}d |j        D             }|                    |           )3|S )Nc                 ,    g | ]}t          |          S r   r)   r+   accs     r   r-   z,Cluster.world_device_ids.<locals>.<listcomp>j   s    #G#G#GCHH#G#G#Gr   )r"   rD   acceleratorsrE   )r%   rG   r,   rH   str_acceleratorss        r   world_device_idszCluster.world_device_idsf   sa    9 	+ 	+C\ + +#G#G#G#G#G )****+ r   c                     g }| j         D ]C}|j         d|j         }|j        |j        J | d            |                    |           D|S )N:z not a valid endpoint)r"   addrportrE   )r%   rG   r,   eps       r   pods_endpointszCluster.pods_endpointsn   sq    9 	 	CH))sx))B8'CH,@,@,,, -A,@@ HHRLLLLr   c                 l    | j         D ]+}t          |          t          |j                  k    r|c S ,d S r    )r"   r*   id)r%   pod_idr,   s      r   get_pod_by_idzCluster.get_pod_by_idx   sA    9 	 	C6{{c#&kk))


 * tr   N)r   r   r   r&   r/   r7   r:   r=   r@   rB   r?   rO   rU   rY   r   r   r   r   r   <   s        # # #L L L  ( ( (, , ,. . .            r   r   c                   &    e Zd Zd Zd Zd Zd ZdS )	JobServerc                     d | _         d S r    rF   r.   s    r   r&   zJobServer.__init__   s    r   c                     | j          S r    r]   r.   s    r   r/   zJobServer.__str__   s    -!!r   c                 "    | j         |j         k    S r    r]   r%   js     r   r7   zJobServer.__eq__   s    }
**r   c                     | |k     S r    r   r`   s     r   r:   zJobServer.__ne__       19}r   N)r   r   r   r&   r/   r7   r:   r   r   r   r[   r[      sP          " " "+ + +    r   r[   c                   ,    e Zd Zd Zd Zd Zd Zd ZdS )Trainerc                 >    g | _         d | _        d | _        d | _        d S r    )rM   rF   rankstager.   s    r   r&   zTrainer.__init__   s#    	


r   c                 6    d| j          d| j         d| j         S )Nzaccelerator:z
 endpoint:z rank:)rM   rF   rg   r.   s    r   r/   zTrainer.__str__   s)    [d/[[4=[[PTPY[[[r   c                     t          | j                  t          |j                  k    rdS | j        |j        k    s| j        |j        k    rdS t	          | j        |j                  D ]\  }}||k    r dS dS r1   )r2   rM   rF   rg   r3   )r%   rH   r5   r6   s       r   r7   zTrainer.__eq__   s    t !!S%8%8885=AJ&&$)qv*=*=5)1>:: 	 	DAqAvvuu  tr   c                     | |k     S r    r   )r%   rH   s     r   r:   zTrainer.__ne__   rc   r   c                     | j         S r    rg   r.   s    r   rg   zTrainer.rank   
    yr   N)r   r   r   r&   r/   r7   r:   rg   r   r   r   re   re      sb          \ \ \        r   re   c                   8    e Zd Zd Zd Zd Zd Zd Zd Zd Z	dS )	Podc                     d | _         d | _        d | _        d | _        g | _        g | _        g | _        g | _        g | _        g | _	        d | _
        d S r    )rg   rW   rR   rS   rD   serversworkerscoordinatorsheter_workersrM   device_moder.   s    r   r&   zPod.__init__   sY    			r   c                 
   d| j          d| j         d| j         d| j         d| j         dd | j        D              dd	 | j        D              d
d | j        D              dd | j        D              dd | j	        D              S )Nzrank:z id:z addr:z port:z visible_accelerator:z
 trainers:c                 ,    g | ]}t          |          S r   r)   )r+   rH   s     r   r-   zPod.__str__.<locals>.<listcomp>   sO      Dc  Dc  Dc  PQ  EH  IJ  EK  EK  Dc  Dc  Dcr   z	 servers:c                 ,    g | ]}t          |          S r   r)   )r+   ss     r   r-   zPod.__str__.<locals>.<listcomp>   sO      nL  nL  nL  z{  or  st  ou  ou  nL  nL  nLr   z             workers:c                 ,    g | ]}t          |          S r   r)   )r+   ws     r   r-   zPod.__str__.<locals>.<listcomp>   s    333c!ff333r   z heter_workers:c                 ,    g | ]}t          |          S r   r)   )r+   hs     r   r-   zPod.__str__.<locals>.<listcomp>   s    DhDhDhPQSVVDhDhDhr   z coordinators:c                 ,    g | ]}t          |          S r   r)   )r+   cs     r   r-   zPod.__str__.<locals>.<listcomp>   s?      y\  y\  y\  EFy|}~yy  y\  y\  y\r   )
rg   rW   rR   rS   rM   rD   rr   rs   ru   rt   r.   s    r   r/   zPod.__str__   sW   ^ty ^ ^dg ^ ^TY ^ ^di ^ ^fjfw ^ ^  Dc  Dc  UY  Ub  Dc  Dc  Dc ^ ^  nL  nL  C  K  nL  nL  nL ^ ^33dl333^ ^DhDhUYUgDhDhDh^ ^ y\  y\  JN  J[  y\  y\  y\^ ^ 	^r   c                    | j         |j         k    s0| j        |j        k    s | j        |j        k    s| j        |j        k    r"t                              d|  d|            dS t          | j                  t          |j                  k    r,t                              d| j         d|j                    dS t          t          | j                            D ]W}| j        |         |j        |         k    r9t                              d| j        |          d|j        |                      dS Xt          | j	                  t          |j	                  k    r,t                              d| j	         d|j	                    dS t          t          | j	                            D ]W}| j	        |         |j	        |         k    r9t                              d| j	        |          d|j	        |                      dS Xt          | j
                  t          |j
                  k    r,t                              d| j
         d|j
                    dS t          t          | j
                            D ]W}| j
        |         |j
        |         k    r9t                              d| j
        |          d|j
        |                      dS XdS )	Nzpod z != Fz	trainers ztrainer zservers zworkers T)rg   rW   rR   rS   loggerdebugr2   rD   rangerr   rs   )r%   r,   is      r   r7   z
Pod.__eq__   s   I!!w#&  yCH$$yCH$$LL///#//0005t}S\!2!222LLFT]FFFFGGG5s4=))** 	 	A}Q3<?22Oa(8OOcl1oOOPPPuu 3 t|CK 0 000LLCDLCCckCCDDD5s4<(()) 	 	A|A#+a.00MQMMS[^MMNNNuu 1 t|CK 0 000LLCDLCCckCCDDD5s4<(()) 	 	A|A#+a.00MQMMS[^MMNNNuu 1 tr   c                     | |k     S r    r   )r%   r,   s     r   r:   z
Pod.__ne__   s    3;r   c                     d S r    r   )r%   res_podss     r   parse_responsezPod.parse_response   s    r   c                     | j         S r    rm   r.   s    r   rg   zPod.rank   rn   r   c                 j    d}| j         D ]
}|| dz  }|dk    sJ d|  d            |d d         }|S )N ,z	this pod z can't see any acceleratorsr   )rM   )r%   rG   gs      r   get_visible_acceleratorszPod.get_visible_accelerators   s[    " 	 	AALAABwwwEDEEEwwwcrcFr   N)
r   r   r   r&   r/   r7   r:   r   rg   r   r   r   r   rp   rp      s             ^ ^ ^% % %N          r   rp      c                     t          j        |          }|                    |            t          j                    }t          j        d          }|                    |           |                    |           |S )Nz>%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s)logging	getLoggersetLevelStreamHandler	FormattersetFormatter
addHandler)	log_levelnamer   log_handler
log_formats        r   
get_loggerr      sv    t$$F
OOI'))K"H J Z(((
k"""Mr   c                    t          |          t          u s
J d            t          d           }d}t          |           D ]\  }}t	                      }	||	_        ||	_        ||	_        ||         }
t          |
          t          |          k    s
J d            t          t          |                    D ]l}t                      }|t          j        k    rt          ||         t          t          f          rA|j                            ||                    |	j                            ||                    n|j                            ||                    |	j                            ||                    ns|t          j        k    rct          ||         t          t          f          r!|j                            ||                    n |j                            ||                    |
|          |_        ||_        |dz  }|	j                            |           n|j                            |	           |                     |          }||j        |         fS )Ntrainer_endpoints must be listr#   r   zNcurrent trainer_endpoints size should be greater equal than accelerators size.r	   )typelistr   	enumeraterp   rg   rR   rv   r2   r   re   r   r   
isinstancetuplerM   extendrE   r   rF   rD   r"   index)node_ipsnode_iptrainer_endpointsrv   devices_per_procr4   trainer_rank	node_rankipr,   cur_node_endpointsr   trainerpod_ranks                 r   get_clusterr     s[    !""d***,L***4   GL"8,, ! !	2ee%.y9%&&#.>*?*????\ @?? s+,,-- 	) 	)AiiGjn,,.q1D%=AA A(//0@0CDDD$++,<Q,?@@@@(//0@0CDDD$++,<Q,?@@@@
...q1D%=AA E(//0@0CDDDD(//0@0CDDD"4Q"79G'GLALL((((C    ~~g&&HGL***r   c                 0   t           j        dk    r| D ]}|j                                        t          j        t          j        |j        j                  t          j                   |j	        r|j	        
                                 t                              d|j        j                    t          j        d           | D ]{}|j                                        `|j                                         |j	        r|j	        
                                 t                              d|j        j                    |t          j        d           t#          dd          D ]}d}| D ]F}|j                                        +t          j        |j        j        t          j                   d	}G|st                              d
            d S t          j        d           t                              d           t+          j        d           d S )Nntzterminate process group gid:r	   zterminate process id:   r   2   FTzterminate all the procszcan't kill all process and exit)osr   procpollkillpggetpgidpidsignalSIGTERMlog_fncloser   infotimesleep	terminater   r   killSIGKILLfatalsysexit)procspstepalives       r   terminate_local_procsr   1  s   	w$ 	I 	IAv{{}}$	"*QVZ00&.AAA8 %HNN$$$G16:GGHHH
1 ? ?6;;== Fx !   LL===>>> 	JqMMMa   	 	Av{{}}$
FN333 	KK1222FF
1
LL2333HQKKKKKr   c                  j    	 t          j                    } t          j        |           }| |fS #  Y d S xY wr    )socketgethostnamegethostbyname)	host_namehost_ips     r   get_host_name_ipr   W  s@    &((	&y11'!!tts   *- 2c                 ^    |t           k    rt          n|} |j        d| z   f|||dz   d| dS )ad  Add argparse's argument.

    Examples:
        .. code-block:: python

            >>> import argparse
            >>> from paddle.distributed.fleet.launch_utils import add_arguments
            >>> parser = argparse.ArgumentParser()
            >>> add_arguments("name", str, "Jonh", "User name.", parser)
            >>> args = parser.parse_args()

    z--z Default: %(default)s.)defaultr   helpN)boolr   add_argument)argnamer   r   r   	argparserkwargss         r   add_argumentsr   `  s`     99$DIw,,	 
     r   c                     d }t                      }d}	  |            }||vr|                    |           t          |          | k    r|S |dz  }|dk    rt          d           d S U)Nc            
      ~   t          t          j        t          j        t          j                            5 } |                     t          j        t          j        t          j        ddd                     | 	                    d           | 
                                d         cd d d            S # 1 swxY w Y   d S )Niir	   r   )r   r   )r   r   AF_INETSOCK_STREAM
setsockopt
SOL_SOCKET	SO_LINGERstructpackbindgetsockname)rz   s    r   __free_portz$find_free_ports.<locals>.__free_portx  s    V]6>63EFFGG 	&1 LL!6#3V[q!5L5L   FF7OOO==??1%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&s   A.B22B69B6r   Tr	   i  z@can't find available port and use the specified static port now!)setaddr2   print)numr   port_setr   rS   s        r   find_free_portsr   w  s    & & & uuHD{}}xLLx==CO	#::R   4r   c                    t           j                            d          !t          |           }|t	          |          }nFt          t           j                            d                    }t          ||z   ||z   | z   d          }|S )NFLAGS_START_PORTr	   )r   environgetr   r   intr   )r   offsetports
start_ports       r   	get_portsr     sw    	z~~())1$$KKE(:;;<<
j6):+>+DaHHLr   c           	         d}d}d}|                                  D ]"\  }}t          |t          |                    }#dd                    |d|z  |          z   }dd| d| d	z   }||z   |z   }	d
d                    dg|	z            z   dz   }
d
d                    dg|	z            z   dz   }d}||
dz   z  }|r&||                    |d         |d                   z  }n||                    dd          z  }||dz   z  }|                                  D ]g\  }}t          |t                    r!t          |          |k    rd|dd          z   }n|}||                    |d|z  t          |                    z  }h||
z  }d| d}|S )Nr
   (   -   z    z|{{:>{}s}}{}{{:^{}s}}|
 z|{:>zs}{}{:^zs}|
z    +r   =+-
r   r	   zfleetrun Distributed EnvsValuez... i)itemsmaxr2   formatjoinr   r*   )envsheaderspacingmax_kmax_vkvh_formatl_formatlengthborderlinedrawsstr_v_strs                  r   pretty_print_envsr    s   GEE

 # #1E3q66""299sW}e  H ???%????HU]W$Frwwuv~...4FRWWcUV^,,,s2DE	Vd]E GF1I666!<gFFF	TD[E

 ? ?1a 	#a&&E//QsttW$EEEC'M3u::>>>	VOE>>>DKr   c                       e Zd Zd ZdS )TrainerProcc                 Z    d | _         d | _        d | _        d | _        d | _        d | _        d S r    )r   r   
log_offsetrg   
local_rankcmdr.   s    r   r&   zTrainerProc.__init__  s0    		r   N)r   r   r   r&   r   r   r   r  r    s#            r   r  c                      t          |           dk    sJ dt          |            d            t          |           dk    r%t          | d         t                    sJ | d         at          S )Nr	   z
len(args) z should <= 1r   )r2   r   r   _run_with_coverage)argss    r   run_with_coverager!    sf    t99>>>?D		???>>>
4yyA~~$q'4(((((!!Wr   c           
         |1t          j         t          j                                                   }nt          j         |          }|                    dd            |                    dd            |                                 }d |D             }g }	t          |j                  D ],\  }
}t          |j                  t          |j	                  t          | 
                                          d                    |                                           t          |
          d                    d |j        D                       d                    |          d}|                    dd           |d         |d<   |                    dd           |d         |d<   |                    d	d           |d	         |d	<   t          |j                  d
k    rO|j        t"          j        k    r:d                    d                    d |j        D                                 |d<   t          |j                  d
k    r:d                    d                    d |j        D                                 |d<   t(          j                                        rRt          |j                  d
k    r:d                    d                    d |j        D                                 |d<   |                    |           g }t1                      s$t          j                            dd          dk    rg d}t2          j        dg|||}t6                              d| d|            |
d
k    rot6                              d                    t          |j                  t=          |d                               t6                              d| d| d           d }t          j        dk    rd nt          j         }|Nt          j!        |d            t          j"        #                    | d!          rt          j$        | d!           tK          | d!d"          5 }|&                    d#           |&                    d$                    |                                                      d d d            n# 1 swxY w Y   |                    d	          A|                    d%          '                                d&k    rtK          | d'|
 d(          }ntK          | d)|
 d(          }tQ          j)        |||||*          }ntQ          j)        |||+          }tU                      }||_+        |j        |_        |
|_,        ||_-        |r|.                                nd |_/        ||_0        |	1                    |           .|	S ),N
http_proxyhttps_proxyc                 8    g | ]}d                      |          S rQ   )r  )r+   eles     r   r-   z(start_local_trainers.<locals>.<listcomp>  s"    
(
(
(S388C==
(
(
(r   r   c                 ,    g | ]}t          |          S r   r)   rK   s     r   r-   z(start_local_trainers.<locals>.<listcomp>  s    444cS444r   )PADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSPADDLE_RANK_IN_NODEPADDLE_LOCAL_DEVICE_IDSPADDLE_WORLD_DEVICE_IDSPADDLE_CLUSTER_TOPO_PATHPADDLE_RANK_MAPPING_PATHPADDLE_ENABLE_AUTO_MAPPINGr   z{}c                 ,    g | ]}t          |          S r   r)   r+   r   s     r   r-   z(start_local_trainers.<locals>.<listcomp>      999Q#a&&999r   FLAGS_selected_gpusc                 ,    g | ]}t          |          S r   r)   r4  s     r   r-   z(start_local_trainers.<locals>.<listcomp>  r5  r   FLAGS_selected_acceleratorsc                 ,    g | ]}t          |          S r   r)   r4  s     r   r-   z(start_local_trainers.<locals>.<listcomp>  r5  r   FLAGS_selected_xpusWITH_COVERAGEOFFON)z-mcoveragerunz--branchz-p-uzstart trainer procz  env:zYLocal start {} processes. First process distributed environment info (Only For Debug): {}zDistributed Envsr  z7Details about PADDLE_TRAINER_ENDPOINTS can be found in z8/endpoints.log, and detail running logs may be found in z/workerlog.0r   Texist_okz/endpoints.logr|   zPADDLE_TRAINER_ENDPOINTS: 
r  PADDLE_NEED_RANK_MAPPINGtruez/prelaunchlog.r5   /workerlog.)envstdoutstderr
preexec_fn)rG  rJ  )2r<   r   r   poprO   r   rD   r*   rg   rF   r@   r  r?   rM   r   r2   rv   r   r   r  r   coreis_compiled_with_xpuupdater!  r   
executabler   r   r   r  r   setsidmakedirspathexistsremoveopenwritelower
subprocessPopenr  r   r  r   tellr  r  rE   )r4   r,   training_scripttraining_script_argslog_dirr  current_envidsresr   idxrH   proc_envcoverage_argsr  fnpre_fnfr   tps                       r   start_local_trainersrh    s2    |i
 1 122ioo OOL$'''OOM4(((

"
"
$
$C
(
(C
(
(
(CECL)) h hQ!$QV'*1:#&w'>'>'@'@#A#A(+1K1K1M1M(N(N#&s88'*xx44Q^444( ( (+xx}}

 

 ??5t<<H3>*4H/0 ??5t<<H3>*4H/0 ??7>>J5@,6H12 q~""s*.'H'H.2kk99!.999::/ /H*+ q~""6:kk99!.999::7 7H23 >..00 	S5H5H15L5L.2kk99!.999::/ /H*+ 	8$$$	Hz~~ou55==GGGMN
 
 	

 "
 	B#BB[BBCCC!88KK88>%%%h0MNN9 9   KK)) )) ) )  
 DbiK$////w~~88899 6	W444555000#66 A!6777		'"<"<">">??@@@A A A A A A A A A A A A A A A  <==IOO$>??EEGG  W99C993??W6666<<#Rv  DD #C[VLLLD]]&	%'1			TRLs   +ASS	S	c                    | j         rt          | j         j        d          5 }|                    | j        d           |D ]_}	 t
          j                            |           ## t          $ r0 t
          j                            d| j         j         d           Y \w xY w|	                                | _        d d d            d S # 1 swxY w Y   d S d S )NrG   r   zOUnicodeEncodeError occurs at this line. Please refer to the original log file "z"
)
r   rU  r   seekr  r   rH  rV  UnicodeEncodeErrorrZ  )rg  finr  s      r   pull_worker_logrm  _  s=   	y '").#&& 
	'#HHR]A&&&  J$$T****)   J$$VBD).V V V    
  HHJJBM
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	'' 's5    CA#"C#7BCBCC
C
c                 z   	 d}g }d}| D ]c}|j         r|j        dk    rt          |           |j                                        }|d}A|dk    rd}|                    |j                   d|r#t          |            t          j	        d           n# t          $ r- t                              d           t          |            Y d S t          $ r2 t                              d| d| d           t          |              t                              d| d| d           t          |            Y d S xY w|S )	NFr   Tr	   zKeyboardInterrupt, exitzABORT!!! Out of all z) trainers, the trainer process with rank=z# was aborted. Please check its log.)r   r  rm  r   r   rE   rg   r   r   r   KeyboardInterruptr   warning
SystemExiterror)r   nranksrr  
error_rankr   r   rets          r   watch_local_trainersrv  n  s   #
 		* 		*Ax #ALA--"""&++--C{!!!&))) 	!%(((HQKKK   0111e$$$    D6  D  DT^  D  D  D	
 	
 	
 	e$$$ D6  D  DT^  D  D  D	
 	
 	
 	e$$$Ls   BB 3D8
A+D8c                    | 9t           j                                        }d t          d|          D             }nt	          j        d          }||dk    r d |                     d          D             }n|                    d          |                     d          D ]}|v sJ d| d| d	            fd
|                     d          D             }t                              d|  d| d            |S )Nc                 ,    g | ]}t          |          S r   r)   r+   xs     r   r-   zget_gpus.<locals>.<listcomp>      777qCFF777r   r   CUDA_VISIBLE_DEVICESr   c                 6    g | ]}|                                 S r   stripry  s     r   r-   zget_gpus.<locals>.<listcomp>       ;;;a		;;;r   r   zCan't find your gpus z in CUDA_VISIBLE_DEVICES[].c                 ^    g | ])}                     |                                          *S r   r   r  )r+   rz  cuda_visible_devices_lists     r   r-   zget_gpus.<locals>.<listcomp>  s?        *//		::  r   z1Change selected_gpus into relative values. --ips: will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)	r   rL  get_cuda_device_countr   r   getenvsplitr   r   )gpusgpus_numres_gpuscuda_visible_devicesrz  r  s        @r   get_gpusr    st   |>779977E!X$6$6777!y)?@@'+?2+E+E;;4::c??;;;HH
 )=(B(B3(G(G%ZZ__  5555V!"V V=QV V V 6555   C  H KKDD D D19D D(AD D   Or   c                    | 9t           j                                        }d t          d|          D             }nt	          j        d          }||dk    r d |                     d          D             }n|                    d          |                     d          D ]}|v sJ d| d| d	            fd
|                     d          D             }t                              d|  d| d            |S )Nc                 ,    g | ]}t          |          S r   r)   ry  s     r   r-   zget_xpus.<locals>.<listcomp>  r{  r   r   XPU_VISIBLE_DEVICESr   c                 6    g | ]}|                                 S r   r~  ry  s     r   r-   zget_xpus.<locals>.<listcomp>  r  r   r   zCan't find your xpus z in XPU_VISIBLE_DEVICES[r  c                 ^    g | ])}                     |                                          *S r   r  )r+   rz  xpu_visible_devices_lists     r   r-   zget_xpus.<locals>.<listcomp>  s?        )..qwwyy99  r   z1Change selected_xpus into relative values. --ips:r  z' according to your XPU_VISIBLE_DEVICES:)	r   rL  get_xpu_device_countr   r   r  r  r   r   )xpusxpus_numres_xpusxpu_visible_devicesrz  r  s        @r   get_xpusr    st   |>668877E!X$6$6777 i(=>>&*=*C*C;;4::c??;;;HH
 (;'@'@'E'E$ZZ__  4444T!"T T<OT T T 5444   C  H KKBD B B19B B'?B B   Or   c                    | dk    rt           j                                        r=t           j                                        dk    rt	          d           t
          j        S t           j                                        r=t           j                                        dk    rt	          d           t
          j	        S | dk    r=t           j                                        dk    rt	          d           t
          j        S | dk    r=t           j                                        dk    rt	          d           t
          j	        S | d	k    rt	          d
           t
          j
        S t          d          )Nheterr   z+launch train in heter mode with GPU device.z+launch train in heter mode with XPU device.ncclzlaunch train in GPU mode!bkclzlaunch train in XPU modegloozlaunch train in CPU modezDon't supported devices)r   rL  is_compiled_with_cudar  r   r   r   rM  r  r   r   RuntimeErrorbackends    r   get_device_moder    s>   'N0022	"4466::?@@@>!N//11	"335599?@@@>!&Y^AACCaGG)***~&Y^@@BBQFF()))~&()))~
0
1
11r   c                    t          | j                  }g }|t          j        k    rt	          | j                  | j        t                    t          | j                  z  dk    s#J dt                     d| j         d            t          t                    t          | j                  z            fdt          dt                              D             }nZ}nV|t          j
        k    rt          | j                  | j        t                    t          | j                  z  dk    s#J dt                     d| j         d            t          t                    t          | j                  z            fdt          dt                              D             }n}n|t          j        k    r]t          | d          r| j        t          j                    | _        | j        dg}n6t#          t          d| j                            }nt%          d	| d
          ||fS )Nr   zgpus' number:z mod args.nproc_per_node:z
 must == 0c                 *    g | ]}||z            S r   r   )r+   r   r  ns     r   r-   z(get_device_proc_info.<locals>.<listcomp>  %    PPPAQQYPPPr   zxpus' number:c                 *    g | ]}||z            S r   r   )r+   r   r  r  s     r   r-   z(get_device_proc_info.<locals>.<listcomp>  r  r   paddle_cpuonlyzCan't support device_mode:z, support only cpu|gpu|xpu now.)r  r  r   r   r  r  nproc_per_noder2   r   r   r   r  r  r   hasattrmultiprocessing	cpu_countr   AssertionError)r   rv   r   r  r  r  s      @@@r   get_device_proc_infor    s[   !$,//K jn$$	""*IID$7 8 88Q>>>cD		ccDDWccc ?>> CIID$7 8 8899APPPPPq#d))Q9O9OPPP#	
	&	&	""*IID$7 8 88Q>>>cD		ccDDWccc ?>> CIID$7 8 8899APPPPPq#d))Q9O9OPPP#	
	&	&4)** 	>t/B/J"1";"="=D& !s#E!T-@$A$ABBUUUU
 
 	
 )**r   c                     t           j        d| j        g| j        }t	          j        |          }|                                 d S )Nr@  )r   rO  r[  r\  rX  rY  wait)r   r  r   s      r   direct_startr    sI     	 
	"	C C  DIIKKKKKr   c           	      j   | J g }|                      d          D ]}|                     d          d         }|                     d          d         }t          |          |z   }|                    d                    |t	          |          f                     d                    |          }|S )zM
    origin_endpoint: ip:port
    user_define_endpoint: ip:(port+offset)
    Nr   rQ   r   r	   )r  r   rE   r  r*   )origin_endpointsr   !paddle_user_define_endpoints_listip_portr   rS   new_portpaddle_user_define_endpointss           r   get_custom_endpointsr  +  s    
 '''(*%#))#.. P P]]3"}}S!!!$t99v%)002s8}}:M1N1NOOOO#&88,M#N#N ''r   c                    t          |          t          u s
J d            |t          j        k    s
J d            t	          d           }t          |           D ]\  }}t                      }||_        ||_        ||_	        ||         }	||         }
t          |
          dk    sJ t          t          |
                    D ]E}t                      }|	|          |_        |
|         |_        |j                            |           F|j                            |           |                     |          }||j        |         fS )Nr   ,Only support get mapped cluster for gpu now.r   r	   )r   r   r   r   r   r   rp   rg   rR   rv   r2   r   re   rF   rD   rE   r"   r   )r   r   r   rv   
node_ranksr4   r   r   r,   r   ranks_per_noder   r   r   s                 r   'get_mapped_cluster_without_rank_mappingr  c  sY    !""d***,L****.(((6 )(( 4   G"8,, ! !	2ee%.y9 $I.>""a''''s>**++ 	) 	)AiiG"4Q"79G)!,GLL((((C    ~~g&&HGL***r   c                    |t           j        k    s
J d            t          j                                        }d }t          | j        d          5 }t          j        |          }d d d            n# 1 swxY w Y   g }g }t          |d                   D ]6\  }}|
                    |d                    |
                    |g           7t          |          dk    r	|d         }	n | j        r| j        }	nt                      \  }
}	|	|v sJ d|	 d| d	            |                    |	          }t          |          t          |          k    s
J d
            t                              d| d|	 d| d||                     g }g }|D ]O|                              }t"          j                            d          Vt)          t#          j        dd                    }t-          t/          ||t          ||                   z                       }nt"          j                            d          `t)          t"          j                            d                    }t-          t/          ||t          ||                   z                       }n"t1          t          ||                             }|
                    fd|D                        Qt3          ||	|||          S )Nr  rG   machinesrR   r	   r   Can't find your local ip {} in node_ips: {}+ranks length should be equal to ips length.parsed from args: node_ips:	 node_ip: node_rank: node_ranks:PADDLE_PORTr   r   c                     g | ]	} d | 
S r&  r   r+   rS   r   s     r   r-   zEget_mapped_cluster_from_args_without_rank_mapping.<locals>.<listcomp>  #    !H!H!HTR..$..!H!H!Hr   )r   r   r   rL  r  rU  cluster_topo_pathjsonloadr   rE   r2   hostr   r   r   r   r   r   r   r   r  r   r   r   r  )r   rv   r  cluster_topo	json_filer   r  ra  cur_cluster_topor   _r   
free_portsr   r   r   s                  @r   1get_mapped_cluster_from_args_without_rank_mappingr    sj   *.(((6 )(( ~3355H L	d$c	*	* ,iy++, , , , , , , , , , , , , , , HJ!*<
+C!D!D ! !(01113%    
8}}1+9 	,iGG)++JAwhMgMMMMM  w''Iz??c(mm+++5 ,++ LL	Dh 	D 	D 	D 	D	D 	D,6y,A	D 	D   J J JNN2&&	:>>-((4RY}b99::Jj*s:i3H/I/I"IJJ JJ Z^^.//;RZ^^,>??@@Jj*s:i3H/I/I"IJJ JJ )Z	-B)C)CDDJ  !H!H!H!HZ!H!H!HIIII2',k:  s   A22A69A6c                 N   t          |          t          u s
J d            |t          j        k    s
J d            d }t	          d           }t          |           D ]$\  }}	t                      }
||
_        |	|
_        ||
_	        ||         }||         }||         }t          t          |                    D ]}t                      }|d         t          ||                            }t          |          dk    s
J d            |j                             ||d                              ||          |_        ||         |_        |
j                            |           |j                            |
           &|                     |          }||j        |         fS )	Nr   r  c                     t          j        d          }||dk    r| S |                    d          }|                    t	          |                     }t
                              d|  d| d|            |S )Nr|  r   r   zChange gpu id from z to z based on CUDA_VISIBLE_DEVICES )r   r  r  r   r*   r   r   )gpu_idr  r  relative_ids       r   get_relative_gpu_idzAget_mapped_cluster_with_rank_mapping.<locals>.get_relative_gpu_id  s    !y)?@@'+?2+E+EM(<(B(B3(G(G%399#f++FFKKKyfyy+yy^wyy   r   r   ranksr	   z.Only support one process to one device mappingr   )r   r   r   r   r   r   rp   rg   rR   rv   r   r2   re   r*   rM   rE   rF   rD   r"   r   )r   r   r   rv   r  node_rank_mappingsr  r4   r   r   r,   r   r  cur_node_rank_mappingr   r   local_device_idsr   s                     r   $get_mapped_cluster_with_rank_mappingr    s    !""d***,L****.(((6 )((
 
 
 4   G"8,, ! !	2ee%.y9 $I. 29 =s>**++ 	) 	)AiiG4W=N1%&&  '((A---@ .--  ''##$4Q$788   #5Q"79G)!,GLL((((C    ~~g&&HGL***r   c                    |t           j        k    s
J d            t          j                                        }| j        pt          j        d          }d }t          |d          5 }t          j
        |          }d d d            n# 1 swxY w Y   dt          j        d<   g }g }g }|D ]}	|                    |	d                    d t          |	d                                                   D             }
|
                                 |                    |
           |                    |	           t!          |          dk    r	|d	         }n | j        r| j        }nt%                      \  }}||v sJ d
| d| d            |                    |          }t!          ||                   |k    s
J d            t!          |          t!          |          k    s
J d            t(                              d| d| d| d||                     g }g }|D ]O|                              }t          j                            d          Vt/          t          j        dd                    }t          t1          ||t!          ||                   z                       }nt          j                            d          `t/          t          j                            d                    }t          t1          ||t!          ||                   z                       }n"t3          t!          ||                             }|                    fd|D                        Qt5          ||||||          S )Nr  r1  rG   r   rR   c                 ,    g | ]}t          |          S r   r   )r+   r   s     r   r-   zBget_mapped_cluster_from_args_with_rank_mapping.<locals>.<listcomp>  s+     
 
 
CFF
 
 
r   r  r	   r   r  r  r  zHnumber of ranks mapped to one node should not exceed the available ones.r  r  r  r  r  r  r   c                     g | ]	} d | 
S r&  r   r  s     r   r-   zBget_mapped_cluster_from_args_with_rank_mapping.<locals>.<listcomp>=  r  r   )r   r   r   rL  r  rank_mapping_pathr   r  rU  r  r  r   rE   r   keyssortr2   r  r   r   r   r   r   r   r   r   r  )r   rv   r  r  rank_mappingr  r   r  r  cur_rank_mappingcur_node_rank_listr   r  r   r  r   r   r   s                    @r   .get_mapped_cluster_from_args_with_rank_mappingr    s   *.(((6 )(( ~3355H . ")"3 3 L		%	% ,y++, , , , , , , , , , , , , , , .0BJ)*HJ( 4 4(0111
 
 !1'!:!?!?!A!ABB
 
 
 	!!!,---!!"23333
8}}1+9 	,iGG)++JAwhMgMMMMM  w''Iz)$%%111R 211 z??c(mm+++5 ,++ LL	Dh 	D 	D 	D 	D	D 	D,6y,A	D 	D   J J JNN2&&	:>>-((4RY}b99::Jj*s:i3H/I/I"IJJ JJ Z^^.//;RZ^^,>??@@Jj*s:i3H/I/I"IJJ JJ )Z	-B)C)CDDJ  !H!H!H!HZ!H!H!HIIII/  s   'BBBc                   8    e Zd Zd Zd Zd Zd Zd Zd Zd Z	dS )	ParameterServerLauncherc                    || _         || _        d| _        d| _        d| _        d| _        d| _        d| _        g | _        g | _	        d| _
        g | _        g | _        d| _        g | _        g | _        d| _        g | _        g | _        d| _        d| _        g | _        i | _        g | _        i | _        d| _        |                     |           d S )NFr   r   T)r   distribute_modewith_coordinator
server_num
worker_numheter_worker_numcoordinator_numserver_endpointsserver_endpoints_ipsserver_endpoints_portworker_endpointsworker_endpoints_ipsworker_endpoints_portheter_worker_endpointsheter_worker_endpoints_ipsheter_worker_endpoints_portcoordinator_endpointscoordinator_endpoints_ipscoordinator_endpoints_portis_localcurrent_node_ipstage_trainer_numstage_heter_map
stage_liststage_device_map	stage_numget_role_endpoints)r%   r   r  s      r   r&   z ParameterServerLauncher.__init__J  s    	. % !  "$&!%'" "$&!%'"&(#*,'+-(%'")+&*,'!!#! "%%%%%r   c                    |j         r|j         | _         |j        rt          |j                            d                    | j         k    sHJ d                    t          |j                            d                    | j                               |j        | _        nt          | j         d          }d                    d |D                       | _        nM|j        dk    s
J d            |j        | _        t          | j                            d                    | _         |j        r|j        | _        |j	        rt          |j	                            d                    | j        k    sHJ d                    t          |j	                            d                    | j                              |j	        | _
        nct          | j        | j                   }d                    d |D                       | _
        n#|j	        dk    s
J d	            d
 |j	                            d          D             }t          |          | _        d |j	                            d          D             }d|v rd}t          || j         z   || j         z   | j        z   d          }g }t          | j                  D ]E}|                    d                    ||         t          ||                   f                     Fd                    |          | _
        n|j	        | _
        |j        rd| _        |j        | _        |j        rt          |j                            d                    | j        k    sHJ d                    t          |j                            d                    | j                              |j        | _        nHt          | j        d          }d                    d |D                       | _        t%          d           | j        t(          j        k    r|j        dk    s
J d            d| j        d<   |j                            d          }	t          t          |	                    D ]}|	|         | j        |dz   <   | j
        | j        d<   |j        r|j                            d          | _        d | j        D             | _        |j        rt          |j                            d                    t          | j                  k    sUJ d                    t          |j                            d                    t          | j                                        |j                            d          }
d| _        t          t          | j                            D ]}| j        dk    r| xj        dz  c_        |
|                             d          }t          |          | j        |         k    sJ d| d            d |D             }d |D             }d|v rt          t          |          | j        | j         z   | j        z             }g }t          t          |                    D ]E}|                    d                    ||         t          ||                   f                     Fd                    |          }nd                    |          }|| j        |dz   <   | j                            |dz   gt          |                    d                    z             | xj        | j        |         z  c_        | xj        |z  c_        nt          t          | j                            D ]}| j        |         }t          || j         | j        z   | j        z             }d                    d |D                       }|| j        |dz   <   | j                            |dz   gt          |                    d                    z             | xj        |z  c_        | j        dk    r| xj        dz  c_        | xj        |z  c_        ݐn|j        dk    s
J d            g | _        |j                            d          }
d| _        t          t          |
                    D ]}|
|                             d          }| j                            t          |                     d |D             }d  |D             }d|v rt          t          |          | j        | j         z   | j        z             }g }t          t          |                    D ]E}|                    d                    ||         t          ||                   f                     Fd                    |          }nd                    |          }|| j        |dz   <   | j                            |dz   gt          |                    d                    z             | xj        | j        d!         z  c_        | j        dk    r| xj        dz  c_        | xj        |z  c_        | j        g| j        | _        t          | j                  | _         |j!        r	|j!        g}n%t          d| j         | j        z   | j        z             }| j                            d          d                             d          d         }|dz   t          |d                   z   | _!        d" | j                            d          D             | _"        d# | j
                            d          D             | _#        | j        rRd$ | j                            d          D             | _$        d% | j                            d          D             | _%        d& | j                            d          D             | _&        d' | j
                            d          D             | _'        g | _(        | j"        D ]%}|| j(        vr| j(                            |           &| j#        D ]%}|| j(        vr| j(                            |           &| j        t(          j        k    rd( | j                            d          D             | _)        d) | j                            d          D             | _*        | j)        D ]%}|| j(        vr| j(                            |           &t          tW          | j(                            dk    rd| _,        | j(        d         | _-        nzd*| _,        t]          j/        d+d           }|ta                      \  }| _-        n|| _-        | j        t(          j        k    s)| j-        | j(        v sJ d,| j-         d-| j(         d.            | j-        | j(        v rX| j(        1                    | j-                  | _2        tf          4                    d/| j(         d0| j-         d1| j2                    d S d S )2Nr   zThe server_num and servers doesn't match. Expect servers endpoints num equal to server_num, but received servers endpoint num: {} and server_num {}r   c                 2    g | ]}d t          |          z   S z
127.0.0.1:r)   ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>{  #    :::q\CFF*:::r   r   z?The setting of Parameter-Server must has server_num or servers.zThe worker_num and workers doesn't match. Expect workers endpoints num equal to worker_num, but received workers endpoint num: {} and worker_num {}c                 2    g | ]}d t          |          z   S r  r)   ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  r  r   z?The setting of Parameter-Server must has worker_num or workers.c                 h    g | ]/}|                                                     d           d         0S rQ   r   r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  s>     $ $ $,-		$$Q'$ $ $r   c                 v    g | ]6}t          |                                                    d                     7S r&  r2   r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  sA     $ $ $./AGGIIOOC(())$ $ $r   r	   i  rQ   TzThe coordinator_num and coordinators doesn't match. Expect coordinators endpoints num equal to coordinator_num, but received coordinator endpoint num: {} and coordinator_num {}c                 2    g | ]}d t          |          z   S r  r)   ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  r  r   z2>>> use default coordinator addr(only one process)zBThe setting of Parameter-Server heter mode must has heter_devices.cpu;r
   c                 ,    g | ]}t          |          S r   r  )r+   trainer_nums     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  s0     0 0 0# $$0 0 0r   zThe stage_num and heter_workers doesn't match. Expect heter_workers endpoints stage num equal to heter_worker_num stage, but received heter_workers endpoint stage num: {} and heter_worker_num stage {}zThe heter trainer num in stage z= is not equal in args.heter_worker_num and args.heter_workersc                 h    g | ]/}|                                                     d           d         0S r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  sA     6 6 6 ! GGIIOOC0036 6 6r   c                 v    g | ]6}t          |                                                    d                     7S r&  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  sD     6 6 6 !  		 4 4556 6 6r   c                 2    g | ]}d t          |          z   S r  r)   ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>&  s#    BBBq\CFF2BBBr   zVThe setting of Parameter-Server heter mode must has heter_worker_num or heter_workers.c                 h    g | ]/}|                                                     d           d         0S r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>>  s>     2 2 245		,,Q/2 2 2r   c                 v    g | ]6}t          |                                                    d                     7S r&  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>A  sD     2 2 2 AGGIIOOC00112 2 2r   r   c                 h    g | ]/}|                                                     d           d         0S r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>w  >     %
 %
 %
()AGGIIOOC  #%
 %
 %
r   c                 h    g | ]/}|                                                     d           d         0S r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>z  r  r   c                 h    g | ]/}|                                                     d           d         0S r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  sA     . . . 		$$Q'. . .r   c                 h    g | ]/}|                                                     d           d         0S rQ   r	   r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  A     / / / 		$$Q'/ / /r   c                 h    g | ]/}|                                                     d           d         0S r"  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  >     &
 &
 &
()AGGIIOOC  #&
 &
 &
r   c                 h    g | ]/}|                                                     d           d         0S r"  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  r%  r   c                 h    g | ]/}|                                                     d           d         0S r  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  r#  r   c                 h    g | ]/}|                                                     d           d         0S r"  r  ry  s     r   r-   z>ParameterServerLauncher.get_role_endpoints.<locals>.<listcomp>  sA     0 0 0 		$$Q'0 0 0r   FPOD_IPr  z)} in args.servers and args.workers ips: {r  r  z current_node_ip:r  )5r  rr   r2   r  r  r  r   r  r  rs   r  r   rE   r*   r  r  rt   r  r   r  r   r   heter_devicesr  r  r  stage_heter_trainer_numru   r  r  r   r  r  	http_portr  r  r  r   r  r  r   r  r  r   r  r  r   r  r   r   r   r   r   )r%   r   r   r  worker_endpoints_lenr   r  r  r   heter_devices_listheter_worker_endpoints_listr  r  heter_worker_endpoints_lenr  new_heter_worker_endpointsra   ip_port_listheter_trainer_numr,  http_ipr   pod_ipr  s                           r   r  z*ParameterServerLauncher.get_role_endpointsn  s`   ? 	D"oDO| 4<--c2233tFFF j  q  qDL..s3344do  GFF
 )-%%!$/155(+::E:::) )%% <2%%%Q &%% %)LD!!$"7"="=c"B"BCCDO ? 0	5"oDO| 4<--c2233tFFF j  q  qDL..s3344do  GFF )-%%!$/4?CC(+::E:::) )%% <2%%%Q &%%$ $151C1CC1H1H$ $ $  ""677DO$ $37<3E3Ec3J3J$ $ $  (((!
(-004?B) )% $& t//  A$++ 4Q 7 #$9!$< = =     ),1A(B(B%%(,%  	L$(D!#'#7D   L)//44559MMMM G  N  ND-33C88994;O  NMM .2->**!$"6::-0XX::E:::. .* JKKK >#:::%+++T ,++ (-D!!$!%!3!9!9#!>!>312233 E E/A!/D%a!e,,&*&;D #$ M@/3/D/J/J3/O/O,0 0'+'C0 0 0,
 % QDt177<<==4B B    c  j  j 2 8 8 = =>> <==    372D2J2J32O2O/24D/"3t'C#D#DEE 2D 2D6"<< 773>771L2%** /   677#;A>? ? ? a~~~? ? ?6 6%;6 6 626 6%;6 6 62
  ::::C #$> ? ? $"&/!2"&"7!8; ;7 :<6%*3/I+J+J%K%K " " : A A$'HH,Fq,I,/0KA0N,O,O)*%& %&!" !" !" !" ,/884N+O+OLL+.884J+K+KL6B,QU3..UGc,*<*<S*A*A&B&BB   --1Ma1PP--33|C333e2Dh #3t'C#D#DEE D D,0,H,K) )- O"o."34! ! (+xxBBEBBB( ( 7C,QU3..UGc,*<*<S*A*A&B&BB   --1BB--6"<< 773>7733|C333%D( )R///l 0// 02,.2.@.F.Fs.K.K+.0+s#>??@@ -@ -@A-H.eCjj + 077233  2 29O2 2 2.2 2!72 2 2. 6666? :;; O"o."347 73 682!&s+E'F'F!G!G  A6== #(B1(E(+,G,J(K(K%&!" !"    (+xx0J'K'K'*xx0F'G'G2>D(Q/O**Q#l&8&8&=&=">">>   ))T-I"-MM))2b8833s:33//<?/// &-&D" !!788DN > 	(II!4?T_4t7LL I '--c2215;;C@@C 3Yq\):)::%
 %
-1-B-H-H-M-M%
 %
 %
!%
 %
-1-B-H-H-M-M%
 %
 %
!   	. .399#>>. . .D*/ /399#>>/ / /D+
&
 &
-1-B-H-H-M-M&
 &
 &
"&
 &
-1-B-H-H-M-M&
 &
 &
" + 	) 	)B&&$$R(((+ 	) 	)B&&$$R(((>#:::/ /4::3??/ / /D+0 04::3??0 0 0D, 5 - -T]**M((,,,s4=!!""a'' DM#'=#3D  !DMYx..F~*:*<*<'4'''-$'>+BBB+t}<<< E$2F  E  Esw  tA  E  E  E =<< 4=00!]001EFFDNLLdmdNboso}     10r   c                    | j         | j        vrd S t          d           }d}d}d}d}t          | j                  D ]\\  }}t	                      }||_        ||_        t          t          | j	                            D ]^}	|| j	        |	         k    rKt                      }
| d| j        |	          |
_        ||
_        |dz  }|j                            |
           _t          t          | j                            D ]e}|| j        |         k    rRt                      }| d| j        |          |_        ||_        d|_        |dz  }|j                            |           ft          t          | j                            D ]e}|| j        |         k    rRt                      }| d| j        |          |_        ||_        d|_        |dz  }|j                            |           ft          t          | j                            D ]p}|| j        |         k    r]t                      }| d| j        |          |_        ||_        | j        |         |_        |dz  }|j                            |           q|j                            |           ^|j        | j                 }t9          j                    | _        g g g g d| _        g g g g d| _         g g g g d| _!        | "                    | j#        |           | $                    | j#        |           | j%        r| &                    | j#        |           | j'        tP          j)        k    r| *                    | j#        |           tV          ,                    d| j#        j-         d| j#        j-         d| j#        j-         d	| j#        j-         d
	           t          | j        d                   dk    rt          | j        d                   D ]r\  }	}| j        d         |	         j.        /                                 t          | j!        d                   dk    r%| j!        d         |	         0                                 stV          ,                    d           t          | j        d                   dk    rt          | j        d                   D ]T\  }	}| j!        d         |	         0                                 | j        d         |	         j.        1                                 UtV          ,                    d           t          | j        d                   dk    rt          | j        d                   D ]T\  }	}| j!        d         |	         0                                 | j        d         |	         j.        1                                 UtV          ,                    d           t          | j        d                   dk    rt          | j        d                   D ]T\  }	}| j!        d         |	         0                                 | j        d         |	         j.        1                                 UtV          ,                    d           nt          | j        d                   dk    rJt          | j        d                   D ]/\  }	}| j        d         |	         j.        /                                 0t          | j        d                   dk    rJt          | j        d                   D ]/\  }	}| j        d         |	         j.        /                                 0td          j3        4                    | j                  rtk          j6        | j                   d S d S )Nr   r   rQ   r	   )workercoordinatorserverheter_workerzDPlease check servers, workers, coordinator and heter_worker logs in z/workerlog.*, z/serverlog.* , z/coordinatorlog.*, and z/heterlog.*r7  zDall workers exit, going to finish parameter server and heter_worker.r:  zall heter_worker are killedr9  zall parameter server are killedr8  zall coordinators are killed)7r  r   r   r   rp   rg   rR   r   r2   r  re   r  rF   rr   rE   r  r  rh   rs   r  r   rt   r  r  r  ru   r"   r   tempfilemkdtempgloo_rendezvous_dirr   cmdslog_fnsstart_pod_serverr   start_pod_workerr  start_pod_coordinatorr  r   r   start_pod_heter_workerr   r   r]  r   r  r   r   r   rR  rS  shutilrmtree)r%   r4   server_rankworker_rankheter_worker_rankcoordinator_rankr   r   r,   r   r9  ra   r7  mr8  r  r:  r   s                     r   start_psz ParameterServerLauncher.start_ps  s   t}44Ft$$$&t}55 )	% )	%MIr%%C CHCH3t899:: / /21555$YYF)+&M&Md.H.K&M&MFO"-FK1$KK&&v...3t899:: / /21555$YYF)+&M&Md.H.K&M&MFO"-FK#$FL1$KK&&v...3t=>>?? 	9 	97:::")))KDD ? BDD  ( (8K$()K%$)$$++K8883t>??@@ 	; 	;8;;;#*99LEE @ CEE !) ):L%)-);L&%*%%,,\:::L$$$$l4>*#+#3#5#5  	
 

 	
 
	 	
 
 	di---di---  	7&&ty#666>#:::''	3777 _SWS\Sd  _  _txt}  uF  _  _  W[  W`  Wh  _  _  AE  AJ  AR  _  _  _	
 	
 	

 tz(#$$q(( %TZ%9:: 6 64
8$Q',11333t|H-..22L*1-33555KKV   4:n-..22(N)CDD C CGAtL0399;;;J~.q16@@BBBB9:::4:h'((1,,(H)=>> = =GAtL*1-33555Jx(+0::<<<<=>>>4:m,--11(M)BCC B BGAtL/288:::J}-a05??AAAA9:::
 4:h'((1,,(H)=>> 8 8GAtJx(+05577774:n-..22(N)CDD > >GAtJ~.q16;;====7>>$233 	4M$233333	4 	4r   c                    t           j                                        }t          j        |          }|                    dd            |                    dd            t	          |j                  D ]\  }}| j        t          j        k    r| j	        | j
        | j        | j        |j                            d          d         dt          | j                  |j                            d          d         t          t          j        dd                    d	| j        | j        d
}n| j	        | j
        | j        |j                            d          d         dt          | j                  |j                            d          d         t          t          j        dd                    d	| j        | j        d}|                    |           t*          j        d|j        g|j        }| j        d                             |           |dk    rNt6                              d                    t=          |j                  t?          |d                               |j         nt          j!        |j         d           tE          |j          d| d          }	| j#        d                             |	           tI          j%        |||	|	          }
ntI          j%        ||          }
tM                      }|
|_'        |j(        |_(        ||_)        |	|_*        |	r|	+                                nd |_,        ||_-        | j.        d                             |           d S )Nr#  r$  rQ   r	   PSERVERr   PADDLE_WITH_GLOO03)PADDLE_PSERVERS_IP_PORT_LISTr,  PADDLE_COORDINATOR_ENDPOINTS%PADDLE_ALL_HETER_TRAINER_IP_PORT_LISTr  TRAINING_ROLEr+  r)  rN  PADDLE_GLOO_RENDEZVOUSPADDLE_GLOO_FS_PATHPADDLE_GLOO_HTTP_ENDPOINT)rQ  r,  rR  r  rT  r+  r)  rN  rU  rV  rW  r@  r9  z`Local server start {} processes. First process distributed environment info (Only For Debug): {}rA  TrB  z/serverlog.r|   rG  rH  rI  rG  )/r   r   r<   rK  r   rr   r  r   r   r  r  r  r  rF   r  r*   r  r  r=  r,  rN  r   rO  r[  r\  r>  rE   r   r   r  r2   r  r]  rQ  rU  r?  rX  rY  r  r   rg   r  r   rZ  r  r  r   )r%   r   r,   default_envr^  ra  
cur_serverrb  r  rd  r   rg  s               r   r@  z(ParameterServerLauncher.start_pod_server7  s?   joo''i,,d+++t,,,(55 E	, E	,OC#~'>>>484I040E484N=A=X#-#6#<#<S#A#A!#D%.+.t+?+?(177<<Q?(+BI6H#,N,N(O(O.1+/+C15   594I040E484N#-#6#<#<S#A#A!#D%.+.t+?+?(177<<Q?(+BI6H#,N,N(O(O.1+/+C15  x((( $ *	C Ih&&s+++axx<<BFCK(()$&C = =   |'DL48888T\;;c;;SAAX&--b111!'[B   "'===BBG oBGBMBI)+5BGGIIIBMBFJx ''++++KE	, E	,r   c           
      	   t           j                                        }t          j        |          }|                    dd            |                    dd            d}g }t          j                                        r$t          |j                  }t          |          }nVt          j        
                                r8t          j                                        }d t          d|          D             }t          |j                  D ]\  }}|dk    rdnt          |||z                     }	| j        t"          j        k    ri d| j        d| j        dt          | j                  d	| j        d
t          | j                  dddt          | j                  ddd| j        d         d| j        d| j        d         ddd|j                            d          d         d|j                            d          d         dt          |j                  dt          t          j        dd                    dd| j         dd|	|	| j!        d}
ni d| j        d| j        dt          | j                  ddd	| j        d|j                            d          d         d|j                            d          d         dt          |j                  dt          t          j        dd                    ddd| j         d dd!dd"|	d#|	d$| j!        }
|"                    |
           tF          j$        d%|j%        g|j&        }| j'        d&         (                    |           |dk    rNtR          *                    d'+                    t          |j                  tY          |
d(                               |j-        nt          j.        |j-        d)*           t_          |j-         d+| d,          }| j0        d&         (                    |           tc          j2        ||||-          }ntc          j2        ||.          }tg                      }||_4        |j        |_        ||_5        ||_6        |r|7                                nd |_8        ||_9        | j:        d&         (                    |           d S )/Nr#  r$  r   c                 ,    g | ]}t          |          S r   r)   ry  s     r   r-   z<ParameterServerLauncher.start_pod_worker.<locals>.<listcomp>      FFFa3q66FFFr   rO  rQ  r,  r+  rR  PADDLE_STAGE_TRAINERS_NUMSTAGE_ID1	STAGE_NUM*PADDLE_PREVIOUS_HETER_TRAINER_IP_PORT_LISTr   &PADDLE_NEXT_HETER_TRAINER_IP_PORT_LISTr
   rS  HETER_DEVICE_TYPEr	   rT  TRAINERr)  rQ   r  r)  rN  rU  rP  )rV  r6  r:  r|  r  rW  rV  r6  r:  r|  r  rW  r@  r7  z`Local worker start {} processes. First process distributed environment info (Only For Debug): {}rA  TrB  rF  r|   rX  rY  );r   r   r<   rK  r   rL  r  r  r  r2   rM  r  r   r   rs   r*   r  r   r   r  r  r  r  r  r  r  r  r  rF   r  rg   r  r=  r,  rN  r   rO  r[  r\  r>  rE   r   r   r  r  r]  rQ  rU  r?  rX  rY  r  r   r  r   rZ  r  r  r   )r%   r   r,   rZ  r^  heter_device_numdevice_listra  
cur_worker	device_idrb  r  rd  r   rg  s                  r   rA  z(ParameterServerLauncher.start_pod_worker  s   joo''i,,d+++t,,,>//11 	G"49--K";//^0022 	G(~BBDDFF54D+E+EFFFK(55 \	, \	,OC $q(( c-=%=>?? 
 #~'>>>2D4I.0E *3t+?+? 3D4N	
 0T5K1L1L   T^!4!4 A" =d>R? <T=X ()>q)A $Y j177<<Q?  ":#6#<#<S#A#A!#D!" (Z_)=)=#$ 'BI6H#,N,N(O(O%& -c'( ,0+C+.+.,5+4153  82D4I.0E *3t+?+? $Y	
 3D4N j177<<Q? ":#6#<#<S#A#A!#D (Z_)=)= 'BI6H#,N,N(O(O -c *4+C *3 *3 +I *9  0!& x((($ *	C Ih&&s+++axx<<BFCK(()$&C = =   |'DL48888T\;;c;;SAAX&--b111!'[B   "'===BBG oBGBMBI)+5BGGIIIBMBFJx ''++++y\	, \	,r   c           
          t          d           t          j                                        }t          j        |          }|                    dd            |                    dd            t          |j                  D ]\  }}d}i d| j        d| j        dt          | j
                  d| j        d	t          | j                  d
dd|j                            d          d         d|j                            d          d         dt          |j                  dt          t          j        dd                    ddd| j        ddddd|d|d| j        }|                    |           t(          j        d|j        g|j        }	| j        d                             |	           |dk    rNt4                              d                    t;          |j                  t=          |d                               |j        nt          j         |j        d            tC          |j         d!| d"          }
| j"        d                             |
           tG          j$        |	||
|
#          }ntG          j$        |	|$          }tK                      }||_&        |j        |_        ||_'        |
|_(        |
r|
)                                nd |_*        |	|_+        | j,        d                             |           d S )%Nz">>> entering start_pod_coordinatorr#  r$  rO  rQ  r,  r+  rR  PADDLE_COORDINATOR_NUMrT  COORDINATORr)  rQ   r   r  r	   r)  rN  rU  rP  rV  r6  r:  r|  r  rW  r@  r8  zeLocal coordinator start {} processes. First process distributed environment info (Only For Debug): {}rA  TrB  z/coordinator.r|   rX  rY  )-r   r   r   r<   rK  r   rt   r  r  r*   r  r  r  rF   r  rg   r  r=  r,  rN  r   rO  r[  r\  r>  rE   r   r   r  r2   r  r]  rQ  rU  r?  rX  rY  r  r   r  r   rZ  r  r  r   )r%   r   r,   rZ  r^  ra  cur_coordinatorrj  rb  r  rd  r   rg  s                r   rB  z-ParameterServerLauncher.start_pod_coordinator  sh   2333joo''i,,d+++t,,,$-c.>$?$? <	1 <	1 CI.0E*D,A &s4?';'; /0J	
 )#d.B*C*C   /288==a@ 7==cBB1E $S)=%>%> #C	2Dc(J(J$K$K )# &t'? &s &s '	  &y!" ,T^#H( x((($ *	C Im$++C000axx<<BFC,--)$&C = =   |'DL48888T\====sCC]+222666!'[B   "'===BBG%*BGBMBI)+5BGGIIIBMBFJ}%,,R0000y<	1 <	1r   c           
         t           j                                        }t          j        |          }|                    dd            |                    dd            d}g }t          j                                        r$t          |j                  }t          |          }nVt          j        
                                r8t          j                                        }d t          d|          D             }t          |j                  D ]\  }}|dk    rdnt          |||z                     }	|j        }
i d| j        d| j        d|
| j        d	z
  k    r| j        |
d	z            nd
d| j        |
d	z
           d| j        d| j        |
         dt          |
          dt          | j                  d|j                            d          d	         dddt          | j                  dt          | j                  d|j                            d          d         dt          t          j        dd                    ddd| j        ddd|	|	| j        d}|                    |           t>          j         d|j!        g|j"        }| j#        d         $                    |           |dk    rNtJ          &                    d'                    t          |j                  tQ          |d                                |j)        nt          j*        |j)        d!"           tW          |j)         d#| d$          }| j,        d         $                    |           t[          j.        ||||%          }nt[          j.        ||&          }t_                      }||_0        |j1        |_1        ||_2        ||_3        |r|4                                nd |_5        ||_6        | j7        d         $                    |           d S )'Nr#  r$  r   c                 ,    g | ]}t          |          S r   r)   ry  s     r   r-   zBParameterServerLauncher.start_pod_heter_worker.<locals>.<listcomp>B  r^  r   rO  rQ  r,  rd  r	   r   rc  rS  re  r`  rb  r  rQ   rT  HETER_TRAINERr+  r_  r)  rN  rU  rP  rV  r6  )r:  r|  r  rW  r@  r:  zfLocal heter_worker start {} processes. First process distributed environment info (Only For Debug): {}rA  TrB  z
/heterlog.r|   rX  rY  )8r   r   r<   rK  r   rL  r  r  r  r2   rM  r  r   r   ru   r*   rh   r  r  r  r  r  r  rF   r  r  r  r  r=  r,  rN  r   rO  r[  r\  r>  rE   r   r   r  r  r]  rQ  rU  r?  rX  rY  r  r   rg   r  r   rZ  r  r  r   )r%   r   r,   rZ  r^  rg  rh  ra  cur_heter_workerrj  stage_idrb  r  rd  r   rg  s                   r   rC  z.ParameterServerLauncher.start_pod_heter_worker5  s   joo''i,,d+++t,,,>//11 	G"49--K";//^0022 	G(~BBDDFF54D+E+EFFFK%.s/@%A%A K	2 K	2!C! $q(( c-=%=>?? 
 (-H.0E*D,A 94>A#555 (A66 =d>RqL? 89T $T%:8%D CMM S00 /8>>sCCAF   !" &s4?';';#$ ,S1G-H-H%& *399#>>qA'( #C	2Dc(J(J$K$K)* )#+, &t'?-. &s/0 (+(1'0-1^7  H: x((( $ *	C In%,,S111axx<<BFC-..)$&C = =   |'DL48888T\::S::C@@^,33B777!'[B   "'===BBG&+BGBMBI)+5BGGIIIBMBFJ~&--b1111WK	2 K	2r   N)
r   r   r   r&   r  rK  r@  rA  rB  rC  r   r   r   r  r  I  s        "& "& "&HG G GR
~4 ~4 ~4@J, J, J,Xk, k, k,ZC1 C1 C1JZ2 Z2 Z2 Z2 Z2r   r  c                 h   | dvrt          d|            | dk    r-t          j                                        st          d          | dk    r-t          j                                        st          d          | dk    r-t          j                                        st          d          d S d S )	N)r  r  r  autor  xcclflagcxzpaddle.distributed initialize error, backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', 'heter', 'xccl' but got r  zlpaddle.distributed initialize error, your paddle is not compiled with cuda but you assign 'nccl' as backend.r  zkpaddle.distributed initialize error, your paddle is not compiled with xpu but you assign 'bkcl' as backend.rw  zppaddle.distributed initialize error, your paddle is not compiled with flagcx but you assign 'flagcx' as backend.)
ValueErrorr   rL  r  rM  is_compiled_with_flagcxr  s    r   check_backendrz    s        ! ! !
 
 	
 &!E!E!G!GV
 
 	

 &!D!D!F!FU
 
 	

 (9>#I#I#K#KZ
 
 	
 r   c                     | dk    rd S t           j                            d          rt          d          t           j        rt          d          d S )Nr  darwinzDYou are going to using gloo on macos, but currently is not supportedzFYou are going to using gloo on windows, but currently is not supported)utilsOS_NAME
startswithrx  
IS_WINDOWSr  s    r   block_windows_and_macosr    sk    &})) 
R
 
 	
  
T
 
 	

 
r   c                      t           j                                        rdS t           j                                        rdS dS )Nr  r  r  )r   rL  r  rM  r   r   r   get_backend_by_compile_flagr    s=    ~++-- v~**,, v6r   )r   r   r    )NN)r   )<r<   r  r   r  r   rD  r   r   r   rX  r   r;  r   
contextlibr   *paddle.utils.cpp_extension.extension_utilsr}  cpp_extensionextension_utilspaddler   paddle.utilsr   r   r   	propagater   r   r   r[   re   rp   r   r   r   r   r   r   r   r  r  r  r!  rh  rm  rv  r  r  r  r  r  r  r  r  r  r  r  rz  r  r  r   r   r   <module>r     s          				         



         : : : : : : : : : : : :       " " " " " "		6	"	"        	 	 	 	 	 	 	 	A A A A A A A AH              <J J J J J J J JZ   '+ '+ '+T# # #L    .  >  & & & &R            MQ| | | |~' ' '& & &R  >  >2 2 2<)+ )+ )+X	 	 	( ( ( (p+ + +:< < <~5+ 5+ 5+pO O OdF2 F2 F2 F2 F2 F2 F2 F2R!
 !
 !
H

 

 

    r   