
    Αi\@                     H   S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKJr  S SK	J
r
  S SKJr  SSKJr  \" SS5      rS	 rS
 r " S S5      r " S S5      r " S S5      r " S S5      r " S S5      rS rS rS rS rS rS!S jr " S S5      r S!S jrS rS r S  r!g)"    N)closing)get_backend_by_compile_flag)	strtobool   )
get_loggerINFOrootc           	         U R                   R                  S5       Vs/ s H  o"R                  5       PM     nnU R                  nUR	                  U5      n[
        R                  SU SU SU 35        S nU R                  (       d?  [        U5      S::  a0  U R                  c#  [        [        U5      5      nUb  [        U5      nO;SnU R                  b  U R                  n[        [        Xw[        U5      -   5      5      n/ nU H)  n	UR                  U V
s/ s H	  o SU
 3PM     sn
5        M+     [        X4X5      $ s  snf s  sn
f )N,zparsed from args:node_ips:z	 node_ip:z node_rank:   i  :)cluster_node_ipssplitstripnode_ipindexloggerdebuguse_paddlecloudlenstarted_portfind_free_portslistrangeappendget_cluster)argsselected_gpusxnode_ipsr   	node_rank
free_portsr   trainer_endpointsipports              e/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/utils/launch_utils.pyget_cluster_from_argsr'       s7   #'#8#8#>#>s#CD#Ca	#CHDllGw'I
LL
$XJiyI;W J  MQ%$S%78
!j)J(,,L,s=/A AB

   Z!HZTD$.Z!HI x*;KK; E8 "Is   D>E
c                 r   U c=  SSK Jn  UR                  5       n[        SU5       Vs/ s H  n[	        U5      PM     nnU$ [
        R                  " S5      nUb  US:X  a0  U R                  S5       Vs/ s H  o3R                  5       PM     nnU$ UR                  S5      nU R                  S5       H  nX6;   a  M
   SU SU S35       e   U R                  S5       Vs/ s H"  nUR                  UR                  5       5      PM$     nn[        R                  S	U  S
U SU 35        U$ s  snf s  snf s  snf )Nr   coreCUDA_VISIBLE_DEVICES r   zCan't find your selected_gpus z in CUDA_VISIBLE_DEVICES[z].z1Change selected_gpus into relative values. --ips:z will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)paddle.frameworkr*   get_cuda_device_countr   strosgetenvr   r   r   r   info)r   r*   gpus_numr   gpuscuda_visible_devicescuda_visible_devices_lists          r&   get_gpusr7   A   sj   )--/ %a 23 21A 232 K/  "yy)?@'+?2+E'4':':3'?@'?!GGI'?D@* K! )=(B(B3(G%"((-5 **+,EFZE[[]_5 . ',,S11A *//	:1   KKCM? S115 7((A'BD K3 4 As   D*3D/)D4c                   2    \ rS rSrS rS rS rS rS rSr	g)	Hdfsb   c                 .    S U l         S U l        S U l        g Nhdfs_ugi	hdfs_name	hdfs_pathselfs    r&   __init__Hdfs.__init__c   s        c                 r    U R                   S L=(       a#    U R                  S L=(       a    U R                  S L$ r<   r=   rA   s    r&   is_validHdfs.is_validh   s5    MM% +d*+d*	
rE   c                 T    SU R                    SU R                   SU R                   3$ )Nz	hdfs_ugi:z hdfs_name:z
 hdfs_pathr=   rA   s    r&   __str__Hdfs.__str__o   s)    4==/T^^4DJt~~N^__rE   c                     U R                   UR                   :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ r<   r=   rB   ns     r&   __eq__Hdfs.__eq__r   sA    MMQZZ' .!++-.!++-	
rE   c                     X:X  + $ r<    rM   s     r&   __ne__Hdfs.__ne__y   
    }rE   )r?   r@   r>   N)
__name__
__module____qualname____firstlineno__rC   rG   rJ   rO   rS   __static_attributes__rR   rE   r&   r9   r9   b   s    

`
rE   r9   c                   P    \ rS rSrS rS rS rS rS rS r	S r
S	 rS
 rS rSrg)Cluster}   c                 <    S U l         / U l        S U l        S U l        g r<   )
job_serverpodshdfsjob_stage_flag)rB   ra   s     r&   rC   Cluster.__init__~   s    		"rE   c           	          SU R                    SU R                   Vs/ s H  n[        U5      PM     sn SU R                   SU R                   3$ s  snf )Nzjob_server:z pods:z job_stage_flag:z hdfs:)r_   r`   r/   rb   ra   rB   pods     r&   rJ   Cluster.__str__   si    T__-V4S#SX4S3TTdeiexexdyy  AE  AJ  AJ  @K  L  	L4Ss   A
c                     [        U R                  5      [        UR                  5      :w  a  g[        U R                  UR                  5       H  u  p#X#:w  d  M    g   U R                  UR                  :w  a  ggNFT)r   r`   ziprb   )rB   clusterabs       r&   rO   Cluster.__eq__   s]    tyy>S..		7<<0DAv 1 '"8"88rE   c                 .    U R                  U5      (       + $ r<   )rO   rB   rk   s     r&   rS   Cluster.__ne__   s    ;;w'''rE   c                 N    [         R                   " UR                  5      U l        g r<   )copyr`   rp   s     r&   update_podsCluster.update_pods   s    IIgll+	rE   c                 4    [        U R                  5       5      $ r<   )r   trainers_endpointsrA   s    r&   trainers_nranksCluster.trainers_nranks   s    4**,--rE   c                 ,    [        U R                  5      $ r<   )r   r`   rA   s    r&   pods_nranksCluster.pods_nranks   s    499~rE   c                     / nU R                    H1  nUR                   H  nUR                  UR                  5        M      M3     U$ r<   )r`   trainersr   endpoint)rB   rrf   ts       r&   rw   Cluster.trainers_endpoints   s:    99C\\$ "  rE   c                     / nU R                    HS  nUR                   SUR                   3nUR                  b  UR                  c
   U S35       eUR                  U5        MU     U$ )Nr   z not a valid endpoint)r`   addrr%   r   )rB   r   rf   eps       r&   pods_endpointsCluster.pods_endpoints   sl    99CHH:Qsxxj)B88'CHH,@ $+,@ HHRL  rE   c                 v    U R                    H)  n[        U5      [        UR                  5      :X  d  M'  Us  $    g r<   )r`   r/   id)rB   pod_idrf   s      r&   get_pod_by_idCluster.get_pod_by_id   s.    99C6{c#&&k)
  rE   )ra   r_   rb   r`   N)rV   rW   rX   rY   rC   rJ   rO   rS   rt   rx   r{   rw   r   r   rZ   rR   rE   r&   r\   r\   }   s5    #L(,.	rE   r\   c                   ,    \ rS rSrS rS rS rS rSrg)	JobServer   c                     S U l         g r<   r   rA   s    r&   rC   JobServer.__init__   s	    rE   c                     U R                    $ r<   r   rA   s    r&   rJ   JobServer.__str__   s    --!rE   c                 4    U R                   UR                   :H  $ r<   r   rB   js     r&   rO   JobServer.__eq__   s    }}

**rE   c                     X:X  + $ r<   rR   r   s     r&   rS   JobServer.__ne__   rU   rE   r   N)	rV   rW   rX   rY   rC   rJ   rO   rS   rZ   rR   rE   r&   r   r      s    "+rE   r   c                   2    \ rS rSrS rS rS rS rS rSr	g)	Trainer   c                 .    / U l         S U l        S U l        g r<   r4   r   rankrA   s    r&   rC   Trainer.__init__   s    		rE   c                 T    SU R                    SU R                   SU R                   3$ )Nzgpu:z
 endpoint:z rank:r   rA   s    r&   rJ   Trainer.__str__   s'    dii[
4==/		{KKrE   c                 (   [        U R                  5      [        UR                  5      :w  a  gU R                  UR                  :w  d  U R                  UR                  :w  a  g[	        U R                  UR                  5       H  u  p#X#:w  d  M    g   gri   )r   r4   r   r   rj   )rB   r   rl   rm   s       r&   rO   Trainer.__eq__   sg    tyy>S[(==AJJ&$))qvv*=		166*DAv + rE   c                     X:X  + $ r<   rR   rB   r   s     r&   rS   Trainer.__ne__   rU   rE   c                     U R                   $ r<   )r   rA   s    r&   get_rankTrainer.get_rank   s    yyrE   )r   r4   r   N)
rV   rW   rX   rY   rC   rJ   rO   rS   r   rZ   rR   rE   r&   r   r      s    
LrE   r   c                   8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
Pod   c                 X    S U l         S U l        S U l        S U l        / U l        / U l        g r<   )r   r   r   r%   r~   r4   rA   s    r&   rC   Pod.__init__   s,    				rE   c                     SU R                    SU R                   SU R                   SU R                   SU R                   SU R
                   Vs/ s H  n[        U5      PM     sn 3$ s  snf )Nzrank:z id:z addr:z port:z visible_gpu:z
 trainers:)r   r   r   r%   r4   r~   r/   r   s     r&   rJ   Pod.__str__   s    tyykdggYfTYYKvdii[P]^b^g^g]hhr  EI  ER  ER  tS  ER  @Atwxytz  ER  tS  sT  U  	U  tSs   A+c                    U R                   UR                   :w  dN  U R                  UR                  :w  d4  U R                  UR                  :w  d  U R                  UR                  :w  a  [        R                  SU  SU 35        g[        U R                  5      [        UR                  5      :w  a0  [        R                  SU R                   SUR                   35        g[        [        U R                  5      5       HZ  nU R                  U   UR                  U   :w  d  M%  [        R                  SU R                  U    SUR                  U    35          g   g)Nzpod z != Fz	trainers ztrainer T)	r   r   r   r%   r   r   r   r~   r   )rB   rf   is      r&   rO   
Pod.__eq__   s   II!ww#&& yyCHH$yyCHH$LL4vT#/0t}}S\\!22LL9T]]O4~FGs4==)*A}}Q3<<?2xa(8'9cll1o=NOP +
 rE   c                     X:X  + $ r<   rR   re   s     r&   rS   
Pod.__ne__  s
    rE   c                     g r<   rR   )rB   res_podss     r&   parse_responsePod.parse_response	  s    rE   c                 j    SnU R                    H
  nX S3-  nM     US:w  d   SU  S35       eUS S nU$ )Nr,   r   z	this pod z can't see any gpus)r4   )rB   r   gs      r&   get_visible_gpusPod.get_visible_gpus  sO    A3aLA  Bw=)D6)<==wcrFrE   )r   r4   r   r%   r   r~   N)rV   rW   rX   rY   rC   rJ   rO   rS   r   r   rZ   rR   rE   r&   r   r      s!    U*rE   r   c                 @   [        U5      [        L d   S5       e[        S S9nSn[        U 5       H  u  pg[	        5       nXhl        Xxl        X&   n	[        U	5      [        U5      :  d   S5       e[        [        U5      5       HZ  n
[        5       nUR                  R                  X:   5        X    Ul        X[l        US-  nUR                  R                  U5        M\     UR                  R                  U5        M     U R                  U5      nXDR                  U   4$ )Nztrainer_endpoints must be list)ra   r   zOcurrent trainer_endpoints size should be greater equal than selected_gpus size.r   )typer   r\   	enumerater   r   r   r   r   r   r4   r   r   r~   r`   r   )r    r   r#   r   rk   trainer_rankr!   r$   rf   cur_node_endpointsr   trainerpod_ranks                r&   r   r     s   !"d*L,LL*4 GL"8,	e.9%&#m*<< 	
]	
< s=)*AiGLL 01"4"7!8G'LALLL( + 	C # -& ~~g&HLL***rE   c                    U  H  nUR                   R                  5       b  M   UR                   R                  5         UR                  (       a  UR                  R	                  5         [
        R                  SUR                   R                   35        M     [        R                  " S5        [        SS5       H  nSnU  H[  nUR                   R                  5       b  M   [        R                  " UR                   R                  [        R                  5        SnM]     U(       d  [
        R                  S5          g [        R                  " S5        M     [
        R!                  S5        ["        R$                  " S	5        g )
Nzterminate process id:   r   2   FTzterminate all the procszcan't kill all process and exitr   )procpoll	terminatelog_fncloser   r   pidtimesleepr   r0   killsignalSIGKILLr2   fatalsysexit)procspstepalives       r&   terminate_local_procsr   2  s    66;;= FFxx LL0=>  	JJqMaAvv{{}$

FNN3 
 KK12

1  LL23HHQKrE   c                  n     [         R                  " 5       n [         R                  " U 5      nX4$ !    g = fr<   )socketgethostnamegethostbyname)	host_namehost_ips     r&   get_host_name_ipr   M  s8    &&(	&&y1!!s   -0 4c                 b    U[         :X  a  [        OUnUR                  " SU -   4UUUS-   S.UD6  g)aB  Add argparse's argument.
Examples:
    .. code-block:: python

        >>> import argparse
        >>> from paddle.distributed.utils import launch_utils
        >>> parser = argparse.ArgumentParser()
        >>> launch_utils.add_arguments("name", str, "Jonh", "User name.", parser)
        >>> args = parser.parse_args()

z--z Default: %(default)s.)defaultr   helpN)boolr   add_argument)argnamer   r   r   	argparserkwargss         r&   add_argumentsr   V  sD     9$Dw,,	
 rE   c                     S n[        5       nSn U" 5       nXB;  a  UR                  U5        [        U5      U :  a  U$ US-  nUS:  a  [        S5        g MG  )Nc                      [        [        R                  " [        R                  [        R                  5      5       n U R	                  S5        U R                  5       S   sS S S 5        $ ! , (       d  f       g = f)N)r,   r   r   )r   r   AF_INETSOCK_STREAMbindgetsockname)ss    r&   __free_port$find_free_ports.<locals>.__free_portm  sG    V]]6>>63E3EFG1FF7O==?1% HGGs   $A++
A9r   r   d   z@can't find available port and use the specified static port now!)setaddr   print)numr   port_setr   r%   s        r&   r   r   l  sg    &
 uHD
}LLx=CO	#:R  rE   c                 n   Uc
  [        5       nUS:X  a  SR                  SR                  UR                   Vs/ s H  n[	        U5      PM     sn5      5      [	        UR
                  5      [	        UR                  5      [	        U R                  5       5      SR                  U R                  5       5      S.nU$ US:X  a  SR                  SR                  UR                   Vs/ s H  n[	        U5      PM     sn5      5      [	        UR
                  5      [	        UR                  5      [	        U R                  5       5      SR                  U R                  5       5      S.nU$ US:X  ad  [	        UR
                  5      [	        UR                  5      [	        U R                  5       5      SR                  U R                  5       5      US.nU$ US	:X  a  S
SK	J
n  UR                  5       S
   nSU S3SR                  SR                  UR                   Vs/ s H  n[	        U5      PM     sn5      5      S[	        UR
                  5      S[	        UR                  5      S[	        U R                  5       5      SSR                  U R                  5       5      0nU$ [        S5      es  snf s  snf s  snf )Nbkclz{}r   )FLAGS_selected_xpusPADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSnccl)FLAGS_selected_gpusr   r  r  r  gloo)r   r  r  r  PADDLE_DISTRI_BACKENDxcclr   r)   FLAGS_selected_r   r   r  r  r  z)backend must be one of 'gloo, nccl, bkcl')r   formatjoinr4   r/   r   r   rx   rw   r-   r*   get_all_custom_device_type
ValueError)rk   r   backendr   proc_envr*   custom_device_names          r&   _prepare_trainer_envr    sR   -/&#';;',,7,Q#a&,78$ "%W\\!2'*7+;+;'<#&w'>'>'@#A(+1K1K1M(N
X OG 
F	#';;',,7,Q#a&,78$ "%W\\!2'*7+;+;'<#&w'>'>'@#A(+1K1K1M(N
D O3 
F	 "%W\\!2'*7+;+;'<#&w'>'>'@#A(+1K1K1M(N%,
. O! 
F	)!<<>qA013T[[',,7,Q#a&,786  W\\!2%s7+;+;'<!3w'>'>'@#A&1K1K1M(N
 O DEEQ 8 8, 8s   J($J-J2c                       \ rS rSrS rSrg)TrainerProci  c                 X    S U l         S U l        S U l        S U l        S U l        S U l        g r<   )r   r   
log_offsetr   
local_rankcmdrA   s    r&   rC   TrainerProc.__init__  s,    		rE   )r  r  r   r  r   r   N)rV   rW   rX   rY   rC   rZ   rR   rE   r&   r  r    s    rE   r  c                 ,   [         R                   " [        R                  R                  5       5      nUR                  SS 5        UR                  SS 5        / n[	        UR
                  5       GH!  u  px[        X5      n	UR                  U	5        [        R                  SU 35        [        R                  SU/UQn
[        R                  SU
 SU	 35        S nUb<  [        R                  " USS9  [        U S	U 3S
5      n[        R                   " XXS9nO[        R                   " XS9n[#        5       nXl        UR&                  Ul        X}l        Xl        U(       a  UR-                  5       OS Ul        Xl        UR3                  U5        GM$     U$ )N
http_proxyhttps_proxyztrainer proc env:z-uzstart trainer proc:z env:T)exist_okz/workerlog.rl   )envstdoutstderr)r  )rs   r0   environpopr   r~   r  updater   r   r   
executabler2   makedirsopen
subprocessPopenr  r   r   r  r   tellr  r  r   )rk   rf   training_scripttraining_script_argslog_dircurrent_envr   idxr   r  r  fnr   tps                 r&   start_local_trainersr0    sM    ))BJJOO-.K
 OOL$'OOM4(ECLL)'38$(67~~t_L7KL)#eH:>?KK$/	SE2C8B##COD##C9D]&&	%'	TR5 *8 LrE   c                    U R                   (       a  [        U R                   R                  S5       nUR                  U R                  S5        U H#  n [
        R                  R                  U5        M%     UR                  5       U l        S S S 5        g g ! [         a;    [
        R                  R                  SU R                   R                   S35         M  f = f! , (       d  f       g = f)Nr   r   zOUnicodeEncodeError occurs at this line. Please refer to the original log file "z"
)
r   r%  nameseekr  r   r  writeUnicodeEncodeErrorr(  )r/  finlines      r&   pull_worker_logr8    s    	yy"))..#&#HHR]]A&JJ$$T*   HHJBM '&  * JJ$$BBD))..AQQTV '&s0   "CB4CACCCC
C-c                 j    Sn/ nSnU  Hu  nUR                   (       a  UR                  S:X  a  [        U5        UR                  R	                  5       nUc  SnMP  US:w  d  MX  SnUR                  UR                  5        Mw     U(       a!  [        U 5        [        R                  " S5        U$ ! [         a"    [        R                  S5        [        U 5        e [         a)    [        R                  SU SW S35        [        U 5        e   [        R                  SU SW S35        [        U 5        e = f)	NFr   Tr   zKeyboardInterrupt, exitzABORT!!! Out of all z) trainers, the trainer process with rank=z# was aborted. Please check its log.)r   r  r8  r   r   r   r   r   r   r   KeyboardInterruptr   warning
SystemExiterror)r   nranksr=  
error_rankr   r   rets          r&   watch_local_trainersrA    s-   #
AxxALLA-"&&++-C{!!!&&)  !%(HHQK& L#  01e$ "6(*ST^S_  `C  D	
 	e$"6(*ST^S_  `C  D	
 	e$s   AB- "A	B- -BD2c                     [        S5        [        [        U 5      R                  5       5       H  u  p[        U SU 35        M     [        S5        g )Nz0-----------  Configuration Arguments -----------z: z0------------------------------------------------)r   sortedvarsitems)r   argvalues      r&   _print_argumentsrH  &  sE    	
<=T$Z--/0
Rw  1	
<=rE   r<   )"rs   r0   r   r   r&  r   r   
contextlibr   %paddle.distributed.fleet.launch_utilsr   paddle.utilsr   utils.log_utilsr   r   r'   r7   r9   r\   r   r   r   r   r   r   r   r   r  r  r0  r8  rA  rH  rR   rE   r&   <module>rM     s     	    
   M " (	FF	#LBB 6: :z  :/ /d+66,40f  BF(V'&R>rE   