
    x-j\@                     p   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ ddlmZ  edd          Zd	 Zd
 Z G d d          Z G d d          Z G d d          Z G d d          Z G d d          Zd Zd Zd Zd Zd Zd!dZ G d d          Z	 d!dZd Zd Z d  Z!dS )"    N)closing)get_backend_by_compile_flag)	strtobool   )
get_loggerINFOrootc           	      Z   d | j                             d          D             }| j        }|                    |          }t                              d| d| d|            d }| j        sHt          |          dk    r5| j        .t          t          |                    }|t          |          }n=d}| j        | j        }t          t          ||t          |          z                       }g }|D ]#|                    fd|D                        $t          ||||          S )	Nc                 6    g | ]}|                                 S  strip.0xs     e/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/utils/launch_utils.py
<listcomp>z)get_cluster_from_args.<locals>.<listcomp>!   s     DDDa		DDD    ,zparsed from args:node_ips:z	 node_ip:z node_rank:   i  c                     g | ]	} d | 
S ):r   )r   portips     r   r   z)get_cluster_from_args.<locals>.<listcomp>=   s#    !H!H!HTR..$..!H!H!Hr   )cluster_node_ipssplitnode_ipindexloggerdebuguse_paddlecloudlenstarted_portfind_free_portslistrangeappendget_cluster)	argsselected_gpusnode_ipsr   	node_rank
free_portsr#   trainer_endpointsr   s	           @r   get_cluster_from_argsr/       s^   DD4#8#>#>s#C#CDDDHlGw''I
LLWXWWWWIWW   J 
MMQ%$S%7%788
!j))J(,L,s=/A/A ABB
 

  J J  !H!H!H!HZ!H!H!HIIIIx*;]KKKr   c                    | 5ddl m} |                                }d t          d|          D             }nt	          j        d          }||dk    r d |                     d          D             }n|                    d          |                     d          D ]}|v sJ d| d	| d
            fd|                     d          D             }t                              d|  d| d            |S )Nr   corec                 ,    g | ]}t          |          S r   strr   s     r   r   zget_gpus.<locals>.<listcomp>F   s    3331A333r   CUDA_VISIBLE_DEVICES c                 6    g | ]}|                                 S r   r   r   s     r   r   zget_gpus.<locals>.<listcomp>J   s     @@@!AGGII@@@r   r   zCan't find your selected_gpus z in CUDA_VISIBLE_DEVICES[z].c                 ^    g | ])}                     |                                          *S r   )r   r   )r   r   cuda_visible_devices_lists     r   r   zget_gpus.<locals>.<listcomp>U   s?        *//		::  r   z1Change selected_gpus into relative values. --ips:z will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)	paddle.frameworkr2   get_cuda_device_countr&   osgetenvr   r   info)r*   r2   gpus_numgpuscuda_visible_devicesr   r:   s         @r   get_gpusrC   A   s   ))))))--//33a 2 2333!y)?@@'+?2+E+E@@}':':3'?'?@@@DD
 )=(B(B3(G(G%"((--  5555_*+_ _FZ_ _ _ 6555   &,,S11  D KKDM D D15D D(AD D   Kr   c                   ,    e Zd Zd Zd Zd Zd Zd ZdS )Hdfsc                 0    d | _         d | _        d | _        d S Nhdfs_ugi	hdfs_name	hdfs_pathselfs    r   __init__zHdfs.__init__c   s    r   c                 8    | j         d uo| j        d uo| j        d uS rG   rH   rL   s    r   is_validzHdfs.is_validh   s/    M% +d*+d*	
r   c                 6    d| j          d| j         d| j         S )Nz	hdfs_ugi:z hdfs_name:z
 hdfs_pathrH   rL   s    r   __str__zHdfs.__str__o   s&    _4=__T^__t~___r   c                 b    | j         |j         k    o| j        |j        k    o| j        |j        k    S rG   rH   rM   ns     r   __eq__zHdfs.__eq__r   s5    MQZ' .!+-.!+-	
r   c                     | |k     S rG   r   rT   s     r   __ne__zHdfs.__ne__y       19}r   N)__name__
__module____qualname__rN   rP   rR   rV   rX   r   r   r   rE   rE   b   sb          

 
 
` ` `
 
 
    r   rE   c                   J    e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 ZdS )Clusterc                 >    d | _         g | _        d | _        d | _        d S rG   )
job_serverpodshdfsjob_stage_flag)rM   rb   s     r   rN   zCluster.__init__~   s%    		"r   c                 Z    d| j          dd | j        D              d| j         d| j         S )Nzjob_server:z pods:c                 ,    g | ]}t          |          S r   r4   )r   pods     r   r   z#Cluster.__str__.<locals>.<listcomp>   s    4S4S4S#SXX4S4S4Sr   z job_stage_flag:z hdfs:)r`   ra   rc   rb   rL   s    r   rR   zCluster.__str__   si     LT_  L  L4S4S4S4S4S  L  Leiex  L  L  AE  AJ  L  L  	Lr   c                     t          | j                  t          |j                  k    rdS t          | j        |j                  D ]\  }}||k    r dS | j        |j        k    rdS dS NFT)r"   ra   ziprc   )rM   clusterabs       r   rV   zCluster.__eq__   sw    ty>>S....5	7<00 	 	DAqAvvuu  '"8885tr   c                 .    |                      |           S rG   )rV   rM   rj   s     r   rX   zCluster.__ne__   s    ;;w''''r   c                 B    t          j         |j                  | _        d S rG   )copyra   rn   s     r   update_podszCluster.update_pods   s    Igl++			r   c                 D    t          |                                           S rG   )r"   trainers_endpointsrL   s    r   trainers_nrankszCluster.trainers_nranks   s    4**,,---r   c                 *    t          | j                  S rG   )r"   ra   rL   s    r   pods_nrankszCluster.pods_nranks   s    49~~r   c                 f    g }| j         D ]&}|j        D ]}|                    |j                   '|S rG   )ra   trainersr'   endpoint)rM   rrf   ts       r   rs   zCluster.trainers_endpoints   sJ    9 	% 	%C\ % %$$$$%r   c                     g }| j         D ]C}|j         d|j         }|j        |j        J | d            |                    |           D|S )Nr   z not a valid endpoint)ra   addrr   r'   )rM   rz   rf   eps       r   pods_endpointszCluster.pods_endpoints   sq    9 	 	CH))sx))B8'CH,@,@,,, -A,@@ HHRLLLLr   c                 l    | j         D ]+}t          |          t          |j                  k    r|c S ,d S rG   )ra   r5   id)rM   pod_idrf   s      r   get_pod_by_idzCluster.get_pod_by_id   sA    9 	 	C6{{c#&kk))


 * tr   N)rZ   r[   r\   rN   rR   rV   rX   rq   rt   rv   rs   r   r   r   r   r   r^   r^   }   s        # # #L L L  ( ( (, , ,. . .    	 	 	    r   r^   c                   &    e Zd Zd Zd Zd Zd ZdS )	JobServerc                     d | _         d S rG   ry   rL   s    r   rN   zJobServer.__init__   s    r   c                     | j          S rG   r   rL   s    r   rR   zJobServer.__str__   s    -!!r   c                 "    | j         |j         k    S rG   r   rM   js     r   rV   zJobServer.__eq__   s    }
**r   c                     | |k     S rG   r   r   s     r   rX   zJobServer.__ne__   rY   r   N)rZ   r[   r\   rN   rR   rV   rX   r   r   r   r   r      sP          " " "+ + +    r   r   c                   ,    e Zd Zd Zd Zd Zd Zd ZdS )Trainerc                 0    g | _         d | _        d | _        d S rG   rA   ry   rankrL   s    r   rN   zTrainer.__init__   s    				r   c                 6    d| j          d| j         d| j         S )Nzgpu:z
 endpoint:z rank:r   rL   s    r   rR   zTrainer.__str__   s&    KdiKK4=KK	KKKr   c                     t          | j                  t          |j                  k    rdS | j        |j        k    s| j        |j        k    rdS t	          | j        |j                  D ]\  }}||k    r dS dS rh   )r"   rA   ry   r   ri   )rM   r{   rk   rl   s       r   rV   zTrainer.__eq__   s    ty>>S[[((5=AJ&&$)qv*=*=5	16** 	 	DAqAvvuu  tr   c                     | |k     S rG   r   )rM   r{   s     r   rX   zTrainer.__ne__   rY   r   c                     | j         S rG   )r   rL   s    r   get_rankzTrainer.get_rank   s
    yr   N)rZ   r[   r\   rN   rR   rV   rX   r   r   r   r   r   r      sb          
L L L        r   r   c                   2    e Zd Zd Zd Zd Zd Zd Zd ZdS )Podc                 Z    d | _         d | _        d | _        d | _        g | _        g | _        d S rG   )r   r   r}   r   rx   rA   rL   s    r   rN   zPod.__init__   s0    						r   c                 z    d| j          d| j         d| j         d| j         d| j         dd | j        D              S )Nzrank:z id:z addr:z port:z visible_gpu:z
 trainers:c                 ,    g | ]}t          |          S r   r4   )r   r{   s     r   r   zPod.__str__.<locals>.<listcomp>   s?      tS  tS  tS  @Atwxytztz  tS  tS  tSr   )r   r   r}   r   rA   rx   rL   s    r   rR   zPod.__str__   s     Uty  U  Udg  U  UTY  U  Udi  U  U^b^g  U  U  tS  tS  EI  ER  tS  tS  tS  U  U  	Ur   c                 h   | j         |j         k    s0| j        |j        k    s | j        |j        k    s| j        |j        k    r"t                              d|  d|            dS t          | j                  t          |j                  k    r,t                              d| j         d|j                    dS t          t          | j                            D ]W}| j        |         |j        |         k    r9t                              d| j        |          d|j        |                      dS XdS )Nzpod z != Fz	trainers ztrainer T)	r   r   r}   r   r   r    r"   rx   r&   )rM   rf   is      r   rV   z
Pod.__eq__   s-   I!!w#&  yCH$$yCH$$LL///#//0005t}S\!2!222LLFT]FFFFGGG5s4=))** 	 	A}Q3<?22Oa(8OOcl1oOOPPPuu 3 tr   c                     | |k     S rG   r   )rM   rf   s     r   rX   z
Pod.__ne__  s    3;r   c                     d S rG   r   )rM   res_podss     r   parse_responsezPod.parse_response	  s    r   c                 j    d}| j         D ]
}|| dz  }|dk    sJ d|  d            |d d         }|S )Nr7   r   z	this pod z can't see any gpus)rA   )rM   rz   gs      r   get_visible_gpuszPod.get_visible_gpus  sZ     	 	AALAABwww=D===wwwcrcFr   N)	rZ   r[   r\   rN   rR   rV   rX   r   r   r   r   r   r   r      sq          U U U  *        r   r   c                    t          |          t          u s
J d            t          d           }d}t          |           D ]\  }}t	                      }||_        ||_        ||         }	t          |	          t          |          k    s
J d            t          t          |                    D ]d}
t                      }|j
                            ||
                    |	|
          |_        ||_        |dz  }|j                            |           e|j                            |           |                     |          }||j        |         fS )Nztrainer_endpoints must be list)rb   r   zOcurrent trainer_endpoints size should be greater equal than selected_gpus size.r   )typer%   r^   	enumerater   r   r}   r"   r&   r   rA   r'   ry   rx   ra   r   )r+   r   r.   r*   rj   trainer_rankr,   r   rf   cur_node_endpointsr   trainerpod_ranks                r   r(   r(     s_   !""d***,L***4   GL"8,, ! !	2ee.y9%&&#m*<*<<<<] =<< s=))** 	) 	)AiiGLa 0111"4Q"79G'GLALL((((C    ~~g&&HGL***r   c                    | D ]{}|j                                         `|j                                          |j        r|j                                         t
                              d|j         j                    |t          j	        d           t          dd          D ]}d}| D ]F}|j                                         +t          j        |j         j        t          j                   d}G|st
                              d            d S t          j	        d           t
                              d           t#          j        d	           d S )
Nzterminate process id:   r   2   FTzterminate all the procszcan't kill all process and exitr   )procpoll	terminatelog_fncloser   r    pidtimesleepr&   r=   killsignalSIGKILLr?   fatalsysexit)procspstepalives       r   terminate_local_procsr   2  s.    ? ?6;;== Fx !   LL===>>> 	JqMMMa   	 	Av{{}}$
FN333 	KK1222FF
1
LL2333HQKKKKKr   c                  j    	 t          j                    } t          j        |           }| |fS #  Y d S xY wrG   )socketgethostnamegethostbyname)	host_namehost_ips     r   get_host_name_ipr   M  s@    &((	&y11'!!tts   *- 2c                 ^    |t           k    rt          n|} |j        d| z   f|||dz   d| dS )ab  Add argparse's argument.
    Examples:
        .. code-block:: python

            >>> import argparse
            >>> from paddle.distributed.utils import launch_utils
            >>> parser = argparse.ArgumentParser()
            >>> launch_utils.add_arguments("name", str, "Jonh", "User name.", parser)
            >>> args = parser.parse_args()

    z--z Default: %(default)s.)defaultr   helpN)boolr   add_argument)argnamer   r   r   	argparserkwargss         r   add_argumentsr   V  s`     99$DIw,,	 
     r   c                     d }t                      }d}	  |            }||vr|                    |           t          |          | k    r|S |dz  }|dk    rt          d           d S U)Nc                      t          t          j        t          j        t          j                            5 } |                     d           |                                 d         cd d d            S # 1 swxY w Y   d S )N)r7   r   r   )r   r   AF_INETSOCK_STREAMbindgetsockname)ss    r   __free_portz$find_free_ports.<locals>.__free_portm  s    V]6>63EFFGG 	&1FF7OOO==??1%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&s   /A33A7:A7r   Tr   d   z@can't find available port and use the specified static port now!)setaddr"   print)numr   port_setr   r   s        r   r$   r$   l  s    & & &
 uuHD{}}xLLx==CO	#::R   4r   c                    |t                      }|dk    rd                    d                    d |j        D                                 t	          |j                  t	          |j                  t	          |                                           d                    |                                           d}n|dk    rd                    d                    d |j        D                                 t	          |j                  t	          |j                  t	          |                                           d                    |                                           d}nZ|d	k    rqt	          |j                  t	          |j                  t	          |                                           d                    |                                           |d
}n|dk    rddl	m
} |                                d         }d| dd                    d                    d |j        D                                 dt	          |j                  dt	          |j                  dt	          |                                           dd                    |                                           i}nt          d          |S )Nbkclz{}r   c                 ,    g | ]}t          |          S r   r4   r   r   s     r   r   z(_prepare_trainer_env.<locals>.<listcomp>      777Q#a&&777r   )FLAGS_selected_xpusPADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSncclc                 ,    g | ]}t          |          S r   r4   r   s     r   r   z(_prepare_trainer_env.<locals>.<listcomp>  r   r   )FLAGS_selected_gpusr   r   r   r   gloo)r   r   r   r   PADDLE_DISTRI_BACKENDxcclr   r1   FLAGS_selected_r   c                 ,    g | ]}t          |          S r   r4   r   s     r   r   z(_prepare_trainer_env.<locals>.<listcomp>  r   r   r   r   r   r   z)backend must be one of 'gloo, nccl, bkcl')r   formatjoinrA   r5   r   ry   rt   rs   r;   r2   get_all_custom_device_type
ValueError)rj   r   backendproc_envr2   custom_device_names         r   _prepare_trainer_envr     s   -//&#';;77',77788$ $ "%W\!2!2'*7+;'<'<#&w'>'>'@'@#A#A(+1K1K1M1M(N(N
 
 
F		#';;77',77788$ $ "%W\!2!2'*7+;'<'<#&w'>'>'@'@#A#A(+1K1K1M1M(N(N
 
 
F		 "%W\!2!2'*7+;'<'<#&w'>'>'@'@#A#A(+1K1K1M1M(N(N%,
 
 
F		))))))!<<>>qA30333T[[77',777886 6  W\!2!2%s7+;'<'<!3w'>'>'@'@#A#A&1K1K1M1M(N(N
 DEEEOr   c                       e Zd Zd ZdS )TrainerProcc                 Z    d | _         d | _        d | _        d | _        d | _        d | _        d S rG   )r   r   
log_offsetr   
local_rankcmdrL   s    r   rN   zTrainerProc.__init__  s0    		r   N)rZ   r[   r\   rN   r   r   r   r   r     s#            r   r   c                 l   t          j         t          j                                                   }|                    dd            |                    dd            g }t	          |j                  D ]?\  }}t          | |          }	|                    |	           t          	                    d|            t          j        d|g|}
t                              d|
 d|	            d }|Dt          j        |d           t          | d	| d
          }t          j        |
|||          }nt          j        |
|          }t#                      }||_        |j        |_        ||_        ||_        |r|                                nd |_        |
|_        |                    |           A|S )N
http_proxyhttps_proxyztrainer proc env:z-uzstart trainer proc:z env:T)exist_okz/workerlog.rk   )envstdoutstderr)r  )rp   r=   environpopr   rx   r   updater   r    r   
executabler?   makedirsopen
subprocessPopenr   r   r   r   r   tellr   r   r'   )rj   rf   training_scripttraining_script_argslog_dircurrent_envr   idxr{   r   r   fnr   tps                 r   start_local_trainersr    s    )BJOO--..K
 OOL$'''OOM4(((ECL))  Q'338$$$666777~t_L7KL>#>>H>>???K$////22S22C88B#C[BOOODD#C[999D]]&	%'1			TRLr   c                    | j         rt          | j         j        d          5 }|                    | j        d           |D ]_}	 t
          j                            |           ## t          $ r0 t
          j                            d| j         j         d           Y \w xY w|	                                | _        d d d            d S # 1 swxY w Y   d S d S )Nrz   r   zOUnicodeEncodeError occurs at this line. Please refer to the original log file "z"
)
r   r  nameseekr   r   r  writeUnicodeEncodeErrorr  )r  finlines      r   pull_worker_logr     s=   	y '").#&& 
	'#HHR]A&&&  J$$T****)   J$$VBD).V V V    
  HHJJBM
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	' 
	'' 's5    CA#"C#7BCBCC
C
c                 r   	 d}g }d}| D ]c}|j         r|j        dk    rt          |           |j                                        }|d}A|dk    rd}|                    |j                   d|r#t          |            t          j	        d           n# t          $ r+ t                              d           t          |             t          $ r2 t                              d| d| d           t          |              t                              d| d| d           t          |             xY w|S )	NFr   Tr   zKeyboardInterrupt, exitzABORT!!! Out of all z) trainers, the trainer process with rank=z# was aborted. Please check its log.)r   r   r   r   r   r'   r   r   r   r   KeyboardInterruptr   warning
SystemExiterror)r   nranksr%  
error_rankr   r   rets          r   watch_local_trainersr)    s   #
 		* 		*Ax #ALA--"""&++--C{!!!&))) 	!%(((HQKKK   0111e$$$    D6  D  DT^  D  D  D	
 	
 	
 	e$$$ D6  D  DT^  D  D  D	
 	
 	
 	e$$$Ls   BB B D4c                     t          d           t          t          |                                                     D ]\  }}t          | d|            t          d           d S )Nz0-----------  Configuration Arguments -----------z: z0------------------------------------------------)r   sortedvarsitems)r)   argvalues      r   _print_argumentsr0  &  sr    	
<===T$ZZ--//00 ! !
U    	
<=====r   rG   )"rp   r=   r   r   r  r   r   
contextlibr   %paddle.distributed.fleet.launch_utilsr   paddle.utilsr   utils.log_utilsr   r   r/   rC   rE   r^   r   r   r   r(   r   r   r   r$   r   r   r  r   r)  r0  r   r   r   <module>r5     sk    				       



        M M M M M M " " " " " " ( ( ( ( ( (	FF	#	#L L LB  B       6: : : : : : : :z              :/ / / / / / / /d+ + +6  6    ,  40 0 0 0f        BF( ( ( (V' ' '& & &R> > > > >r   