
    Αi                     N    S SK r S SKrS SKrS SKJr  SSKJrJr   " S S\5      rg)    N)	Container   )CollectiveControllerControllerModec                   P    \ rS rSr\S 5       rS rS rS rS0 SSS4S jr	S	 r
S
rg)IPUController   c                     UR                   R                  S:X  aH  UR                  R                  U R                   S35        [
        R                  UR                   l        gg)Nipuz enabledTF)argstraining_scriptloggerdebug__name__r   IPUrun_mode)clsctxs     t/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/launch/controllers/ipu_controller.pyenableIPUController.enable   sI    88##u,JJ~X67 . 2 2CHH    c                    [         R                  " 5       nUR                  S[        SS9  UR                  S[        SS9  UR                  S[        SS9  UR                  S[        S	S9  UR                  S
[        SS9  UR                  S[        SS9  UR                  S[         R
                  S9  UR                  U5      $ )Nz--hostsz'The hosts for IPU distributed training.)typehelpz--nproc_per_hostz*The number of processes launched per host.z--ipus_per_replicaz)The number of IPUs requested per replica.z--ipu_partitionz"The partition name of IPU devices.z--vipu_serverz!The ip of the IPU device manager.r   zoThe full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``.training_script_args)nargs)argparseArgumentParseradd_argumentstrint	REMAINDER
parse_args)self	args_listparsers      r   parse_ipu_argsIPUController.parse_ipu_args"   s    ((*C&O 	 	
 	= 	 	

 	 < 	 	

 	5 	 	

 	#,O 	 	
 	 C 	 	

 	2(:L:LM  ++r   c                    SU R                   R                  l        U R                  U R                   R                  R                  5      n[        U R                   R                  R                  5      nX!R                  -  S:X  d   SU SUR                   S35       eX!R                  -  nU R                   R                  R                  SU S35        [        UR                  R                  S5      5      nXAR                  -  nU R                   R                  R                  S	U S35        X5-  S:X  d   S
U SU S35       eUR                  R                  SS5      R                  S5      nU Vs/ s H  owS-   PM	     nn/ n	U	R                  SU 35        U	R                  SU 35        U	R                  SUR                   35        U	R                  SR!                  SR#                  U5      5      5        U	R                  SUR$                   35        U	R                  SUR&                   35        U	R)                  / SQ5        Sn
[*        R,                  " SS 5      nU(       a	  U
SU S3-  n
U
SR!                  USR#                  U5      5      -  n
U
S-  n
U	R                  U
5        [/        U5       H@  nXUR                  -     nXR                  -  nU	R                  SU SU SU SU S3	5        MB     U	R                  [0        R2                  5        U	R                  UR                  5        U	R)                  UR                  5        [5        S 5        [5        S!5        [/        [        U	5      S"-
  5       H  n[5        X    S#35        M     [5        U	[        U	5      S"-
      5        [5        S$5        XR                   R                  l        g s  snf )%Npoprunr   zThe number of IPUs:z$ mod the number of IPUs per replica:z
 must == 0z The number of total replicas is .,z!The number of total processes is zThe number of replicas:z mod the number of processes:  z:8090z--num-instances=z--num-replicas=z--ipus-per-replica=z	--host={}z--vipu-partition=z--vipu-server-host=)z--update-partition=noz--vipu-server-timeout=120z--print-topology=yesz--numa-aware=yesz--mpi-local-args='POPART_LOG_LEVELz-x POPART_LOG_LEVEL=z8-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'z--instance-mpi-local-args=z:"-x PADDLE_TRAINER_ID=z -x PADDLE_CURRENT_ENDPOINT=z -x PADDLE_RANK_IN_NODE="z'-----------  PopRun Command -----------zpoprun \r   z \z'---------------------------------------)r   r   r   r(   r   r"   devicesipus_per_replicar   infolenhostssplitnproc_per_hostreplaceappendformatjoinipu_partitionvipu_serverextendosgetenvrangesys
executableprint)r%   poprun_argsnum_ipusnum_replicas	num_nodes	num_procsr7   x	endpointspoprun_commandglobal_envs	log_levelidxcur_endpointrank_in_nodeis                   r   replace_training_script%IPUController.replace_training_scriptA   s   (0%))$((--*L*LMtxx}},,-777A= 	
!(+OP[PlPlOmmwx	
=  #?#???~QOP ))//45	 : ::	@1MN(Q. 	
%l^3PQZP[[ef	
.
 !!))#r288=*/0%Q[%	0  0<=~>?!+">">!?@	
 	k00%AB 1+2K2K1LMN 3K4K4K3LMN	
 ,II0$7	1)A>>KFMM388I.	

 	tk* #C$K,F,F%FGL!;!;;L!!,SE1HMijviw  xP  Q]  P^  ^_  ` $ 	cnn- 	k99:k>>? 	78ks>*Q./A^&'s+, 0N 3a 789;78 .<*y 1s   5Oc                     U R                   R                  R                  /nUR                  U R                   R                  R                  5        SR                  U5      /nU$ )Nr.   )r   r   r   r@   r   r=   )r%   
entrypoints     r   _get_entrypointIPUController._get_entrypoint   sK    hhmm334
$((--<<=hhz*+
r   NTc                     [        U=(       d    U R                  5       U(       a  U R                  R                  5       O0 S9nU R	                  XE5      u  Ul        Ul        UR                  U5        SUl        U$ )N)rX   envT)	r   rY   r   get_envs_get_out_err_fileoutfileerrfile
update_envshell)r%   rX   envsuse_ctx_envouterrcs          r   new_containerIPUController.new_container   sf     "<d&:&:&<(3""$
  $55c?	19	Tr   c                     U R                  5         U R                  5         U R                  5         U R                  5         U R	                  5         g )N)rU   	build_job	build_pod
deploy_podwatch)r%   s    r   runIPUController.run   s5    $$&

r    )r   
__module____qualname____firstlineno__classmethodr   r(   rU   rY   rh   ro   __static_attributes__rq   r   r   r   r      s<     ,>S<j BDd	r   r   )	r   rA   rD   'paddle.distributed.launch.job.containerr   
collectiver   r   r   rq   r   r   <module>ry      s%     	 
 = <Z( Zr   