
    Αi(                         S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	  SSK
Jr  SSKJr   " S S	5      r " S
 S5      r " S S\5      rg)    N)	Container)Job)Pod   )Master)Watcherc                   $    \ rS rSrSrSrSrSrSrg)ControllerMode   
collectivepsipurpc N)	__name__
__module____qualname____firstlineno__
COLLECTIVEPSIPURPC__static_attributes__r       p/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/launch/controllers/controller.pyr
   r
      s    J	B
C
Cr   r
   c                   T    \ rS rSrS rS rS rS\4S jrSS jr	SS	 jr
S
 rS rSrg)ControllerBase#   c                 "   [         R                   " [         R                  U R                  5        [         R                   " [         R                  U R                  5        [         R                   " [         R                  U R                  5        UR                  5       (       aw  UR                  (       dP  [         R                   " [         R                  U R                  5        [         R                  " UR                  5        O[         R                  " S5        Xl        [        R                  " U R                  5      U l        [        U R                  5      U l        [#        U R                  R$                  R&                  U R                  R$                  R(                  U R                  R$                  R*                  S9U l        [/        5       U l        U R                  R3                  SU R0                  R4                  05        S U l        g )Nr   )nnodesmodejidPOD_NAME)signalSIGTERMsignal_handlerSIGABRTSIGINTis_auto_tuner_moderun_bestSIGALRMnot_exit_signal_handleralarmmax_time_per_taskctxr   factorymasterr   watcherr   argsr    run_modejob_idjobr   podset_envsnamejoin_server)selfr/   s     r   __init__ControllerBase.__init__$   s/   fnnd&9&9:fnnd&9&9:fmmT%8%89!!##<<fnnd.J.JKS223QnnTXX.txx(88==''''$$

 5:txx}}56r   c                    [        U R                  R                  5      [        U R                  R                  5      -   S:  d   S5       eU R                  R
                  R                  SU R                   35        [        U R                  R                  5      S:  a<  U R                  R
                  R                  U R                  R                  S   5        [        U R                  R                  5      S:  a<  U R                  R
                  R                  U R                  R                  S   5        U R                  5         U R                  R                  R                  5         U R                  R                  5         g )Nr   No container in the podzRun )lenr7   
containersinit_containersr/   loggerinfodebugsave_pod_envstatusrundeployr;   s    r   
deploy_podControllerBase.deploy_pod@   s   488&&'#dhh.F.F*GG!K 	
%	
K 	tDHH:./txx''(1,HHOO!!$((":":1"=>txx""#a'HHOO!!$(("5"5a"89r   c                     U R                  5         U R                  5         U R                  5         U R                  5         g N)	build_job	build_podrK   watchrJ   s    r   rH   ControllerBase.runO   s)    

r   returnc                    U R                   R                  R                  SU R                   35        U R                   R                  R                  5       (       Gdc  U R                  R                  SS9nU R                  R                  5         XR                   R                  R                  :X  a  U R                   R                  R                  5         U R                  R                  U5        U R                  R                  5       (       a"   U R                  R                  5       (       a  M"  U R                   R                  R                  SU 35        gXR                   R                  R                  :X  GaV  U R                   R                  R                  5         U R                  R                  U5        U R                  R                  5         U R                  R                  5       nU R                   R                  R                  SU 35        U R                   R                  R!                  SUS    35        U R                   R                  R                  S5        US   R#                  5         U R                   R$                  R&                  S::  a  U R                  R)                  S	S9  gU R                  R)                  S
S9  gU R                   R                  R+                  5       (       a  U R                  R-                  5       U R                   R                  R                  :w  aX  U R                   R$                  R&                  S:X  a  U R                  R)                  S	S9  gU R                  R)                  S
S9  gU R                   R                  R                  5       (       d  GMb  gg)z1
watch self and peer status, return true to exit
z	Watching    timeoutzPod TzContainer failed !!!
r   zD------------------------- ERROR LOG DETAIL -------------------------      FN)r/   rC   rD   r7   rG   is_donerQ   logs	COMPLETEDcompleter1   
set_statusFAILEDfailrestart_peerfailed_containererrortailr3   elastic_levelstopis_restarting
get_status)r;   rG   fcs      r   rQ   ControllerBase.watchW   s{    	y
34((//))++XX^^A^.F HHMMO 222((*&&v.hhmmoo hhmmoo $$tF8_5 88??111$$&&&v.((*XX..0$$tF8_5%%(>r!ug&FG$$Z 1

88==..!3HHMM!M,HHMM"M-  --//KK**,0I0II 88==.."4HHMM!M,b)i ((//))++r   Nc                     U R                   R                  R                  S5        U R                  R	                  5         U R
                  R	                  5         U R                  R	                  SS9  g )NzController stoprY   rV   )r/   rC   rE   r2   rg   r1   r7   )r;   sigints     r   rg   ControllerBase.stop   sK    /0b!r   c                 H   U R                   R                  5         U R                  R                  5         U R                  R
                  R                  SU R                   R                   35        U(       a+  [        R                  " U R                   R                  5        g g )Nz
Exit code )
r7   joinr1   rg   r/   rC   rD   	exit_codesysexit)r;   rs   s     r   finalizeControllerBase.finalize   se    z$((*<*<)=>?HHTXX''( r   c                    [        U S5      (       aT  U R                  R                  R                  S5        U R                  R                  SS9  [        R                  " U5        U R                  R                  R                  SU 35        Xl        U R                  R                  R                  5         U R                  US9  U R                  R                  R                  SU 35        [        R                  " U5        g Nrm   zForce quit in 10 seconds...
   rV   zTerminating with signal )rm   zExit with signal )hasattrr/   rC   rD   r7   rg   rr   rs   rm   rG   doner;   rm   frames      r   r&   ControllerBase.signal_handler   s    4""HHOO  !>?HHMM"M%HHV7x@A			 09:r   c                    [        U S5      (       a>  U R                  R                  R                  S5        U R                  R                  SS9  U R                  R                  R                  SU 35        Xl        U R                  R                  R                  5         U R                  US9  U R                  R                  R                  SU 35        g rw   )	ry   r/   rC   rD   r7   rg   rm   rG   rz   r{   s      r   r,   &ControllerBase.not_exit_signal_handler   s    4""HHOO  !>?HHMM"M%7x@A			 09:r   )r/   r6   r:   r1   r7   rm   r2   rN   )T)r   r   r   r   r<   rK   rH   boolrQ   rg   rt   r&   r,   r   r   r   r   r   r   #   s2     8<t <|")
;r   r   c                       \ rS rSrSrS rS\4S jrS rSS jr	S0 S	SS4S
 jr
SS0 SS4S jrS rS rS rSS jrSrg)
Controller   z"
Controller API for customization
c                 b    U R                   R                  R                  U R                  5        g)z
build job fill the job info.
N)r/   rC   rD   r6   rJ   s    r   rO   Controller.build_job   s     	TXX&r   rS   c                     [         e)zE
build pod includes creating containers etc.

Return True if succeed
)NotImplementedErrorrJ   s    r   rP   Controller.build_pod   s
     "!r   c                    U R                   R                  R                  R                  S5      (       a  [        R
                  R                  S5      S:X  a7  [        R                  SSSSSS	U R                   R                  R                  /nO[        R                  SU R                   R                  R                  /nOU R                   R                  R                  R                  S
5      (       a1  [        R                  U R                   R                  R                  /nO!U R                   R                  R                  /nUR                  U R                   R                  R                  5        U$ )Nz.pyWITH_COVERAGEONz-uz-mcoveragerH   z--branchz-pz.pyxes)r/   r3   training_scriptendswithosenvirongetrr   
executableextendtraining_script_args)r;   
entrypoints     r   _get_entrypointController._get_entrypoint   s    88==((11%88zz~~o.$6NNHHMM11	
 NNHHMM11

 XX]]**33H==..$((--*G*GHJ((--778J$((--<<=r   Nc                    U(       ab  U R                   R                  R                  S:w  a>  [        R                  R                  U R                   R                  R                  U5      nU(       ab  U R                   R                  R                  S:w  a>  [        R                  R                  U R                   R                  R                  U5      nX=(       d    U4$ )N )r/   r3   log_dirr   pathrp   )r;   outerrs      r   _get_out_err_fileController._get_out_err_file   s    488==((B.'',,txx}}44c:C488==((B.'',,txx}}44c:CZC  r   Tc                 &   [        U=(       d    U R                  5       U(       a  U R                  R                  5       O0 U R                  R                  R
                  S9nU R                  XE5      u  Ul        Ul        UR                  U5        U$ )N)r   envoverwrite_log)
r   r   r/   get_envsr3   log_overwriter   outfileerrfile
update_env)r;   r   envsuse_ctx_envr   r   cs          r   new_containerController.new_container   sp     "<d&:&:&<(3""$((--55

  $55c?	19	Tr   Fc                 l   U(       do  [         R                  " U5      n[        [        R                  R                  U R                  R                  R                  5      5      US'   U R                  X#XDS9nU(       a  U R                  R                  U5        g U R                  R                  U5        g )NPADDLE_LOG_DIR)r   r   r   r   )copydeepcopystrr   r   abspathr/   r3   r   r   r7   add_init_containeradd_container)r;   	containerr   r   log_fileis_inits         r   r   Controller.add_container  s     ==&D%(9N9N)O%PD!"**%h + I HH''	2HH""9-r   c                    U R                   R                  R                  (       a)  [        U R                   R                  R                  5      $ U R                   R                  R                  (       a8  [        U R                   R                  R                  R                  S5      5      $ U R                   R                  R                  R                  $ )z1
how many process/container should be run in pod
,)
r/   r3   nproc_per_nodeintdevicesr@   splitnodedevicecountrJ   s    r   pod_replicasController.pod_replicas  s}    
 88==''txx}}3344XX]]""txx}},,22378888==''---r   c                 H   U R                   R                  R                  (       d  g[        R                  R                  U R                   R                  R                  U R                  R                   SU R                  R                   S35      n [        R                  " [        R                  R                  U5      SS9  [        US5       nUR                  5       S:X  a9  UR                  [        [        R                   5      5        UR                  S5        UR                  [        U5      5        UR                  S5        SSS5        g! , (       d  f       g= f! ["         a2  nU R                   R$                  R'                  S	U 35         SnAgSnAff = f)
z8
save_pod_log append *info* to the log file of pod.name
N.z.logTexist_okza+r   
zsave log failed because )r/   r3   r   r   r   rp   r6   idr7   r9   makedirsdirnameopentellwriter   r   	ExceptionrC   rd   )r;   rD   ffdes        r   save_pod_logController.save_pod_log!  s    xx}}$$GGLLHHMM!!xx{{m1TXX]]O40
		BKK*T:a"779>HHS_-HHTNT#   	BHHOO!!$<QC"@AA	Bs7   >E% A9EE% 
E"E% "E% %
F!/(FF!c                    [        U R                  R                  5      [        U R                  R                  5      -   S:  d   S5       eU R                  R
                  R                  (       d  g U R                  R                   H  nU R                  USS9  M     U R                  R                   H  nU R                  U5        M     g )Nr   r?   T)r   )r@   r7   rA   rB   r/   r3   r   _save_container_env)r;   r   s     r   rF   Controller.save_pod_env7  s    488&&'#dhh.F.F*GG!K 	
%	
K xx}}$$))A$$Q$5 * $$A$$Q' %r   c                 z   [         R                  R                  U R                  R                  R
                  U(       a  SUR                   3OSUR                   35      n [         R                  " [         R                  R                  U5      SS9  [        X1R                  5       nUR                  S [        UR                  R                  5       5       5       5        S S S 5        g ! , (       d  f       g = f! [         a2  nU R                  R                   R#                  SU 35         S nAg S nAff = f)Nzenvlog.init.zenvlog.Tr   c              3   6   #    U  H  u  pU S U S3v   M     g7f)=r   Nr   ).0kvs      r   	<genexpr>1Controller._save_container_env.<locals>.<genexpr>Q  s"      -JTQqc1#RL-Js   z save pod env log failed because )r   r   rp   r/   r3   r   rankr   r   r   log_mode
writelinessortedr   itemsr   rC   rd   )r;   r   r   r   r   r   s         r   r   Controller._save_container_envE  s    GGLLHHMM!!  y~~./y~~./
	JKK*T:a++, -3IMM4G4G4I-J  -,,  	JHHOO!!$DQC"HII	Js7   #AC> *:C-$C> -
C;7C> ;C> >
D:(D55D:r   )NN)F)r   r   r   r   __doc__rO   r   rP   r   r   r   r   r   r   rF   r   r   r   r   r   r   r      s`    '"4 "6! BDd
 .(
.B,(Jr   r   )r   r   r$   rr   'paddle.distributed.launch.job.containerr   !paddle.distributed.launch.job.jobr   !paddle.distributed.launch.job.podr   r1   r   r2   r   r
   r   r   r   r   r   <module>r      sI     	  
 = 1 1   Z; Z;zUJ UJr   