
    Αiv.                     \    S SK r S SKrSSKJr  SSKJrJr   " S S\5      r " S S	\5      rg)
    N   )
DeviceType   )
ControllerControllerModec                   T   ^  \ rS rSrU 4S jr\S 5       rS rS rS r	S	S jr
SrU =r$ )
CollectiveController   c                 2   > S U l         [        TU ]	  U5        g )N)_tuner_run_modesuper__init__)selfctx	__class__s     p/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/launch/controllers/collective.pyr   CollectiveController.__init__   s    #    c                     U(       aH  UR                   R                  U R                   S35        [        R                  UR
                  l        gg)N enabledTF)loggerdebug__name__r   
COLLECTIVEargsrun_modeclsr   s     r   enableCollectiveController.enable   s<     JJ~X67 . 9 9CHHr   c                    U R                  5       nU(       a  g U R                  R                  R                  cZ  U R                  R                  R                  (       a5  U R                  R                  R
                  (       a  U R                  5       $ U R                  R                  R                  c  SnU R                  U5      $ )NT)	_build_pod_with_tunerr   r   master
start_portips_build_pod_with_argsauto_parallel_config_build_pod_with_master)r   skip_runs     r   	build_podCollectiveController.build_pod%   s    --/HHMM  (((!!,,..xx}}119..x88r   c                    U R                   R                  R                  nUGb  [        R                  R                  U5      (       d%  U R                   R                  R                  S5        UR                  S5      (       d%  U R                   R                  R                  S5        [        US5       n[        R                  " UR                  5       5      nUR                  SS5      U l        S S S 5        U R                   R                  R                  SU R                   35        SU R                   R                   R#                  5        3nU R%                  5       nU R                  S	;   ay  U R                   R                  R                  S
USUSSU['        U R                   R                  R(                  5      -   U S.	nSnU R+                  XgSS9  U R                  S:X  a  gg! , (       d  f       GN= f)Nzauto_parallel_conf not exists!z.jsonz2auto_parallel_config should be a json format file!rtuner_run_modetuner_and_runztuner_run_mode is: z
127.0.0.1:)
tuner_onlyr/   10tuner)	PADDLE_AUTO_PARALLEL_CONFIGPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSPADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTFLAGS_selected_gpusPADDLE_AUTO_PARALLEL_STAGEPADDLE_GLOBAL_SIZEPADDLE_LOCAL_SIZEz	tuner.logT)envslog_fileis_initr0   F)r   r   r'   ospathexistsr   warningendswithopenjsonloadsreadgetr   infonodeget_free_portpod_replicasintnnodesadd_container)r   r'   robjauto_parallel_dataendpointrM   er>   s           r   r"   *CollectiveController._build_pod_with_tuner5   s   #xx}}AA+77>>"677''(HI'0099''H *C0D%)ZZ		%<"'9'='=$o($ 1 HHOO  #6t7K7K6L!MN#DHHMM$?$?$A#BCH,,.L##'FF3788==3U3U+.08),/7+.29-9C@T@T<U-U,V,8>
 '""d"K''<75 10s   5<G,,
G;c                    U R                  5       U R                  l        [        U R                  R
                  R                  5      nU R                  R
                  R                  R                  S5      nU VVs/ s H3  n[        U R                  R                  5        H  nU SXA-    3PM     M5     nnnU R                  R                  R                  SU 35        U R                  R                  R                  SUS    S35        US   U R                  R
                  l        U R                  R                  R                  U;   aF  UR!                  U R                  R                  R                  5      U R                  R                  -  OSnU R#                  U5        U R                  R                  R$                  R'                  5       nU R                  R                  R$                  R)                  U R                  R
                  R*                  5      n[        U R                  R                  5       GH  n	U R                  R
                  R                  [-        U5       U R                  R                   X-    U	 [-        U5       XYU-      X-    [-        U5       [/        U	5      [/        U R                  R
                  R0                  5      S.n
U
R3                  SSR5                  U5      05        U R6                  b2  U
R3                  U R                  R
                  R8                  S	S
.5        [-        U5      S:  a  U R                  R                  R$                  R:                  [<        R>                  :X  a=  U
R3                  U R                  R                  R$                  RA                  5       5        U R                  R                  S:X  a#  U
R3                  USR5                  U5      05        O)U
R3                  XxU	   05        OU
R3                  SS05        SU	 3nU RC                  XS9  GM     gs  snnf )N,:zjob endpoints: z1master is set by args, it will be overwritten by r   .PADDLE_MASTERr;   r<   PADDLE_GLOBAL_RANKPADDLE_LOCAL_RANKPADDLE_NNODESr8   r7   r5   PADDLE_RANK_IN_NODEPADDLE_AUTO_CLUSTERr6   runr4   r:   r   PADDLE_DISTRI_BACKENDgloo
workerlog.r=   r>   T)"rM   podreplicasrN   r   r   r$   r%   splitranger   r   rC   r#   rK   ipindexsave_pod_logdeviceget_selected_device_keyget_selected_devicesdeviceslenstrauto_cluster_configupdatejoinr   r'   dtyper   CUSTOM_DEVICEget_custom_device_envsrP   )r   r$   r%   hpjob_endpointsrank_offsetselected_dev_keyselected_dev_listirT   r>   s               r   r&   )CollectiveController._build_pod_with_args[   s`    --/112
hhmm%%c* 
488,,- c1>"#- $ 	 
 	?@?a@P?QQRS	
  -Q/ xx}}3& IIdhhmm&&'$((*;*;; 	 	-(88==//GGI HHMM00EEHHMM!!
 txx(()A!%!5!5),]);(<(,(9(9':)*(9()s$'H:+8[+I()'8*-m*<)='*1v'*488==+L+L'MA HH0#((=2IJK##/7;xx}}7Y7Y6; $%)88==''--1I1IIHHTXX]]11HHJK88$$)HH.9J0KLMHH.!0DEF16:;#A3'HA9G *J A
s   =:Qc                    U R                  5       U R                  l        [        U R                  R
                  R                  5      U R                  l        U R                  R                  R                  5       nU R                  R                  R                  U R                  R                  U R                  R                  5       Vs/ s H(  nU R                  R                  R                   SU 3PM*     nn[        R                  " U R                  R                  U R                  R                  U R                  R                  U R                  R                  R                  R                  U R                  R                  R                   SU 3SR!                  U5      S.5      nU R"                  R%                  SU R&                  R(                   S3U R                  R                  UU R&                  R                  U R                  R                  5      u  pgXpR                  l        [+        U5      S:  a  gU Vs/ s H  n[        R,                  " U5      PM     nnU R                  R.                  R1                  SU 35        U R3                  U5        [5        U Vs/ s H  oS	   PM	     sn5      n	[5        US U  Vs/ s H  oS	   PM	     sn5      n
 US
   S   nUR7                  S5      S
   R9                  5       nU[:        R<                  S'   U Vs/ s H  oS   PM	     nnU(       a  U R                  R?                  5         U R                  R                  R                  RA                  5       nU R                  R                  R                  RC                  U R                  R
                  RD                  5      n[G        U R                  R                  5       GH  nUU	 U R                  R                   X-    U U R&                  R                   XH   X-    U	 [I        U5      [I        U R                  R
                  RJ                  5      S.nURM                  SSR!                  U5      05        U RN                  b2  URM                  U R                  R
                  RP                  SS.5        [+        U5      S
:  a  U R                  R                  R                  R                  [R        RT                  :X  a=  URM                  U R                  R                  R                  RW                  5       5        U R                  R                  S:X  a#  URM                  USR!                  U5      05        O)URM                  XU   05        OURM                  SS05        SU 3nU RY                  UUS9  GM     gs  snf s  snf s  snf s  snf s  snf )NrX   rW   )namerankrh   rw   	candidate	endpoints/z/infor   Fzsync peers done rh   r   r   COLLECTIVE_MASTER_IPr   rZ   r6   ra   rb   rc   rd   re   rf   T)-rM   rg   rh   rN   r   r   r   rK   rL   get_free_portsrk   rF   dumpsr   rn   rw   rv   r#   
sync_peersjobidrr   rG   r   r   rm   sumri   stripr@   environresetro   rp   rq   rj   rs   rt   ru   r   r'   r   rx   ry   rP   )r   	reset_podportr{   r   data	peer_listr   r   global_sizer}   collective_mastercollective_master_ipr|   r~   r   rT   r>   s                     r   r(   +CollectiveController._build_pod_with_master   s    --/ DHHMM../xx}}**,
 XX]]11!!488==
 xx}} !% 	 
 zz HH----33 $ 0 014&9 XXi0	
 ++00}E"HHMMHHHHMM
	 y>A,56IqTZZ]I	6 0<=)$)<)QZ=)<=)ET2BC2BQZ=2BCD	 &aL5  166s;A>DDF-A

)*1:;A;;HHNN88==//GGI HHMM00EEHHMM!!
 txx(()A!2)4(,(9(9':)*(9()s$(HH$5$5#6+4<()'8*5'*1v'*488==+L+L'MA HH0#((=2IJK##/7;xx}}7Y7Y6; $%)88==''--1I1IIHHTXX]]11HHJK88$$)HH.9J0KLMHH.!0DEF16:; $A3'HA9I *L y
< 7
 =C <s   /W	 W-WW %W%)r   )T)r   
__module____qualname____firstlineno__r   classmethodr   r*   r"   r&   r(   __static_attributes____classcell__)r   s   @r   r	   r	      s:      9 $LFPe er   r	   c                   0    \ rS rSr\S 5       rS rS rSrg)CollectiveElasticControlleri  c                    UR                   R                  (       ar  UR                   R                  R                  S5      (       aH  UR                  R	                  U R
                   S35        [        R                  UR                   l        gg)Nzetcd://r   TF)	r   r#   
startswithr   r   r   r   r   r   r   s     r   r   "CollectiveElasticController.enable  sZ    88??sxx99)DDJJ~X67 . 9 9CHHr   c                 
   U R                   R                  S:X  a%  U R                  R                  R	                  S5        U R
                  R                  U R                   R                  U R                  R                  5        g )Ndefaultz?Using default job name may cause conflict, add --job_id in args)	r   r   r   r   rC   r#   register_heartbeatrg   r   )r   s    r   register$CollectiveElasticController.register  sQ    88;;)#HHOO##Q 	&&txx{{DHHMMBr   c                    [        U R                  R                  R                  5      nU R                  R
                  (       a  UOUS-  nU R                  5         U R                  R                  U R                  R                  R                  ::  Ga  U R                  5         U R                  R                  R                  S5        U R                  R                  U R                  R                  U R                  R                   U5      u  p#U(       a  X0R                  l        OU R                  R                  R%                  SU R                   35        U R                  R'                  5       (       a:  U R                  R                  R                  S5        SS KnUR+                  S5        OU R                  R                  R-                  SU R                   35        U R/                  5       (       d  GM  U R                  R1                  U R                  R2                  R4                  5        U R7                  5         U R9                  5       (       a  O;U R                  R                  U R                  R                  R                  ::  a  GM  U R                  R                  R-                  SU R                   35        g )	N
   zWaiting peer ready...zpeer not ready z&Failed to start peer, auto tuner exit.r   zRun z	Job done )rN   r   r   elastic_timeoutr   elasticr   rg   restartmax_restart	build_jobr   rJ   r#   wait_peer_readyreplicas_minreplicas_maxrh   rC   is_auto_tuner_modesysexitr   r*   
set_statusstatusRUNNING
deploy_podwatch)r   timeoutokrh   r   s        r   ra   CollectiveElasticController.run  s   dhhmm334!XX--'7R<hh$((--";";;NNHHOO  !89;;66%%txx'<'<gLB $,!''/$(((DE88..00HHOO((@ HHRLHHOO!!D
"34>>##KK""488??#:#:;OOzz||? hh$((--";";;B 		$((45r    N)	r   r   r   r   r   r   r   ra   r   r   r   r   r   r     s     C&6r   r   )	rF   r@   context.devicer   
controllerr   r   r	   r   r   r   r   <module>r      s/     	 ' 2r: rj86"6 86r   