
    ΑiZ~                        S SK r S SKrS SKrS SKr\R                  " S5      r/ r/ rS r/ 4S jr	/ 4S jr
S rS r\/ 4S j5       r\/ 4S	 j5       r\S
 5       r\/ 4S j5       r\/ / 4S j5       r\/ 4S j5       r\/ / 4S j5       r\/ 4S j5       r\/ / 4S j5       r\/ 4S j5       rS r\/ / 4S j5       r\/ 4S j5       r\/ 4S j5       r\/ / 4S j5       rS r\/ 4S j5       r\/ 4S j5       r\/ / 4S j5       r\/ / 4S j5       r g)    N
auto_tunerc                 D   SR                  U S   U S   U S   U S   U S   U S   U S   U S	   U S
   5	      nSU;   aK  US    HB  nSR                  S UR                  S5       5       5      nU[        X   5      -  nUS-   U-   nMD     SU;   aK  US    HB  nSR                  S UR                  S5       5       5      nU[        X   5      -  nUS-   U-   nMD      SSKJn  UR                  R                  SU SU 35        [        R                  SU SU 35        g !    N = f)NzIDP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}	dp_degree	mp_degree	pp_degree
vpp_degreesharding_degreesharding_stagemicro_batch_sizeuse_recomputerecompute_granularityrefined_recompute c              3   @   #    U  H  oR                  5       v   M     g 7fN
capitalize.0is     c/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_tuner/prune.py	<genexpr>"log_pruned_info.<locals>.<genexpr>'        F~!||~~~   _custom_search_dimc              3   @   #    U  H  oR                  5       v   M     g 7fr   r   r   s     r   r   r   -   r   r   r   )ctxz	Strategy z has been pruned that )formatjoinsplitstrpaddle.distributed.launch.mainr   loggerinfo)cur_cfgpruned_reason	tuner_cfgpruned_strategykeystrategyr   s          r   log_pruned_infor-      s^   ahh!" !"# '(
O i'01CwwFsyy~FFHGL))H-3h>O 2
 i'01CwwFsyy~FFHGL))H-3h>O 2
6

((>}oN	

 KK
O$$:=/Js   'D Dc                     / nSnU HJ  nU H%  nX`;   a  M
  Xe;  d  XV   X   :w  d  M  US;  d  M#  Sn  O   U(       a  UR                  U5        MH  SnML     U$ )z
Compare the current configuration with the history configuration,
and obtain the same configurations as the current configuration except for the given attr.
T)estimated_memory_usageF)append)attrsr'   history_cfgsresultssamecfgr+   s          r   same_cfgs_besider6   >   si    
 GDC|~GL(99  NN3D  N    c                 n    S nU H,  n/ SQnSnU H  nXG   X   :w  d  M  Sn  O   U(       d  M)  Un  U$    U$ )N)r   r   r   r   r   r   r   r
   TF )r)   r'   r2   resultr5   keysr4   r+   s           r   !same_cfgs_beside_sharding_overlapr<   X   sX    F	
 Cx7<'  4FM' & Mr7   c                 >   ^  U 4S jn[         R                  U5        U$ )Nc                     > T" U 0 UD6$ r   r9   argskwargsfuncs     r   wrapperregister_prune.<locals>.wrapperq       T$V$$r7   )_PRUNE_FUNCr0   rB   rC   s   ` r   register_prunerH   p   s    % wNr7   c                 >   ^  U 4S jn[         R                  U5        U$ )Nc                     > T" U 0 UD6$ r   r9   r?   s     r   rC   'register_prune_history.<locals>.wrappery   rE   r7   )_PRUNE_HISTORY_FUNCr0   rG   s   ` r   register_prune_historyrM   x   s    % w'Nr7   c                    UR                  SS5      nU S   R                  SS5      nU S   R                  SS5      nU S   R                  SS5      nU S   R                  SS5      nU R                  SS	5      nUc  g	U(       a	  XC-  S
:w  a  gU(       a	  XS-  S
:w  a  gU(       a	  Xc-  S
:w  a  gU(       a  Xs-  S
:w  a  U(       a  gU R                  SS5      n	U	S:X  a  U S   S   n	U	(       a  X9;  a  gg	)z
Prune by mp, the rules are:
1. MP degree should be evenly divided by hidden size and vocab size
2. MP degree should be in the candidates of user defined.
3. MP degree should be less than 8 if no candidates.
r   N	model_cfghidden_size
vocab_sizenum_attention_heads
seq_lengthuse_sequence_parallelFr   Tauto
candidatesget)
r)   r'   r2   r   rP   rQ   rR   rS   rT   mp_degree_candidatess
             r   prune_by_mprZ      s    K.IK(,,]DAK;'++L$?J#K044t ;'++L$?J%MM*A5I{.!3j,12>!Cj,16K$==d;v%(6{C0r7   c                 *   UR                  SS5      nU S   R                  SS5      nSU;   a  US   OU R                  SS5      nUc  gU(       a	  XC-  S:w  a  g	U R                  SS5      nUS
:X  a  U S   S   nU(       a  X6;  a  g	 gUS:w  a  X5:  a  g	g)z
Prune by pp (pipeline-parallelism), the rules are:
1. PP degree should be evenly divided by number of layers.
2. PP degree should be in the candidates of user defined.
3. If no candidates, PP degree should be less than or equal to the number of nodes.
r   NrO   
num_layersnodes   Fr   TrU   rV   rW   )r)   r'   r2   r   r\   	num_nodespp_degree_candidatess          r   prune_by_ppra      s     K.I;'++L$?J#w.IMM'14M  !Q&$==d;v%(6{C0 1
  >i3r7   c           
         UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUb  Ub  Uc  g[        R                  " U5      nUR                  U5        [	        SS/X5      nU(       am  U Hg  nU(       a  M  US   US   -  XE-  :X  d  M  US   U:  d  M*  UR                  S5      S:X  d  MA  SU SU S	US    S
US    S3	n	[        XU 5        SUS'     g   g)Nr   r   	recomputeFmax_mem_usageOOMz
mp_degree z, pp_degree  may cause oom because z,  already oom.TrX   copydeepcopyextendr6   r-   )
r)   r'   r2   pruned_cfgsr   r   r   cfgsr5   r(   s
             r   prune_by_mp_pp_historyrn      s   K.IK.IKKT2MI-1F==.L$[+6NDC!M$s;'779;PP$y0GGO,5",YK|I;Nefijufvewwyz}  J  {K  zL  LY  !Z	B+0(  r7   c                    UR                  SS5      nUR                  SS5      nU S   R                  SS5      nUc  gUc  gU(       ai  SU;   a  US   OU S   R                  SS5      nUUS   -  US	   -  US
   -  nUS:  a	  Xs-  S:w  a  gXSU-  -  S:w  a  gUS:X  a  US:w  a  gUS::  a  US:w  a  gU R                  SS5      nUS:X  a  U S   S   nU(       a  XH;  a  gg)z
Prune by vpp (virtual pipeline parallelism), the rules are:
1. VPP degree should be evenly divided by number of layers.
2. VPP degree should be in the candidates of user defined.
r   Nr   rO   r\   Fglobal_batch_sizer   r	   r   r^   r   T   rU   rV   rW   )	r)   r'   r2   r   r   r\   rp   	acc_stepsvpp_degree_candidatess	            r   prune_by_vpprt      s6    K.I\40J;'++L$?J #g- '(;'++,?F 	 {#$()* )*+ 	 >i3q8Z/0A5>jAo>jAo%MM,=& ), 7 E2r7   c                 4   UR                  SS 5      nUc  g[        R                  " U5      nUR                  U5        [	        SX5      nU(       aH  U HB  nUS   U:  d  M  UR                  S5      S:X  d  M%  SU SUS    S3n[        XU 5        SUS'     g   g)	Nr   Frd   re   zvpp_degree rf   rg   Trh   )r)   r'   r2   rl   r   rm   r5   r(   s           r   prune_by_vpp_historyrv     s    \40J==.L$L'@DC L!J.GGO,5"-j\9PQTUaQbPccp q	B+0(  r7   c                    UR                  SS5      nSU;   a  US   OU S   R                  SS5      nUS:X  a  US   nU(       a  UUS   -  US   -  nUS:X  a  g	U R                  SS5      nUS:X  a  U S
   S   nUc  gW(       aO  XS-  S:w  a  g	XS-  nUR                  SS5      nUb  Xx:  a  g	UR                  SS5      n	U	b  U	S:  a  Ub	  Xx-  S:w  a  g	U(       a  X6;  a  g	g)a  
Prune by mbs (micro batch size), the rules are:
1. Micro batch size should be evenly divided by the local batch size.
2. Micro batch size should be in the candidates of user defined.
3. Prune if a similar configuration with a larger micro batch size resulted in a valid run.
r   Nrp   rO   rU   r   r	   r   TrV   Fr   r   r^   rW   )
r)   r'   r2   r   rp   local_batch_sizembs_candidatesrr   r   r   s
             r   prune_by_mbsrz   2  s8    {{#5t< ') 	#${#''(;TB 
 F"#$78{#$()* 	
 q ]]#5t<N"<01CD.!3$8	KKT2	 $[[t4
!j1n$(A-1r7   c                    UR                  SS 5      nUc  g[        R                  " U5      nUR                  U5        [	        SS/X5      nU(       a  U H  nUS   U:  a8  UR                  SS5      S:  a"  SU SUS    S	3n[        XU 5        US   US'     g
US   U:  d  MO  UR                  S5      S:X  d  Mf  SU SUS    S3n[        XU 5        SUS'     g
   g)Nr   Frr   timer   zmicro_batch_size  may be slower because  has been already runnable.Trd   re   rf   rg   rh   )r)   r'   r2   rl   r   rm   r5   r(   s           r   prune_by_mbs_historyr   h  s$   {{#5t<==.L$	[)7D C&'*::GGFB'!+"34D3EE\]`as]t\u  vQ  !R	B"%f+ &'*::GGO,5"34D3EE\]`as]t\u  vC  !D	B+0(# $ r7   c                    UR                  SS5      nUR                  SS5      nUR                  SS5      nU(       d  gU(       d  gU R                  SS5      nUS:X  a  U S   S   nU R                  SS5      nUS:X  a  U S   S   nU(       a  X6;  a  gU(       a  XG;  a  gU(       a  US	:w  a  US	:w  a  US	:w  a  gUS	:X  a  [        SX5      nU(       a  gg)
a  
Prune by sharding parameters, the rules are:
1. Sharding stage and sharding degree should be specified.
2. Sharding stage and degree should be in the candidates of user defined.
3. If PP (pipeline-parallelism) degree is not 1, sharding stage must be 1.
4. Prune if a similar configuration with a lower sharding stage resulted in a valid run.
5. If sharding degree is 1, sharding stage is invalid.
r
   Nr	   r   FrU   rV   Tr^   )rX   r6   )	r)   r'   r2   r
   r	   r   sharding_stage_candidatessharding_degree_candidatesrm   s	            r   prune_by_shardingr     s     [[!148Nkk"3T:OK.I ).> E F*$-l$;<L$M!!*/@$!G!V+%.|%<=N%O" :!< 	Naq ! 0'Hr7   c                    UR                  SS 5      nUc  gUR                  SS 5      nUc  g[        R                  " U5      nUR                  U5        [	        SX5      nU(       a  U H  nUS   U:  a8  UR                  SS5      S:  a"  SU SUS    S	3n[        XU 5        US   US'     g
US   U:  d  MO  UR                  S5      S:X  d  Mf  SU SUS    S3n[        XU 5        SUS'     g
   g)Nr	   Fr
   r|   r}   r   zsharding_stage r~   r   Trd   re   rf   rg   rh   )	r)   r'   r2   rl   r	   r
   rm   r5   r(   s	            r   prune_by_sharding_historyr     s+    kk"3T:O[[!148N==.L$,gDDC$%6GGFB'!+"1.1AAXY\]mYnXo  pK  !L	B"%f+ $%6GGO,5"1.1AAXY\]mYnXoo| }	B+0(% ( r7   c                 z   UR                  SS5      nUR                  SS5      n[        U5      nUc  gU S   R                  SS5      nU S   R                  SS5      nU(       a  XG;  a  gU(       a  U(       a  X6;  a  gU(       d6  US:w  a  g[        SS/X5      nU(       a  U H  n	U[        U	5      :X  d  M    g   g)a  
Prune by recompute parameters, the rules are:
1. If recompute is not used, return False directly.
2. Usage of recompute and recompute granularity should be in the candidates of user defined.
3. If recompute is not used, but recompute granularity is set, return True for pruning.
4. Prune if a similar configuration without using recompute resulted in a valid run.
5. If recompute is false, prune redundant recompute granularity
r   Nr   FrV   Tfull)rX   get_config_recompute_levelr6   )
r)   r'   r2   r   r   recompute_level recompute_granularity_candidatesuse_recompute_candidatesrm   r5   s
             r   prune_by_recomputer     s     $KK(?FKK6M09O'0'>'B'B($  )6::   8',A H F*56
 "&@&EE  r7   c                 v    SSSS.nU R                  SS 5      nU R                  SS 5      nUc  g U(       d  gX   $ )N   rq   r^   )r   	full_attn	core_attnr   r   r   rW   )r5   recompute_granularity_levelr   r   s       r   r   r     sI    +,11"MGGOT2MGG$;TB*AAr7   c                    [        U5      nUc  g[        R                  " U5      nUR                  U5        [	        SS/X5      nU(       a  U H  n[        U5      US'   US   U:  a5  UR                  SS5      S:  a  SUS    S	3n[        XU 5        US   US'     g
US   U:  d  MZ  UR                  S5      S:X  d  Mq  SUS    S3n[        XU 5        SUS'     g
   g)NFr   r   r   r|   r}   r   z$use_recompute may be slower because r   Trd   re   z$use_recompute may cause oom because rg   )r   ri   rj   rk   r6   rX   r-   )r)   r'   r2   rl   r   rm   r5   r(   s           r   prune_by_recompute_historyr   "  s    19O==.L$	12GD C%?%DC!" %&8GGFB'!+"Fs?G[F\\w x	B"%f+ %&8GGO,5"Fs?G[F\\i j	B+0(' * r7   c                     SU;   a  US   OU R                  S5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nXE-  U-  U-  U:w  a  gg)	Nnum_gpusr   r^   r   r   r	   TFrW   )r)   r'   r2   r   r   r   r   r	   s           r   prune_by_num_gpusr   K  s       	
]]:& 
 K+IK+IK+Ikk"3Q7Oy(?:hFr7   c                    U R                  SS 5      nU R                  SS 5      nU S   nUc  g[        R                  R                  U5      (       d  [	        SU 35      eUc  [	        S5      eUS   nUS   nUS	   nUS
   n	US   n
US   nUS   nUS   nUS   nSUS[        U5      S[        U5      S[        U5      S[        U	5      S[        U
5      S[        U5      S[        U5      S[        U5      S[        U5      /nUR                  SS 5      nUb  UR                  S[        U5      /5        UR                  SS 5      nUb  UR                  S[        U5      /5        UR                  SS 5      nUb  UR                  S[        U5      /5        UR                  S S 5      nUb  UR                  S![        U5      /5        UR                  S"S 5      nUb  UR                  S#[        U5      /5        UR                  S$S 5      nUb  UR                  S%[        U5      /5        [        R                  " US&S&S'9nUR                  S(:X  aa  [        [        [        UR                  5      S)5      5      nUUS*'   S+U S,U S-3nUUS.-  :  nU(       a  US/-  n[        R                  U5        U$ [	        S0UR                    35      e)1Nmemory_estimation_toolrd   rO   Fz7memory_estimation_tool should be a valid path, but got z=max_mem_usage should be set when using memory estimation toolr   r   r   r   r	   r
   r   r   r   pythonz--dp_degreez--mp_degreez--pp_degreez--vpp_degreez--sharding_degreez--sharding_stagez--use_recomputez--micro_batch_sizez--recompute_granularityrP   z--hidden_sizerR   z--num_attention_headsr\   z--num_layersmax_sequence_lengthz--max_sequence_lengthrQ   z--vocab_sizeintermediate_sizez--intermediate_sizeT)capture_outputtextr   rq   r/   z
Estimated z memory usage: z MBi   z, it will be pruned!z*memory_estimation_tool failed with error: )rX   ospathexists
ValueErrorr#   rk   
subprocessrun
returncodeintroundfloatstdoutr%   r&   stderr)r)   r'   r2   r   max_memory_usagerO   r   r   r   r   r	   r
   r   r   r   memory_estimation_cmdrP   rR   r\   r   rQ   r   r:   cur_memory_usagemsgmemory_exceededs                             r   prune_by_memory_estimationr   \  s*   &]]+CTJ }}_d;+&I%77>>011EF\E]^
 	
 K
 	

 $I$I$I&J/0O-.NO,M12#$;< 	IIIJONM!!")0 --t4K$$os;7G%HI#--(=tD&$$$c*=&>?	
 |T2J$$nc*o%FG#--(=tD&$$$c*=&>?	
 |T2J$$nc*o%FG!&94@$$$"C(9$:;	
 ^^F AuU6==%91=>,<()7)?3C2DCH*.>.EF))CC8H
 	
r7   c                 X    SU;   a$  [        XU5      nU(       d  gX@S   S      (       d  gg)z2Prune by sharding overlap for single dp estimationsharding_overlapT
metric_cfgnameF)r<   )r)   r'   r2   rl   r:   s        r   prune_by_sharding_overlapr     s9    
 W$2
 -f56r7   c           
         SSSSSSSSS	S
.	nSSSS.n0 nU H	  nXTX%   '   M     U GH  n[        U[        5      (       d   eUR                  S5      nSnU GH  n	S n
U H  nU	R                  U5      (       d  M  Un
  O   U
(       d  M2  U	[	        U
5         nU
S;   a9  US:X  a  XU
      S:w  a  US-  nM^  M`  [        U5      nXU
      U:X  a  US-  nM}  M  U
S:X  aC  US:X  a  XU
      (       a  US-  nM  M  [        [        U5      5      nXU
      U:X  a  US-  nM  M  U
S:X  a;  US:X  a  XS      S:w  a  US-  nM  M  [        U5      nXU
      U:X  a  US-  nGM  GM	  U
S:X  a/  US:X  a  US-  nGM  [        U5      nXU
      U:X  a  US-  nGM;  GM>  U
S	:X  d  GMG  US:X  a  XS      (       a  US-  nGMa  GMd  [        U5      nX;   nXU
      U:X  d  GM  US-  nGM     GM     W[	        W5      :X  a  gg)Ndpmpppvppmbsshardingstagerc   granularity)	r   r   r   r   r   r	   r
   r   r   r   r   r   )r   r^   rq   r   r   )r   r   r   r   r   *r^   r   TF)
isinstancer#   r"   
startswithlenr   bool)r'   invalid_strategymappinggranularity_mappingreversed_mappingr+   r,   dimshas_matcheddimmatchedvaluer   s                r   
is_invalidr     s   !%!$!.
G %E),&  %(C((((~~c"CG'>>#&&!G ( wCL)CC|"G#<=B'1,K$ C !$E
"G#<=F'1,K$ G +|"G#<='1,K$ > !%SZ 0"G#<=F'1,K$ G '|"J#?@AE'1,K$ F !$E
"G#<=F'1,K$ G %|#q(  #E
"G#<=F'1,K$ G -|"O#DE'1,K$ F !$E
&9&@"G#<=L'1,K$u 	 %~ c$ir7   c                     U R                  SS 5      (       a-  U S   n[        U[        5      (       d   e[        X5      (       a  gg)Nr   TF)rX   r   listr   )r)   r'   r2   r   s       r   prune_by_invalid_strategyr   +  sE    }}'..$%78*D1111g00r7   c                 :   U R                  SS 5      (       a  U R                  S5      nUS   nUS   nUS   nU Vs/ s H  oqU   PM	     nnU(       a,  U(       a%  US:w  a  UR                  S5      [        U5      :w  a  gUS:X  a  UR                  S5      [        U5      :w  a  gU S	   S
   U-  S:w  a  gU S	   S
   U-  n	XS      U	:  a  gSn
U
[        U5      :  a;  XU
      U	:  d  XU
S-
        U	:w  a  XU
      S:w  a  gU
S-  n
U
[        U5      :  a  M;  gs  snf )Nr   r   r   r   r   r   Tr^   rO   r\   F)rX   countr   )r)   r'   r2   rrr   rc   r   itemcompare	max_valuer   s              r   prune_by_refined_recomputer   6  s=   }}($//]]./K(	O,	 '(? @-/0RT4=R0$)>&)H==#s7|3>gmmA.#g,>[!,/);q@k*<89D	a5>I%#b'k!u~	)1q5	"i/GqENa4GFA #b'k ) 1s   Dc                 2   U R                  SS 5      (       Ga  [        R                  " U5      nUR                  U5        U R                  S5      n[        R                  " U5      nUR	                  S5        [        XQU5      nU GH  nU(       d  M  U H  nUS   (       d7  UR                  SS5      S:  a!  U SX    S3n	[        XU 5        US   US'       gX   X   :  aP  UR                  SS5      S:  a:  US   (       a0  US   (       a&  U SX    S	X    S
3n	[        XU 5        US   US'       gX   X   :  d  M  UR                  S5      S:X  d  M  US   (       d  M  US   (       d  M  U SX    SX    S3n	[        XU 5        SUS'       g   GM     g)Nr   r   r|   r}   r    z? may be slower because not recompute has been already runnable.Tr~   r   rd   re   rf   rg   F)rX   ri   rj   rk   r0   r6   r-   )
r)   r'   r2   rl   r   r   rm   r   r5   r(   s
             r   "prune_by_refined_recompute_historyr   T  s    }}($//}}\2K(]]./--#',?DtC/CGGFB4G!4K+/&'-@  )A'	J*-f+#	GM1GGFB/!30#O4+/&'-@WX[XaWbb}(~'	J*-f+# 	GM1GGO4=00#O44+/&'-@WX[XaWbbo(p'	J380#5   < r7   c                 b   [         R                  " U5      nU R                  SS 5      n/ n0 nUb\  UR                  5        HH  u  pxUS   (       d  M  UR	                  U5        [        US   5       VVs0 s H  u  pXx_M	     snnUW'   MJ     U H  nUR                  U5        [        XqU5      n	UR                  US 5      n
U
c    gU	(       d  M@  U	 HR  nX   nXg   U   Xg   U
   :  d  M  UR                  SS5      S:  d  M1  U U SU U
 S	3n[        XU 5        US   US'       g
   M     gs  snnf )Nr   pruner   Fr|   r}   r   r~   r   T)	ri   rj   rX   itemsr0   	enumeraterk   r6   r-   )r)   r'   r2   rl   r   prune_custom_search_dimcustom_dim_levelr+   r   rm   	cur_valuer5   	cfg_valuer(   s                 r   "prune_by_custom_search_dim_historyr     sX    ==.L!&94@ $+113JCW~~'..s3 2;5>1J)1J:5CJ1J) %	 4 'K(l;KKT*	 4H	$))4&+I67+a/'*eI;6McUS\R]]x$yM#GIF&)&kGFO  '* 3)s   3D+)!ri   loggingr   r   	getLoggerr%   rF   rL   r-   r6   r<   rH   rM   rZ   ra   rn   rt   rv   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r9   r7   r   <module>r      s     	 			<	( "J 35 4 HJ 0 13 ( (V 13  @  6 24 , ,^ :<"  0 24 2 2j :<"  B 79 0 0f %'R# #L 8: + +\B %'R% %P 79    @B f
 f
R %'R Rj ?A   @B  : %'R( (V %'R% %r7   