
    Αi[                        S SK r S SKJr  S SKrS SKJs  Js  Jr  S SK	J
r
Jr  S SKJr  S SKJr  S SKJrJrJrJrJr  S SKJrJr  S SKJr  S S	KJrJr  S S
KJr  S SKJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  \" 5       r'/ SQr(Sq)Sq*S r+S r,S r-S r. " S S5      r/S r0S r1S r2S r3S!S jr4S r5\&" S5       " S S \$5      5       r6g)"    N)defaultdict)
check_typecheck_variable_and_dtype)OperatorDistAttr)get_world_process_group)is_backward_opis_forward_opis_optimize_op6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attr)OP_ROLE_KEYOpRole)core)default_main_programdefault_startup_program)#_keep_layer_norm_scale_bias_to_fp32)unique_name   )ProcessMesh   )AMPPass)register_pass)create_py_readercreate_double_buffer_readerwhilecastc                 Z   U R                  S5      (       aM  U R                  S5      [        R                  R                  R
                  :X  a  U R                  S[        5        U R                  S5      (       aM  U R                  S5      [        R                  R                  R
                  :X  a  U R                  S[        5        U R                  S5      (       aO  U R                  S5      [        R                  R                  R
                  :X  a  U R                  S[        5        g g g )Nin_dtype	out_dtypedtype)has_attrattrr   VarDescVarTypeFP32	_set_attr__target_dtype__)ops    l/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/passes/auto_parallel_fp16.pyset_op_dtype_to_fp16r*   :   s    
JGGJ4<<#7#7#<#<<
Z!12
K  GGK DLL$8$8$=$==
["23	{{7 0DLL4H4H4M4M M
W./ !N    c                 H   U R                  S5      S   nU R                  S5      S   nUR                  U5      nUR                  U5      nUb  Uc   SU SU S35       e[        U 5      (       a[  U R	                  SUR
                  5        UR                  R                  [        R
                  " U R                  S5      5      5        g [        U 5      (       a  UR                  US UR                  S	5       5      nUR                  US UR                  S	5       5      nU R	                  SUR
                  5        U R	                  SUR
                  5        UR                  R                  UR
                  5        UR                  R                  UR
                  5        g g )
NXr   Outzin_var z or out_var z is None of cast opr   r   @)inputoutput_find_var_recursiver	   r&   r    desc	set_dtypepaddler"   r   find)cast_opblockin_nameout_namein_varout_var	in_var_fw
out_var_fws           r)   set_auto_cast_attrr?   I   sb   mmC #G~~e$Q'H&&w/F''1G'"5 
',xj0CD5 W*fll3v||GLL,EFG		 	 --g6IS8I.JK	..x8L(--:L/MN
*ioo6+z'7'78ioo.z//0 
!r+   c                    U R                   R                  (       d  gU R                  nUS:X  a  US:g  $ US:X  a  [        5       (       a  US:g  $ US:X  a  US;  $ US:X  a  US;  $ US	;   a  US
;   $ US;   a  US;  $ US;   a  US;  $ g)NT
batch_normr-   
layer_normfused_bn_add_activation>   r-   Zresnet_unit>   r-   rD   FilterXFilterZfused_attentionfused_feedforward>   LnBiasLn1BiasLn2BiasLnScaleLn1ScaleLn2Scalebatch_norm_grad>   r-   Y@GRADlayer_norm_gradFamp_optionsenabletyper   )r(   r9   op_types      r)   _keep_fp32_inputr[   _   s    >>  ggG,#~,#F#H#H#~++j((->>>:: 
 
 	
 %%o--%%o--r+   c                     U R                   R                  (       d  gU R                  nUS;   a  US:g  $ US:X  a  [        5       (       a  US:g  $ US:X  a  US;  $ US;   a  US;   $ US	;   a  US
:g  $ US;   a  US
:g  $ g)NT)rA   rC   YrB   rE   >   r]   ConvXConvZrH   >   LnMeanLn1MeanLn2Mean
LnVarianceLn1VarianceLn2VariancerT   zX@GRADrQ   FrV   )r(   r:   rZ   s      r)   _keep_fp32_outputrf   ~   s    >>  ggG;;3,#F#H#H3-666:: 
 
 	
 %%8##%%8##r+   c                   \    \ rS rSr SS jrS rS rS rS rS r	S	 r
S
 rS rS rS rSrg)	FP16State   Nc                    Xl         X l        X@l        X0l        U R                  R                  R
                  U l        0 U l        U(       a  XPl        O/ U l        0 U l	        0 U l
        [        [        5      U l        SU l        0 U l        g )NF)programamp_listuse_fp16_guarddist_contextdist_op_contextgrad_op_id_to_op_idgrad_op_to_op_mapforward_op_to_amp_optionsinput_data_var_names_op_fp16_dictforward_non_leaf_tensorsr   listforward_input_cast_opsis_trainout_var_op_deps)selfrk   rl   rn   rm   rs   s         r)   __init__FP16State.__init__   s      ,(--AA 	 *,&(<%(*D%(*%&1'
# !r+   c                 :    U R                   R                  US 5      $ N)rt   get)rz   op_ids     r)   _is_fp16_opFP16State._is_fp16_op   s    !!%%eT22r+   c                    U R                   R                   H  nUR                   H  nUR                   Hs  nX0R                  ;  a*  UR
                  R                  5       /U R                  U'   M<  U R                  U   R                  UR
                  R                  5       /5        Mu     U R                  U5        U R                  U5        M     M     U R                   R                   H  nU R                  U5        M     U R                   R                   H  nU R                  U5        M     U R                   R                   H  nU R                  U5        M     U R                  $ )za
mark the execution mode (fp16 or fp32) for ops in all blocks
include forward ops & backward ops
)rk   blocksopsoutput_arg_namesry   r3   original_idextend_mark_amp_options_info_mark_opresolute_tensor_dtyperesolute_cast_op
cast_blockrx   )rz   r8   r(   names       r)   _build_stateFP16State._build_state   s     \\((Eii//D#7#7768gg6I6I6K5L,,T2,,T299WW0023	 0 ++B/b!   ) \\((E&&u- ) \\((E!!%( ) \\((EOOE" ) }}r+   c                    [        U5      (       a2  UR                  U R                  UR                  R	                  5       '   g[        U5      (       a  UR                  R	                  5       U R                  ;   a  U R                  UR                  R	                  5          U R                  R                  5       ;   aF  U R                  U R                  UR                  R	                  5             nUR                  U5        gggg)zA
Mark amp options info for backward ops according to forward ops
N)	r	   rW   rr   r3   r   r   rq   keysset_amp_options)rz   r(   
amp_options      r)   r    FP16State._mark_amp_options_info   s      **277+>+>+@A Bww""$(>(>>**277+>+>+@A55::<= "&!?!?..rww/B/B/DE"J &&z2= ?  r+   c                    UR                   [        ;   a  g [        U5      (       Ga  UR                   S:X  a;  SUR                  S   ;   a(  SU R                  UR
                  R                  5       '   g UR                   S:X  a  UR                  S   n[        U R                  U   5      S:  at  U R                  U R                  U   S      (       d(  SU R                  UR
                  R                  5       '   g SU R                  UR
                  R                  5       '   g UR                  R                  (       a9  [        R                  XR                  R                  U R                   5      (       a(  SU R                  UR
                  R                  5       '   O'SU R                  UR
                  R                  5       '   UR                   H*  nUR
                  R#                  5       U R$                  U'   M,     O['        U5      [)        [*        R,                  5      :X  a  UR
                  R                  5       U R.                  ;   ar  U R.                  UR
                  R                  5          nX@R                  ;   d   U 5       eU R                  U   U R                  UR
                  R                  5       '   [)        UR1                  S5      5      S:X  a  SU l        g g )	Nassignarray_r   Fr   Top_rolei  )rY   __amp_skip_ops__r	   input_arg_namesrt   r3   r   r   lenry   rW   rX   __amp_utils___need_keep_fp32rl   unsupported_listrm   idru   r   intr   Backwardrq   r"   rx   )rz   r(   r:   var_name	fwd_op_ids        r)   r   FP16State._mark_op   sM   77&&ww("x23E3Ea3H'H<A""277#6#6#89ww("..q1t++H56:--,,X6q9 EJ**277+>+>+@A  EI**277+>+>+@A>>((M,I,IMM22D4G4G- - =B""277#6#6#89<@""277#6#6#89//:<''**,--h7 0 B3v#77ww""$(>(>> 222773F3F3HI	 $6$66?2$?6<@<N<N=""277#6#6#89 rwwy!"c) DM *r+   c                 H   S n UR                  U5      nUb$  UR                  [        R
                  ;  d  SU;   a  g UR                  [        R                  :X  a   UR                  R                  [        5        g g ! [         a  nUR                  U5      n S nANS nAff = f)Nr   )var
ValueError_var_recursiverY   r   _valid_typesr    r5   float32r3   r4   r'   )rz   r   r8   r   es        r)   set_var_to_fp16FP16State.set_var_to_fp16  s    	1))H%C Kxx}9998#99&HH/0 '  	1&&x0C	1s   A< <
B!BB!c                 R   UR                    H  nUR                  S:X  d  M  UR                  S5      S   nUR                  S5      S   nUR	                  U5      nUR	                  U5      nUR                  SUR                  5        UR                  SUR                  5        M     g)zB
Deal the "cast_op" from "FP32" to "FP16" or "BF16" in the model.
r   r-   r   r.   r   r   N)r   rY   r0   r1   r2   r&   r    )rz   r8   r(   r9   r:   r;   r<   s          r)   r   FP16State.resolute_cast_op-  s     ))Bww& ((3-*99U+A.227;33H=Z6['--8 r+   c                 x   UR                    GH  nUR                  R                  (       d  UR                  S:X  a  [	        X!5        M<  [        U5      (       Ga  U R                  UR                  R                  5       5      SL d  UR                  S:X  a  UR                   H`  n[        X#5      (       a  M  UR                  U5       H6  nX@R                  ;  d  M  X@R                  ;  d  M%  U R                  XA5        M8     Mb     UR                   H>  n[!        X%5      (       a  M  UR#                  U5       H  nU R                  Xa5        M     M@     [%        U5        GMU  U R                  UR                  R                  5       5      SL a  UR&                   H  nUR(                  R+                  U5      nUb  UR                  [,        R.                  ;  a  MA  UR0                  [2        :X  d  MW  UR                  R5                  [6        R8                  R:                  R<                  5        M     GM(  GM+  [?        U5      (       d  GM>  U R                  UR                  R                  5       5      SL d  UR                  S:X  a\  UR                   H>  n[!        X%5      (       a  M  UR#                  U5       H  nU R                  Xa5        M     M@     [%        U5        GM  U R                  UR                  R                  5       5      SL d  GM  UR&                   H  nUR(                  R+                  U5      nUb  UR                  [,        R.                  ;  a  MA  UR0                  [2        :X  d  MW  UR                  R5                  [6        R8                  R:                  R<                  5        M     GM     g )Nr   TF) r   rW   rX   rY   r?   r	   r   r3   r   input_namesr[   r0   ru   rs   r   output_namesrf   r1   r*   r   varsr   r   r   r    r'   r4   r   r#   r$   r%   r   )rz   r8   r(   r9   in_var_namer:   out_var_namer<   s           r)   r   FP16State.resolute_tensor_dtype:  s   ))B>>((77f$&r1R   $$RWW%8%8%:;tCww&(#%>>+B88$+-88G+<K +3P3P P$/7P7P$P $ 4 4[ H ,= $2 %'OO,R::$,.IIh,?L 00E -@ %4
 ),%%bgg&9&9&;<E(*(;(;"'**..">#O&||=3M3MM$"==,<<#LL224<<3G3G3L3LM )< F  ##$$RWW%8%8%:;tCww&($&OO,R::$,.IIh,?L 00E -@ %4
 ),%%bgg&9&9&;<E(*(;(;"'**..">#O&||=3M3MM$"==,<<#LL224<<3G3G3L3LM )<g r+   c                    U R                   R                  nSnU[        UR                  5      :  Ga6  UR                  U   nSnUR                  [
        ;   a  US-  nMF  [        U5      (       a  U R                  UR                  R                  5       5      SL aH  U R                  UUU[        [        R                  R                  R                  U R                   5      nGOcU R                  UR                  R                  5       5      SL aF  U R                  UUU[        R                  R                  R                  [        U R                   5      nGO[!        U5      (       Ga  UR                  R                  5       UR"                  ;   a  U R                  UR                  R                  5       5      SL aH  U R%                  UUU[        [        R                  R                  R                  U R                   5      nGOBU R                  UR                  R                  5       5      SL aF  U R%                  UUU[        R                  R                  R                  [        U R                   5      nOUR                  S:X  a  UR&                  S   nUR(                  S   nUR+                  U5      nUR-                  U5      n	UR(                   HJ  nU	R.                  UR+                  U5      R.                  :X  a  M.   U	 SUR+                  U5       SU 35       e   UR                  R1                  U	R.                  5        X5S-   -  nU[        UR                  5      :  a  GM6  UR3                  5         g )Nr   r   FTsum, )rn   ro   r   r   rY   r   r	   r   r3   r   _insert_forward_cast_opsr'   r   r#   r$   r%   r   rp   _insert_backward_cast_opsr   r   r   r2   r    r4   _sync_with_cpp)
rz   r8   ro   idxr(   num_cast_opsr   r   r<   r;   s
             r)   r   FP16State.cast_blockx  s   ++;;C		N"3BLww**qr""##BGG$7$7$9:eC#'#@#@(,,11))$L %%bgg&9&9&;<D#'#@#@,,11())$L  ##77&&(O,O,OO''(;(;(=>%G'+'E'E!, LL0055 --( ))"''*=*=*?@DH'+'E'E! LL0055, --( WW%#%#6#6q#9L"$"4"4Q"7K#ii5G"66{CF')'9'9%||uyy/E/K/KK %hb;)?(@2$GK (: LL**6<<8!##Cu C		N"v 	r+   c                    SnUR                    GH~  nU[        R                  :X  a  [        X5      (       a  M*  UR	                  U5      n	U	c   eUR                  U5       GH(  n
UR                  U
5      nUb.  UR                  [        R                  ;  d  UR                  U:X  a  MH  UR                  U:X  d  MZ  UR                  S-   [        R                  U5      -   nUR                  R                  U5      nU R                  UR                   R#                  5       ==   XR                  XTU4/-  ss'   [$        R&                  " U	R)                  UR                  5      5      nUc   eUb  UR                  U:w  a  UR*                  nUR,                  nU	R.                  nUR1                  UUSUR2                  S9n[5        UUUUUS9  SnUR7                  S5      (       a  UR9                  S5      nUR;                  USS	U0S
U0SUR                  SUR                  [<        [>        R@                  0S9nURC                  SU5        [E        UUUUUS9  US-  nURG                  UR                  U5        U	RI                  X5        GM+     GM     UR7                  S5      (       a,  UR9                  S5      S:w  a  UR9                  S5      U:X  d   eU$ )Nr   z.cast_F)r   r    persistablestop_gradientchunk_id/op_namescoper   r-   r.   r   r   rY   inputsoutputsattrsr   )%r   r5   r   r[   get_op_dist_attr_for_programr0   r2   rY   r   r   r    r   _dtype_to_strr   r   rw   r3   r   copydeepcopyget_input_dist_attrprocess_meshdims_mappingr   
create_varr   r   r!   r"   _insert_op_without_syncr   r   Forwardr&   r   _rename_inputset_input_dist_attr)rz   r(   r   r8   	src_dtype	dst_dtypern   r   r9   consume_op_attrr   r;   	cast_namecast_varin_var_dist_attrref_meshref_mappingref_chunk_idr   r7   s                       r)   r   "FP16State._insert_forward_cast_ops  s    ~~GFNN*/?/L/L*GGKO"...!xx022;?N{{-*D*DD||y0<<9,"#'55i@A 
  %zz~~i8H//0C0C0EF"KKwOK F (,}}';;FKKH($ ,777'8>>Y+F $4#@#@&6&C&C'6'?'?#(#3#3!*"+(-*0*>*>	 $4 $ *($'$%1 (+;;~66+-77>+BL"'"?"?!'$'=%*H$5 *FLL +X^^ +V^^# #@ 
#  ))*L O#$'(%1 %)$$V[[)<#77!U  1 &h ;;{##(<(B77;'9444r+   c                 J   SnUR                   R                  5       nUR                  n	U	R                  U   n
UR	                  U5      nUc   eUR
                   HL  nUR                  U5      n[        XR                  5      (       a  M0  UR                  U:X  a  MB   U SU 35       e   U R                  U
    GHe  u  nnnnnUUR                  ;   a]  XR                  U5      ;   d   SU SU SU 35       eUR                  U5      nUc   eUR                  X5        UR                  UU5        US-   nUUR                   ;   d  M  [#        UR%                  U5      5      S:X  a  M  [#        UR%                  U5      5      S:X  d   SU S	U 35       eUR%                  U5      S   nUR                  U5      nUR'                  U5      nUc   U 5       eUR(                  nUR*                  nUR,                  nUUl        UR/                  [0        R2                  " S
R5                  US/5      5      UUR6                  UR8                  UR:                  UR<                  S9nUR?                  UU5        URA                  UUR                  5        URC                  UR                  U5        URE                  US-   SSUR                  /0SUR                  /0SUSU[F        [H        RJ                  0S9nUR                   RM                  U5        [O        UUUUUS9  US-  nGMh     U$ )Nr   r   zvar: z not in op's z. z@GRADr   [z], Current Op:  r   r    shaperY   r   r   r   r-   r.   r   r   r   r   )(r3   r   ro   rp   r   r   r   rf   r   r    rw   r   r0   r   r   r   r   r   r1   get_output_dist_attrr   r   r   r   r   generate_with_ignorable_keyjoinr   rY   r   r    set_tensor_dist_attr_for_program_rename_outputset_output_dist_attrr   r   r   r   r4   r   )rz   r(   r   r8   r   r   rn   r   r   ro   forward_op_idgrad_op_attrr   r<   r   src_name	slot_namesrc_var_dist_attrgrad_slot_name	grad_namegradgrad_dist_attrr   r   r   	cast_gradr7   s                              r)   r   #FP16State._insert_backward_cast_ops  sQ    gg))+&66';;KH#@@D'''//Lii-G \\22==I-H'"YK/HH-	 0 ((7
 BNN*88I#66 H:]9+RtD6 %1$D$DX$N!(444  500<MN '0N0ryy01Q6299^45: 'rd;: IIn5a8	yy+!-!B!B9!M%1Ai[A1)66,99+44*6'!,,$@@G 45 $** $ 0 0"&"4"4 - 		 ==~ !!)Y^^<11NNN
  77!G)..!12"TYYK0"I#Y#V__ 8 
 		##I.F ) !K 8N r+   )rt   rl   rn   rw   ru   rr   rq   rs   rx   ry   rk   rm   r~   )__name__
__module____qualname____firstlineno__r{   r   r   r   r   r   r   r   r   r   r   __static_attributes__ r+   r)   rh   rh      sF     ""<3@3&(!T1(9<N|>@\|^r+   rh   c           	         [         R                  R                  5       R                  5       nUR	                  5         [        U S[        [        4S5        U  H  n[        US/ SQS5        M     UR                  [        R                  " SR                  SU/5      5      S/S[        R                  R                  R                   SSS	9n[#        X6S
/[$        R&                  SS9  XS.nXS.nS[(        R*                  0n	UR-                  SUUU	S9n
[/        U
R0                  5      n[3        [$        R&                  5      Ul        SUl        SUl        [;        [$        R&                  5      S:  a  SUl        U  He  nUR?                  U5      nUc   eURA                  URB                  URD                  5        URG                  URB                  URD                  5        Mg     URI                  X5        X4$ )Nxcheck_finite_and_unscale)float16r   float64.find_infinite_scaler   boolF)r   r   r    rY   r   r   r   r   r   )r-   Scale)r.   FoundInfiniter   r   )%r5   staticr   global_blockr   r   tuplerv   r   r   r   r   r   r   r#   r$   DENSE_TENSORr   world_process_groupranksr   Optimize	append_opr   r3   r   r   impl_idxr   r   	impl_type get_tensor_dist_attr_for_programset_input_dims_mappingr   r   set_output_dims_mappingset_op_dist_attr_for_program)gradsloss_scalingr   rn   
main_blockr   	found_infr   r   r   new_opnew_op_dist_attrgg_dist_attrs                 r)   _check_and_update_gradientr  w  s   335BBDJucE4=*DE -&		
  %%44HH+T23
 c\\!!.. & 	I "':'@'@1 0F8G(E!!'	 " F (4$/0C0I0I$J! ! !
$$%)%?""CCAF&&&//FFK,,	
 	00FFK,,	
  --fGr+   c                 \   U  VVs/ s H  u  pUPM	     nnnU Vs/ s H$  o"R                   [        R                  :X  d  M"  UPM&     nnU Vs/ s H  o"R                   [        :X  d  M  UPM     nn[	        U5      [	        U5      -   [	        U5      :X  d   S5       eX4U4$ s  snnf s  snf s  snf )Nz4Data types of all grads must be either fp16 or fp32.)r    r5   r   r'   r   )params_grads_r  r  
fp32_grads
fp16_gradss         r)   _split_gradsr%    s    '(<41Q<E("@Ugg&?!UJ@"BUgg1A&A!UJBz?S_,E
: >: j(( )@Bs   B!B$ B$B)'B)c                    [        5       n[        U5      Ul        SUl        [	        U 5      (       d   eSUl        U R                   HE  nUR                  U5      nUR                  U5      nUc   eUR                  XWR                  5        MG     U R                   HE  nUR                  U5      nUR                  U5      nUc   eUR                  XWR                  5        MG     UR                  X5        g )Nr   )r   r   r   r  r
   r   r   r   r  r  r   r   r  r  )r  r  r8   rn   r  r   r   var_dist_attrs           r)   _set_op_dist_attr_with_ranksr(    s    ')$/$6! !&!!!! !**ii!$EEcJ(((//00	
	 + ++ii!$EEcJ(((0000	
	 , --fGr+   c                     [        U R                  5       H;  u  p#UR                  S:X  d  M  UR                  S   UR                  :X  d  M6  US-   s  $    [        S5      e)N
reduce_anyr   r   z=not found the correct location for memcopy for found_inf_var.)	enumerater   rY   r   r   RuntimeError)r8   found_inf_varr   r(   s       r)   _get_memcopy_idxr.    sW    UYY'GG|###A&-*<*<<7N ( G r+   c           	      D   UR                   nU R                  [        R                  " UR	                  S/5      5      UR
                  UR                  [        R                  R                  R                  SUR                  S9n[        UUUR                   Vs/ s H  nSPM     sn[        R                  SS9  US:X  a  SnO[        SU S	35      eS
U0n	U R!                  USSU/0SU/0U	S9n
[#        U
[        R                  X5        U R%                  5         U$ s  snf )Nmemcopy_Fr   r   r   r   D2Hzdirection [z] is not supported yet.dst_place_type
memcpy_d2hr-   r.   )indexrY   r   r   r   )r   r   r   r   r   r    r   r   r#   r$   r  r   r   r  r  NotImplementedErrorr   r(  r   )r8   r   src_varrn   	directionr   
output_varir2  r   r  s              r)   _insert_memcopyr:    s5   ||H!!44MM:,'
 mmmm\\!!..++ " 	J ]]#]]#!! E!)$;<
 	
 ~.E**gY% + F !#))5 
3 	$s   Dc                     [        5       n [        5       n0 nU R                   H2  nUR                  5        H  nUR                  X$R
                  '   M     M4     S nUR                  5       R                   GH
  nU" U5      (       d  M  UR                  S   nUR                  US 5      [        :X  d  M>  UR                  S5      (       d   SU S35       eUR                  5       R                  U5      nUR                  [        R                  :X  a  UR                  R!                  [        5        UR#                  S5      [$        R&                  R(                  R*                  :X  d  M  UR-                  S[        5        GM     g )Nc                     SnU R                   nUR                  U5      (       a  g[        U R                  5      S:w  a  [        U R                  5      S:w  a  gg)Nc_Fr   r   T)rY   
startswithr   r   r   )r(   comm_op_prefixrZ   s      r)   is_initialization_op2cast_startup_program.<locals>.is_initialization_op  sO    ''n--r""#q(S1C1C-D-Ir+   r   r    z>initialization op is supported to has dtype attribute but got r  )r   r   r   all_parametersr    r   r
  r   r   r   r'   r!   r   r5   r   r3   r4   r"   r   r#   r$   r%   r&   )	main_programstartup_programparam_to_dtyper8   pr@  r(   output_namer<   s	            r)   cast_startup_programrH    s4   ')L-/ON$$%%'A%&WWN66" ( %	 **,00##--a0K!!+t48HH{{7++ TUWTXXYZ+ *668<<[I==FNN2LL**+;<777#t||';';'@'@@LL*:; 1r+   auto_parallel_fp16c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )FP16Passi.  c                 "   > [         TU ]  5         g r~   )superr{   )rz   	__class__s    r)   r{   FP16Pass.__init__0  s    r+   c                    U R                  S5      U l        U R                  S5      U l        U R                  S5      nU R                  SS 5      U l        U R                  c  U R                  SS 5      S:H  U l        [        R
                  nU R                  S:X  a%  [        R                  R                  R                  nONU R                  S:X  a%  [        R                  R                  R                  nO[        S	U R                   S
35      eUq[        qU" [        U R                  S5      5      [        U R                  S5      5      S U R                  S9nU R                  S5       Vs/ s H  oR                  PM     n	n[         R"                  R%                  X5         ['        UUU R                  U R                  S5      U	5      n
U
R)                  5       n[+        5         U(       a  U R-                  U R                  5        S S S 5        W(       Ga  U R                  S:X  Ga  [         R"                  R%                  X5         U R/                  5         U R1                  5         [3        U5      u  pnU R                  S5      (       d  U R                  S5      S:w  Ga  / nU(       aP  UR5                  / 5         [7        UU R8                  SU R                  5      u  nnS S S 5        UR;                  W5        U(       aP  UR5                  / 5         [7        UU R8                  SU R                  5      u  nnS S S 5        UR;                  W5        UR5                  / 5         UR=                  5       nUR?                  [         R@                  RB                  RE                  SRG                  SS/5      5      US   RH                  S US   RJ                  US   RL                  SSS9nURO                  SSU0SU/0SS0S9n[Q        U R                  US/[R        RT                  SS 9  [W        U[R        RT                  UU R                  5        UR?                  [         R@                  RB                  RE                  SRG                  S!S/5      5      URH                  S URJ                  URL                  SSS9nURO                  S"SU0SU0S/SS#S$.S9n[Q        U R                  UURX                   Vs/ s H  nSPM     sn[R        RT                  SS 9  [W        U[R        RT                  UU R                  5        S S S 5        U R                  S5      (       aL  UR5                  / 5         U(       a  U R[                  UW5        U(       a  U R[                  UW5        S S S 5        S S S 5        U R                  S%5      nS#Ul.        U R                  (       a  SUl.        U R                  S:X  a  [_        U[         R`                  Rb                  [         R`                  Rd                  45      (       a[  UR5                  / 5         [g        WW5      n[i        UUUU R                  5      nS S S 5        URk                  S&WR                  5        g [m        US'5      (       a  URk                  S&WR                  5        g g g g s  snf ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= fs  snf ! , (       d  f       GN= f! , (       d  f       GNx= f! , (       d  f       GN= f! , (       d  f       N= f)(Nrn   r    r!  use_optimizer_fp16levelo3r  bfloat16ztarget dtype [z"] is for amp o2 not supported yet.custom_white_listcustom_black_list)r    
input_datarm   use_dynamic_loss_scalinginit_loss_scalingg      ?z@fp32z@fp16r  concattmpr   F)r   r    r   	lod_levelrY   r   r   r-   r.   axisr   r   r   r  r*  T)dimkeep_dim
reduce_allbase_optr  _set_auxiliary_var)7get_attrrn   target_dtyperQ  	amp_utilsAutoMixedPrecisionListsr   r#   r$   FP16BF16r5  r'   r   setr   r5   r	  program_guardrh   r   rH  
_cast_loss_init_amp_var_scale_lossr%  _optimized_guardr  _loss_scalingappendr
  r   utilsr   r   r   r    r\  rY   r  r   r  r  r(  r   _update_loss_scaling_multi_precision
isinstance	optimizerAdamAdamWr.  r:  rb  hasattr)rz   rC  rD  contextr!  AMPList_FP16Pass__target_dtyperl   r   rs   
fp16_staterx   r  r#  r$  
found_infsr"  found_inf_fp32found_inf_fp16r8   all_infs	concat_opr  reduce_any_opr9  ra  
insert_idxs                              r)   _apply_single_implFP16Pass._apply_single_impl6  sW    MM.9 MM'2}}^4"&--0Dd"K""*&*mmGT&Bd&JD#33	)!\\1166N*,!\\1166N% !2!2 33UV  *!123123##	
 59MM,4OP4OS4OP]]((G"!!$ %J "..0H " 1 12 H    I-]]00O&&($$&4@4N1Ez &@AA==)<=D%'
%!-!>!>r!B4N$.$($6$6$+$($5$5	5" 1> "C '--n=%!-!>!>r!B4N$.$($6$6$+$($5$5	5" 1> "C '--n=)::2>$0$=$=$?E (-'7'7%+\\%=%=%Y%Y$'HHh->$?&" '1m&9&9&**4Q-*A*A%/]%7%7,1.3 (8 
(H ).%-(+Z'8).
(;'-qk	 )8 )I . $ 1 1 (!# 3 9 9)* 9 ) 3 9 9 % $ 1 1	 ).(8(8%+\\%=%=%Y%Y$'HH.CU-K$L&" '/nn&**2*<*<%-]],1.3 )9 
)I -2OO%1(+X).	(:,-30526'"	 -< 	-M . $ 1 1 )-6__ =__ = 3 9 9)* 9 - 3 9 9 % $ 1 1	A ?N }}%?@@)::2>) $ 9 9*i P) $ 9 9*i P	 ?M PZ }}Z0H(,H%&&,1)  I-v//44f6F6F6L6LM  &66r: &6eY%G
$3!:y$:K:K%		 ; //Y^^LX';<<//Y^^L = .i #  QGG> "C!B "C!BH !>y ?>P ?>M POl ;:s   -Z;#A%[ A5\=&[#1\&[$:*\$F[;3[6?;[;:/\)3\\ %\1 
[
[!	\$
[3	.\6[;;
\
	\
\	\
\.1
\?)rn   rd  rQ  )r   r   r   r   r{   r  r   __classcell__)rN  s   @r)   rK  rK  .  s    rM rMr+   rK  )r1  )7r   collectionsr   r5   paddle.static.amp.fp16_utilsr	  amp
fp16_utilsre  paddle.common_ops_importr   r   6paddle.distributed.auto_parallel.static.dist_attributer   5paddle.distributed.auto_parallel.static.process_groupr   -paddle.distributed.auto_parallel.static.utilsr   r	   r
   r   r   /paddle.distributed.fleet.meta_optimizers.commonr   r   paddle.frameworkr   paddle.staticr   r   r   paddle.utilsr   auto_parallel.process_meshr   auto_parallel_ampr   	pass_baser   r  r   r'   r   r*   r?   r[   rf   rh   r  r%  r(  r.  r:  rH  rK  r   r+   r)   <module>r     s     #  0 0 I  P ! G M $ 4 & $-/    01,>8Z Zz7t)H.)X<D #$yMw yM %yMr+   