
    x-j[                        d dl Z d dlmZ d dlZd dlmc mc mZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZ d dlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&  e            Z'g dZ(da)da*d Z+d Z,d Z-d Z. G d d          Z/d Z0d Z1d Z2d Z3d"dZ4d Z5 e&d           G d  d!e$                      Z6dS )#    N)defaultdict)
check_typecheck_variable_and_dtype)OperatorDistAttr)get_world_process_group)is_backward_opis_forward_opis_optimize_op6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attr)OP_ROLE_KEYOpRole)core)default_main_programdefault_startup_program)#_keep_layer_norm_scale_bias_to_fp32)unique_name   )ProcessMesh   )AMPPass)register_pass)create_py_readercreate_double_buffer_readerwhilecastc                 <   |                      d          rH|                     d          t          j        j        j        k    r|                     dt                     |                      d          rH|                     d          t          j        j        j        k    r|                     dt                     |                      d          rJ|                     d          t          j        j        j        k    r|                     dt                     d S d S d S )Nin_dtype	out_dtypedtype)has_attrattrr   VarDescVarTypeFP32	_set_attr__target_dtype__)ops    l/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/passes/auto_parallel_fp16.pyset_op_dtype_to_fp16r*   :   s    
J3GGJ4<#7#<<<
Z!1222
K  4GGK  DL$8$===
["2333	{{7 0 0 0DL4H4M M M
W./////0 0 M M    c                    |                      d          d         }|                     d          d         }|                    |          }|                    |          }||J d| d| d            t          |           r\|                     d|j                   |j                            t          j        | 	                    d                               d S t          |           r|                    |d |                    d	                             }|                    |d |                    d	                             }|                     d|j                   |                     d|j                   |j                            |j                   |j                            |j                   d S d S )
NXr   Outzin_var z or out_var z is None of cast opr   r   @)inputoutput_find_var_recursiver	   r&   r    desc	set_dtypepaddler"   r   find)cast_opblockin_nameout_namein_varout_var	in_var_fw
out_var_fws           r)   set_auto_cast_attrr?   I   s   mmC  #G~~e$$Q'H&&w//F''11G'"5"5D'DDxDDD #6"55 W 	1*fl333v|GLL,E,EFFGGGGG		 	  1--g6IS8I8I6I.JKK	..x8L(--:L:L8L/MNN
*io666+z'7888io...z/000001 1r+   c                     | j         j        sdS | j        }|dk    r|dk    S |dk    rt                      r|dk    S |dk    r|dvS |dk    r|dvS |d	v r|d
v S |dv r|dvS |dv r|dvS dS )NT
batch_normr-   
layer_normfused_bn_add_activation>   r-   Zresnet_unit>   r-   rD   FilterXFilterZfused_attentionfused_feedforward>   LnBiasLn1BiasLn2BiasLnScaleLn1ScaleLn2Scalebatch_norm_grad>   Y@GRADr-   layer_norm_gradFamp_optionsenabletyper   )r(   r9   op_types      r)   _keep_fp32_inputr[   _   s    >  tgG,#~,#F#H#H#~+++j((->>>::: 
 
 	
 %%%o--%%%o--5r+   c                     | j         j        sdS | j        }|dv r|dk    S |dk    rt                      r|dk    S |dk    r|dvS |dv r|dv S |d	v r|d
k    S |dv r|d
k    S dS )NT)rA   rC   YrB   rE   >   r]   ConvXConvZrH   >   LnMeanLn1MeanLn2Mean
LnVarianceLn1VarianceLn2VariancerT   zX@GRADrQ   FrV   )r(   r:   rZ   s      r)   _keep_fp32_outputrf   ~   s    >  tgG;;;3,#F#H#H3-666::: 
 
 	
 %%%8##%%%8##5r+   c                   T    e Zd Z	 ddZd Zd Zd Zd Zd Zd Z	d	 Z
d
 Zd Zd ZdS )	FP16StateNc                    || _         || _        || _        || _        | j        j        j        | _        i | _        |r|| _        ng | _        i | _	        i | _
        t          t                    | _        d| _        i | _        d S )NF)programamp_listuse_fp16_guarddist_contextdist_op_contextgrad_op_id_to_op_idgrad_op_to_op_mapforward_op_to_amp_optionsinput_data_var_names_op_fp16_dictforward_non_leaf_tensorsr   listforward_input_cast_opsis_trainout_var_op_deps)selfrj   rk   rm   rl   rr   s         r)   __init__zFP16State.__init__   s      ,(-A 	 *,& 	+(<D%%(*D%(*%&1'
 '
# !r+   c                 8    | j                             |d           S N)rs   get)ry   op_ids     r)   _is_fp16_opzFP16State._is_fp16_op   s    !%%eT222r+   c                 J   | j         j        D ]}|j        D ]}|j        D ]f}|| j        vr#|j                                        g| j        |<   .| j        |                             |j                                        g           g|                     |           | 	                    |           | j         j        D ]}| 
                    |           | j         j        D ]}|                     |           | j         j        D ]}|                     |           | j        S )zy
        mark the execution mode (fp16 or fp32) for ops in all blocks
        include forward ops & backward ops
        )rj   blocksopsoutput_arg_namesrx   r3   original_idextend_mark_amp_options_info_mark_opresolute_tensor_dtyperesolute_cast_op
cast_blockrw   )ry   r8   r(   names       r)   _build_statezFP16State._build_state   s^    \( 
	" 
	"Ei 	" 	"/  D4#77768g6I6I6K6K5L,T22,T299W00223    ++B///b!!!!	" \( 	. 	.E&&u----\( 	) 	)E!!%(((( \( 	# 	#EOOE""""}r+   c                    t          |          r(|j        | j        |j                                        <   dS t          |          r|j                                        | j        v r| j        |j                                                 | j                                        v rJ| j        | j        |j                                                          }|                    |           dS dS dS dS )zQ
        Mark amp options info for backward ops according to forward ops
        N)	r	   rW   rq   r3   r   r   rp   keysset_amp_options)ry   r(   
amp_options      r)   r   z FP16State._mark_amp_options_info   s      	3 *27+>+>+@+@AAA B 		3w""$$(>>>*27+>+>+@+@A5::<<= = "&!?.rw/B/B/D/DE"J &&z22222		3 		3>>= =r+   c                    |j         t          v rd S t          |          r||j         dk    r2d|j        d         v r#d| j        |j                                        <   d S |j         dk    r|j        d         }t          | j	        |                   dk    rc| j        | j	        |         d                  s"d| j        |j                                        <   n!d| j        |j                                        <   d S |j
        j        r+t                              || j        j        | j                  r"d| j        |j                                        <   n!d| j        |j                                        <   |j        D ]#}|j                                        | j        |<   $nt'          |          t)          t*          j                  k    r|j                                        | j        v rd| j        |j                                                 }|| j        v sJ |             | j        |         | j        |j                                        <   t)          |                    d                    dk    r	d| _        d S d S )	Nassignarray_r   Fr   Top_rolei  )rY   __amp_skip_ops__r	   input_arg_namesrs   r3   r   r   lenrx   rW   rX   __amp_utils___need_keep_fp32rk   unsupported_listrl   idrt   r   intr   Backwardrp   r"   rw   )ry   r(   r:   var_name	fwd_op_ids        r)   r   zFP16State._mark_op   so   7&&&F !	w(""x23Ea3H'H'H<A"27#6#6#8#89w("".q1t+H566::-,X6q9 I EJ*27+>+>+@+@AADH*27+>+>+@+@AF>( AM,I,IDM2D4G- - A =B"27#6#6#8#899<@"27#6#6#8#89/ G G:<'**,,-h77G B3v#7#777w""$$(>>> 2273F3F3H3HI	 D$66662666<@<N="27#6#6#8#89 rwwy!!""c)) DMMM *)r+   c                 2   d }	 |                     |          }n,# t          $ r}|                    |          }Y d }~nd }~ww xY w||j        t          j        vsd|v rd S |j        t          j        k    r!|j	        
                    t                     d S d S )Nr   )var
ValueError_var_recursiverY   r   _valid_typesr    r5   float32r3   r4   r'   )ry   r   r8   r   es        r)   set_var_to_fp16zFP16State.set_var_to_fp16  s    	1))H%%CC 	1 	1 	1&&x00CCCCCC	1 Kx}9998##F9&&H/00000 '&s    
A>Ac                 \   |j         D ]}|j        dk    r|                    d          d         }|                    d          d         }|                    |          }|                    |          }|                    d|j                   |                    d|j                   dS )zR
        Deal the "cast_op" from "FP32" to "FP16" or "BF16" in the model.
        r   r-   r   r.   r   r   N)r   rY   r0   r1   r2   r&   r    )ry   r8   r(   r9   r:   r;   r<   s          r)   r   zFP16State.resolute_cast_op-  s     ) 	9 	9Bw&  ((3--*99U++A.227;;33H==Z666['-888	9 	9r+   c                 L   |j         D ]}|j        j        s|j        dk    rt	          ||           +t          |          r|                     |j                                                  du s|j        dk    r|j	        D ]S}t          ||          r|                    |          D ]*}|| j        vr|| j        vr|                     ||           +T|j        D ]A}t!          ||          r|                    |          D ]}|                     ||           Bt%          |           )|                     |j                                                  du rx|j        D ]p}|j                            |          }||j        t,          j        vr2|j        t2          k    r.|j                            t6          j        j        j                   qt?          |          r9|                     |j                                                  du s|j        dk    rZ|j        D ]A}t!          ||          r|                    |          D ]}|                     ||           Bt%          |           t|                     |j                                                  du rx|j        D ]p}|j                            |          }||j        t,          j        vr2|j        t2          k    r.|j                            t6          j        j        j                   qd S )Nr   TF) r   rW   rX   rY   r?   r	   r   r3   r   input_namesr[   r0   rt   rr   r   output_namesrf   r1   r*   r   varsr}   r   r   r    r'   r4   r   r#   r$   r%   r   )ry   r8   r(   r9   in_var_namer:   out_var_namer<   s           r)   r   zFP16State.resolute_tensor_dtype:  sm   ) ;	N ;	NB>( 7f$$&r5111R   5N $$RW%8%8%:%:;;tCCw&((#%> I I+B88 %$+-88G+<+< I IK +43P P P$/t7P$P$P $ 4 4[% H H HI %'O F F,R:: %$,.IIh,?,? F FL 00uEEEEF(,,,,%%bg&9&9&;&;<<EE(*(; N N"'*..">">#O&|=3MMM$"=,<<<#L224<3G3LMMM## N$$RW%8%8%:%:;;tCCw&(($&O F F,R:: %$,.IIh,?,? F FL 00uEEEEF(,,,,%%bg&9&9&;&;<<EE(*(; N N"'*..">">#O&|=3MMM$"=,<<<#L224<3G3LMMMw;	N ;	Nr+   c                 @   | j         j        }d}|t          |j                  k     r|j        |         }d}|j        t
          v r|dz  }<t          |          r|                     |j        	                                          du r:| 
                    |||t          t          j        j        j        | j                   }n'|                     |j        	                                          du r8| 
                    |||t          j        j        j        t          | j                   }nt!          |          r|j        	                                |j        v r|                     |j        	                                          du r:|                     |||t          t          j        j        j        | j                   }n'|                     |j        	                                          du r8|                     |||t          j        j        j        t          | j                   }n|j        dk    r|j        d         }|j        d         }|                    |          }|                    |          }	|j        D ]J}|	j        |                    |          j        k    s%J |	 d|                    |           d|             K|j                            |	j                   ||dz   z  }|t          |j                  k     |                                 d S )Nr   r   FTsum, )rm   rn   r   r   rY   r   r	   r   r3   r   _insert_forward_cast_opsr'   r   r#   r$   r%   r   ro   _insert_backward_cast_opsr   r   r   r2   r    r4   _sync_with_cpp)
ry   r8   rn   idxr(   num_cast_opsr   r   r<   r;   s
             r)   r   zFP16State.cast_blockx  s   +;C	NN""3BLw***qr"" 19##BG$7$7$9$9::eCC#'#@#@(,1)$ $LL %%bg&9&9&;&;<<DD#'#@#@,1()$ $L  ## 97&&((O,OOO''(;(;(=(=>>%GG'+'E'E!, L05 -( ( ))"'*=*=*?*?@@DHH'+'E'E! L05, -( ( W%%#%#6q#9L"$"4Q"7K#ii55G"66{CCF')'9  %|uyy/E/E/KKKK%GG;)?)?GG2GG  LKKK L**6<888<!##Cu C	NN""v 	r+   c                 f   d}|j         D ]\}|t          j        k    rt          ||          r$|                    |          }	|	J |                    |          D ]}
|                    |
          }||j        t          j	        vs|j
        |k    r9|j
        |k    r|j        dz   t                              |          z   }|j                            |          }| j        |j                                        xx         ||j        |||fgz  cc<   t%          j        |	                    |j                            }|J ||j
        |k    r|j        }|j        }|	j        }|                    ||d|j                  }t5          |||||           d}|                    d          r|                    d          }|                    |dd	|id
|id|j
        d|j
        t<          t>          j         i          }|!                    d|           tE          |||||           |dz  }|#                    |j        |           |	$                    ||           
^|                    d          r4|                    d          dk    r|                    d          |k    sJ |S )Nr   z.cast_F)r   r    persistablestop_gradientchunk_id/op_namescoper   r-   r.   r   r   rY   inputsoutputsattrsr   )%r   r5   r   r[   get_op_dist_attr_for_programr0   r2   rY   r   r   r    r   _dtype_to_strr   r}   rv   r3   r   copydeepcopyget_input_dist_attrprocess_meshdims_mappingr   
create_varr   r   r!   r"   _insert_op_without_syncr   r   Forwardr&   r   _rename_inputset_input_dist_attr)ry   r(   r   r8   	src_dtype	dst_dtyperm   r   r9   consume_op_attrr   r;   	cast_namecast_varin_var_dist_attrref_meshref_mappingref_chunk_idr   r7   s                       r)   r   z"FP16State._insert_forward_cast_ops  sI    ~ R	 R	GFN**/?G/L/L**GGKKO"...!xx00 L L22;??N{-*DDD|y00<9,,"#'55i@@A 
  %z~~i88H/0C0C0E0EFFF"FKIwOK FFF (,}';;FKHH( ($ ,777'8>Y+F+F $4#@&6&C'6'?#(#3#3!*"+(-*0*>	 $4 $ $ *($'$%1    (+;;~66 C+-77>+B+BL"'"?"?!'$'=%*H$5 *FL +X^ +V^# #@ 
# 
#  ))*L   O#$'(%1    %)$$V[)<<<#77!#3  UL\ ;;{## 	5(<(<(B(B77;''94444r+   c                 6   d}|j                                         }|j        }	|	j        |         }
|                    |          }|J |j        D ]G}|                    |          }t          ||j                  r-|j	        |k    sJ | d|             H| j
        |
         D ]r\  }}}}}||j        v ro||                    |          v sJ d| d| d|             |                    |          }|J |                    ||           |                    ||           |dz   }||j        v rt#          |                    |                    dk    rt#          |                    |                    dk    sJ d| d	|             |                    |          d         }|                    |          }|                    |          }|J |             |j        }|j        }|j        }||_        |                    t1          j        d
                    |dg                    ||j        |j        |j        |j                  }|                    ||           |                     ||j                   |!                    |j        |           |"                    |dz   dd|j        gid|j        gid|d|tF          tH          j%        i          }|j         &                    |           tO          |||||           |dz  }t|S )Nr   r   zvar: z not in op's z. z@GRADr   [z], Current Op:  r   r    shaperY   r   r   r   r-   r.   r   r   r   r   )(r3   r   rn   ro   r   r   r   rf   r   r    rv   r   r0   r   r   r   r   r   r1   get_output_dist_attrr   r   r   r   r   generate_with_ignorable_keyjoinr   rY   r   r    set_tensor_dist_attr_for_program_rename_outputset_output_dist_attrr   r   r   r   r4   r   )ry   r(   r   r8   r   r   rm   r   r   rn   forward_op_idgrad_op_attrr   r<   r   src_name	slot_namesrc_var_dist_attrgrad_slot_name	grad_namegradgrad_dist_attrr   r   r   	cast_gradr7   s                              r)   r   z#FP16State._insert_backward_cast_ops  s    g))++&6';KH#@@DD'''/ 	I 	ILii--G W\22 =I---'/H/HY/H/H---- (7K	" K	" 
 BN**288I#6#6666DHDD9DDDD 766 %1$D$DX$N$N!(444  955500<MNNN '0N00ryy0011Q66299^4455:::;;;r;; ;:: IIn55a8	yy++!-!B!B9!M!M%11i>111)6,9+4*6'!,,$@G 455  $* $ 0"&"4 - 	 		 ==~   !!)Y^<<<11NN  
  77!G).!12"TYK0"I#Y#V_ 8 
 
 	##I...F )    !r+   r|   )__name__
__module____qualname__rz   r   r   r   r   r   r   r   r   r   r    r+   r)   rh   rh      s         "" " " "<3 3 3  @3 3 3&(! (! (!T1 1 1(9 9 9<N <N <N|> > >@\ \ \|^ ^ ^ ^ ^r+   rh   c                    t           j                                                                        }|                                 t          | dt          t          fd           | D ]}t          |dg dd           |	                    t          j        d                    d|g                    dgdt          j        j        j        dd	          }t#          ||d
gt$          j        d           | |d}| |d}dt(          j        i}	|                    d|||	          }
t/          |
j                  }t3          t$          j                  |_        d|_        d|_        t;          t$          j                  dk    rd|_        | D ][}|                    |          }|J |                     |j!        |j"                   |#                    |j!        |j"                   \|$                    |
|           | |fS )Nxcheck_finite_and_unscale)float16r   float64.find_infinite_scaler   boolF)r   r   r    rY   r   r   r   r   r   )r-   Scale)r.   FoundInfiniter   r   )%r5   staticr   global_blockr   r   tupleru   r   r   r   r   r   r   r#   r$   DENSE_TENSORr   world_process_groupranksr   Optimize	append_opr   r3   r   r   impl_idxr   r   	impl_type get_tensor_dist_attr_for_programset_input_dims_mappingr   r   set_output_dims_mappingset_op_dist_attr_for_program)gradsloss_scalingr   rm   
main_blockr   	found_infr   r   r   new_opnew_op_dist_attrgg_dist_attrs                 r)   _check_and_update_gradientr  w  sN   3355BBDDJucE4=*DEEE 
 
 ---&		
 	
 	
 	
 %%4HH+T233
 
 c\!. & 	 	I i"':'@1    <00Fi88G(E!!'	 "  F (44$/0C0I$J$J! ! !
$%%))%?" 
 
"CCAFF&&&//FK,	
 	
 	
 	00FK,	
 	
 	
 	
 --f6FGGG)r+   c                     d | D             }d |D             }d |D             }t          |          t          |          z   t          |          k    s
J d            |||fS )Nc                     g | ]\  }}|S r   r   ).0_r  s      r)   
<listcomp>z _split_grads.<locals>.<listcomp>  s    (((41aQ(((r+   c                 <    g | ]}|j         t          j        k    |S r   )r    r5   r   r  r  s     r)   r  z _split_grads.<locals>.<listcomp>  s&    @@@ag&?&?!&?&?&?r+   c                 2    g | ]}|j         t          k    |S r   )r    r'   r  s     r)   r  z _split_grads.<locals>.<listcomp>  s%    BBBag1A&A&A!&A&A&Ar+   z4Data types of all grads must be either fp16 or fp32.)r   )params_gradsr	  
fp32_grads
fp16_gradss       r)   _split_gradsr    s}    ((<(((E@@U@@@JBBUBBBJz??S__,E

:::> ;:: *j((r+   c                     t                      }t          |          |_        d|_        t	          |           sJ d|_        | j        D ]K}|                    |          }|                    |          }|J |	                    ||j
                   L| j        D ]K}|                    |          }|                    |          }|J |                    ||j
                   L|                    | |           d S )Nr   )r   r   r   r  r
   r   r   r   r  r  r   r   r  r  )r  r   r8   rm   r  r   r   var_dist_attrs           r)   _set_op_dist_attr_with_ranksr     s4   '))$/$6$6! !&!!!!! !* 
 
ii!!$EEcJJ(((//m0	
 	
 	
 	
 + 
 
ii!!$EEcJJ(((00m0	
 	
 	
 	
 --f6FGGGGGr+   c                     t          | j                  D ]-\  }}|j        dk    r|j        d         |j        k    r|dz   c S .t          d          )N
reduce_anyr   r   z=not found the correct location for memcopy for found_inf_var.)	enumerater   rY   r   r   RuntimeError)r8   found_inf_varr   r(   s       r)   _get_memcopy_idxr&    se    UY''  RG|###A&-*<<<7NNN
G  r+   D2Hc                    |j         }|                     t          j        |                    dg                    |j        |j        t          j        j	        j
        d|j                  }t          ||d |j        D             t          j        d           |dk    rd}nt          d| d	          d
|i}|                     |dd|gid|gi|          }	t#          |	t          j        | |           |                                  |S )Nmemcopy_Fr   c                     g | ]}d S r   r   r  is     r)   r  z#_insert_memcopy.<locals>.<listcomp>  s    ######r+   r   r   r'  zdirection [z] is not supported yet.dst_place_type
memcpy_d2hr-   r.   )indexrY   r   r   r   )r   r   r   r   r   r    r   r   r#   r$   r   r   r   r   r   NotImplementedErrorr   r   r   )
r8   r   src_varrm   	directionr   
output_varr.  r   r  s
             r)   _insert_memcopyr5    sQ   |H!!4MM:,''
 
 mm\!.+ " 	 	J ##W]###!    E!<)<<<
 
 	
 ~.E**gY% +  F !#)5,   
r+   c                     t                      } t                      }i }| j        D ](}|                                D ]}|j        ||j        <   )d }|                                j        D ] } ||          r|j        d         }|	                    |d           t          k    r|                    d          sJ d| d            |                                                    |          }|j        t          j        k    r|j                            t                     |                    d          t$          j        j        j        k    r|                    dt                     d S )Nc                     d}| j         }|                    |          rdS t          | j                  dk    rt          | j                  dk    rdS dS )Nc_Fr   r   T)rY   
startswithr   r   r   )r(   comm_op_prefixrZ   s      r)   is_initialization_opz2cast_startup_program.<locals>.is_initialization_op  s^    'n-- 	5r"##q((S1C-D-D-I-I5tr+   r   r    z>initialization op is supported to has dtype attribute but got r   )r   r   r   all_parametersr    r   r   r   r   r}   r'   r!   r   r5   r   r3   r4   r"   r   r#   r$   r%   r&   )	main_programstartup_programparam_to_dtyper8   pr;  r(   output_namer<   s	            r)   cast_startup_programrB    s   '))L-//ON$ - -%%'' 	- 	-A%&WN16""	-	 	 	 **,,0 < <## 
	<-a0K!!+t448HHH{{7++  ZUWZZZ + *6688<<[II=FN22L**+;<<<777##t|';'@@@LL*:;;;< <r+   auto_parallel_fp16c                   $     e Zd Z fdZd Z xZS )FP16Passc                 H    t                                                       d S r|   )superrz   )ry   	__class__s    r)   rz   zFP16Pass.__init__0  s    r+   c                    |                      d          | _        |                      d          | _        |                      d          }|                      dd           | _        | j        |                      dd           dk    | _        t          j        }| j        dk    rt          j        j        j	        }n:| j        dk    rt          j        j        j
        }nt          d	| j         d
          |at          a |t          |                      d                    t          |                      d                    d | j                  }d |                      d          D             }t          j                            ||          5  t%          ||| j        |                      d          |          }	|	                                }
t)                       |
r|                     | j                   d d d            n# 1 swxY w Y   |
r| j        dk    rt          j                            ||          5  |                                  |                                  t1          |          \  }}}|                      d          s|                      d          dk    rg }|ra|                    g           5  t5          || j        d| j                  \  }}d d d            n# 1 swxY w Y   |                    |           |ra|                    g           5  t5          || j        d| j                  \  }}d d d            n# 1 swxY w Y   |                    |           |                    g           5  |                                }|                    t          j        j         !                    d"                    ddg                    |d         j#        d |d         j$        |d         j%        dd          }|&                    dd|id|giddi          }tO          | j        |d gtP          j)        d!           tU          |tP          j)        || j                   |                    t          j        j         !                    d"                    d"dg                    |j#        d |j$        |j%        dd          }|&                    d#d|id|idgdd$d%          }tO          | j        |d& |j+        D             tP          j)        d!           tU          |tP          j)        || j                   d d d            n# 1 swxY w Y   |                      d          r]|                    g           5  |r| ,                    ||           |r| ,                    ||           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                      d'          }d$|_-        | j        rd|_-        | j        dk    rt]          |t          j/        j0        t          j/        j1        f          rq|                    g           5  te          ||          }tg          |||| j                  }d d d            n# 1 swxY w Y   |4                    d(|j5                   d S tm          |d)          r!|4                    d(|j5                   d S d S d S d S )*Nrm   r    r  use_optimizer_fp16levelo3r   bfloat16ztarget dtype [z"] is for amp o2 not supported yet.custom_white_listcustom_black_list)r    c                     g | ]	}|j         
S r   )r   )r  r   s     r)   r  z/FP16Pass._apply_single_impl.<locals>.<listcomp>V  s    PPPSPPPr+   
input_datarl   use_dynamic_loss_scalinginit_loss_scalingg      ?z@fp32z@fp16r   concattmpr   F)r   r    r   	lod_levelrY   r   r   r-   r.   axisr   r   r   r   r"  T)dimkeep_dim
reduce_allc                     g | ]}d S r+  r   r,  s     r)   r  z/FP16Pass._apply_single_impl.<locals>.<listcomp>  s     = = = = = =r+   base_optr  _set_auxiliary_var)7get_attrrm   target_dtyperJ  	amp_utilsAutoMixedPrecisionListsr   r#   r$   FP16BF16r1  r'   r   setr5   r   program_guardrh   r   rB  
_cast_loss_init_amp_var_scale_lossr  _optimized_guardr  _loss_scalingappendr   r   utilsr   r   r   r    rV  rY   r  r   r   r   r   r   _update_loss_scaling_multi_precision
isinstance	optimizerAdamAdamWr&  r5  r]  r   hasattr)ry   r=  r>  contextr  AMPList_FP16Pass__target_dtyperk   rr   
fp16_staterw   r	  r  r  
found_infsr  found_inf_fp32found_inf_fp16r8   all_infs	concat_opr  reduce_any_opr\  
insert_idxs                            r)   _apply_single_implzFP16Pass._apply_single_impl6  s~	    MM.99 MM'22}}^44"&--0Dd"K"K"*&*mmGT&B&Bd&JD#3	))!\16NN*,,!\16NN%V!2VVV   *!71223312233#	
 
 
  QPDMM,4O4OPPP]((GG 	3 	3"!$  % J "..00H """ 3 1222	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3   A	M I--]00OO jQ jQ&&((($$&&&4@4N4N1E:z &@AA\==)<==DD%'
% >!-!>!>r!B!B " "4N$.$($6$+$($5	5" 5" 1>" " " " " " " " " " " " " " " '--n===% >!-!>!>r!B!B " "4N$.$($6$+$($5	5" 5" 1>" " " " " " " " " " " " " " " '--n===)::2>> E E$0$=$=$?$?E (-'7'7%+\%=%Y%Y$'HHh->$?$?&" &" '1m&9&**4Q-*A%/]%7,1.3 (8 
( 
(H ).%-(+Z'8).
(;'-qk	 )8 ) )I . $ 1 (!# 3 9)*    9 ) 3 9 % $ 1	   ).(8(8%+\%=%Y%Y$'HH.CU-K$L$L&" &" '/n&**2*<%-],1.3 )9 
) 
)I -2OO%1(+X).	(:,-30526'" '"	 -< 	- 	-M . $ 1 ) = =Y_ = = = 3 9)*    9 - 3 9 % $ 1	  AE E E E E E E E E E E E E E EN }}%?@@ Q)::2>> Q Q) Q $ 9 9*i P P P) Q $ 9 9*i P P P	Q Q Q Q Q Q Q Q Q Q Q Q Q Q QMjQ jQ jQ jQ jQ jQ jQ jQ jQ jQ jQ jQ jQ jQ jQZ }}Z00H(,H%& 2,1) I--v/4f6F6LM  M &66r::   &6eY%G%G
$3!:y$:K% %		               //Y^LLLLLX';<< M//Y^LLLLLCA	M A	Mh .-M Ms   A*HH	H	?BW! K/#W!/K3	3W!6K3	7/W!& MW!M	W!M	-W!GUW!U	W!U	 -W!1W
>W!
W	W!W	W!!W%(W%&(ZZ!Z)r   r   r   rz   r  __classcell__)rH  s   @r)   rE  rE  .  sU            rM rM rM rM rM rM rMr+   rE  )r'  )7r   collectionsr   r5   paddle.static.amp.fp16_utilsr   amp
fp16_utilsr`  paddle.common_ops_importr   r   6paddle.distributed.auto_parallel.static.dist_attributer   5paddle.distributed.auto_parallel.static.process_groupr   -paddle.distributed.auto_parallel.static.utilsr   r	   r
   r   r   /paddle.distributed.fleet.meta_optimizers.commonr   r   paddle.frameworkr   paddle.staticr   r   r   paddle.utilsr   auto_parallel.process_meshr   auto_parallel_ampr   	pass_baser   r   r   r'   r   r*   r?   r[   rf   rh   r  r  r   r&  r5  rB  rE  r   r+   r)   <module>r     s    # # # # # #  0 0 0 0 0 0 0 0 0 0 0 0 I I I I I I I I                        P O O O O O O O ! ! ! ! ! ! G G G G G G G G M L L L L L $ $ $ $ $ $ 4 4 4 4 4 4 & & & & & & $ $ $ $ $ $--//      0 0 01 1 1,  >  8Z Z Z Z Z Z Z Zz7 7 7t) ) )H H H.  ) ) ) )X< < <D #$$yM yM yM yM yMw yM yM %$yM yM yMr+   