
    x-j                     l   d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d d	lmZ d
dlmZ d
dlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&  e            Z'g dZ(d Z)d Z* G d d          Z+ G d d          Z, e&d           G d de%                      Z-dS )    N)
check_typecheck_variable_and_dtype)OperatorDistAttr)get_world_process_group)6naive_set_dist_op_attr_for_program_by_mesh_and_mappingset_var_dist_attr)OP_ROLE_KEYOpRole)core)	AutoMixedPrecisionLists_is_in_black_varnames_keep_fp32_input_keep_fp32_output_rename_arg_valid_typesfind_op_indexfind_true_post_opfind_true_prev_op)unique_name   )ProcessMesh)is_backward_opis_forward_opis_loss_grad_op
is_loss_opis_optimize_op   )PassBaseregister_pass)create_py_readercreate_double_buffer_readerwhilec                 N    | t           j        k    rdS | t           j        k    rdS dS )Nfp16bf16fp32)paddlefloat16bfloat16dtypes    k/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/passes/auto_parallel_amp.py_dtype_to_strr-   =   s+    v	&/	!	!vv    c                     | dk    rt           j        j        j        S | dk    rt           j        j        j        S t           j        j        j        S )Nr(   r)   )r   VarDescVarTypeFP16BF16FP32)dstrs    r,   _str_to_dtyper6   F   sB    y|#((			|#((|#((r.   c                       e Zd Z	 	 	 	 ddZed             Zed             Zed             Zed             Zed             Z	ed	             Z
d
 Zd Zd ZdS )AMPListsNr(   c                     t          t          |          t          |          t          |          |          | _        || _        d S )Nr*   )r   set	_amp_list_dtype)self
white_list
black_listblack_varnamesr+   s        r,   __init__zAMPLists.__init__P   sD     1
OOS__c..A.A
 
 
 r.   c                     | j         j        S N)r;   r>   r=   s    r,   r>   zAMPLists.white_list\       ~((r.   c                     | j         j        S rC   )r;   r?   rD   s    r,   r?   zAMPLists.black_list`   rE   r.   c                     | j         j        S rC   )r;   	gray_listrD   s    r,   rH   zAMPLists.gray_listd   s    ~''r.   c                     | j         j        S rC   )r;   r@   rD   s    r,   r@   zAMPLists.black_varnamesh   s    ~,,r.   c                     | j         S rC   )r<   rD   s    r,   r+   zAMPLists.dtypel   s
    {r.   c                     | j         S rC   )r;   rD   s    r,   amp_listzAMPLists.amp_listp   s
    ~r.   c                 ,    t          || j                  S rC   )r   r;   )r=   ops     r,   _is_in_black_fp32_varnamesz#AMPLists._is_in_black_fp32_varnamest   s    $R888r.   c                 >    |j         j        sdS t          ||          S NT)amp_optionsenabler   )r=   rN   in_names      r,   _op_keep_fp32_inputzAMPLists._op_keep_fp32_inputw   s$    ~$ 	4G,,,r.   c                 >    |j         j        sdS t          ||          S rQ   )rR   rS   r   )r=   rN   out_names      r,   _op_keep_fp32_outputzAMPLists._op_keep_fp32_output|   s$    ~$ 	4 X...r.   )NNNr(   )__name__
__module____qualname__rA   propertyr>   r?   rH   r@   r+   rL   rO   rU   rX    r.   r,   r8   r8   O   s        
 
 
 
 ) ) X) ) ) X) ( ( X( - - X-   X   X9 9 9- - -
/ / / / /r.   r8   c                   8    e Zd Zd Zd Zd Zd Zd Zd Zd Z	dS )	AMPStatec                     || _         || _        || _        || _        |j        j        | _        i | _        i | _        i | _	        d S rC   )
programdist_context	amp_lists	amp_dtypedist_op_contextgrad_op_id_to_op_idgrad_op_to_op_map_op_fp16_dict_var_name_dictout_var_op_deps)r=   ra   rc   rd   rb   s        r,   rA   zAMPState.__init__   sP    (""(< 	
   !r.   c                 8    | j                             |d           S rC   )rh   get)r=   op_ids     r,   _is_fp16_opzAMPState._is_fp16_op   s    !%%eT222r.   c                 t   d}| j         j        D ]}|j        D ]w}|j        D ]f}|| j        vr#|j                                        g| j        |<   .| j        |                             |j                                        g           gt          |          rd}|j	        t          v rt          |          r|                     ||j        |           t          |          r|j                                        | j        v rx| j        |j                                                 }|| j        v sJ t!          |                      |                     |          | j        |j                                        <   ft%          |          r ny| j         j        D ]}|                     |           |S )NFT)ra   blocksopsoutput_arg_namesrj   descoriginal_idextendr   type__amp_skip_ops__r   _mark_black_white_opsr   rg   rh   strrn   r   _cast_block)r=   is_trainblockrN   name	fwd_op_ids         r,   build_statezAMPState.build_state   s   \( 	 	Ei  /  D4#77768g6I6I6K6K5L,T22,T299W00223    #2&& $#H7... $$ ..r59eDDDD#B'' 
w**,,0FFF$($:G//11%	  )D,>>>>B>>> ,,Y77 *27+>+>+@+@A $B'' E \( 	$ 	$EU####r.   c                 n   |j         j        s#d| j        |j                                        <   d S |j        dk    r2d|j        d         v r#d| j        |j                                        <   d S |j        dk    r|j        d         }t          | j	        |                   dk    rk| 
                    | j	        |         d                   s"d| j        |j                                        <   n!d| j        |j                                        <   d S | j        j        =| j                            |          r#d| j        |j                                        <   d S |j        | j        j        v r#d| j        |j                                        <   d S |j        | j        j        v r#d| j        |j                                        <   d S |j        | j        j        v r=d}d}|j        D ]}|r|                    |          D ]}|                    |          }	|	j        |	j        |u rt+          |||          }
|
<n|	j        }
| 
                    |
j                                                  du s|
j        | j        j        v rd}| 
                    |
j                                                  du s|
j        | j        j        v rd}̌|r#d| j        |j                                        <   d S |r#d| j        |j                                        <   d S d S d| j        |j                                        <   d S )NFassignarray_r   r   T)rR   rS   rh   rs   rt   rv   input_arg_namesrr   lenrj   rn   rc   r@   rO   r?   r>   rH   input_namesinput_var_recursiverN   r   )r=   rN   rq   r|   rW   is_black_opis_white_oprT   in_var_namein_varprev_ops              r,   rx   zAMPState._mark_black_white_ops   sW   ~$ 	8=Drw22445F 7h8r/A!/D#D#D8=Drw22445F 7h*1-H4'122Q66''(<X(Fq(IJJ E@ED&rw':':'<'<==@DD&rw':':'<'<= N)599"== 6 9>Drw22445F7dn///8=Drw2244555W1118<Drw2244555W000KK> / / /')xx'8'8 / /!&!5!5k!B!B!9,$#Y"__&7R&M&MG& (  / '-iG !,,W\-E-E-G-GHH$% %&|t~/HHH*.KK ,,W\-E-E-G-GHHDPP&|t~/HHH*.K <A"27#6#6#8#8999 <@"27#6#6#8#8999 9>Drw2244555r.   c           	      	   d}d}|t          |j                  k     r|j        |         }d}|j        t          v r|dz  }<t	          |          rJ|                     |j                                                  du rG|                     |||t          | j
                  t          j        j        j        | j                  }n|                     |j                                                  du r|                    d          rK|                    d          t$          j        k    r(|                    dt          | j
                             |                     |||t          j        j        j        t          | j
                  | j                  }nt+          |          r | j                            |          }t+          |          rFt	          |j        |dz
                     st/          |j        |dz
                     r|j        s|dz  }|j                                        | j        v rL|                     |j                                                  du rH|                     |||t          | j
                  t          j        j        j        | j        |          }n|                     |j                                                  du r|                    d          rK|                    d          t$          j        k    r(|                    dt          | j
                             |                     |||t          j        j        j        t          | j
                  | j        |          }n$|j        dk    r|j                                        d         }|j                                        d         }|                    |          }	|                    |          }
|j        D ]J}|
j        |                    |          j        k    s%J |
 d|                    |           d|             K|	j                             |
j                   n?tC          |                    d                    d	k    rntE          d
|j         d          ||dz   z  }|t          |j                  k     |#                                 d S )Nr   r   FTr+   sumz, op_rolei  'z/' op is not supported in the complete amp pass.)$r   rq   rv   rw   r   rn   rs   rt   _insert_cast_op_forwardr6   rd   r   r0   r1   r4   rb   has_attrattrr'   float32	_set_attrr   get_op_dist_attr_for_programr   is_recomputerg   _insert_cast_op_backwardrr   r   var_find_var_recursiver+   	set_dtypeint
ValueError_sync_with_cpp)r=   r|   idxappended_grad_timesrN   num_cast_opsop_dist_attrout_var_namer   out_varr   s              r,   rz   zAMPState._cast_block  s   C	NN""3BLw***qr"" Q##BG$7$7$9$9::eCC#'#?#?%dn55,1)$ $LL %%bg&9&9&;&;<<DD G,,MGGG,,>>WmDN.K.KLLL#'#?#?,1%dn55)$ $L  ## 8  $0MM    ""%% 1!%)C!G"4551!%)C!G"4551 (4 1+q0+7&&((D,BBB''(;(;(=(=>>%GG'+'D'D!)$.99 L05 -/( ( ))"'*=*=*?*?@@DHH KK00Q " 0 0FN B BLL-2O2OPPP'+'D'D! L05)$.99 -/( ( W%%#%7#;#;#=#=a#@L"$'"9"9";";A">K#ii55G"66{CCF')'9  %|uyy/E/E/KKKK%GG;)?)?GG2GG  LKKK L**6<8888++,,33$TBGTTT   <!##Cu C	NN""v 	r.   c                    d}i }|j         dk    r|                    |                    d          d                   }	|                    |                    d          d                   }
|                    d|	j                   |
j                            t          j        |	                    d                               |S |j
        D ]e}|t          j        k    r| j                            ||          r/|                    |          D ]}|                    |          }	|	j         t          vs|	j        |k    r2|	j        |k    r|	j        dz   t!          |          z   }|j                            |          }|||	j        <   |                    |          }|J ||j        |k    r	|                    |	j                  }|J |j        }|j        }|j        }||_        |                    ||           |                    ||d	|	j        
          }t7          |||||           d}|                    d          r|	                    d          }|                    |dd|	id|i|	j        |j        d          }|                    d|           t=          |||||           |dz  }n0|                    |	j                  }|                    ||           t?          ||	j        |           |                    d          r|                    d|           !g|| j         |j        !                                <   |t          j        k    r|tE          | j#                  k    r|j$        D ]}| j        %                    ||          r|                    |          D ]}|&                    |          }
|
j         t          vr&|
j        t          j        k    ri|
j                            tE          | j#                             |                    d          r(|                    dtE          | j#                             |S )zO
        only for forward cast
        modified from paddle.static.amp
        r   castXOutin_dtype	out_dtypez.cast_NF)r}   r+   persistablestop_gradientchunk_id/op_namescope)r   r   rv   inputsoutputsattrsr   )'rv   r   r   outputr   r+   rs   r   r'   r   r   r   rc   rU   r   r}   r-   varsrl   r   get_input_dist_attrprocess_meshdims_mappingr   set_input_dist_attr
create_varr   r   r   _insert_op_without_syncr   r   ri   rt   r6   rd   output_namesrX   r   )r=   r|   rN   r   	src_dtype	dst_dtyperb   r   var_name_dictr   r   rT   r   	cast_namecast_varconsume_op_attrin_var_dist_attrref_meshref_mappingref_chunk_idr   cast_oprW   r   s                           r,   r   z AMPState._insert_cast_op_forwardd  s    7f..rxx}}Q/?@@F//		%0@0@0CDDGLLV\222L""6<0D0D#E#EFFF~ T	< T	<GV^++N66r7CC , !xx00 N< N<22;??;l22fli6O6O<9,,h.y1I1II   %z~~i88H1:M&+.&2&O&O' 'O +666'8>Y+F+F ,;+N+N"K, ,(  0;;;#3#@&6&C'6'?4@(1';;%'7   $)#3#3!*"+(-*0*>	 $4 $ $ *($'$%1    (+;;~66 C+-77>+B+BL"'"?"?!'$'=%*H$5,2L-5^# # #@ 	# 	#  ))*L   O#$'(%1    %)+:+N+N"K, ,( (;;%'7    FK;;;;{{:.. <Z;;;]N<^ 6CBG//112&&9N9
 9
 ,
 ,
 O  >66r8DD $&IIh$7$7 	 	L#22<@@G|<77 }66..}T^/L/LMMM;;{33 LL +]4>-J-J  	 r.   c                    d }d }	d}
|j                                         }|j        }| j        |         }|j        dk    r6|                    d          d         }|                    d          d         }|                    |          }|                    |          }|                    |d|                    d                             }|                    |d|                    d                             }|	                    d	|j
                   |	                    d
|j
                   |j                             |j
                   |j                             |j
                   |
S |j        D ]H}|t          j        k    rQ |||          rE|                    |          D ].}|                    |          }|j
        t          j        k    sJ /d|                    |          D ]}|                    |          }|j
        |k    r|                    |          }|| j        |         v rZ| j        |         |         }|j                             ||           |                    |          }|                    ||           |j
        |k    s#J d|j         d| d| d|j
         d| 
            ϐJ|j        D ]}|t          j        k    rQ |	||          rE|                    |          D ].}|                    |          }|j
        t          j        k    sJ /d|                    |          D ]^}|                    |          }|d|                    d                   }|                    |          }|j
        |j
        k    r|j                             |j
                   |j
        |k    r|| j        |         v r|                    |          }| j        |         |         }d}d|v r||                    d          d         }|dz   |z   }|j                            |          }||j
        |k    r?|j                             ||           |                    |          }|j        }|j        } |j        }!|!|_        |                    ||           | J |                    ||j        |d|j                   }tC          ||| ||!           ||j"        |         |<   |#                    |dz   dd|id|i|j
        |j
        tH          j%        d          }"|"&                    d           |"&                    d           |"&                    d           tO          |"|| ||!           |
dz  }
Q|j
        |k    sJ `|(                    d          rK|)                    d          t          j        k    r(|	                    dtU          | j+                             |
S )zonly for backward castc                 $    | j         }|dv r|dvS dS )Nlayer_norm_grad>   r   Y@GRADFrv   )rN   rT   op_types      r,   r   z;AMPState._insert_cast_op_backward.<locals>._keep_fp32_input  s%    gG---o555r.   c                 (    | j         }|dv r|dk    S dS )Nr   X@GRADFr   )rN   rW   r   s      r,   r   z<AMPState._insert_cast_op_backward.<locals>._keep_fp32_output  s%    gG---8++5r.   r   r   r   r   N@r   r   zop [z] expect input [z] to be dtype [z] BUT got [z].  z@RENAME@GRADF)r}   shaper+   r   r   r   r   r   r   r   r   op_role_varr   with_quant_attrr+   ),rs   rt   re   rg   rv   r   r   r   findr   r+   r   r   r'   r   r   r   ri   _rename_inputr   r   r   r   rl   _rename_outputget_output_dist_attrr   r   r   set_output_dist_attrr   r   r   r   grad_var_to_var
_insert_opr
   Backward_remove_attrr   r   r   r6   rd   )#r=   r|   rN   r   r   r   rb   r   r   r   r   rt   re   r~   rT   rW   r   r   	in_var_fw
out_var_fwr   r   r   r   r   out_var_name_prefixfwd_varfwd_cast_namesuffixr   out_var_dist_attrr   r   r   r   s#                                      r,   r   z!AMPState._insert_cast_op_backward  s   	 	 		 	 	 g))++&6*;7	7fhhsmmA&Gyy''*H..w77F//99G11':MGLL<M<M:M2NOOI22-8==---. J LLY_555LLj&6777K!!)/222L"":#3444~ 	 	GFN**/?/?G/L/L*#%88G#4#4 : :K"11+>>F!<6>99999!xx00  --k::<9,,&2&O&O' 'O #d&9)&DDD %)$7	$B;$O	--k9EEE+:+N+N', ,( (;;%'7     &|y88827GT]jpjv{}  988%,  Z	6 Z	6HFN**/@/@X/N/N*$&IIh$7$7 ; ;L#22<@@G"=FN::::: "		( 3 3 S6 S6..|<<&23K\5F5Fs5K5K3K&L#../BCC=GM11L**7=999=I--*d.A).LLL
 )EEbII ( )-(;I(F/) "$$44%1 , 1 1) < < > >&F %2G$;f$D	#(:>>)#<#<#+x~/J/JG22<KKK / D D$0!" !" .
 (9'EH*;*HK+:+CL9E-6+@@ )+<   $/#:#:#:','7'7%.&-m&/,1.5.C (8 ( (H . , ( + ()5    ,9 ,; 3') ',&6&6 #a%+(+X).(80818/5'" '" '7 
' 
'G $00???#00@@@#001BCCCR ' ( + ,)5    )A-L"=I55555gS6j ;;w 	ABGGG$4$4$F$FLL-"?"?@@@r.   N)
rY   rZ   r[   rA   rn   r   rx   rz   r   r   r]   r.   r,   r_   r_      s        " " " 3 3 3% % %NE> E> E>N^ ^ ^@y y yvh h h h hr.   r_   auto_parallel_ampc                   Z     e Zd Z fdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Z xZS )AMPPassc                    t                                                       |                     dd           |                     dd            |                     dd            |                     dd            |                     dd            |                     dd            |                     dd	           |                     d
d           |                     dd           |                     dd           |                     dd           |                     dd           |                     dg            |                     dg            |                     dd           d | _        d | _        d | _        d | _        d S )Nr+   r   lossrb   custom_white_listcustom_black_listcustom_black_varnamesinit_loss_scalingg      @incr_every_n_stepsi  decr_every_n_nan_or_infr   
incr_ratiog       @
decr_ratiog?use_dynamic_loss_scalingF
input_dataparams_grads)superrA   set_attr_loss_loss_scaling_num_good_steps_num_bad_steps)r=   	__class__s    r,   rA   zAMPPass.__init__  sq   gr"""fd###nd+++)4000)4000-t444)7333*D111/333lC(((lC(((0%888lB'''nb)))gr"""
!#"r.   c                 t   |                      d          dvrdS |                      d          dk     rdS |                      d          dk     rdS |                      d          dk     rdS |                      d          dk     rdS |                      d	          dk     rdS |                      d
          dS dS )Nr+   )r(   r)   Fr   r   r   r   r   r   rb   T)get_attrrD   s    r,   _check_selfzAMPPass._check_self  s    ==!!)@@@5==,--115==-..225==233a775==&&**5==&&**5==((05tr.   c                     dS rQ   r]   )r=   
other_passs     r,   _check_conflictzAMPPass._check_conflict  s    tr.   c           	         |                      d          | _        |                      d          | _        |                      d          | _        t	          t          |                      d                    t          |                      d                    t          |                      d                    | j                  }t          j                            ||          5  t          ||| j        | j                  }|
                                }|r.|                                  |                     | j                   |r| j        dk    r|                                  |                                  |                      d          s|                      d	          d
k    r|                                 \  }}|                      d          r|                     ||           d d d            d S # 1 swxY w Y   d S )Nrb   r   r+   r   r   r   r(   r   r         ?)r  rb   r   rd   r8   r:   r'   staticprogram_guardr_   r   _update_backward_cast_ops
_cast_loss_init_amp_var_scale_loss_check_and_update_gradient_update_loss_scaling)	r=   main_programstartup_programcontextrc   	amp_stater{   grads	found_infs	            r,   _apply_single_implzAMPPass._apply_single_impl  s2    MM.99 MM.99w//122331223356677N	
 
	 ]((GG 	@ 	@ i9J I !,,..H 0..000/// 
@DNi77""$$$  """MM"<==I}}%899S@@'+'F'F'H'H$E9==!;<< @--eY???)	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@s   &DG99G= G=c           	      L   t           j                                                                        }|                                 | j        D ]B\  }}|j        }|j        t           j        k    r|j	        dk    rt          |                    d                    t          t          j                  k    r*|                    d          r|                    d           t!          |j        ||j                  }|rt'          d| d|d                    ||j        d         k    r|j                                        }|                    |j                   t           j                            ||dddd	          }|j                            |           | j                            |          }| j                            |                    |j        d                             }	|J |	J t;          ||j        |j        | j        |j         
           |j        |	_        |j        |	_        |j         |	_         tC          |j        |j                  }
|
dk    rt'          d| d          |"                    |
d           D|                                 dS )zo
        move param grad cast to the end of backward segment
        in order to enable fp16 allreduce
        r   r   r   zThe cast op zH's output should not beused by a non-optimize op, however, itis used by r   N)r|   rs   rv   r   r   r   r   zThe op z is not in programF)sync)#r'   r  default_main_programglobal_blockr   r   rN   r+   r   rv   r   r   r
   r   r   r   r   rq   r}   r   rs   	append_op	copy_fromOperatorappendrb    get_tensor_dist_attr_for_programr   rr   r   r   r   r   r   
_remove_op)r=   
main_blockpgrN   post_opsnew_op_descnew_opparam_dist_attroutput_dist_attrop_idxs              r,   r
  z!AMPPass._update_backward_cast_ops  s    ]7799FFHH
!!###% :	: :	:DAqBw&.((RW->->rwwy))**cO/ /  kk-00 OOM222,Z^RHH $4r 4 4&.qk4 4   +++ )o7799%%bg...//$$  0   %%f--- %FFqII   %FF"r':1'=>>  !
 '222'333F#0#0%,5    1@0L -0?0L -,;,D )&z@@R<<$%Er%E%E%EFFF%%f5%999!!#####r.   c                 H   t           j                                                                        }|                                 d | j        D             }t          |dt          t          fd           |D ]}t          |dg dd           |
                    t          j        d                    ddg                    dgd	t          j        j        j        d
d
          }t%          | j        |dgt(          j        d           || j        d}||d}dt.          j        i}|                    d|||          }t5          |j                  }	t9          t(          j                  |	_        d|	_        d|	_        tA          t(          j                  dk    rd|	_!        |D ]`}
| j        "                    |
          }|J |	#                    |
j$        |j%                   |	&                    |
j$        |j%                   a| j        '                    ||	           ||fS )Nc                     g | ]\  }}|S r]   r]   ).0_r$  s      r,   
<listcomp>z6AMPPass._check_and_update_gradient.<locals>.<listcomp>'  s    111tq!111r.   xcheck_finite_and_unscaler(   r   float64.find_infinite_scaletmpr   boolF)r}   r   r+   rv   r   r   r  r   r   )r   Scale)r   FoundInfiniter   r   )(r'   r  r  r  r   r   r   tuplelistr   r   r   generate_with_ignorable_keyjoinr   r0   r1   DENSE_TENSORr   rb   world_process_groupranksr   r
   Optimizer  r   rs   r   r   impl_idxr   r   	impl_typer   set_input_dims_mappingr}   r   set_output_dims_mappingset_op_dist_attr_for_program)r=   r"  r  er  r   r   r   r'  new_op_dist_attrr$  g_dist_attrs               r,   r  z"AMPPass._check_and_update_gradient#  ss   ]7799FFHH
!!###11t01115#t}.HIII 	 	A$111*	    ))8/788  #%2 * 	
 	
	 	D%	
 	
 	
 	
 t'9::)<<FO,%%+	 & 
 
 ,FK88(34G4M(N(N%$%!$%!"())A--)C& 	 	A+LLQOOK***330   440    	66v?OPPPir.   c                    t           j                            t          j        d          dg|                     d          dd          | _        t          | j        | j        dgt          j
        d	           |                     d
          rt           j                            t          j        d          dgddd          | _        t          | j        | j        dgt          j
        d	           t           j                            t          j        d          dgddd          | _        t          | j        | j        dgt          j
        d	           d S d S )Nloss_scalingr   r   r   T)r}   r   valuer+   r   r  r   r   r   num_good_stepsint32num_bad_steps)r'   r  create_global_varr   generater  r   r   rb   r?  r@  r   r   rD   s    r,   r  zAMPPass._init_amp_vara  s   #]<<%n55#-- 344 = 
 
 	D%	
 	
 	
 	
 ==344 	#)=#B#B )*:;;c  $C $ $D  !$#)    #)-"A"A )/::c  #B # #D !##)     /	 	r.   c           
         t           j                                                                        }|                                 |                     d          }|J |j        }| j                            |          }|j	        t          j        j        j        k    r!t          j        |j        dz             }|                    |t          j        j        j                  }| j                            |          }|j        }	|j        }
|
|_        | j                            ||           t-          |j        |j                  }|                    |dz   dd|gid|gi|j	        t          j        j        j        |                                t4                   d	          }|                    t4          t8          j                   t=          ||	d
 |j        D             | j        |
           d }d}tA          |j!        |d                    D ]9\  }}|j"        dk    rtG          |          r	|}|dz   } ntI          |          r n:|
J d            |                    t          j        |dz             |j        t          j        j        j        |j%                  }tM          | j        |dgtO          |j                  z  |	|
           |j(        d         }|)                    ||j                   t=          ||	dgtO          |j                  z  | j        |
           |                    ||z   dd|gid|git          j        j        j        tU          |          t8          j+        d	          }t=          ||	d |j        D             | j        |
           |}|}| ,                    d|           || _-        |                                 d S )Nr   z
.cast_fp32)r}   r+   r   r   r   r   r   r   c                     g | ]}d S r  r]   r-  is     r,   r/  z&AMPPass._cast_loss.<locals>.<listcomp>      ((((((r.   r      fill_constantThere is not loss_grad op.r   r}   r   r+   r   r  r   c                     g | ]}d S rT  r]   rU  s     r,   r/  z&AMPPass._cast_loss.<locals>.<listcomp>  rW  r.   ).r'   r  r  r  r   r  rN   rb   r   r+   r   r0   r1   r4   r   rQ  r}   r   r   r   r    set_tensor_dist_attr_for_programr   rs   r   	all_attrsr	   r   r
   Forwardr   r   	enumeraterq   rv   r   r   r   r   r   rr   r   r6   r   r   r   )r=   target_dtyper"  r   loss_oploss_op_dist_attrtmp_name	cast_lossloss_dist_attrr   r   loss_op_idxr   first_backward_opinsert_op_offsetr   rN   cast_loss_gradpre_grad_namecast_grad_ops                       r,   r  zAMPPass._cast_loss  s:   ]7799FFHH
!!###}}V$$' -JJ
 
 :-222"+DI,DEEH"--T\%9%> .  I ".OO N )5H,5L&2N#>>>  
 (
FFK ++adV}, $
!%!5!:&0022;?  , 
 
G k6>:::B((TZ(((!%    !% $Z^KLL%ABB  R7o--/"2E2E-(*%'*Qw$E!"%% E %002N000'22 )(W*<==jl*/ ,	 3  N !s4:&%    .>qAM,,]N<OPPPB!s4:&!%    &00..n-.0 $ 4 9!.|!<!<%  1 
 
L C((TZ(((!%    GDMM&$'''
!!#####r.   c           	      "	   t           j                                                                        }|                     d          }|J |j        }| j                            |          }|                     d          s|                     d          dk    rt          |j	        |j	                  }|j
        }|j        }|                    t          j        d          |j        |j        |j                  }t%          | j        |d |j        D             ||           |                    |d	z   d
|g| j        gdd|gid|                                t,                   i          }	|                    t,          t0          j                   t5          |	|d |j        D             | j        |           d }
|j        |d          D ]1}|j        dk    rt;          |          r|}
 nt=          |          r n2|

J d            |                    t          j        d          dz   |j        |j        |j                  }t%          | j        |dgt?          |j                  z  ||           |
j         d         }|
!                    ||j"                   t5          |
|dgt?          |j                  z  | j        |           |
|_        |#                                 |j	                            |dz             }|$                    d           |%                    d|j"        g           |%                    d|j"        g           |%                    d| j        j"        g           |&                    d|g           |&                    dg            |                    t,          t0          j'                   |                    dd           t           j        (                    ||          }|j        )                    |dz   |           |#                                 |j        |dz            }|j        dk    sJ t5          ||d |j        D             | j        |           n|}|| _*        |#                                 d S )Nr   r   r   r  scaled_lossr[  c                     g | ]}d S rT  r]   rU  s     r,   r/  z'AMPPass._scale_loss.<locals>.<listcomp>  rW  r.   r   r   elementwise_mul)r   Yr   r   r   c                     g | ]}d S rT  r]   rU  s     r,   r/  z'AMPPass._scale_loss.<locals>.<listcomp>,  rW  r.   rY  rZ  r   r  r   rX  elementwise_mul_gradzOut@GRADr   rq  r   r   axisc                     g | ]}d S rT  r]   rU  s     r,   r/  z'AMPPass._scale_loss.<locals>.<listcomp>p  rW  r.   )+r'   r  r  r  r  rN   rb   r   r   rs   r   r   r   r   rQ  r   r+   r   r   r   r   r^  r	   r   r
   r_  r   rq   rv   r   r   r   rr   r   r}   r   set_type	set_input
set_outputr   r  insertr   )r=   r"  r   rb  rc  rg  r   r   rn  elementwise_mul_oprh  rN   scaled_loss_gradrk  elementwise_mul_grad_op_descelementwise_mul_grad_ops                   r,   r  zAMPPass._scale_loss  s&   ]7799FFHH
}}V$$' -JJ
 

 MM455n	}}011S88'
FFK )5H,5L$// )-88jj ,	 0  K !((TZ(((%    ",!6!6a&"V4+=*>??.w0022;? "7 " " k6>:::B"((TZ(((!%    !% n[\\2  7o--/"2E2E-(*%E!"%% E %002N000)44 )-887Bjj ,	  5     ! s4:&%    .>qAM,,/4   C!s4:&!%    #4%%'''+5?+E+Ea, ,( )112HIII(22-23   )223DDD(22d(-.   )33H}oNNN(33HbAAA(22;PPP(2262>>>&,m&<&<8' '# N!!+/3JKKK%%'''&0n[1_&E#*/3IIIIIB'((TZ(((!%     K 
!!#####r.   c                 <   t           j                                                                        }|                                 t          | j        dddgd           t          |dt          t          fd           |D ]o}t          |dg dd           |j
        t           j        k    r%| j        j
        t           j        k    s
J d            P| j        j
        |j
        k    s
J d            p||| j        | j        | j        d	}|| j        | j        | j        d
}|                     d          |                     d          |                     d          |                     d          |                     d          t           j        d}|                    d|||          }t'          |j                  }	t+          t,          j                  |	_        d|	_        d|	_        t7          t,          j                  dk    rd|	_        |D ]`}
| j                            |
          }|J |	                    |
j         |j!                   |	"                    |
j         |j!                   a| j        #                    ||	           |                                 d S )Nprev_loss_scalingr   r3  update_loss_scalingr0  r2  zPThe dtype of prev_loss_scaling should be float32 when the dtype of x is float16.zAThe dtype of prev_loss_scaling should be equal to the dtype of x.)r   r9  PrevLossScalingInGoodSteps
InBadSteps)r   LossScalingOutGoodStepsOutBadStepsr   r   r   r   stop_update)r   r   r   r   r  r   r   r   r   )$r'   r  r  r  r   r   r   r   r:  r;  r+   r(   r   r   r   r  r
   rA  r  r   rs   r   r?  r@  r   rB  r   r   rC  rb   r   rD  r}   r   rE  rF  )r=   r  r  r"  rG  r   r   r   r'  rH  r$  rI  s               r,   r  zAMPPass._update_loss_scalingy  s   ]7799FFHH
!!### 	"!		
 	
 	
 	5#t}.CDDD 	 	A$3999;P   w&.(()/6>AAAf BAAA )/17:::W ;:::
 &#1/-
 
 - 0.	
 
 #'--0D"E"E'+}}5N'O'O--55--55==77
 
 %%&	 & 
 
 ,FK88(34G4M(N(N%$%!$%!"())A--)>& 	 	A+LLQOOK***330   440    	66v?OPPP!!#####r.   c                 H    | j         r| j         S |                     d          S )Nr   )r   r  rD   s    r,   get_losszAMPPass.get_loss  s(    
 : 	):==(((r.   )rY   rZ   r[   rA   r  r  r  r
  r  r  r  r  r  r  __classcell__)r   s   @r,   r   r     s        # # # # #,  "   @  @  @DF$ F$ F$P<  <  < |- - -^l$ l$ l$\y$ y$ y$vJ$ J$ J$X) ) ) ) ) ) )r.   r   ).r'   paddle.base.data_feederr   r   6paddle.distributed.auto_parallel.static.dist_attributer   5paddle.distributed.auto_parallel.static.process_groupr   -paddle.distributed.auto_parallel.static.utilsr   r   /paddle.distributed.fleet.meta_optimizers.commonr	   r
   paddle.frameworkr   paddle.static.amp.fp16_utilsr   r   r   r   r   r   r   r   r   paddle.utilsr   auto_parallel.process_meshr   auto_parallel.static.utilsr   r   r   r   r   	pass_baser   r   r?  rw   r-   r6   r8   r_   r   r]   r.   r,   <module>r     s    H H H H H H H H                  P O O O O O O O ! ! ! ! ! !
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 % $ $ $ $ $ 4 4 4 4 4 4              / . . . . . . .--//      ) ) )0/ 0/ 0/ 0/ 0/ 0/ 0/ 0/fE E E E E E E EP "##B	) B	) B	) B	) B	)h B	) B	) $#B	) B	) B	)r.   