
    ёi V                        S SK r S SKrS SKrS SKrS SKrS SKJrJrJ	r	  S SK
Jr  S SKJr  SSKJrJrJrJr  SSKJr  \" \\R,                  S	S
9r\R0                  R2                  R4                  \R0                  R2                  R6                  \R0                  R2                  R8                  /rSrS rS r S r!S r"S r#S r$\S 5       r%S r& SS jr' SS jr(SS jr)SS jr*g)    N)core	frameworkglobal_scope)
get_logger)signature_safe_contextmanager   )_rename_arg_rename_op_inputfind_true_post_opfind_true_prev_op   )AutoMixedPrecisionListsBF16z&%(asctime)s-%(levelname)s: %(message)s)fmt__use_bf16__c                     [         R                  " U 5      n [         R                  " S [         R                  /S9" U R                  5      n[         R
                  " XR                  5      $ )Nc                 f    [         R                  " S[         R                  " SU 5      5      S   S-	  $ )Nz<Iz<fr      )structunpackpack)xs    `/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/static/amp/bf16/amp_utils.py<lambda>)convert_float_to_uint16.<locals>.<lambda>3   s$    &--fkk$&:;A>"D    )otypes)npasarray	vectorizeuint16flatreshapeshape)in_listouts     r   convert_float_to_uint16r&   0   sM    jj!G
,,D		{ llC ::c==))r   c                 .    U [         R                  :X  a  gg)zh
Convert specific variable type to its corresponding string.

Args:
    dtype (VarType): Variable type.
bf16fp32)paddlebfloat16)dtypes    r   _dtype_to_strr-   9   s     r   c                    SnUR                    GHw  nU[        R                  :X  a  UR                  S;   a  US;  a  M0  UR	                  U5       GH1  nU R                  U5      nUR                  [        ;  d  UR                  U:X  a  M;  UR                  U:X  a  UR                  S-   [        U5      -   n	U R                  R                  U	5      n
U
b  U
R                  U:w  aO  U R                  U	USUR                  S9n
U R                  USSU0S	U
0UR                  U
R                  S
.S9  US-  n[        XR                  U
R                  5        GM  UR!                  S5      (       d  GM  UR#                  SU5        GM4     GMz     U[        R                  :X  Ga*  U[        R$                  :X  Ga  UR&                   GH  nUR                  S;   a  US:w  a  M  UR)                  U5       H  nU R                  U5      n
U
R                  [        ;  a  M*  U
R                  [        R                  :X  d  MJ  U
R*                  R-                  [.        R0                  R2                  R4                  5        UR!                  S5      (       d  M  UR#                  S[.        R0                  R2                  R4                  5        M     GM     U$ )a  
Insert cast op and rename args of input and output.

Args:
    block (Program): The block in which the operator is.
    op (Operator): The operator to insert cast op.
    idx (int): The index of current operator.
    src_dtype (VarType): The input variable dtype of cast op.
    dest_dtype (VarType): The output variable dtype of cast op.

Returns:
    num_cast_op (int): The number of cast ops that have been inserted.
r   )
batch_normfused_bn_add_activation
layer_norm   XZ.cast_Fnamer,   persistablestop_gradientcastr3   Outin_dtype	out_dtypetypeinputsoutputsattrsr   r=   Yr>   )input_namesr*   float32r@   inputvar_valid_typesr,   r7   r-   varsget
create_varr9   
_insert_opr	   has_attr	_set_attrr+   output_namesoutputdesc	set_dtyper   VarDescVarTypeBF16)blockopidx	src_dtype
dest_dtypenum_cast_opsin_namein_var_namein_var	cast_nameout_varout_nameout_var_names                r   _insert_cast_oprd   F   s9    L>>&277 7
 ,

 j(88G,KYY{+F{{,.&,,*2L||y("KK(2]:5NN	**..3?gmmz&A#..&($)&,&:&:	 / G $$# #V}!& 0(.)0 % 	 !A%LBW\\:;;z**LLZ8; - "L FNN"zV__'DHJKO "		( 3))L1<<|3==FNN2LL**4<<+?+?+D+DE{{;//[$,,2F2F2K2KL !4 ( r   c           	      f   SnU R                  U5      nUR                  [        ;  d  UR                  U:X  a  U$ UR                  U:X  d*   S[	        UR                  5       S[	        U5       S35       eUR
                  S-   [	        U5      -   n	U R                  R                  U	5      n
U
b  U
R                  U:w  at  U R                  U	USUR                  S9n
U R                  USS	U0S
U
0UR                  U
R                  S.S9  US-  nU
R
                  X`R                     UR
                  '   U$ )Nr   zThe real dtype(z ) is not equal to the src dtype()r5   Fr6   r:   r3   r;   r<   r?   r   )rH   r@   rI   r,   r-   r7   rJ   rK   rL   r9   rM   rY   )rW   rX   rY   rZ   r[   target_nameop_var_rename_mapr\   
target_varr`   cast_vars              r   _insert_cast_post_oprk      sF    L;'Jl*j.>.>*.Ly( 
-
(8(89::Z[hir[sZttuv( (*]:-FFIzz~~i(H8>>Z7##$22	 $ 
 	$H%)//hnnM 	 	
 	8@))$Z__5r   c                     UR                   (       d  gU R                   H  nX!R                   ;   d  M    g   U R                   H  nX1R                   ;   d  M    g   gNFT)fp32_varnamesinput_arg_namesoutput_arg_names)rX   	amp_listsr]   rb   s       r   _is_in_fp32_varnamesrr      sS    ""%%--- & ''... ( r   c                 L   U R                   U;   a  g/ nUR                  [        U R                  5      5        UR                  [        U R                  5      5        U H  nSU;   d  M    g   U(       a1  U R                  S5      (       a  [        U R                  S5      ;   a  ggg)NTlearning_rateop_namescopeF)r@   extendlistro   rp   rN   _bf16_guard_patternattr)rX   unsupported_op_listuse_bf16_guardin_out_arg_namesr7   s        r   _need_keep_fp32r}      s    	ww%%  D!3!345D!4!456 d" ! ;;~&&277>#::  r   c               #   x   #    [         R                  " [        S9   Sv   SSS5        g! , (       d  f       g= f7f)a2  
As for the pure bf16 training, if users set `use_bf16_guard` to True,
only those ops created in the context manager `bf16_guard` will be
transformed as float16 type.

Examples:
    .. code-block:: python

        >>> import numpy as np
        >>> import paddle
        >>> import paddle.nn.functional as F
        >>> paddle.enable_static()
        >>> data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
        >>> conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)

        >>> with paddle.static.amp.bf16.bf16_guard():
        ...     bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
        ...     pool = F.max_pool2d(bn, kernel_size=2, stride=2)
        ...     hidden = paddle.static.nn.fc(pool, size=10)
        ...     loss = paddle.mean(hidden)
)prefixN)r   
name_scoperx    r   r   
bf16_guardr      s$     . 
		%8	9 
:	9	9s   :)	:
7:c                 8    U  H  nU H  nX1;   d  M
      g   M     grm   r   )post_opskeep_fp32_opspost_oprX   s       r   are_post_ops_bf16r      s$    B"   r   c           	      "   U R                  5       R                  nU GH  n[        UR                  5      UR                  ;   d  M)  Sn/ n	/ n
UR
                   Hw  nUR                  U5       H_  nUR                  U5      n[        X7US5      nUb  UR                  [        ;  a  Sn  MQ  U	R                  U5        U
R                  U5        Ma     My     U(       d  M  [        X5      (       d  M  U
 H  nUR                  [        R                  :X  a=  UR                  R!                  ["        R$                  R&                  R(                  5        Uc  Mc  UR*                  U;   d  Mu  UR-                  UR*                  5        M     UR/                  S5      (       d  GM  UR1                  S5      ["        R$                  R&                  R2                  :X  d  GM  UR5                  S["        R$                  R&                  R(                  5        GM     g )NTFr,   )global_blockopsstrr@   bf16_initializer_listrP   rQ   rH   r   rI   appendr   r,   r*   rF   rR   rS   r   rT   rU   rV   r7   removerN   ry   FP32rO   )startup_progrq   rW   all_opsr   to_bf16_var_namesprepend_opsrX   	change_opop_post_opsop_out_varsrb   rc   ra   r   s                  r   cast_initializers_to_bf16r      s{    ++-11Krww<9:::IKKOO$&IIh$7L#ii5G/\4PG',,l*J$)	&&w/&&w/ %8 , y.{JJ*G}}6..t||/C/C/H/HI)5#LL,==)00>  + KK(((DLL,@,@,E,EELL$,,*>*>*C*CD; r   c                    Uc
  [        5       nU R                  5       n[        5       n[        5       n[        5       n/ nU R                   H  n	UR	                  U	R
                  5        M      U R                   GH
  n	U	R
                  n
U
 GH  nUR                  S:X  d  UR                  S:X  a  M&  [        XR                  U5      (       a  UR                  U5        MT  UR                   GH   nUR                  S;   a  US;  a  M  UR                  U5       H  nSn U	R                  U5      nUb  UR                  [         ;  a  M0  UR"                  [$        R&                  :X  aN  UR(                  R+                  [,        R.                  R0                  R2                  5        UR                  U5        [        R                  S
UR                   SU SUR"                   S35        M     GM     UR4                   H  nUR                  S;   a  US:w  a  M  UR7                  U5       H  nSn U	R                  U5      nUb  UR                  [         ;  a  M0  UR"                  [$        R&                  :X  a=  UR(                  R+                  [,        R.                  R0                  R2                  5        [        R                  S
UR                   SU SUR"                   S35        M     M     S Ht  nUR9                  U5      (       d  M  UR;                  U5      [$        R&                  :X  d  M@  UR=                  U[,        R.                  R0                  R2                  5        Mv     GM     Uc  GM  [?        UUUU
UU5        GM     [A        [C        U R                  5      5       Vs/ s H  n[D        RF                  " 5       PM     nnU R                   GH#  n	U	R
                  n
SnU[C        U
5      :  d  M#  U
U   nSnX;  a_  X;   aX  [I        U	UU[,        R.                  R0                  RJ                  [,        R.                  R0                  R2                  5      nUU-  nGO{[I        U	UU[,        R.                  R0                  R2                  [,        R.                  R0                  RJ                  5      nUU-  nURL                   GH  nU	RN                  RQ                  U5      nUb  UR                  [         ;  a  M8  UR"                  [$        RR                  :X  d  MX  UR(                  R+                  [,        R.                  R0                  RJ                  5        [U        XU5      nU Hk  nUU;   a  M  [W        U	UUU-   S-   [,        R.                  R0                  RJ                  [,        R.                  R0                  R2                  UU5      nUU-  nMm     GM     UUS-   -  nU[C        U
5      :  a  GM   GM&     [Y        U UX5        U$ ! [         aQ  n[        R                  SU S35        UR                  U5      nUb  [        R                  SU S	35         SnAGN3SnAff = f! [         aQ  n[        R                  SU S35        UR                  U5      nUb  [        R                  SU S	35         SnAGNSnAff = fs  snf )a  
Traverse all ops in the whole model and set their inputs and outputs
to the bf16 data type. This function will do some special processing for
the batch normalization, which will keep the batchnorm's computations in FP32.
Args:
    program (Program): The used program.
    amp_lists (AutoMixedPrecisionListsBF16): An AutoMixedPrecisionListsBF16 object.
    use_bf16_guard(bool): Determine whether to use `bf16_guard` when
                          constructing the program. Default True.
Ncreate_py_readerread>   r/   r1   r0   r2   z-- z&, try to get it in the global block --z-- var z is got in the global block --z-- op type: z, in var name: z, in var dtype: z --rD   z, out var name: z, out var dtype: )r=   r>   r,   r   r   )-r   r   setblocksrv   r   r@   r}   unsupported_listaddrE   rG   rH   
ValueError_loggerdebugrI   r,   r*   rF   rR   rS   r   rT   rU   rV   rP   rQ   rN   ry   rO   r   rangelencollectionsOrderedDictrd   r   rp   rJ   rK   r+   r   rk   r
   )programr   rq   r{   r   r   r   to_bf16_pre_cast_ops
origin_opsrW   r   rX   r]   r^   r_   erb   rc   ra   	attr_name_rh   rY   r\   in_var_cast_numpre_cast_numr   r   post_cast_nums                                r   cast_model_to_bf16r   '  s    /1	'')LEM5J%))$   iiBww,,60Ar#=#=~NN!!"%>>77   Z/#%88G#4K!F
!&;!7 ~L)H ||v~~5--dll.B.B.G.GH)--k:MM&rwwi{mK[\b\h\h[iilm+ $5 *@ OOGGNO C$&IIh$7L"G
"'))L"9 ',,l*J }}6..t||/C/C/H/HIMM&rwwi/?~M^_f_l_l^mmpq) %8 ,< @	KK	**	*fnn<LLDLL,@,@,E,EF @I V #%!]  r ,1W^^1D+E+Ea!+E   iiCHnSBL&-&5,,11,,11'O !O3L.LL((--LL((--  ,$&$7$7L#jjnn\:G',,l*J }}7..t||/C/C/H/HI#4Sl#K'/G&-7 (,@ % " #l 2Q 6 $ 4 4 9 9 $ 4 4 9 9 , 1-M )M9L (0 %8( <!##CU CHnn  ^ W/Kq & !!$JK ".!1!1+!>!-#MM")+6T U@ & !!$JK #/"2"2<"@".#MM"),7U VJs8   X-Y)'[
Y&AY!!Y&)
[3AZ??[c                    / nUR                    H"  nUR                  UR                  5       5        M$     U(       a  UO	[        5       nU(       a  UO	[	        5       nU H  nUR
                  U;   d  M  [        R                  SUR
                   S35        UR                  UR
                  5      R                  5       n	[        R                  " U	5      n
U	R                  [        U
5      U 5        M     g)a  
Traverse all parameters in the whole model and set them to the BF16 data type.
Whereas, this function will keep parameters of batchnorms in FP32.
Args:
    place(base.CPUPlace|base.CUDAPlace): `place` is used to restore the BF16 weight tensors.
    program (Program): The used program.
    scope(base.Scope, optional): `scope` is used to get the FP32 weight tensor values.
                                  Default is None.
    to_bf16_var_names(set|list, optional): The data types of vars in `to_bf16_var_names`
                                           will be set to BF16. Usually, it is the returned
                                           value of `cast_model_to_bf16` API.
z
---- cast z to bf16 dtype ----N)r   rv   all_parametersr   r   r7   r   r   find_var
get_tensorr   arrayr&   )placer   scoper   r   rW   bf16_var_names	var_scopeparamparam_tdatas              r   cast_parameters_to_bf16r     s     Ne2245   +<&NLNI::'MMJuzzl2EFG((4??AG88G$DKK/5u=  r   c                    Uc
  [        5       nU R                  5       nUR                  n[        5       n[        5       nU GH  nUR                  S:X  d  UR                  S:X  a  M&  UR
                  b#  [        Xa5      (       a  UR                  U5        MV  UR                  UR                  ;   a  UR                  U5        M  UR                  UR                  ;   a  UR                  U5        M  UR                  UR                  ;   Ga  SnSnUR                   H  n	U	(       d  M  UR                  U	5       H  n
UR                  U
5      nUR                  c  M#  UR                  UL a  [        X6U
5      nUc  MC  OUR                  nX;   d  UR                  UR                  ;   a  SnMs  X;   d  UR                  UR                  ;   d  M  SnM     M     U(       a  UR                  U5        GM  U(       a  UR                  U5        GM  GM  UR                  U5        GM     SnU[!        U5      :  GaT  X=   nSnXe;   aT  [#        UUU[$        R&                  R(                  R*                  [$        R&                  R(                  R,                  5      nOXd;   a  UR/                  S5      (       ak  UR1                  S5      [$        R&                  R(                  R,                  :X  a4  UR3                  S[$        R&                  R(                  R*                  5        [#        UUU[$        R&                  R(                  R,                  [$        R&                  R(                  R*                  5      nO XS-   -  nU[!        U5      :  a  GMS  gg)	a  
Traverse all ops in current block and insert cast op according to
which set current op belongs to.

1. When an op belongs to the fp32 list, add it to fp32 set
2. When an op belongs to the bf16 list, add it to bf16 set
3. When an op belongs to the gray list. If one
   of its inputs is the output of fp32 set op or fp32 list op,
   add it to fp32 set. If all of its previous ops are not fp32
   op and one of its inputs is the output of bf16 set op or
   bf16 list op, add it to bf16 set.
4. When an op isn't in the lists, add it to fp32 op set.
5. Add necessary cast ops to make sure that fp32 set op will be
   computed in fp32 mode, while bf16 set op will be computed in
   bf16 mode.

Args:
    main_prog (Program): The main program for training.
Nr   r   FTr   r,   r   )r   r   r   r   r@   rn   rr   r   	fp32_list	bf16_list	gray_listrE   rG   rH   rX   r   r   rd   r   rT   rU   rV   r   rN   ry   rO   )	main_progrq   rW   r   bf16_op_setfp32_op_setrX   
is_fp32_op
is_bf16_opr]   r^   r_   prev_oprY   r\   s                  r   rewrite_program_bf16r     s   ( /1	""$E
))C%K%K
 77((BGGv,="".3G4
 4
 OOB77i)))OOBWW	+++OOBWW	+++JJ>>7')xx'8!&;!7!99,$#YY"_&7&MG& (  / '-iiG $2&||y/B/BB)-J#2&||y/B/BB)-J+ (9 *2 ## OOBo r C
C.X*$$))$$))L G$$GGG$(<(<(A(AAWdll&:&:&?&?@*$$))$$))L a9 C..r   )N)NNT)NN)+r   loggingr   numpyr   r*   paddle.baser   r   r   paddle.base.log_helperr   paddle.base.wrapped_decoratorr   
fp16_utilsr	   r
   r   r   rq   r   __name__INFOr   rT   rU   DENSE_TENSORSELECTED_ROWSDENSE_TENSOR_ARRAYrI   rx   r&   r-   rd   rk   rr   r}   r   r   r   r   r   r   r   r   r   <module>r      s          5 5 - G  3
gll H
 	LL%%LL&&LL++ % *
FRD6  4 &ET @DcL>6p r   