
    Αi1                    (   S SK Jr  S SKrS SKJrJrJr  S SKrS SK	r	S SK	J
r
  S SKJr  S SKJr  SSKJr  SS	KJr  S
SKJrJrJrJr  \(       a#  S SKJr  S SKJr  S SKJr  S SKJr   " S S\5      r / r!S r"S r# " S S\5      r$          SS jr%g)    )annotationsN)TYPE_CHECKINGAny	TypedDict)	framework)PyLayer)core   )get_rng_state_tracker)utils   )check_recompute_necessarycustom_state_managerdetach_variableswitch_rng_state_tracker)Callable)NotRequired)Group)Layerc                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg)	_Ctx+   r   mp_groupzNotRequired[bool]offload	partition N)__name__
__module____qualname____firstlineno____annotations____static_attributes__r       s/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/fleet/recompute/recompute_hybrid.pyr   r   +   s    ""$$r#   r   c                    UR                   nUR                  nUS:  a  U $ [        R                  " U 5      nUS:w  d   S5       eXB-  S:X  d   SU SU S35       eU R	                  5       nXB-  nXc-  nXv-   nXWU $ )Nr
   r   zcan't recompute zero elementz The capacity of the activation (z#) cannot be divisible by mp_degree())nranksrankpaddlenumelflatten_)	tensorr   	mp_degreemp_ranktensor_numeldata	part_sizestartends	            r$   _split_activationr4   4   s    ImmG1}<<'L1<<<#q( 
*<.8[\e[ffgh(
 ??D)IE

Cc?r#   c                R   UR                   nUR                  nUS:  a  U $ [        U R                  5      nUS==   UR                   -  ss'   [        R
                  " X@R                  5      nUR                  R                  U R                  5       U5      nUR                  5         U$ )Nr
   r   )r'   r(   listshaper)   emptydtypeprocess_group
all_gathercudawait)r,   r   r-   r.   tensor_shapeouttasks          r$   _merge_activationrA   I   s    ImmG1} %LOx&O
,,|\\
2C!!,,V[[]C@DIIKJr#   c                  8    \ rS rSrSr\S 5       r\S 5       rSrg)_HPRecomputeFunctionX   ab  
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
2. Offload support for activation
3. Support MP segmentation of activation to further reduce cuda memory
4. Adapt to the random state of MP
c                ^   Xl         Xl        [        R                  " 5       U l        [        5       R                  5       U l        [        R                  R                  5       U l        [        R                  " 5       U l        U" 5       U l        X`l        Xpl        X0l        X@l        XPl        / U l        / U l        / U l        / n
[        R.                  " 5       nS[        R.                  " 5       ;   dX  S[        R.                  " 5       ;   d?  UR1                  S5      S   [        R2                  R5                  5       ;   d   SU S35       e[6        R8                  " 5       nUR:                  [<        R>                  R@                  :X  a  SOSU l!        UR:                  [<        R>                  RD                  :X  a  S	U l#        ObUR:                  [<        R>                  RH                  [<        R>                  R@                  4;   a  S
U l#        O[K        SUR:                   35      eURL                  U l'        URQ                  5       u  U l)        U l*        [        RV                  " 5          U" U0 U	D6nS S S 5        [Y        U5       GHY  u  p[        RZ                  " U5      (       Ga  UR\                  nU(       ag  U R,                  R_                  UR`                  5        [c        URe                  5       U5      Rg                  5       nU(       a  URi                  5       OUnOU(       a  URi                  5       OUnUUl.        U
R_                  U5        U R*                  R_                  U5        U R(                  R_                  S 5        [6        Rj                  " 5       (       a  U(       a  U Rm                  U5        GM8  GM;  GM>  U R(                  R_                  U5        GM\     U Rn                  " U
6   [        RZ                  " W5      (       a  X-/-  nU$ X--  n[q        U5      $ ! , (       d  f       GN= f)Nzgpu:zxpu::r   z2Recompute with RNG is not support current device: .FTO2O1zunsupported amp level: )9run_functionkwargsr)   get_rng_statefwd_rng_stater   get_states_trackerfwd_rng_state_trackernprandom	get_statefwd_numpy_stategetstatefwd_random_statefwd_custom_statecustom_get_state_funccustom_set_state_funcr   r   r   inputstensor_indicestensor_shapes
get_devicesplitdeviceget_all_custom_device_typer   _dygraph_tracer
_amp_levelr	   AmpLevelO0is_fw_autocastrH   	amp_levelrI   
ValueError
_amp_dtype	amp_dtype_get_amp_op_listamp_white_listamp_black_listno_grad	enumerate	is_tensorstop_gradientappendr7   r4   detachclonecpuin_dynamic_modemark_non_differentiablesave_for_backwardtuple)ctxrJ   all_outputsr   r   r   rW   rX   argsrK   tensor_inputs
cur_devicetraceroutputsiargstates                    r$   forward_HPRecomputeFunction.forwarda   sF    (
 #002$9$;$N$N$P! ii113%046$9!$9!  ! 
&&(
f''))**,,$Q'}}779:	N
 @
|1M	N: **,&&$--*:*::E 	  0 00 CM4==#3#3T]]5E5E"FF CM6v7H7H6IJKK))171H1H1J.C.^^"D3F3G   oFA$$))%%,,SYY7 1

h!eg  .5)--/)C'.#'')CC$)!$$S)""))!,

!!$' ,,..5//4 49. 

!!#&; &> 	}-G$$9$KN"K>!S s   #	P
P,c           
        [         R                  R                  R                  5          [	        U R
                  5      nU R                  nU R                  n[	        U R                  5       5      n[         R                  R                  5       R                  n[        U5       H  u  pxU R                  (       aP  XW   R                  n	[        XW   U R                   5      R#                  5       R%                  XG   5      XW'   XU   l        U R&                  (       a  XW   R)                  U5      OXW   X('   M     [*        R,                  " 5       n
SU
l        [1        U R2                  U R4                  U R6                  U R8                  U R:                  U R<                  U R>                  5         U R@                  (       a  [         RB                  RE                  U R@                  U RF                  U RH                  U RJ                  U RL                  S9   [O        [Q        U5      5      nU RR                  " U0 U RT                  D6nS S S 5        O0[O        [Q        U5      5      nU RR                  " U0 U RT                  D6nS S S 5        [W        W[X        RZ                  R\                  5      (       a  U4n[_        U5      [_        U5      :X  d   e/ n/ n[a        [_        U5      5       Hk  n[W        X   [X        RZ                  R\                  5      (       d  M0  X   R                  (       a  ME  URc                  X   5        URc                  X   5        Mm     [_        U5      S:X  a  [e        S5      e[         Rf                  Ri                  X5        [Q        S W 5       5      nUsS S S 5        $ ! , (       d  f       GN:= f! , (       d  f       GND= f! , (       d  f       g = f)NT)enablecustom_white_listcustom_black_listlevelr9   r   zInone of output has stop_gradient=False, this recompute() is not necessaryc              3     #    U  H@  n[        U[        R                  R                  5      (       d  M.  UR	                  5       v   MB     g 7fN)
isinstancer	   eagerTensor
_grad_ivar).0inps     r$   	<genexpr>0_HPRecomputeFunction.backward.<locals>.<genexpr>  s7      *Cc4::#4#45 !  *s
   -A
A
)5r)   basedygraphguardr6   rY   rZ   r[   saved_tensordistributedParallelEnv	device_idrm   r   ro   rA   r   rq   reshape_r   r<   r   r`   	_has_gradr   rM   rO   rS   rU   rV   rW   rX   rd   amp	auto_castrj   rk   re   rh   r   rw   rJ   rK   r   r	   r   r   lenrangerp   RuntimeErrorautogradbackward)rx   rz   rY   rZ   r[   tensorsr   r   idxr   r}   detached_inputsr~   forward_outputs_with_gradbackward_inputsgradss                   r$   r   _HPRecomputeFunction.backward   s   [[  &&(#**%F //N--M3++-.G**668BBI#N3==#J44E)'*cllC!-"23 J
 05AJ,25++GJOOI.7:  4 ..0F#F *!!))##$$$$)))) %%--"11*-*<*<*-*<*<!mm!mm .  +:%-*H"%"2"2,#03

#  '6eFm&DO!..N3::NG/2 '4::#4#455"*w<3t9,,,(*% O3w<(wz4::+<+<==#J444-44WZ@#**473 ) ,-2"_ 
 OO$$%>P * E
 [ )(F  3 )(sK   FO*A%N61N$ 9N69BOO$A6O$
N3.N66
O	 O
Or   N)	r   r   r   r    __doc__staticmethodr   r   r"   r   r#   r$   rC   rC   X   s4     d" d"L N Nr#   rC   c           	        U R                  SS5      nUc   S5       eU R                  SS5      nU R                  SS5      n[        R                  " 5       R                  (       a  [	        U5        [
        R                  c  [
        R                  b   eSS jnSS jnO [
        R                  n[
        R                  n/ n	[        R                  " UU	UUUUU/UQ70 UD6  [        U	5      S	:X  a  U	S
   $ U	 HD  n
[        R                  " U
5      (       d  M   [        R                  " U
5      (       a  M=  SU
l        MF     [!        U	5      $ )a  
recompute intermediate activations to save the memory in hybrid parallel scene.
# NOTE(shenliang03)The current hybrid parallel recompute has limitations.
# It cannot handle the following situations:
# 1. The calculation output of recompute, there are tensors that do not require gradients.
# 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach().
# 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor

Parameters:
    ctx(dict): include 'mp_group', 'offload', and 'partition' keys. the key 'mp_group' (Group), represents the activations are splitted
               in which group. the key 'offload' (bool, optional, default=False), represents whether to offload to cpu. the key 'partition' (bool, optional, default=False),
               represents whether to split activations in the mp_group.
    function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
          whose intermediate activations will be released to save memory in forward stage and will be recomputed
          in backward stage for gradient calculation.
    *args(Tensor): inputs(tuple) to the function.

    **kwargs(Dict): inputs(dict) to the function.

Returns:
    Output of function on args and kwargs.

r   Nz8ctx must contains mp_group and mp_group can not be None.r   Fr   c                    g r   r   xs    r$   <lambda>"recompute_hybrid.<locals>.<lambda>A      tr#   c                    g r   r   r   s    r$   r   r   B  r   r#   r   r   Tr   )getr   r`   r   r   r   rW   rX   rC   applyr   r)   rn   r   is_float_tensorro   rw   )rx   functionrz   rK   r   r   r   rW   rX   ry   outputs              r$   recompute_hybridr     s=   4 wwz4(H B ggi'GU+I  ",,!$'119#99AAA 3 3 4 J J 4 J JK
 

 
 ;11~!F''0E0Ef0M0M'+$ " [!!r#   )
rx   r   r   zLayer | Callable[..., Any]rz   r   rK   r   returnr   )&
__future__r   rQ   typingr   r   r   numpyrP   r)   r   paddle.autogradr   paddle.frameworkr	   $meta_parallel.parallel_layers.randomr   meta_parallel.pp_utilsr   	recomputer   r   r   r   collections.abcr   typing_extensionsr   &paddle.distributed.communication.groupr   	paddle.nnr   r   __all__r4   rA   rC   r   r   r#   r$   <module>r      s    #  0 0    # ! H *  (-<%y % *7 DA"	A"3A"<?A"KNA"A"r#   