
    ΑiG                         S SK r S SKJr  S SKJs  Jr  S SK Jr  S r	S r
 " S S5      r SS jrS rS	 r     SS
 jr   SS jr " S S\ R"                  R$                  5      rg)    N)_C_opsc                 $   U R                   nU R                  nUR                  R                  S5      nUR                  U   nUS:  a  [
        R                  " XS-  US9n/ n[        U5       H.  nUR                  U5        UR                  US-  S-
  U-
  5        M0     Un	U	 Vs/ s H  oU   PM	     n
n[
        R                  " XS9n[
        R                  R                  U5      X$'   [
        R                  R                  XU5      n U $ s  snf )Nsep      num_or_sectionsaxisr
   )
placementsprocess_mesh	dim_namesindexshapepaddlesplitrangeappendconcatdistributedShardreshard)tensorseq_dimr   r   cp_index	cp_degreesliced_datasindicesireorder_indices	reorderedreordered_tensors               o/var/www/html/banglarbhumi/venv/lib/python3.13/site-packages/paddle/distributed/auto_parallel/ring_attention.pyshard_seq_load_balancer$      s   ""J&&L%%++E2H""8,I1}||M
 y!ANN1NN9q=1,q01 " ".=>o!_o	>!==A%1177@
##++J
 M ?s   &Dc                 2   U R                   nU R                  nUR                  R                  S5      nUR                  U   n[
        R                  " X[
        R                  " 5       /5      n[        R                  " XeS-  US9n/ n[        U5       H.  n	UR                  U	5        UR                  US-  S-
  U	-
  5        M0     S/[        U5      -  n
[        U5       H	  u  pXU'   M     U
 V	s/ s H  oU	   PM	     nn	[        R                  " XS9$ s  sn	f )Nr   r   r   r   r   r   )r   r   r   r   r   distr   	Replicater   r   r   r   len	enumerater   )r   r   r   r   r   r   
all_tensorr   r    r   inverse_indicesidxvrestoreds                 r#   unshard_seq_load_balancer/   1   s   ""J&&L%%++E2H""8,IfT^^5E4FGJ<<ML O9q!y1}q0145  cC00OO,  -)89AQH9==00 :s   0Dc                   2    \ rS rSrS rS rS rS rS rSr	g)	RingCommunicatorF   c                    UR                  5       R                  5       UR                  5       R                  5       /U l        UR                  5       R                  5       UR                  5       R                  5       /U l        SU l        Xl        [        R                  R                  5       n[        R                  " 5       nUR                  SU5      U l        UR                  S5      U l        UR                  R                  S5      nU R
                  R                   U R                  S-   U R                  -     U l        U R
                  R                   U R                  S-
  U R                  -     U l        / U l        g )Nr   r   r   )clone
contiguous	_k_buffer	_v_buffer_next_buffer_idxgroupr&   auto_parallelget_meshget_rankget_rank_by_dim_and_process_id
group_rankget_dim_sizecp_sizer   r   ranks	send_rank	recv_rank_reqs)selfr9   	local_keylocal_valuemesh
process_idr   s          r#   __init__RingCommunicator.__init__G   s*   OO((*OO((*

 **,**,

 !"
!!**,]]_
==eZP((/>>''.))__q DLL0
 ))4??Q+>$,,*NO
    c                 @    [         R                  R                  5         g N)r   devicesynchronizerE   s    r#   waitRingCommunicator.waita   s    !!#rL   c                    UR                   U R                  U R                     R                   :w  a  U R                  U R                     S S 2S UR                   S   2S S 2S S 24   R                  U5        U R                  U R                     S S 2S UR                   S   2S S 2S S 24   R                  U5        g U R                  U R                     R                  U5        U R                  U R                     R                  U5        g )Nr   )r   r6   r8   add_r7   )rE   keyvalues      r#   add_to_buffersRingCommunicator.add_to_buffersd   s    99t'<'<=CCCNN4001!^syy|^Q2IJOO NN4001!^syy|^Q2IJOO NN400166s;NN400166u=rL   c                 f    U R                   U R                     U R                  U R                     4$ rN   )r6   r8   r7   rQ   s    r#   get_buffersRingCommunicator.get_buffersp   s0    NN4001NN4001
 	
rL   c                 P   [         R                  " [         R                  U R                  U R                     R                  5       U R                  U R                  5      n[         R                  " [         R                  U R                  U R                     R                  5       U R                  U R                  5      n[         R                  " [         R                  U R                  U R                  S-   S-     U R                  U R                  5      n[         R                  " [         R                  U R                  U R                  S-   S-     U R                  U R                  5      nU R                  S-   S-  U l        XX4/n[         R                  " U5      U l        g )Nr   r   )r&   P2POpisendr6   r8   r5   rB   r9   r7   irecvrC   batch_isend_irecvrD   )rE   	send_k_op	send_v_op	recv_k_op	recv_v_opopss         r#   	send_recvRingCommunicator.send_recvv   s8   JJJJNN4001<<>NNJJ	
	 JJJJNN4001<<>NNJJ	
	 JJJJNND11A5:;NNJJ	
	 JJJJNND11A5:;NNJJ	
	 "&!6!6!:a ?Y:++C0
rL   )	r6   r8   rD   r7   r@   r9   r>   rC   rB   N)
__name__
__module____qualname____firstlineno__rJ   rR   rX   r[   rg   __static_attributes__ rL   r#   r1   r1   F   s    4$
>
1rL   r1   c                 v   U(       a  U S S 2U R                   S   S-  S 2S S 2S S 24   nUS S 2UR                   S   S-  S 2S S 2S S 24   n[        XVX#5      u  pVXPS S 2U R                   S   S-  S 2S S 2S S 24'   XaS S 2UR                   S   S-  S 2S S 2S S 24'   X4$ [        R                  " US5      [        R                  " US5      p2[        R                  R                  SS9   X U-
  [        R                  " X1-
  5      -  -
  U[        R                  " X-
  5      -
  4sS S S 5        $ ! , (       d  f       g = f)Nr   r   float32F)enable)	r   update_out_and_lser   castamp	auto_castFsigmoidlog_sigmoid)old_outold_lse	block_out	block_lsesecond_chunk_onlysecond_chunk_outsecond_chunk_lses          r#   rr   rr      s6    "1gmmA&6!&;&=q!#CD"1gmmA&6!&;&=q!#CD-?	.
* 5E7==#q(*Aq014D7==#q(*Aq01 KK	9-KK	9-  ZZ!!!/	1QYY#6  w':;;< 0//s   %;D**
D8c                     U SU-  S-
  U -
  4$ )Nr   r   rn   )rankr@   s     r#   get_chunk_idr      s    !g+/D())rL   c                 z    [        U 5      SU-  :X  d   e[        X5      u  p4[        R                  " X   X   /SS9$ )Nr      r   )r(   r   r   r   )attn_masks_listr   r@   first_chunk_idsecond_chunk_ids        r#   concat_masksr      sH    1w;...&24&A#N==		(/*JK rL   c	                 >   U R                   n	U R                  n
[        XU5      nUR                  S   nUb  [        R
                  " XIS-  SS9nU(       a"  US S 2US-  S 2S S 2S S 24   R                  5       n[        U	5       GHF  nUR                  5       u  nnXS-
  :w  a  UR                  5         U(       d  [        R                  " UUUUUc  S O[        WX-
  U	-  U	5      USSU(       + S5
      u  nnnn[        R                  " [        R                  " U/ SQ5      SS	9  US
:X  a  UUnnGO|[        WWUU5      u  nnGOiUS
:X  a[  [        R                  " UUUUS USSU(       + S5
      u  nnnn[        R                  " [        R                  " U/ SQ5      SS	9  UUnnGOX:  ay  [        R                  " WUUUS USSU(       + S5
      u  nnnnUS S 2S S 2S
US-  24   n[        R                  " [        R                  " U/ SQ5      SS	9  [        WWUUS5      u  nnO[        R                  " UUS S 2S US-  2S S 2S S 24   US S 2S US-  2S S 2S S 24   US USSU(       + S5
      u  nnnn[        R                  " [        R                  " U/ SQ5      SS	9  [        WWUU5      u  nn[        R                   R#                  5         GMI     [        R$                  " WUR&                  5      n[        R                  " [        R(                  " WSS	9/ SQ5      nUU4$ )Nr   r   r   r   F )r   r   r   r   r   T)
world_sizer   r1   r   r   r   r5   r   r[   rg   r   
flash_attnr   
unsqueeze_
transpose_rr   rO   rP   rs   dtypesqueeze)r9   local_queryrF   rG   	attn_maskdropout	is_causalfixed_seed_offsettrainingr@   r>   comm_bufferlocal_q_seq_lenr   local_query_second_chunkstepblock_kblock_vr{   _r|   outlses                          r#   !ring_flash_attention_forward_funcr      sX    GJ"5[AK!''*O ,,{
 #.!#%q!+$

*, 	! g&224Q;!!# *0):):! !( %'**;w)F !*&Iq)Q$ f//	9EBOqy$iSS-c3	9MSqy-3->->% L.*	1i !!%%i;" %iSS"-3->->,% L.*	1i &aAA1E,F&FG	!!%%i;" .iDS .4->->A5A!55q!;<A5A!55q!;<% L.*	1i !!%%i;" .c3	9MS!!#o p ++c;,,
-C


FNN3R8)
DC8OrL   c                    U R                   nU R                  nUR                  5       nUR                  S   n[        R
                  " U5      n[        R
                  " U5      n[        R
                  " U5      n[        XU5      n[        XU5      nU	(       a[  US S 2US-  S 2S S 2S S 24   nUS S 2US-  S 2S S 2S S 24   nUS S 2S S 2US-  S 24   R                  5       nUS S 2US-  S 2S S 2S S 24   nUb  [        R                  " X{S-  SS9n[        U5       GH  nUR                  5       u  nnUUS-
  :w  a  UR                  5         U	(       dK  [        R                  " UUUUUU
Uc  S O[        WUU-
  U-  U5      UUS5
      u  nnnUR                  U5        OUS:X  a5  [        R                  " UUUUUU
S UUS5
      u  nnnUR                  U5        OUU:  aG  [        R                  " WUUWWU
S WUS5
      u  nnnUS S 2US-  S 2S S 2S S 24   R                  U5        OX[        R                  " UUS S 2S US-  2S S 2S S 24   US S 2S US-  2S S 2S S 24   UUU
S UUS5
      u  nnnUR                  U5        [        R                  R!                  5         UR#                  UR                  5       UR                  5       5        UR                  5         UR%                  5         GM     UR                  5       u  nnXU4$ )Nr   r   r   r   Fr   T)r   r   r5   r   r   
zeros_liker1   r   r   r[   rg   r   flash_attn_gradr   rU   rO   rP   rX   rR   )r9   local_out_gradr   rF   rG   	local_outr   r   r   r   r   r@   r>   r   query_grad_bufferkey_grad_buffervalue_grad_bufferkv_comm_buffergrad_comm_bufferr   local_out_second_chunklse_second_chunkout_grad_second_chunkr   r   r   r   block_q_gradblock_k_gradblock_v_grads                                 r#   "ring_flash_attention_backward_funcr   0  sh    GJ
..
C!''*O))+6''	2O))+6%eDN' 1 #.q/Q2F2H!Q/N#O !*1o.B.Da+J!Kq!_%9%;;<GGI .q/Q2F2H!Q/N O ,,{
 g)5577Q;$$&7=7M7M! !( %'*t*;w)F !84L,$ ""<0qy**#!)& 9lL "&&|4
"**0.()- 9lL "!_%9%;Q"ABGG 
 **##9_%9#91a ?@#9_%9#91a ?@!)& 9lL "&&|4!!#''##%|'>'>'@	
 	""$g h *:)E)E)G&O&/@@@rL   c                   B    \ rS rSr\     SS j5       r\S 5       rSrg)RingFlashAttentioni  Nc	                    US:  a  [        S5      e[        R                  R                  5       n	U	R                  R                  S5      n
[        R                  " 5       nU	R                  SU5      n[        R                  " 5         U	R                  S5      n[        R                  R                  R                  XR                  UR                  5      n[        R                  R                  R                  X"R                  UR                  5      n[        R                  R                  R                  X3R                  UR                  5      nUb  Sn[        UUUUUUUUU5	      u  nnU R                  XX#UUU5        Xpl        XPl        X`l        [        R                  R                  R'                  UUR                  UR                  5      nUR)                  5       $ )N        z/Dropout is not supported in ring attention yet.r   F)NotImplementedErrorr&   r:   r;   r   r   r<   r=   init_parallel_env
_get_groupapidtensor_to_localr   r   r   save_for_backwardr   r   r   dtensor_from_localr5   )ctxqueryrV   rW   r   r   r   r   r   rH   r   rI   r   r9   r   rF   rG   r   r   out_dtensors                       r#   forwardRingFlashAttention.forward  s    S=%A  !!**,>>''.]]_
225*E &((,,==%%u'7'7
 &&**;;!!3>>
	 ((,,==%%u'7'7
  I4

S 	eCS)L 1!((,,??##U%5%5
 %%''rL   c                 8   [         R                  R                  5       nUR                  R	                  S5      nU R                  5       u  pEpgpn
U R                  nU R                  nU R                  nUc9  [        R                  " SS/[        R                  " 5       [        R                  S9n[         R                  R                  R                  XUR                  UR                   5      n[         R                  R                  R                  XfR                  UR                   5      n[         R                  R                  R                  XwR                  UR                   5      n[         R                  R                  R                  XR                  UR                   5      n[#        UUUUUUU	U
UUU5      u  nnn[         R                  R                  R%                  UUR                  UR                   5      n[         R                  R                  R%                  UUR                  UR                   5      n[         R                  R                  R%                  UUR                  UR                   5      nU
b  U
R&                  (       d  UUUS 4$ UUU4$ )Nr   r   )placer   )r&   r:   r;   r   r   saved_tensorr   r   r   r   	to_tensorCPUPlaceint64r   r   r   r   r   r   stop_gradient)r   out_gradrH   r   r9   r   rV   rW   r   r   r   r   r   r   r   rF   rG   r   
query_gradkey_grad
value_gradquery_grad_dtensorkey_grad_dtensorvalue_grad_dtensors                           r#   backwardRingFlashAttention.backward  sF   !!**,>>''.8;8H8H8J5c#I11++MM	$ & 0 0Afoo/v||! ((,,==%%u'7'7
 &&**;;!!3>>
	 ((,,==%%u'7'7
 ++//@@++X-@-@
 ,N,
(
Hj "//33FF**E,<,<
  --11DDc&&
 "//33FF**E,<,<
  )@)@" "	  &'79KKKrL   rn   Nr   FNT)ri   rj   rk   rl   staticmethodr   r   rm   rn   rL   r#   r   r     s<     4( 4(l 7L 7LrL   r   )Fr   )r   FN)r   paddle.distributedr   r&   paddle.nn.functionalnn
functionalrv   r   r$   r/   r1   rr   r   r   r   r   autogradPyLayerr   rn   rL   r#   <module>r      s      !     61*N1 N1d ?D<.* r| {A|pL00 pLrL   