
    x-jG                         d dl Z d dlmZ d dlmc mZ d dl mZ d Z	d Z
 G d d          Z	 ddZd	 Zd
 Z	 	 	 	 	 ddZ	 	 	 ddZ G d de j        j                  ZdS )    N)_C_opsc                    | j         }| j        }|j                            d          }|j        |         }|dk    rt          j        | |dz  |          g }t          |          D ]5}|                    |           |                    |dz  dz
  |z
             6|}fd|D             }	t          j	        |	|          }
t
          j
                            |          ||<   t
          j
                            |
||          } | S )Nsep      num_or_sectionsaxisc                      g | ]
}|         S  r   .0isliced_datass     o/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/paddle/distributed/auto_parallel/ring_attention.py
<listcomp>z*shard_seq_load_balance.<locals>.<listcomp>'   s    >>>\!_>>>    r
   )
placementsprocess_mesh	dim_namesindexshapepaddlesplitrangeappendconcatdistributedShardreshard)tensorseq_dimr   r   cp_index	cp_degreeindicesr   reorder_indices	reorderedreordered_tensorr   s              @r   shard_seq_load_balancer*      s'   "J&L%++E22H"8,I1}}|IM
 
 
 y!! 	2 	2ANN1NN9q=1,q01111!>>>>o>>>	!=AAA%177@@
8#++lJ
 
 Mr   c                 "   | j         }| j        }|j                            d          }|j        |         }t          j        | |t          j                    g          }t          j	        ||dz  |          g }t          |          D ]5}|                    |           |                    |dz  dz
  |z
             6dgt          |          z  }	t          |          D ]
\  }
}|
|	|<   fd|	D             }t          j        ||          S )Nr   r   r   r   r   c                      g | ]
}|         S r   r   r   s     r   r   z,unshard_seq_load_balance.<locals>.<listcomp>B   s    999AQ999r   r   )r   r   r   r   r   distr!   	Replicater   r   r   r   len	enumerater   )r"   r#   r   r   r$   r%   
all_tensorr'   r   inverse_indicesidxvrestoredr   s                @r   unshard_seq_load_balancer6   1   s6   "J&L%++E22H"8,IflT^5E5E4FGGJ<IM  L O9 6 6q!!!y1}q0145555cC000OO,, ! !Q 9999999H=0000r   c                   ,    e Zd Zd Zd Zd Zd Zd ZdS )RingCommunicatorc                    |                                                                 |                                                                 g| _        |                                                                 |                                                                 g| _        d| _        || _        t          j                                        }t          j	                    }|
                    d|          | _        |                    d          | _        |j                            d          }| j        j        | j        dz   | j        z           | _        | j        j        | j        dz
  | j        z           | _        g | _        d S )Nr   r   r   )clone
contiguous	_k_buffer	_v_buffer_next_buffer_idxgroupr-   auto_parallelget_meshget_rankget_rank_by_dim_and_process_id
group_rankget_dim_sizecp_sizer   r   ranks	send_rank	recv_rank_reqs)selfr?   	local_keylocal_valuemesh
process_idr$   s          r   __init__zRingCommunicator.__init__G   s=   OO((**OO((**

 **,,**,,

 !"
!**,,]__
==eZPP((//>''..)_q DL0
 )4?Q+>$,*NO


r   c                 B    t           j                                         d S N)r   devicesynchronizerK   s    r   waitzRingCommunicator.waita   s    !!#####r   c                    |j         | j        | j                 j         k    r| j        | j                 d d d |j         d         d d d d f                             |           | j        | j                 d d d |j         d         d d d d f                             |           d S | j        | j                                     |           | j        | j                                     |           d S )Nr   )r   r<   r>   add_r=   )rK   keyvalues      r   add_to_bufferszRingCommunicator.add_to_buffersd   s    9t'<=CCCN401!!!^sy|^QQQ2IJOO   N401!!!^sy|^QQQ2IJOO     N40166s;;;N40166u=====r   c                 J    | j         | j                 | j        | j                 fS rR   )r<   r>   r=   rU   s    r   get_bufferszRingCommunicator.get_buffersp   s&    N401N401
 	
r   c                    t          j        t           j        | j        | j                                                 | j        | j                  }t          j        t           j        | j        | j                                                 | j        | j                  }t          j        t           j	        | j        | j        dz   dz           | j
        | j                  }t          j        t           j	        | j        | j        dz   dz           | j
        | j                  }| j        dz   dz  | _        ||||g}t          j        |          | _        d S )Nr   r   )r-   P2POpisendr<   r>   r;   rH   r?   r=   irecvrI   batch_isend_irecvrJ   )rK   	send_k_op	send_v_op	recv_k_op	recv_v_opopss         r   	send_recvzRingCommunicator.send_recvv   s#   JJN401<<>>NJ	
 
	 JJN401<<>>NJ	
 
	 JJND1A5:;NJ	
 
	 JJND1A5:;NJ	
 
	 "&!6!:a ?)Y	:+C00


r   N)__name__
__module____qualname__rP   rV   r[   r]   rh   r   r   r   r8   r8   F   s_          4$ $ $
> 
> 
>
 
 
1 1 1 1 1r   r8   Fc                 x   |r| d d | j         d         dz  d d d d d f         }|d d |j         d         dz  d d d d d f         }t          ||||          \  }}|| d d | j         d         dz  d d d d d f<   ||d d |j         d         dz  d d d d d f<   | |fS t          j        |d          t          j        |d          }}t          j                            d          5  | | |z
  t          j        ||z
            z  z
  |t          j        ||z
            z
  fcd d d            S # 1 swxY w Y   d S )Nr   r   float32F)enable)	r   update_out_and_lser   castamp	auto_castFsigmoidlog_sigmoid)old_outold_lse	block_out	block_lsesecond_chunk_onlysecond_chunk_outsecond_chunk_lses          r   ro   ro      s     <"111gmA&6!&;&=&=qqq!!!#CD"111gmA&6!&;&=&=qqq!!!#CD-?.	9.
 .
** 5E7=#q(**AAAqqq014D7=#q(**AAAqqq01 K	9--K	9-- 	 Z!!!// 	< 	<g	1QYG#6 6  w':;;;<	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	<s   (:D//D36D3c                     | d|z  dz
  | z
  fS )Nr   r   r   )rankrF   s     r   get_chunk_idr      s    !g+/D())r   c                     t          |           d|z  k    sJ t          ||          \  }}t          j        | |         | |         gd          S )Nr      r   )r/   r   r   r   )attn_masks_listr~   rF   first_chunk_idsecond_chunk_ids        r   concat_masksr      sa    1w;....&24&A&A#NO=		(//*JK   r           Tc	                 *   | j         }	| j        }
t          | ||          }|j        d         }|t	          j        ||	dz  d          }|r)|d d |dz  d d d d d f                                         }t          |	          D ]G}|                                \  }}||	dz
  k    r|	                                 |st          j        |||||d nt          ||
|z
  |	z  |	          |dd| d
  
        \  }}}}t	          j        t	          j        |g d          d	           |d
k    r||}}n}t          ||||          \  }}nf|d
k    rTt          j        ||||d |dd| d
  
        \  }}}}t	          j        t	          j        |g d          d	           ||}}n||
k    ryt          j        ||||d |dd| d
  
        \  }}}}|d d d d d
|dz  f         }t	          j        t	          j        |g d          d	           t          ||||d          \  }}nt          j        ||d d d |dz  d d d d f         |d d d |dz  d d d d f         |d |dd| d
  
        \  }}}}t	          j        t	          j        |g d          d	           t          ||||          \  }}t          j                                         It	          j        ||j                  }t	          j        t	          j        |d	          g d          }||fS )Nr   r   r   r   F )r   r   r   r   r   T)
world_sizer~   r8   r   r   r   r;   r   r]   rh   r   
flash_attnr   
unsqueeze_
transpose_ro   rS   rT   rp   dtypesqueeze)r?   local_queryrL   rM   	attn_maskdropout	is_causalfixed_seed_offsettrainingrF   rD   comm_bufferlocal_q_seq_lenr   local_query_second_chunkstepblock_kblock_vrx   _ry   outlses                          r   !ring_flash_attention_forward_funcr      s    GJ"5)[AAK!'*O ,w{
 
 
  #.AA!#%%qqq!!!+$

*,, 	! g W$ W$&22447Q;!!### Q	N *0):! !( D%'*t*;w)F  !* *&Iq)Q$ f/	999EEBOOOOqyy$iS-c3	9MMSSqyy-3->% L. .*	1i !%i;;"    %iS
""-3->,% L. .*	1i &aaaAA1E,F&FG	!%i;;"    .iD SS .4->AAA5A!55qqq!!!;<AAA5A!55qqq!!!;<% L. .*	1i !%i;;"    .c3	9MMS!!####
+c;,
-
-C

FN3R888)))
D
DC8Or   c                 x   | j         }| j        }|                                }|j        d         }t	          j        |          }t	          j        |          }t	          j        |          }t          | ||          }t          | ||          }|	rk|d d |dz  d d d d d f         }|d d |dz  d d d d d f         }|d d d d |dz  d f                                         }|d d |dz  d d d d d f         }|t	          j        ||dz  d          }t          |          D ]}|	                                \  }}||dz
  k    r|
                                 |	sPt          j        ||||||
|d nt          |||z
  |z  |          ||d
  
        \  }}}|                    |           n|dk    r7t          j        ||||||
d ||d
  
        \  }}}|                    |           n||k    rLt          j        ||||||
d ||d
  
        \  }}}|d d |dz  d d d d d f                             |           n`t          j        ||d d d |dz  d d d d f         |d d d |dz  d d d d f         |||
d ||d
  
        \  }}}|                    |           t          j                                         |                    |                                |                                           |
                                 |                                 |	                                \  }}|||fS )Nr   r   r   r   Fr   T)r   r~   r;   r   r   
zeros_liker8   r   r   r]   rh   r   flash_attn_gradr   rX   rS   rT   r[   rV   )r?   local_out_gradr   rL   rM   	local_outr   r   r   r   r   rF   rD   r   query_grad_bufferkey_grad_buffervalue_grad_bufferkv_comm_buffergrad_comm_bufferr   local_out_second_chunklse_second_chunkout_grad_second_chunkr   r   r   r   block_q_gradblock_k_gradblock_v_grads                                 r   "ring_flash_attention_backward_funcr   0  s[    GJ
..

C!'*O)+66'	22O)+66%eYDDN' 1   P#.qqq/Q2F2H2H!!!QQQ/N#O !*111o.B.D.Daaa+J!Kqqq!!!_%9%;%;;<GGII .qqq/Q2F2H2H!!!QQQ/N O ,w{
 
 
 g S  S )55777Q;$$&&& F	57=7M! !( D%'*t*;w)F  !8 84L,$ ""<0000qyy*#!)&  9lL "&&|4444
""*0.()-  9lL "!!!_%9%;%;QQQ"ABGG    
 *##9_%9#9111aaa ?@#9_%9#9111aaa ?@!)&  9lL "&&|444!!###''##%%|'>'>'@'@	
 	
 	
 	""$$$)9)E)E)G)G&O&o/@@@r   c                   F    e Zd Ze	 	 	 	 	 dd            Zed             ZdS )RingFlashAttentionNr   FTc	                    |dk    rt          d          t          j                                        }	|	j                            d          }
t          j                    }|	                    d|          }t          j                     |		                    d          }t          j        j
                            ||j        |j                  }t          j        j
                            ||j        |j                  }t          j        j
                            ||j        |j                  }|d}t          |||||||||	  	        \  }}|                     |||||||           || _        || _        || _        t          j        j
                            ||j        |j                  }|                                S )Nr   z/Dropout is not supported in ring attention yet.r   F)NotImplementedErrorr-   r@   rA   r   r   rB   rC   init_parallel_env
_get_groupapidtensor_to_localr   r   r   save_for_backwardr   r   r   dtensor_from_localr;   )ctxqueryrY   rZ   r   r   r   r   r   rN   r$   rO   r~   r?   r   rL   rM   r   r   out_dtensors                       r   forwardzRingFlashAttention.forward  s    S==%A   !**,,>''..]__
225*EE   &&(,==5%u'7
 
 &*;;!3>
 
	 (,==5%u'7
 
  I4

 

S 	eUCS)LLL 1!(,??#U%5
 
 %%'''r   c                 B   t           j                                        }|j                            d          }|                                 \  }}}}}}	}
| j        }| j        }| j        }|4t          j
        ddgt          j                    t          j                  }t           j        j                            ||j        |j                  }t           j        j                            ||j        |j                  }t           j        j                            ||j        |j                  }t           j        j                            ||j        |j                  }t#          |||||||	|
|||          \  }}}t           j        j                            ||j        |j                  }t           j        j                            ||j        |j                  }t           j        j                            ||j        |j                  }|
|
j        s|||d fS |||fS )Nr   r   )placer   )r-   r@   rA   r   r   saved_tensorr   r   r   r   	to_tensorCPUPlaceint64r   r   r   r   r   r   stop_gradient)r   out_gradrN   r$   r?   r   rY   rZ   r   r   r   r   r   r   r   rL   rM   r   
query_gradkey_grad
value_gradquery_grad_dtensorkey_grad_dtensorvalue_grad_dtensors                           r   backwardzRingFlashAttention.backward  s-   !**,,>''..8;8H8H8J8J5uc5#sI1+M	$ & 0Afo//v|! ! ! (,==5%u'7
 
 &*;;!3>
 
	 (,==5%u'7
 
 +/@@h+X-@
 
 ,N,
 ,
(
Hj "/3FF*E,<
 
  -1DDc&
 
 "/3FF*E,<
 
  )@ " "	  &'79KKKr   Nr   FNT)ri   rj   rk   staticmethodr   r   r   r   r   r   r     sd         4( 4( 4( \4(l 7L 7L \7L 7L 7Lr   r   )Fr   )r   FN)r   paddle.distributedr   r-   paddle.nn.functionalnn
functionalrs   r   r*   r6   r8   ro   r   r   r   r   autogradPyLayerr   r   r   r   <module>r      su     ! ! ! ! ! !                          61 1 1*N1 N1 N1 N1 N1 N1 N1 N1d ?D< < < <.* * *   r r r r| {A {A {A {A|pL pL pL pL pL0 pL pL pL pL pLr   