
    |-j                        d dl Z ddlmZ ddlmZ de j        j        de j        de j        de j        d	e j        dz  d
ede j        de j        ee	e j        f         z  de
de
ee	e
f         z  de j        dz  dee j        df         fdZe j        j        de j        j        de j        de j        de j        d
ede j        dee
e
f         de j        de j        fd            ZdS )    N   )PagedAttentionCache)!lazy_import_paged_flash_attentionmoduleqkvattention_maskcachecu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kblock_tablereturnc                    t          | j        j                  \  }}t          | dd          sdn| j        dz
  df}|dk    rdnd}t          |t                    r||         }|	|         }	|
*|                    ||| j        |d	         |d
                   \  }}d|v rd|	                    d          ini } ||
                    dd                              d                                          |                                |                                |                    t          j                  |                    t          j                                                  ||	f| j        d|d|}t          |t$                    r|d         }n%d|v r
d|d         ini }t'          | ||||||||
f	i |}|dfS )ap  Performs the forward pass of attention with paged key-value cache. This function handles the cache updates and
    performs the attention computation. For decode-only batches (when block_table is provided), uses
    `flash_attn_with_kvcache` for fused attention + cache update. Otherwise uses `flash_attn_varlen_func`.
    See the [paged attention guide](https://huggingface.co/docs/transformers/en/paged_attention) for more details.

    Args:
        q: (1, nheads, total_q, headdim), where total_q = total number of query tokens in the batch.
        k: (1, nheads_k, total_k, headdim), where total_k = total number of key tokens in the batch.
        v: (1, nheads_k, total_k, headdim), where total_k = total number of key tokens in the batch.
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        block_table: (num_groups, batch_size, max_blocks_per_seq), dtype int32. Block table for paged KV cache.
            If provided, uses flash_attn_with_kvcache for fused attention + cache update. For each request, the block
            table is a vector of size (max_blocks_per_seq,) with indices indicating the physical location of the cache
            to read from and write to. The kernel, using the cache_seqlens for that request, knows how much cache to
            read and dispatches the read using the block table. Same for the write. If a request has fewer than
            max_blocks_per_seq blocks, the block table is padded with -1s to indicate that the block is not allocated.
    sliding_windowF)r      r   full_attentionsliding_attentionN
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   s_auxr   T)softmax_scalecausalwindow_size)r   config_attn_implementationgetattrr   
isinstancedictupdater   get	transposesqueeze
contiguoustotorchint32clonescalingtuple_paged_decode_forward)r   r   r   r	   r
   r   r   r   r   r   r   kwargsflash_attn_varlen_funcflash_attn_with_kvcacher   
layer_typecustom_kwargsattn_outputflash_kwargss                      e/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardr:      s   J 7X*7 733
 &-V5Eu%M%MqXXTZTilmTmopSqN%3x%?%?!!EXJ-&& 0%j1#J/ ||&l+}-  
 
1 ;BV:K:K&**W"5"566QS,,KK1%%a((3355LLNNLLNNU[))U[))//11
 !.&
 
 
 
 k5)) 	)%a.K 6=5F5F11B+Aq!UM>CZ\g
 
kw
 
     r   c	                 X   |j         | j                 \  }
}|j        |                             d|j        |j        |j                  }|j        |                             d|j        |j        |j                  }|                    dddd          	                                }|                    dddd          	                                }|                    dddd          	                                }|
                    d          }|d|dz            |d|         z
  dz
                      t          j                  }||
         |	|                    |          <    |d	||||||| j        d|d	|	}t!          |t"                    r|d         }|                    d          S )
zaDecode fast path using flash_attn_with_kvcache. Disabled because FA3 has issue with tracing this.r   r   r   r      NT)	r   k_cachev_cacher   r	   cache_seqlensr   r   r     )layer_index_to_group_indicesr   	key_cacheview
block_sizenum_key_value_headshead_dimvalue_cachepermuter*   sizer+   r,   r-   get_block_table_keyr/   r$   r0   r)   )r   r   r   r	   r   r   r   r4   r   r8   	group_idxlayer_idx_in_groupr>   r?   
batch_sizer@   r7   s                    r9   r1   r1   [   s    %*$FvGW$X!I!o0166r5;KUMfhmhvwwG 2388
Ee7 G 	
		!Q1((**A			!Q1((**A			!Q1((**A J"1zA~#56{
{9SSVWW[[\a\ghhMGRS\G]L**+BCCD)) 


#n"   K +u%% %!!nq!!!r;   )r,   generation.continuous_batchingr   modeling_flash_attention_utilsr   nnModuleTensorr%   strintr0   r:   compilerdisabler1   rA   r;   r9   <module>rX      s    @ @ @ @ @ @ N N N N N NQHOQ|Q |Q |	Q
 L4'Q Q <Q <$sEL'8"99Q Q S#X&Q $Q 5<Q Q Q Qh ."HO."|." |." |	."
 ." <." #s(O." ." \." ." ." ." ." ."r;   