
    |-j                        d dl Z d dlmZmZmZ d dlmZ d dlZddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ de	deeee                  ee         f         fdZ  G d d          Z! G d d          Z"dS )    N)floorgcdsqrt)Any   )PreTrainedConfig)ContinuousBatchingConfig)is_flash_attention_requested   )BlockManagerCacheAllocatorFullAttentionCacheAllocatorSlidingAttentionCacheAllocator)DistributedHelper)resolve_max_memory_percent)RequestStateRequestStatusget_device_and_memory_breakdownloggerconfigreturnc                 &  	 t          | dd          		5t          | dd          dndfdt          | j                  D             	i }t          	          D ]"\  }}|                    |g           |gz   ||<   #t          d |                                D              }g }|                                D ]F\  }}t          dt          |          |          D ]"}|	                    ||||z                       #G	fd	|D             }||fS )
a  
    Group layers depending on the attention mix, according to VLLM's hybrid allocator rules:
        - Layers in each group need to have the same type of attention
        - All groups have the same number of layers

    For a model with the following layer types: ["sliding", "full", "full", "sliding", "full", "full", "full", "full"]
    We would get four groups: [0, 3], [1, 2], [4,5] and [6,7].
    layer_typesNsliding_windowsliding_attentionfull_attentionc                     g | ]}S  r   ).0_	attn_types     q/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/generation/continuous_batching/cache.py
<listcomp>z-group_layers_by_attn_type.<locals>.<listcomp>*   s    JJJQyJJJ    c                 ,    g | ]}t          |          S r   )len)r   indicess     r"   r#   z-group_layers_by_attn_type.<locals>.<listcomp>2   s    IIIs7||IIIr$   r   c                 ,    g | ]}|d                   S )r   r   )r   lgr   s     r"   r#   z-group_layers_by_attn_type.<locals>.<listcomp>:   s"    ===";r!u%===r$   )
getattrrangenum_hidden_layers	enumerategetr   valuesitemsr&   append)
r   layer_countsi
layer_type
group_sizelayer_groupsr'   group_typesr!   r   s
           @@r"   group_layers_by_attn_typer8      sf    &-66K+26;KT+R+R+^''dt	JJJJ%0H*I*IJJJ L";// J J:#/#3#3J#C#Cqc#IZ   II<3F3F3H3HIIIJJ L+1133 = =
Gq#g,,
33 	= 	=AA
N(: ;<<<<	= >======K$$r$   c                      e Zd ZdZej        fdededej        e	z  de
dee	ef         dej        dd	fd
ZdededefdZdede	deded	z  fdZde	dd	fdZdefdZde	dededeee                  d	z  deee                  dd	fdZde	dededej        dd	f
dZdededee	ef         fdZdej        dej        dedeej                 deej                 deej        ej        f         fdZdede	fd Zde	d!ee         defd"Zd#ed$edd	fd%Zd&ee         d'ee         dd	fd(Z d)e	d*ee	         deee         ee         f         fd+Z!d-d,Z"d	S ).PagedAttentionCacheu  
    Manages the cache for a paged attention mechanism, inspired by VLLM's hybrid allocator. The cache relies on making
    groups of layers to reduce the complexity of cache management and fragmentation.

    The cache uses a three-level hierarchy:
    - Pages: The smallest unit of cache, a page has a size of [num_heads, head_size], which is the space needed to
        store the key or value states for one token and one layer. For a model with only full-attention layers, to store
        the KV cache of one token, we need `2 * num_layers` pages: key and values each take `num_layers` pages.
        Pages are grouped into blocks:
    - Blocks: A block is a collection of `block_size` pages, serving as the allocation unit to reduce management
        complexity and fragmentation. Cache is allocated and freed block by block, not page by page. One block is
        allocated to one layer group, which only has one attention type, like full-attention or sliding-attention.
        If all layers in the model have the same attention type, then all layers will be in the same group. There is
        more than one group if and only if the model has a mixed attention types, like layers with full-attention and
        layers with sliding-attention.
    - Cache tensors: The physical supports for the cache. There are as many cache tensors as there are layer in a
        layer group, and the shape of the cache tensor is `[num_blocks * block_size, num_heads, head_size]`.

    Grouping layers into groups is useful because when we allocate one block to a group N, the block allocated is the
        same for all layers in group N, equivalently it is allocated across all cache tensors. This allows us to
        efficiently allocate and free blocks, and to efficiently read and write key and value states.

    For instance, imagine we have 8 blocks of cache and a model with two layer groups: a full-attention group with 3
    layers and a sliding-attention group with 3 layers. At creation time, the physical cache tensors look like this:

    cache_tensor_0: □ □ □ □ □ □ □ □
    cache_tensor_1: □ □ □ □ □ □ □ □
    cache_tensor_2: □ □ □ □ □ □ □ □

    where □ means the blocks is not allocated to any layer group yet. We have 3 cache tensors because there are
    3 layers per group.
    We allocate 1 block to each group, after allocation, the cache tensors look like this:

    cache_tensor_0: ✖ ◉ □ □ □ □ □ □
    cache_tensor_1: ✖ ◉ □ □ □ □ □ □
    cache_tensor_2: ✖ ◉ □ □ □ □ □ □

    where ✖ means the block is allocated to the full-attention group, and ◉ means the block is allocated to the
    sliding-attention group.
    Now, if we continue to generate, and the sliding window has been reached, we only need to allocate a new block
    for the full-attention group, and the cache tensors look like this:

    cache_tensor_0: ✖ ◉ ✖ □ □ □ □ □
    cache_tensor_1: ✖ ◉ ✖ □ □ □ □ □
    cache_tensor_2: ✖ ◉ ✖ □ □ □ □ □

    And after further generation, when we need a new block allocated:

    cache_tensor_0: ✖ ◉ ✖ ✖ □ □ □ □
    cache_tensor_1: ✖ ◉ ✖ ✖ □ □ □ □
    cache_tensor_2: ✖ ◉ ✖ ✖ □ □ □ □

    This would not have been possible if all layers were in the same group: we would have had to allocate a new block
    for the sliding-attention group, although it is not needed.
    r   continuous_batching_configdevicedistributed_helpertp_plandtyper   Nc                    || _         || _        || _        t          |dd          }||n|j        | _        t          |dd          }||n|j        |j        z  | _        |j        | _        | j        dk    rt          d| j                   t          |          \  }	}
t          |	d                   }t          |	          | _        i | _        i | _        t          |	          D ]E\  }}|
|         dk    r|j        nd}t          |          D ]\  }}||f| j        |<   || j        |<   Fd}d	D ]}||v sd
|z   |v sd} n|j        }|dk    r;|r9| j        |z  dk    rt          d| j         d| d          | xj        |z  c_        | j        | j        z  }t%          | j                   rd}n	d|
v rd}nd}|j        | j        z  }d|j        d|j        z  z   f}d|z  |j        |z   d|z  z   f}t)          ||| j        |||g|          }|j        t-          |d           |                    |j        |j        |j        | j                  \  }}|dk    rt5          j        ||g| j        t4          j                  }|                    |           t=          |d                                                   t=          |d                                                   }}|| _        || _        | j        | j        z  | _         tC          j"        d| j        d| j        d|d| j        d|
           |j#        }||j$        }|| _#        g | _%        g | _&        |dz   | j        z  | j        | j        f| _'        | j'        d         dz
  | _(        | j(        dz
  | _)        tU          |          D ]}t5          j+        | j'        | j        | j                  }t5          j+        | j'        | j        | j                  } t4          j,        -                    |           t4          j,        -                    |            | j%        .                    |           | j&        .                    |            tC          j"        d| j'        d| j%        d         j/        d| j%        d         0                                           |j1        | _1        g | _2        d| _3        d| _4        d| _5        t          |
          D ]\  }}!|!dk    r-tm          || j        | j1                  }"| xj3        dz  c_3        n\|!dk    rDto          || j        |j        | j(        | j)                  }"| xj4        dz  c_4        |"j8        | _5        nt          d|!           | j2        .                    |"           | j1        o|
dgk    | _9        tu          || j        |dk               | _;        d| _<        d| _=        dS )!a/  Initialize a paged attention cache for efficient memory usage. Also turns in prefix sharing if the model has
        only full attention layers.

        Args:
            config: Model configuration
            continuous_batching_config: Continuous batching configuration containing cache parameters
            device: Device for the cache tensors
            distributed_helper: TP-aware helper. Used to dispatch attention heads and ensure coherent cache size
            tp_plan: Tensor parallelism plan
            dtype: Data type of the cache
        num_key_value_headsNhead_dimr   z%Block size must be positive, but got r   r   T)zlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzmodel.FzNumber of key value heads z+ must be divisible by tensor parallel size .   )r;   	page_size
num_groupsr5   activation_peaksnum_attention_masks)	cb_confighas_logit_processors)
num_blocksmax_batch_tokensmax_memory_percentcache_dtyper<   r?   z7PagedAttentionCache initialized with self.num_blocks = z, self.block_size = z, page_size = z, self.max_batch_tokens = z num_attention_masks = )r?   r<   zself.cache_shape = z self.key_cache[0].shape = z self.key_cache[0].numel() = r   )allow_block_sharingzInvalid group type: )tp_on)>r   r?   r<   r*   num_attention_headsrA   hidden_sizerB   
block_size
ValueErrorr8   r&   rF   sliding_windowslayer_index_to_group_indicesr-   r   tp_sizer
   
vocab_sizePagedAttentionMemoryHandlerrM   r   %infer_num_blocks_and_max_batch_tokensrK   rL   torchtensorint64tp_all_reduce_minintitem	num_pagesr   infomax_blocks_per_requestfallback_max_blocks_per_request	key_cachevalue_cachecache_shapesentinel_indextrash_indexr+   empty_dynamomark_static_addressr1   shapenumelrP   group_cache_managersnum_full_attention_groupsnum_sliding_attention_groups%max_sliding_window_blocks_per_requestr   r   _max_blocks_per_requestuse_prefix_sharingr   _block_manager_total_prefix_length_block_table_key)#selfr   r;   r<   r=   r>   r?   kv_headsrB   r6   r7   r5   r3   groupr   jlayerkv_is_tpkeyrX   rE   rH   q_bytes_per_tokenlm_head_peakattention_peakmemory_handlerrK   rL   syncrd   r    new_layer_key_cachenew_layer_value_cache
group_typecms#                                      r"   __init__zPagedAttentionCache.__init__w   s   ( 
 6#8$??4<4HfNh 6:t44)1)=XX6CUY_YsCs 5??aVT_VVWWW %>f$E$E!ka))
l++!,.)!,// 	= 	=HAu6A!nH[6[6[V22abN%e,, = =5<=q61%8.<$U++= M 	 	C7NNhn&?&?  %,Q;;8;''1Q66  A1I  A  Av}  A  A  A   $$0$$ MD$<<	'44 	$"# K//"#"# #6FV%6!66

 	M!22Q]B

 5'A!*N; 3
 
 
 &8@&1Kbfgggg'5'['[1<7H9L
	 (\ (
 (
$
$ Q;;<-= >t{Z_ZefffD00666+.tAw||~~+>+>DGLLNN@S@S(J % 04?:CDO C C$/ C C`i C C$C C*=C C	
 	
 	
 "<!R!)%?%_"&<# .0/1 (!^t>@XZ^Zgh".q1A5.2z"" 	; 	;A"'+d.>djY]Yd"e"e"e$)K0@
[_[f$g$g$g!M--.ABBBM--.CDDDN!!"5666##$9::::ht'hhT^A->-Dhh$.YZJ[JaJaJcJchhiii $>#Q :<!)*&,-)562&{33 	1 	1MAz---0DOY]Yqrrr..!3...2223t(=t?RTXTd  11Q611=?=W:: !D
!D!DEEE%,,R0000 #'":"`{O_N`?`*:tgXYkZZZ)*! !%r$   num_requested_blocksallocated_blocksc                     || j         z  }| j        r3t          | j        |z
  d          }|t	          ||          | j        z  z  }||                                 k    S )aN  Returns a boolean indicating if the allocation of (num_requested_blocks) blocks will be successful. The
        number of newly allocated blocks needed is predicted by the following rules:
        - for full attention groups: since there is no sliding window for full attention layers, one requested block is
            always equivalent to one newly allocated block for EACH full attention group
        - for sliding window groups: because of the sliding window, the number of blocks allocated to a request is
            capped. Using the number of already (allocated_blocks) we can compute the number of new blocks to actually
            allocate to the request, which can be lower than the number of requested blocks. That number is the same for
            all sliding window groups, as only one sliding window size is supported.
        r   )rq   rr   maxrs   minget_num_free_blocks)ry   r   r   needed_blocksblocks_lefts        r"   will_allocation_be_successfulz1PagedAttentionCache.will_allocation_be_successful'  sj     -t/MM, 	hdHK[[]^__KS.BCCdFgggM 8 8 : :::r$   n_blocks
request_idc                     |                      ||          sdS d}| j        D ]E}|                    ||| j                  }|t	          d| d|           t          ||          }F|S )zAllocate cache blocks across all layer groups for a given request. Actual allocation is done by the cache
        managers, and this method only returns the maximum number of blocks actually allocated across all managers.Nr   zFailed to allocate z blocks for request )r   rp   allocate_blocksrv   rU   r   )ry   r   r   r   max_allocatedr   num_allocated_blockss          r"   r   z#PagedAttentionCache.allocate_blocks9  s     11(<LMM 	4+ 	E 	EB#%#5#5h
DL_#`#` #+ !ax!a!aU_!a!abbb/CDDMMr$   c                 P    | j         D ]}|                    || j                   dS )zFree all allocated cache blocks for a given request across all layer groups. Actual deallocation is done
        by the cache managers.N)rp   free_blocksrv   )ry   r   r   s      r"   r   zPagedAttentionCache.free_blocksH  s:     + 	< 	<BNN:t':;;;;	< 	<r$   c                     | j         j        S )zHGet the current number of unallocated blocks available for new requests.)rv   num_free_blocks)ry   s    r"   r   z'PagedAttentionCache.get_num_free_blocksN  s    "22r$   past_lengthquery_length
read_indexwrite_indexc                 "   t          | j        |          D ]/\  }}|                    |                    |||                     0|Et          | j        |          D ]1\  }}|                    |                    |||                     0dS dS )aM  Retrieve physical cache indices for reading KV states in the cache across all layer groups. This method
        coordinates with all cache managers to build the complete set of read indices needed for attention computation.
        When read_index is None, the batch has no cache reads and we only compute the write indices.
        N)ziprp   extendget_write_indicesget_read_indices)	ry   r   r   r   r   r   r   write_indicesread_indicess	            r"   extend_read_and_write_indicesz1PagedAttentionCache.extend_read_and_write_indicesR  s     "%T%>!L!L 	^ 	^B  !5!5j+|!\!\]]]]!$'(A:$N$N ` ` L##B$7$7
KQ]$^$^____ "!` `r$   block_tablec                 v    t          | j                  D ]#\  }}|                    |||||                    $d S )N)r-   rp   fill_block_table)ry   r   r   r   r   r3   r   s          r"   r   z$PagedAttentionCache.fill_block_tablef  sR     t899 	W 	WEAr
K{ST~VVVV	W 	Wr$   c                     i }| j         dk    r||z   |d<   | j        dk    r#|t          || j        j        dz
            z   |d<   |S )zRetrieve the key sequence length for the given request_id across all layer types. Returns a dictionary of
        layer types to their corresponding key sequence lengths.r   r   r   r   )rq   rr   r   r   r   )ry   r   r   	seqlens_ks       r"   get_seqlens_kz!PagedAttentionCache.get_seqlens_kl  s`     	)A--*5*DI&',q00-9CT[MgjkMk<l<l-lI)*r$   
key_statesvalue_states	layer_idxc                     | j         |         \  }}||         }||         }	| j        |         }
| j        |         }|                    dd                              d          }|                    dd                              d          }|                                dk    r2|
                    d|	|           |                    d|	|           ||fS | j        |         }|dk    r[|
                    d|	|           |                    d|	|           t          j	        |
d|          }t          j	        |d|          }n|| j
        k                        d                              d          }t          j	        |
d|          }|                    ||           t          j	        |d|          }|                    ||           |
                    d|	|           |                    d|	|           ||fS )a\  Update the cache with new key-value states for a specific layer, and retrieves the relevant KV states from
        the cache for attention computation. The behavior differs based on the layer's attention type:

        - Full attention: New KV states are written to cache, then complete sequence is read from cache
        - Sliding window: Old KV is read from cache along with extra spaces for the new KV, then new KV is written to
            cache. This is because new KV might overwrite the old KV, so we need to read the old KV first.

        When the layer's read index is empty, the batch has no cache reads (all requests are non-chunked prefills): we
        only write to the cache and return the input KV states directly, skipping the index_select read-back.

        Returns the complete KV states (cached + new) for attention computation.
        r   rD   r   )rW   rf   rg   	transposesqueezero   index_copy_rV   r\   index_selectri   	unsqueezemasked_scatter_)ry   r   r   r   r   r   	group_idxlayer_idx_in_grouplayer_read_indexlayer_write_indexk_cachev_cacher   key_states_with_cachevalue_states_with_cachemasks                   r"   updatezPagedAttentionCache.updatew  s#   * )-(I)(T%	%%i0'	2.!34"#56))!Q//77::
#--a33;;A>> !!##q((#4jAAA#4lCCC|++ -i8Q#4jAAA#4lCCC$)$6wCS$T$T!&+&8!EU&V&V## %(;;FFrJJTTUWXXD$)$6wCS$T$T!!11$
CCC&+&8!EU&V&V##33D,GGG#4jAAA#4lCCC %&===r$   flash_attn_with_kvcache_fnc                     | j         gt          j        |          j                                        }d|v rd| _         n0d|v rd| _         n$t          dt          j        |                     | j         S )zA function to get the name of the block table key for the given flash_attn_with_kvcache_fn. The function's
        signature is only inspected once. This is necessary because different version of flash have different names for
        the block table key.Nr   
page_tablezOflash_attn_with_kvcache_fn does not have a block_table or page_table argument: )rx   inspect	signature
parameterskeysrU   )ry   r   kwarg_namess      r"   get_block_table_keyz'PagedAttentionCache.get_block_table_key  s      (!+,FGGRWWYYK++(5%%,,(4%%  Vfmfw  yS  gT  gT  V  V   $$r$   
prompt_idsc                 B   d}g }t          t          |          | j        z            D ]}||| j        z  |dz   | j        z           }| j                            ||d          }| j        j                            |          }|0|                    |           | j                            |            |r?t          j
        d| dt          |           d           | j        d         }||j        |<   t          |          | j        z  }	| xj        |	z  c_        |	S )a  Searches for a prefix match in the cache for the given (prompts_ids). If one is found, we reference the
        matching blocks in the (request_id), increase the reference count of the blocks and return the number of blocks
        that match. If no prefix match is found, we return 0.Nr   r   )group_idzFound prefix match for request z with z blocks)r+   r&   rT   rv   compute_hash_hash_to_idr.   r1   increase_ref_countr   debugrp   r   rw   )
ry   r   r   current_hashr   btokensblock_idr   prefix_lengths
             r"   search_prefix_matchz'PagedAttentionCache.search_prefix_match  s?    s:$/9:: 		 		ADO 3q1u6O OPF.;;L&[\;]]L*6::<HHH# ''111#66x@@@@ 	:Lk:kkSQaMbMbkkklll*1-B)9BN:&,--?!!]2!!r$   statenum_complete_blocksc                     |dk    s|j         t          j        k    rdS | j        D ]C}|j        r:| j                            ||j        |j                 |j	        |j
        z              DdS )a  Marks the blocks allocated to a request (state) as complete if they are shareable and they have been computed
        in the forward pass. A complete block is a block where the KV cache has been fully computed: if the block has
        enough space to hold the cache for N tokens, the block is marked as complete when the cache data is present for
        the N tokens. If block sharing is off, this is a no-op.r   N)r   r   r   )statusr   FINISHEDrp   uses_block_sharingrv   !mark_shareable_blocks_as_completer   r   initial_tokensgenerated_tokens)ry   r   r   r   s       r"   r   z5PagedAttentionCache.mark_shareable_blocks_as_complete  s     !##u|}7M'M'M4+ 	 	B$ #EE(;%'^E4D%E % 4u7M M F   	 	r$   list_source_blockslist_forked_blocksc                    t          j        || j        t           j                  }t          j        || j        t           j                  }t	          | j        | j                  D ]i\  }}|                    d| j        | j	        | j
                  }|                    d| j        | j	        | j
                  }||         ||<   ||         ||<   jdS )z;Copy the cache from the source blocks to the forked blocks.rO   r   N)r\   r]   r<   int32r   rf   rg   viewrT   rA   rB   )ry   r   r   source_blocksforked_blocksrf   rg   s          r"   
copy_cachezPagedAttentionCache.copy_cache  s    %7SXS^___%7SXS^___&)$.$:J&K&K 	D 	D"I{!r4?D<TVZVcddI%**2t@XZ^ZghhK'0'?Im$)4])CK&&		D 	Dr$   source_request_iddestination_request_idsc                     g g }}| j         D ]K}|                    ||| j                  \  }}|                    |           |                    |           L||fS )zhFork the cache of a request (state) into the one of a list of requests with the given (dst_request_ids).)rp   fork_blocksrv   r   )ry   r   r   r   destination_blocksr   
src_blocks
dst_blockss           r"   fork_requestz PagedAttentionCache.fork_request  sw     -/)+ 	2 	2B%'^^4EG^`d`s%t%t"J
  ,,,%%j1111000r$   c                     t                      }| j        D ].}|                    |j                                                   /|D ]}|                     |           dS )a  Free all blocks allocated to requests across all cache managers. This preserves prefix hashes in the block
        manager (blocks become initialized rather than uninitialized if they were complete), allowing prefix sharing
        to work across generation sessions.N)setrp   r   r   r   r   )ry   all_request_idsr   r   s       r"   free_all_requestsz%PagedAttentionCache.free_all_requests  st     %%+ 	: 	:B""2>#6#6#8#89999) 	) 	)JZ((((	) 	)r$   )r   N)#__name__
__module____qualname____doc__r\   float16r   r	   r<   strr   dictr   r?   r   r`   boolr   r   r   r   listr   Tensorr   r   tupler   r   r   r   r   r   r   r   r   r$   r"   r:   r:   >   s       6 6~ #]n% n% n% %=n% s"	n%
 .n% c3hn% {n% 
n% n% n% n%`;# ;Y\ ;ae ; ; ; ;$  PS X[^bXb    <c <d < < < <3S 3 3 3 3`` ` 	`
 cOd*` $s)_` 
` ` ` `(WW,/W?BWQVQ]W	W W W W	 	C 	DcN 	 	 	 	;>L;> l;> 	;>
 &;> %,';> 
u|U\)	*;> ;> ;> ;>z%c %c % % % % c tCy S    4| Z] bf    "DT#Y DDQTI DZ^ D D D D	1c 	1DQTI 	1Z_`deh`ikopskt`tZu 	1 	1 	1 	1) ) ) ) ) )r$   r:   c                      e Zd ZdZej        Zej        ZdZ	dZ
dededededeeeef                  d	ed
dfdZedded
efd            Zdeeef         dej        d
eeeeef         fdZedededed
efd            Zdeeef         dededz  dedz  dej        d
eeef         fdZdddej        fdedz  dedz  dedej        d
eeef         f
dZdededej        d
efdZdS )rZ   u  Determines the optimal number of pages (N) and max batch tokens (M) for the paged attention cache, given
    available GPU memory. The relation between N and number of blocks is: num_blocks = N // block_size.

    The memory footprint is a polynomial in N and M, where each term maps to a tensor allocated in
    ``ContinuousBatchingIOs._setup_static_tensors`` or ``PagedAttentionCache.__init__``:

        memory(N, M)  =  coeff_n · N  +  coeff_m · M  +  coeff_nm · N·M  +  coeff_mm · M²

    See ``_equation_coefficients`` for the breakdown.  All three solving modes (auto, fixed-N, fixed-M) reduce to
    solving this equation, which is at most quadratic in one variable.
    i   i   r;   rE   rF   r5   rG   rH   r   Nc                     |j         | _         || _        || _        || _        || _        || _        |j        | _        | j        |j        | _        |j        rdnd| _	        |j
        rdnd| _        dS )uN  Initialize the memory handler. `activation_peaks` is a list of `(Δcn, Δcm)` pairs giving the activation memory
        contributions proportional to N (pages) and M (batch tokens) for each peak. Memory must satisfy the constraint
        at every peak, so we solve each polynomial independently and take the most restrictive result.NrD   r   )rT   rE   rF   r5   rG   rH   rd   re   return_logprobsnum_output_rowsuse_async_batchingio_multiplier)ry   r;   rE   rF   r5   rG   rH   s          r"   r   z$PagedAttentionMemoryHandler.__init__$  s     5?"$$ 0#6 &@&W#&.*D*dD'$>$NUqqTU"<"OVQQUVr$         ?rM   c                 v    t                      \  }}}}|t          ||          z
  }t          || z            }|S )z^Calculate available GPU memory for cache allocation, accounting for already allocated tensors.)r   r   r`   )rM   r    totalreserved	allocatedavailable_memorys         r"   get_available_memoryz0PagedAttentionMemoryHandler.get_available_memory>  sI     )H(I(I%5(I 3y(#;#;;/2DDEEr$   peakrN   c                    | j         j        }| j        j        }|j        }| j        }|\  }}d| j        z  | j        z  |z  || j        z  dz  z   ||z  z   }	||z  |dz  |z  z   || j        z  |z  z   || j        z  | j        z  |z  z   || j        z  dz  z   || j        z  dz  z   }
|| j	        z  |z  }|| j	        z  |z  }|	|
||fS )u  Returns `(coeff_n, coeff_m, coeff_nm, coeff_mm)` for the memory polynomial of a single activation peak.
        `peak = (Δcn, Δcm)` is the peak-specific activation contribution; the rest of the coefficients are shared
        across peaks. Each addend is annotated with the tensor it corresponds to in
        `ContinuousBatchingIOs._setup_static_tensors` (or the forward pass, for activation terms).
        rD         )
_input_dtypeitemsize_activation_dtyper  r5   rE   rF   r  rd   rH   )ry   r  rN   r3   ackdelta_ndelta_mcoeff_ncoeff_mcoeff_nmcoeff_mms                r"   _equation_coefficientsz2PagedAttentionMemoryHandler._equation_coefficientsH  s)    &"+  $.014$/!A%&k 	 aK!eai$&&*+ $/!)*,-..
 $/!A%& $/!A%& 	 t//!3t//!3833r$   r  r   r  c                     | dk    r| |z  S |dz  d| z  |z  z
  }|dk     rt          d| d          | t          |          z   d| z  z  }|dk     rt          d| d          |S )uQ   Largest positive root of a·x² + b·x + c = 0. Falls back to linear when a == 0.r   rD      z!No real solution (discriminant = )zNo positive solution (root = )rU   r   )r  r   r  discriminantroots        r"   _solve_quadraticz,PagedAttentionMemoryHandler._solve_quadraticn  s     6626M!ta!eai'!PPPPQQQT,'''AE2!88DTDDDEEEr$   	availablerK   rL   c                 v   |                      ||          \  }}}}	||}d}
|                     ||
z  |	|
dz  z  z   |||
z  z   |           }t          ||
z            }|| j        k    r
| j        }d}n*t	          t          |          | j        z  | j                  }|G|}t          |||z  z
  |	|dz  z  z
  |||z  z   z            }t	          || j        z  | j                  }nQ|O|| j        z  }|                     |	|||z  z   ||z  |z
            }t	          t          |          | j                  }||fS )zSolve for `(num_blocks, max_batch_tokens)` against one activation peak's memory polynomial. Clamps to upper
        bounds. Either input may be None; whichever is None is solved for.Ng{Gz?rD   )r  r%  r`   _upper_bound_max_batch_tokensr   r   rT   _upper_bound_num_blocks)ry   r  r&  rK   rL   rN   cnr   cnmcmmmrb   MNs                 r"   _solve_for_peakz+PagedAttentionMemoryHandler._solve_for_peak{  sm     66t[IIBS"2":A--cAgad
.BBaKR[Q[\\I"9q=11$"DDD#'#E !

 y!1!1T_!DdFbcc
 Ay261C!Q$J>2a<PQQIY$/94;WXXJJ%T_,A%%c2a<a)9KLLA"588T-OPP+++r$   g?c                    |                      |          }t          j        d|            t          d          }t          d          }| j        D ]>}|                     |||||          \  }	}
t          ||	          }t          ||
          }?||}}|                     |||          }||k    rt          d| d|           ||fS )u  Solve for the missing variable(s) in the memory polynomial (see ``_equation_coefficients``). There is one
        polynomial per activation peak; we solve each independently and take the most restrictive (smallest) result.
        When both `N` and `M` are unknown, assumes `M = m·N` (m = 0.01, i.e. one batch fills ~1 % of the cache) and
        solves the resulting quadratic in N.
        zCache memory: infzMemory footprint z is more than available memory )	r  r   rc   floatrG   r0  r   compute_memory_footprintMemoryError)ry   rK   rL   rM   rN   r&  acc_num_blocksacc_max_batch_tokensr  r   m_batch_tokensmemory_footprints               r"   r[   zAPagedAttentionMemoryHandler.infer_num_blocks_and_max_batch_tokens  s     --.@AA	0Y00111u$U||) 	M 	MD'+';';D)ZYikv'w'w$Hn ::N#&';^#L#L  '57K$
88EUWbcci''n2Bnnclnnooo+++r$   c                     || j         z  }|}d}| j        D ]J}|                     ||          \  }}	}
}||z  |	|z  z   |
|z  |z  z   ||z  |z  z   }t          ||          }K|S )zaEvaluate the memory polynomial at concrete (N, M) values, taking the max across activation peaks.r   )rT   rG   r  r   )ry   rK   rL   rN   r/  r.  max_memory_footprintr  r*  r   r+  r,  r9  s                r"   r4  z4PagedAttentionMemoryHandler.compute_memory_footprint  s    ( ) 	O 	OD#::4MMBC!AvQq1<sQw{J#&';=M#N#N  ##r$   )r  )r   r   r   r   r\   bfloat16r  r   r  r(  r)  r	   r`   r   r  r   staticmethodr3  r  r?   r  r%  r0  r   r[   r4  r   r$   r"   rZ   rZ     sh       
 
 ;L$(!"W$<W W 	W
 W uS#X/W !W 
W W W W4             \ #4#s(O#427+#4	sCc!	"#4 #4 #4 #4L 
E 
e 
 
% 
 
 
 \
#,CHo#, #, $J	#,
 *#, [#, 
sCx#, #, #, #,N "&'+$'#(=, ,$J, *, "	,
 [, 
sCx, , , ,:
$3 
$# 
$\a\g 
$lo 
$ 
$ 
$ 
$ 
$ 
$r$   rZ   )#r   mathr   r   r   typingr   r\   configuration_utilsr   generation.configuration_utilsr	   utils.genericr
   cache_managerr   r   r   r   distributedr   initializationr   requestsr   r   r   r   r  r   r`   r   r8   r:   rZ   r   r$   r"   <module>rG     s    ! ! ! ! ! ! ! ! ! !        3 3 3 3 3 3 F F F F F F 9 9 9 9 9 9 t t t t t t t t t t t t * * * * * * 6 6 6 6 6 6 Z Z Z Z Z Z Z Z Z Z Z Z%&6 %5d3iRVWZR[A[;\ % % % %BP) P) P) P) P) P) P) P)hu$ u$ u$ u$ u$ u$ u$ u$ u$ u$r$   