
    |-jx%                        d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlZd dlmZ d dlmZ d	d
lmZmZmZmZ  G d d          Ze G d d                      ZdedefdZdedededefdZd,dedededefdZdedededefdZ	 d-dej        dee         dee         d eddf
d!Z d"ed#ed$ed%ed&edee         fd'Z!d(e j"        dee         fd)Z#d* Z$ed+             Z%dS ).    N)OrderedDict)contextmanager)	dataclass)ceillog2)Any)PretrainedConfig)is_torch_greater_or_equal   )FutureRequestStateRequestStateRequestStatusloggerc                       e Zd ZdZdeddfdZddZdeedf         dej	        j
        dz  fd	ZddeddfdZdeedf         dej	        j
        ddfdZdS )CudaGraphBufferz>A fixed-size dict for CUDA graphs with LRU eviction when full.max_sizereturnNc                 j    |dk    rt          d|           || _        t                      | _        d S )Nr   z#max_size must be positive, but got )
ValueErrorr   r   _storage)selfr   s     q/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/generation/continuous_batching/utils.py__init__zCudaGraphBuffer.__init__    s8    q==M8MMNNN LWMM    c                 \    | j         }d| _         |                     d           || _         d S )Nr   T)silent)r   plan_for_new_graph)r   original_max_sizes     r   __del__zCudaGraphBuffer.__del__&   s3     Mt,,,)r   key.c                 r    | j                             |          }|| j                             |           |S N)r   getmove_to_endr   r    graphs      r   	get_graphzCudaGraphBuffer.get_graph,   s8    !!#&&M%%c***r   Fr   c                    t          | j                  | j        k    rj| j                            d          \  }}|st	          j        d|           |                                 t          | j                  | j        k    hd S d S )NF)lastz!Evicting graph for evicted_key = )lenr   r   popitemr   inforeset)r   r   evicted_keyevicted_graphs       r   r   z"CudaGraphBuffer.plan_for_new_graph2   s    $-  DM11)-)>)>E)>)J)J&K DB+BBCCC!!!	 $-  DM111111r   r&   c                 B    |                                   || j        |<   d S r"   )r   r   r%   s      r   	set_graphzCudaGraphBuffer.set_graph9   s&    !!!"cr   )r   N)F)__name__
__module____qualname____doc__intr   r   tupletorchcuda	CUDAGraphr'   boolr   r1    r   r   r   r      s        HHZ Z Z Z Z Z* * * *U38_ 1E1L    " " "$ " " " "#U38_ #UZ5I #d # # # # # #r   r   c                   0    e Zd ZU dZdZeed<   dZeed<   dS )WorkloadHintszRA tiny dataclass containing hints to help choose good continuous batching defaultsr   max_prompt_lengthmax_generated_lengthN)r2   r3   r4   r5   r?   r6   __annotations__r@   r<   r   r   r>   r>   ?   s:         \\s !#!!!!!r   r>   configr   c                     | j         dv S )z:Checks if attention mask is needed for the given (config).)zpaged|eagerz
paged|sdpa)_attn_implementation)rB   s    r   attn_mask_is_neededrE   G   s    &*GGGr   sizeinterval_size	max_valuec                 l    |dk    r|S | dk    rt          | |z            |z  n|}t          ||          S )zQReturn the smallest multiple of (interval_size) >= (size), capped at (max_value).r   )r   min)rF   rG   rH   paddeds       r   pad_to_intervalrL   L   sG    ;?!88T$&''-77Fvy!!!r   value	min_valuec                     t          | t          d|                    } dt          t          t          |                               z  }t	          ||          S )zReturn the smallest power of 2 >= (value), capped at (max_value). If a minimum value is provided, the value is at
    least padded to that value.r      )maxr6   r   r   rJ   )rM   rH   rN   rK   s       r   pad_to_pow2rR   T   sM     s1i(())E#d4;;''(((Fvy!!!r   x	divide_byalign_toc                 d    t          t          | |z                      } | |z  r| || |z  z
  z  } | S r"   )r6   r   )rS   rT   rU   s      r   aligned_dividerW   \   s?    DY  A8| '	XX&&Hr   attention_maskcumulative_seqlens_qcumulative_seqlens_ksliding_windowc                 j   t          j        | j                  j        }t	          t          |          dz
            D ]}||dz            ||         z
  }||dz            ||         z
  }||k     r|dk    r	||z
  dz   }nd}t          ||         ||dz                      }	t          ||         ||dz                      }
t          j        | d|	|
f         j        || j        | j	                  }t          j
        ||          }|dk    r!||z
  |z
  }|t          j        ||          z  }|| d|	|
f<   dS )u  Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
    will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
    equivalent) so it's more of an attention score bias tensor.
    The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
    Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.

    An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:

    CAUSAL MASK:

           █ █ █ █ █ ░ ░ ░
           █ █ █ █ █ █ ░ ░
           █ █ █ █ █ █ █ ░
           █ █ █ █ █ █ █ █

    SLIDING WINDOW MASK:
         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the left
       <─┴─>
     ░ █ | █ █ █ █ █ █ █ █
     ░ ░ | █ █ █ █ █ █ █ █
     ░ ░ | ░ █ █ █ █ █ █ █
     ░ ░ | ░ ░ █ █ █ █ █ █

    ATTENTION MASK (sum of causal and sliding window masks):

           █ █ █ █ █ ░ ░ ░
           █ █ █ █ █ █ ░ ░
           ░ █ █ █ █ █ █ ░
           ░ ░ █ █ █ █ █ █

    Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:

    CAUSAL MASK:

           █ █ █ ░ ░
           █ █ █ █ ░
           █ █ █ █ █

    SLIDING WINDOW MASK:
         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the left
        <┴>
         | ░ █ █ █ █
         | ░ ░ █ █ █
         | ░ ░ ░ █ █

    ATTENTION MASK (sum of causal and sliding window masks):

           ░ █ █ ░ ░
           ░ ░ █ █ ░
           ░ ░ ░ █ █

    r   .)dtypedevice)diagonalN)r8   finfor]   rJ   ranger*   slicefullshaper^   triutril)rX   rY   rZ   r[   rN   iseqlen_qseqlen_kcausal_diagonalquery_range	key_range	minus_infmaskedsliding_diagonals                 r   build_attention_maskrp   c   sx   t N0115I3+,,q011 = ='A.1Ea1HH'A.1Ea1HHh8q==&1A5OOO035I!a%5PQQ.q13GA3NOO	J3Y67= &!(	
 
 
	 I@@@A'(2^Cej5EFFFFF6<sK233-= =r   numstatusnum_q_tokensmax_kv_readcachec           	      r   fdt          |           D             }||z   }t          ||j        z            }g }|D ]y}	t          |	dg|z  d          }
|
_        dg|z  |
_        ||
_        |                    ||
j        d          }||c S |	                    t          |
dd|                     z|S )zPA utility function to create a list of FutureRequestStates for the warmup of CB.c                 ,    g | ]}d j          d| dS )	__warmup____)name).0rg   rr   s     r   
<listcomp>z/create_warmup_future_states.<locals>.<listcomp>   s/    GGGq2v{22Q222GGGr   r   r   )
request_idinitial_tokensmax_new_tokensNT)has_new_tokencomplete_blocksquery_length)ra   r   
block_sizer   _statustokens_to_processposition_offsetallocate_blocksr~   appendr   )rq   rr   rs   rt   ru   request_idstotal_tokensblocks_neededfuture_statesreq_idstate	allocateds    `          r   create_warmup_future_statesr      s     HGGGE#JJGGGK+-L(8899MM 
 
s\?Qbcddd#$#"4 +))-9I1MM	    uD!Zfggg	
 	
 	
 	
 r   request_queuec                     g }|                                  sV	 |                                 }||                    |           n# t          j        $ r Y nw xY w|                                  V|S )z3Drains a queue and returns a list of RequestStates.)empty
get_nowaitr   queueEmpty)r   
new_statesr   s      r   drain_queuer      s    %'J!!## 	!,,..E !!%((({ 	 	 	E	 !!##  s   +A AAc                      t          d          r)t          j                                        } | j        }| |fS d} t          j                                        }| |fS )zReturns a tuple of (mem_pool, graph_pool_id) for CUDA graphs. Since the MemPool object is only available in torch
    2.5+, we only return a graph_pool_id for older versions.z2.5.0N)r
   r8   r9   MemPoolidgraph_pool_handle)mem_poolgraph_pool_ids     r   get_cuda_poolsr      s\     !)) ':%%'' &&
4466&&r   c              #      K   | >t           j                            |           5  dV  ddd           dS # 1 swxY w Y   dS dV  dS )zA context manager to use a CUDA mem pool. If the mem pool is None, it is a no-op. No type hint because it would
    make torch 2.4 or below crash.N)r8   r9   use_mem_pool)r   s    r   mem_pool_ctxr      s       Z$$X.. 	 	EEE	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   6::)r   )r   )&r   collectionsr   
contextlibr   dataclassesr   mathr   r   typingr   r8    transformers.configuration_utilsr	   transformers.utilsr
   requestsr   r   r   r   r   r>   r;   rE   r6   rL   rR   rW   Tensorlistrp   r   Queuer   r   r   r<   r   r   <module>r      s    # # # # # # % % % % % % ! ! ! ! ! !                = = = = = = 8 8 8 8 8 8 M M M M M M M M M M M M# # # # # # # #D " " " " " " " "H 0 HT H H H H
"# "c "c "c " " " "" "s "s "s "3 " " " "c c S S     	Q= Q=LQ=s)Q= s)Q= 	Q=
 
Q= Q= Q= Q=h	  	
  

   :
u{ 
tL/A 
 
 
 

' 
' 
'     r   