
    |-j74              
       <   d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ d
dlmZ d
dlmZ dede
dedz  dede
f
dZde
dedz  ddfdZde
ddfdZdede
deddfdZde
de	dz  dededdf
dZde
dededdfdZde
deddfdZde
deddfdZdS ) zResolves a `ContinuousBatchingConfig` into a fully-specified config ready for cache and runner creation. Each
helper mutates the config in place; `resolve_continuous_batching_config` orchestrates them in the required order.    )deepcopy)ceilN   )PretrainedConfig)CompileConfigContinuousBatchingConfig)!lazy_import_paged_flash_attention)is_flash_attention_requested   )logger)WorkloadHintsconfig	cb_configworkload_hintshas_logit_processorsreturnc                    t          |          }|j        du}t          |j        |j        |j        g          }t          ||           t          |           t          | ||           t          |t          | dd          t          |           |j        dk               t          |            }t          |||           t          ||           t          ||           |S )zmReturns a deep-copied and fully-resolved `ContinuousBatchingConfig`. The original `cb_config` is not mutated.Ncompile_configr   )r   fallback_compile_configis_flash_attndecode_fast_path_available)r   is_attn_mask_neededcuda_graph_requested)r   r   r   r   )r   max_blocks_per_requestanyq_padding_interval_sizekv_padding_interval_sizemax_cached_graphsresolve_using_hintsresolve_without_hints$ensure_decode_fast_path_is_availableresolve_compile_configsgetattrr
   decide_use_cuda_graphsdecide_use_async_batchingresolve_max_memory_percent)r   r   r   r   user_requested_decode_pathr   r   s          z/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/generation/continuous_batching/initialization.py"resolve_continuous_batching_configr*      s#    ##I "+!A!M		*I,NPYPkl 
 	>222 )$$$ )<VWWW  '0@$ G G26::#,#Ca#G	    ;6BBB1D[o   
 	GZ[[[[ I]^^^^    c                     | j         M|M|j        |j        z   }|dk    r:t          t	          || j        z                      dz   }||dz  z   | _         dS dS dS dS )z`Fills `max_blocks_per_request` from the workload hints, when the user did not set it explicitly.Nr   r      )r   max_prompt_lengthmax_generated_lengthintr   
block_size)r   r   max_sequence_lengthblocks_per_requests       r)   r    r    M   s     '/N4N,>Add""!$T*=	@T*T%U%U!V!VYZ!Z/AEWZ[E[/\I,,,	 0/4N4N""r+   c                     | j         d| _         | j        dk    rd| _        | j        dk    rd| _        | j        dk    r	d| _        dS dS )zEFills any remaining unset/sentinel attribute with a fallback default.N    r   @   i @  )r   r   r   r   )r   s    r)   r!   r!   W   sc    '/+-	((A--,.	))Q..-5	*"a''&(	### ('r+   user_requestedc                    |j         dk    rt          | d          rxt          | j                  d         }t          j                                        |dug}t          |          s+|r t          j	        d|j         d| d           d|_         dS dS |r%t          j	        d|j         d	| j        d           d|_         dS dS )
zEnsures the decode fast path is available. If it is not, set the max blocks per request to 0. If it is
    available, and no user-provided max blocks per request, set it to the fallback default.r   r   )versionr   Nz,Although cb_config.max_blocks_per_request = zS, the decode fast path is not available because at least one condition is not met: .z{, the decode fast path is not available because the attention implementation is not FA3. Got config._attn_implementation = )
r   r
   r	   _attn_implementationtorchcudais_availableallr   warning)r   r   r7   flash_attn_with_kvcache
conditionss        r)   r"   r"   c   s5    '1,,'::: 	1&GHc&d&def&g#
''))'t3J
 z?? 5! NTI$D T TFPT T T   45	0005 5  n	 @ n nLRLgn n n   01I,,,/ -,r+   r   r   r   c                    | j         '| j        r|rd}n"t          ddd          }n||}n
d}n| j         }| j        "| j        rt          ddd          }n||}n
d}n| j        }|s| j        d}t	          j        d           |)t	          j        d|                                            |)t	          j        d|                                            || _         || _        dS )	zResolve if the compile configs for varlen and decode paths, modifying these attributes in place if needed.
    Default config use full compile over regional compile, because the throughput is significantly higher (~15%)Nzmax-autotune-no-cudagraphsT)mode	fullgraphdynamicFzSA decode_compile_config was set but fast decode path is not available. Ignoring it.z"Varlen path will be compiled with z"Decode path will be compiled with )varlen_compile_configuse_default_compile_configsr   decode_compile_configr   r@   infoto_dict)r   r   r   r   varlen_configdecode_configs         r)   r#   r#      s=    &.0 		! o $ -3O[_im n n n$03MM MM!7&.0 	!)/KW\fklllMM$03MM MM!7 & n)*I*Ulmmm  R9N9N9P9PRRSSS R9N9N9P9PRRSSS&3I#&3I###r+   r   r   c                 B   t           j                                        s[t          | j                  }|r<t          j        d| j        dt           j                                        d           d| _        n	| j        .t          | j        t                    r| j        | j        f| _        n|rd| _        ng }| j
        | j        fD ]}||                    |            t           j                                                            |j        |j                  }|                    dd          }|rt          j        d	|j        d
           |                    | o|            t%          |          | _        t          j        d| j                    dS )a  Decides whether or not to use cuda graphs for continuous batching. If the user specified this in the config
    or if they specified a parameter related to cuda graphs, they are turned on. Otherwise, we use a heuristic
    based on the attention implementation: we turn on cuda graphs if and only if no attention mask is needed.

    This function modifies the `use_cuda_graph` attribute of the config in place, to a tuple of booleans.
    zcb_config.use_cuda_graph = z! but torch.cuda.is_available() = z: turning off cuda graphs)FFN)TTztriton.cudagraphsFz%Compile config compile_config.mode = z uses cudagraphs, which usually does not work well with continuous batching. We recommend using mode 'default' or 'max-autotune-no-cudagraphs' instead.z.Using cuda graphs for (varlen, decode) paths: )r<   r=   r>   r   cuda_graph_booleansr   r@   use_cuda_graph
isinstanceboolrG   rI   append	_inductorlist_mode_optionsgetrD   optionstuplerJ   )r   r   r   intended_use_cuda_graphrP   r   rW   compile_uses_cudagraphss           r)   r%   r%      s    :""$$ $9"%i&C"D"D" 	Nl9+llUZ5L5L5N5Nlll   $2	   
	!	-i.55 	\(1(@)BZ'[I$ 
 9#/	   (>	@_` 	[ 	[N%%%*=&=>>>o7799==n>QSaSijjG&-kk2Eu&M&M#& vn&9 v v v   !!&="="YFYBYZZZZ#(#8#8	 
K[AY[[\\\\\r+   c           	          | j         Ct          | j                  }|o| | _         t          j        d| j         d|d|d           dS dS )ah  Returns whether or not to use asynchronous batching for continuous batching. If the user specified this in
    the config, we follow their choice. Otherwise, we turn on asynchronous batching if and only if CUDA graphs are
    turned on and no attention mask is needed.

    This function modifies the `use_async_batching` attribute of the config in place.
    NzVNo behavior specified for use_async_batching, choosing cb_config.use_async_batching = z because use_cuda_graphs = z and is_attn_mask_needed = zd. If you want to save memory, you can disable asynchronous batching but it will degrade performance.)use_async_batchingr   rO   r   rJ   )r   r   use_cuda_graphss      r)   r&   r&      s     #+i;<<'6'R?R;R	$MiFb M MM M(;M M M	
 	
 	
 	
 	
 ,+r+   c                 .    | j         |rdnd| _         d S d S )Ng?g?)max_memory_percentr   s     r)   r'   r'      s*    #+.B'Kss	$$$ ,+r+   )__doc__copyr   mathr   r<   configuration_utilsr   generation.configuration_utilsr   r   modeling_flash_attention_utilsr	   utils.genericr
   requestsr   utilsr   rR   r*   r    r!   r"   r#   r%   r&   r'    r+   r)   <module>rj      sz  u u              3 3 3 3 3 3 U U U U U U U U O O O O O O 9 9 9 9 9 9                  ,,', "D(, 	,
 , , , ,^]#; ]]]aMa ]fj ] ] ] ]	)%= 	)$ 	) 	) 	) 	)11)A1SW1	1 1 1 1@.4'.4*T1.4 .4 !%	.4
 
.4 .4 .4 .4b0]'0]>B0]Z^0]	0] 0] 0] 0]f
)A 
X\ 
ae 
 
 
 
$L*B LZ^ Lcg L L L L L Lr+   