§
    |-j74  ã            
       ó<  — d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ d
dlmZ d
dlmZ dede
dedz  dede
f
d„Zde
dedz  ddfd„Zde
ddfd„Zdede
deddfd„Zde
de	dz  dededdf
d„Zde
dededdfd„Zde
deddfd„Zde
deddfd„ZdS ) zßResolves a `ContinuousBatchingConfig` into a fully-specified config ready for cache and runner creation. Each
helper mutates the config in place; `resolve_continuous_batching_config` orchestrates them in the required order.é    )Údeepcopy)ÚceilNé   )ÚPretrainedConfig)ÚCompileConfigÚContinuousBatchingConfig)Ú!lazy_import_paged_flash_attention)Úis_flash_attention_requestedé   )Úlogger)ÚWorkloadHintsÚconfigÚ	cb_configÚworkload_hintsÚhas_logit_processorsÚreturnc                 óÐ  — t          |¦  «        }|j        du}t          |j        |j        |j        g¦  «        }t          ||¦  «         t          |¦  «         t          | ||¦  «         t          |t          | dd¦  «        t          | ¦  «        |j        dk    ¬¦  «         t          | ¦  «         }t          |||¬¦  «         t          ||¬¦  «         t          ||¬¦  «         |S )zmReturns a deep-copied and fully-resolved `ContinuousBatchingConfig`. The original `cb_config` is not mutated.NÚcompile_configr   )r   Úfallback_compile_configÚis_flash_attnÚdecode_fast_path_available)r   Úis_attn_mask_neededÚcuda_graph_requested)r   r   ©r   r   )r   Úmax_blocks_per_requestÚanyÚq_padding_interval_sizeÚkv_padding_interval_sizeÚmax_cached_graphsÚresolve_using_hintsÚresolve_without_hintsÚ$ensure_decode_fast_path_is_availableÚresolve_compile_configsÚgetattrr
   Údecide_use_cuda_graphsÚdecide_use_async_batchingÚresolve_max_memory_percent)r   r   r   r   Úuser_requested_decode_pathr   r   s          úz/var/www/html/banglarbhumi/venv/lib/python3.11/site-packages/transformers/generation/continuous_batching/initialization.pyÚ"resolve_continuous_batching_configr*      s#  € õ ˜Ñ#Ô#€Ið "+Ô!AÈÐ!MÐåØ	Ô	*¨IÔ,NÐPYÔPkÐlñô Ðõ
 ˜	 >Ñ2Ô2Ð2õ ˜)Ñ$Ô$Ð$õ )¨°Ð<VÑWÔWÐWõ ØÝ '¨Ð0@À$Ñ GÔ GÝ2°6Ñ:Ô:Ø#,Ô#CÀaÒ#Gð	ñ ô ð õ ;¸6ÑBÔBÐBÐÝØÐ1DÐ[oðñ ô ð õ
 ¨	ÐGZÐ[Ñ[Ô[Ð[õ ¨ÐI]Ð^Ñ^Ô^Ð^ØÐó    c                 ó¶   — | j         €M|M|j        |j        z   }|dk    r:t          t	          || j        z  ¦  «        ¦  «        dz   }||dz  z   | _         dS dS dS dS )z`Fills `max_blocks_per_request` from the workload hints, when the user did not set it explicitly.Nr   r   é   )r   Úmax_prompt_lengthÚmax_generated_lengthÚintr   Ú
block_size)r   r   Úmax_sequence_lengthÚblocks_per_requests       r)   r    r    M   s   € ð Ô'Ð/°NÐ4NØ,Ô>ÀÔAdÑdÐØ Ò"Ð"Ý!$¥TÐ*=À	Ô@TÑ*TÑ%UÔ%UÑ!VÔ!VÐYZÑ!ZÐØ/AÐEWÐZ[ÑE[Ñ/\ˆIÔ,Ð,Ð,ð	 0Ð/Ð4NÐ4Nà"Ð"r+   c                 ó’   — | j         €d| _         | j        dk    rd| _        | j        dk    rd| _        | j        dk    r	d| _        dS dS )zEFills any remaining unset/sentinel attribute with a fallback default.Né    r   é@   i @  )r   r   r   r   )r   s    r)   r!   r!   W   sc   € àÔ'Ð/Ø+-ˆ	Ô(ØÔ(¨AÒ-Ð-Ø,.ˆ	Ô)ØÔ)¨QÒ.Ð.Ø-5ˆ	Ô*ØÔ" aÒ'Ð'Ø&(ˆ	Ô#Ð#Ð#ð (Ð'r+   Úuser_requestedc                 óŽ  — |j         dk    r¹t          | d¬¦  «        rxt          | j        ¦  «        d         }t          j                             ¦   «         |dug}t          |¦  «        s+|r t          j	        d|j         ›d|› d¦  «         d|_         dS dS |r%t          j	        d|j         ›d	| j        ›d¦  «         d|_         dS dS )
zÂEnsures the decode fast path is available. If it is not, set the max blocks per request to 0. If it is
    available, and no user-provided max blocks per request, set it to the fallback default.r   r   )Úversionr   Nz,Although cb_config.max_blocks_per_request = zS, the decode fast path is not available because at least one condition is not met: ú.z{, the decode fast path is not available because the attention implementation is not FA3. Got config._attn_implementation = )
r   r
   r	   Ú_attn_implementationÚtorchÚcudaÚis_availableÚallr   Úwarning)r   r   r7   Úflash_attn_with_kvcacheÚ
conditionss        r)   r"   r"   c   s5  € ð Ô'¨1Ò,Ð,å'¨¸Ð:Ñ:Ô:ð 	1Ý&GÈÔHcÑ&dÔ&dÐefÔ&gÐ#å”
×'Ò'Ñ)Ô)Ø'¨tÐ3ðˆJõ
 z‘?”?ð 5Ø!ð Ý”NðT IÔ$Dð Tð TØFPðTð Tð Tñô ð ð 45	Ô0Ð0Ð0ð5ð 5ð ð Ý”ðn 	Ô @ð nð nØLRÔLgðnð nð nñô ð ð 01ˆIÔ,Ð,Ð,ð/ -Ð,r+   r   r   r   c                 óÖ  — | j         €'| j        r|rd}n"t          ddd¬¦  «        }n||}n
d}n| j         }| j        €"| j        rt          ddd¬¦  «        }n||}n
d}n| j        }|s| j        d}t	          j        d¦  «         |)t	          j        d|                     ¦   «         › ¦  «         |)t	          j        d|                     ¦   «         › ¦  «         || _         || _        dS )	zÛResolve if the compile configs for varlen and decode paths, modifying these attributes in place if needed.
    Default config use full compile over regional compile, because the throughput is significantly higher (~15%)Nzmax-autotune-no-cudagraphsT)ÚmodeÚ	fullgraphÚdynamicFzSA decode_compile_config was set but fast decode path is not available. Ignoring it.z"Varlen path will be compiled with z"Decode path will be compiled with )Úvarlen_compile_configÚuse_default_compile_configsr   Údecode_compile_configr   r@   ÚinfoÚto_dict)r   r   r   r   Úvarlen_configÚdecode_configs         r)   r#   r#   ƒ   s=  € ð Ô&Ð.ØÔ0ð 		!àð oØ $å -Ð3OÐ[_ÐimÐ nÑ nÔ nØ$Ð0Ø3ˆMˆMà ˆMˆMà!Ô7ˆàÔ&Ð.ØÔ0ð 	!å)Ð/KÐW\ÐfkÐlÑlÔlˆMˆMØ$Ð0Ø3ˆMˆMà ˆMˆMà!Ô7ˆð &ð n¨)Ô*IÐ*UØˆÝŒÐlÑmÔmÐmð Ð ÝŒÐR¸×9NÒ9NÑ9PÔ9PÐRÐRÑSÔSÐSØÐ ÝŒÐR¸×9NÒ9NÑ9PÔ9PÐRÐRÑSÔSÐSà&3€IÔ#Ø&3€IÔ#Ð#Ð#r+   r   r   c                 óB  — t           j                             ¦   «         s[t          | j        ¦  «        }|r<t          j        d| j        ›dt           j                             ¦   «         ›d¦  «         d| _        n	| j        .t          | j        t          ¦  «        r| j        | j        f| _        nÔ|rd| _        nÊg }| j
        | j        fD ]¥}|€|                     | ¦  «         Œt           j                             ¦   «                              |j        |j        ¦  «        }|                     dd¦  «        }|rt          j        d	|j        ›d
¦  «         |                     | o| ¦  «         Œ¦t%          |¦  «        | _        t          j        d| j        › ¦  «         dS )a¸  Decides whether or not to use cuda graphs for continuous batching. If the user specified this in the config
    or if they specified a parameter related to cuda graphs, they are turned on. Otherwise, we use a heuristic
    based on the attention implementation: we turn on cuda graphs if and only if no attention mask is needed.

    This function modifies the `use_cuda_graph` attribute of the config in place, to a tuple of booleans.
    zcb_config.use_cuda_graph = z! but torch.cuda.is_available() = z: turning off cuda graphs)FFN)TTztriton.cudagraphsFz%Compile config compile_config.mode = z— uses cudagraphs, which usually does not work well with continuous batching. We recommend using mode 'default' or 'max-autotune-no-cudagraphs' instead.z.Using cuda graphs for (varlen, decode) paths: )r<   r=   r>   r   Úcuda_graph_booleansr   r@   Úuse_cuda_graphÚ
isinstanceÚboolrG   rI   ÚappendÚ	_inductorÚlist_mode_optionsÚgetrD   ÚoptionsÚtuplerJ   )r   r   r   Úintended_use_cuda_graphrP   r   rW   Úcompile_uses_cudagraphss           r)   r%   r%   ´   sÙ  € õ Œ:×"Ò"Ñ$Ô$ð $9Ý"% iÔ&CÑ"DÔ"DÐØ"ð 	ÝŒNØl9Ô+ÐlÐlµU´Z×5LÒ5LÑ5NÔ5NÐlÐlÐlñô ð ð $2ˆ	Ô Ñ ð 
Ô	!Ð	-ÝiÔ.µÑ5Ô5ð 	\Ø(1Ô(@À)ÔBZÐ'[ˆIÔ$øð 
ð 9Ø#/ˆ	Ô Ð ð ˆØ(Ô>À	Ô@_Ð`ð 	[ð 	[ˆNàÐ%Ø×%Ò%Ð*=Ð&=Ñ>Ô>Ð>Øå”o×7Ò7Ñ9Ô9×=Ò=¸nÔ>QÐSaÔSiÑjÔjˆGØ&-§k¢kÐ2EÀuÑ&MÔ&MÐ#Ø&ð Ý”ðv nÔ&9ð vð vð vñô ð ð ×!Ò!Ð&=Ð"=Ð"YÐFYÐBYÑZÔZÐZÐZÝ#(¨Ñ#8Ô#8ˆ	Ô å
„KÐ[ÀÔAYÐ[Ð[Ñ\Ô\Ð\Ð\Ð\r+   c           	      óš   — | j         €Ct          | j        ¦  «        }|o| | _         t          j        d| j         ›d|›d|›d¦  «         dS dS )ah  Returns whether or not to use asynchronous batching for continuous batching. If the user specified this in
    the config, we follow their choice. Otherwise, we turn on asynchronous batching if and only if CUDA graphs are
    turned on and no attention mask is needed.

    This function modifies the `use_async_batching` attribute of the config in place.
    NzVNo behavior specified for use_async_batching, choosing cb_config.use_async_batching = z because use_cuda_graphs = z and is_attn_mask_needed = zd. If you want to save memory, you can disable asynchronous batching but it will degrade performance.)Úuse_async_batchingr   rO   r   rJ   )r   r   Úuse_cuda_graphss      r)   r&   r&   ç   s   € ð Ô#Ð+Ý˜iÔ;Ñ<Ô<ˆØ'6Ð'RÐ?RÐ;Rˆ	Ô$ÝŒðMÀiÔFbð Mð MØðMð MØ(;ðMð Mð Mñ	
ô 	
ð 	
ð 	
ð 	
ð ,Ð+r+   c                 ó.   — | j         €|rdnd| _         d S d S )Ngš™™™™™é?gÍÌÌÌÌÌì?)Úmax_memory_percentr   s     r)   r'   r'   ù   s*   € ØÔ#Ð+Ø.BÐ'K s sÈˆ	Ô$Ð$Ð$ð ,Ð+r+   )Ú__doc__Úcopyr   Úmathr   r<   Úconfiguration_utilsr   Úgeneration.configuration_utilsr   r   Úmodeling_flash_attention_utilsr	   Úutils.genericr
   Úrequestsr   Úutilsr   rR   r*   r    r!   r"   r#   r%   r&   r'   © r+   r)   ú<module>rj      sz  ððuð uð Ð Ð Ð Ð Ð Ø Ð Ð Ð Ð Ð à €€€à 3Ð 3Ð 3Ð 3Ð 3Ð 3Ø UÐ UÐ UÐ UÐ UÐ UÐ UÐ UØ OÐ OÐ OÐ OÐ OÐ OØ 9Ð 9Ð 9Ð 9Ð 9Ð 9Ø Ð Ð Ð Ð Ð Ø  Ð  Ð  Ð  Ð  Ð  ð,Øð,à'ð,ð " DÑ(ð,ð ð	,ð
 ð,ð ,ð ,ð ,ð^]Ð#;ð ]È]Ð]aÑMað ]Ðfjð ]ð ]ð ]ð ]ð	)Ð%=ð 	)À$ð 	)ð 	)ð 	)ð 	)ð1Øð1Ø)Að1ØSWð1à	ð1ð 1ð 1ð 1ð@.4Ø'ð.4à*¨TÑ1ð.4ð ð.4ð !%ð	.4ð
 
ð.4ð .4ð .4ð .4ðb0]Ø'ð0]Ø>Bð0]ØZ^ð0]à	ð0]ð 0]ð 0]ð 0]ðf
Ð)Að 
ÐX\ð 
Ðaeð 
ð 
ð 
ð 
ð$LÐ*Bð LÐZ^ð LÐcgð Lð Lð Lð Lð Lð Lr+   