
    Ϧi>                        S SK JrJr  S SKJrJrJrJr  S SKJ	r	  S SK
r
S SKJrJrJrJr  S SKJr  S SKrS SKrS SKrS SKrS SKrS SKrS SKJr  S\4S	 jr\
R6                  " 5       S
\4S j5       r\
R6                  " 5       S 5       r\
R6                  " 5       S\4S j5       r S r!\
R6                  " 5       S 5       r"\
R6                  " S5      S 5       r#\	" SS9 " S S5      5       r$ " S S\5      r%g)    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                     S $ )Nc                 4    U R                  5       (       a  S$ S$ )N)       r   )r   r   r   )is_int8)lhsTyperhsTypes     ^/var/www/html/ai-image-ml/venv/lib/python3.13/site-packages/triton/backends/nvidia/compiler.py<lambda>min_dot_size.<locals>.<lambda>   s    GOO4E4EL$W<$W     r   s    r   min_dot_sizer      s    WWr   binaryc                    [         R                  R                  SU R                  5        S3S5      [         R                  R                  [         R                  R                  [        5      SU 5      /nU H  n[         R                  R                  U5      (       d  M)  [         R                  R                  U5      (       d  MO  [        R                  " US/[        R                  S9nUc  My  [        R                  " SUR                  S5      [        R                   S	9nUc  M  X$R#                  S
5      4s  $    [%        SU  35      e)NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr$   resultversions        r   _path_to_binaryrA      s     	

 06;
RWW__X.v>E
 77>>#277>>##6#6,,c;-?
HYHYZF!))$=v}}W?U]_]i]ij&a 000  fX.
//r   c                  l    [         R                  " [        S5      S   S/5      R                  S5      n U $ )Nptxasr   r%   r'   )r5   r6   rA   r:   )r@   s    r   get_ptxas_versionrD   &   s2    %%w'?'BK&PQXXY`aGNr   returnc                     [        U [        5      (       d   e[        [        U R	                  S5      5      u  pUS:X  a  US:  a  SU-   $ US:X  a  gUS:X  a  SU-   $ US:X  a  S	U-   $ [        S
U -   5      e)zC
Get the highest PTX version supported by the current CUDA driver.
.      P   U      F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapintsplitr=   )cuda_versionmajorminors      r   ptx_get_versionrX   ,   s    
 lC((((sL..s34LE{19:aZ{Ez{Ez
X[gg
hhr   c                 T    U R                   nUc  [        S5      u  p#[        U5      nU$ )NrC   )ptx_versionrA   rX   )optionsrZ   _rU   s       r   get_ptx_version_from_optionsr]   ?   s/    %%K)'2%l3r   c                 >    [        U 5      n[        SU5      nSU 3nU$ )NS   z+ptx)r]   min)r[   rZ   llvm_ptx_versionfeaturess       r   get_featuresrc   G   s.    .w7K 2{+&'(HOr   c                     [        U S5       n[        R                  " UR                  5       5      R	                  5       sS S S 5        $ ! , (       d  f       g = f)Nrb)openhashlibsha256read	hexdigest)r/   fs     r   	file_hashrl   U   s5    	dD	Q~~affh'113 
		s   2A		
AT)frozenc                   R   \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   Sr	\\S	'   Sr
\\S
'   Sr\\S'   Sr\\S'   Sr\\   \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\   \S'   Sr\\   \S'   Sr\\S'   Sr\\   \S'   Sr\\S'   Sr\\S'   Sr\\S'   S r\\S!'   Sr\\S"'   S# r S$ r!Sr"g)%CUDAOptions[      	num_warpsr*   num_ctas   
num_stagesr   num_buffers_warp_specnum_consumer_groupsreg_dec_producerreg_inc_consumerNmaxnreg)r*   r*   r*   cluster_dimsrZ   Tenable_fp_fusion)fp8e5fp8e4b15supported_fp8_dtypesr   deprecated_fp8_dtypestf32default_dot_input_precision)r   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsFdebugcudabackend_namesanitize_overflowc                    [        [        5      R                  S-  nU R                  c  0 O[	        U R                  5      nUR                  SS 5      (       d&  [        R                  " S[        US-  5      5      US'   [        R                  U S[        UR                  5       5      5        U R                  S:  a   U R                  U R                  S-
  -  S:X  d   S5       eg )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcr   r   r*   znum_warps must be a power of 2)r   r2   parentr   dictr-   r+   getenvrQ   object__setattr__tupleitemsrr   )selfdefault_libdirr   s      r   __post_init__CUDAOptions.__post_init__t   s    h..6 ,,4b$t?O?O:P{D11')yy1H#n_pNpJq'rK$4k6G6G6I0JK~~!t~~!9K'LQR&R 	0/	0R&Rr   c           	      d   [        U R                  5      n[        S [        US   5       5       5      US'   SR	                  [        UR                  5       5       VVs/ s H  u  p#U SU 3PM     snn5      n[        R                  " UR                  S5      5      R                  5       $ s  snnf )Nc              3   @   #    U  H  u  pU[        U5      4v   M     g 7fN)rl   ).0kvs      r   	<genexpr>#CUDAOptions.hash.<locals>.<genexpr>   s     (hGgtq!Yq\):Ggs   r   r\   -r'   )
r   __dict__r   sortedr0   r   rg   rh   encoderj   )r   	hash_dictnamevalkeys        r   hashCUDAOptions.hash}   s    '	#((hviXeNfGg(h#h	- hh	@Q9RS9RID4&#9RST~~cjj12<<>> Ts   B,
)#__name__
__module____qualname____firstlineno__rr   rS   __annotations__rs   ru   rv   rw   rx   ry   rz   r   r{   r   rZ   r|   boolr   r   rQ   r   r   r   r   r   r   r   r   r   r   r   __static_attributes__r   r   r   ro   ro   [   s    IsHcJ!"3"  cc "GXc]!#L%#K!d!'<%*<(*5:*'--/I %*I*.!4.KE4L#"t"0?r   ro   c                     ^  \ rS rSr\S\4S j5       rS\SS4U 4S jjrS\4S jr	S r
S	 rS\\\4   4S
 jrS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rS r\R0                  " 5       S 5       rSrU =r$ )CUDABackend   r   c                      U R                   S:H  $ )Nr   )backendr   s    r   supports_targetCUDABackend.supports_target   s    ~~''r   rE   Nc                    > [         TU ]  U5        UR                  U l        [	        U R                  [
        5      (       d   eSU l        g )Ncubin)super__init__arch
capabilityrP   rS   
binary_ext)r   r   	__class__s     r   r   CUDABackend.__init__   s9      ++$//3////!r   c                    [         R                  R                  5        Vs0 s H  o"U;   d  M
  X!U   _M     nnSU;  aQ  [        [         R                  5      nU R
                  S:  a  UR                  S5        [        [        U5      5      US'   SU;  a  U R
                  S:  a  SUS'   SU;  a  [        R                  " SS	5      S	:H  US'   U R
                  S:X  a  S
OSUS'   [        S0 UD6$ s  snf )Nr   Y   fp8e4nvr   Z   )r~   r|   TRITON_DEFAULT_FP_FUSION1i   @r   r   r   )ro   __dataclass_fields__keyssetr   r   addr   r   r+   r   )r   optsr   argsr   s        r   parse_optionsCUDABackend.parse_options   s    $/$D$D$I$I$KY$KqTXy
7
$KY!-#&{'G'G#H "$$((3+08L1M+ND'("$."$0>,-T)')yy1KS'QUX'XD#$9=B9NTU,-"T"" Zs
   	C0	C0c                     UR                   UR                  UR                  UR                  S   UR                  S   UR                  S   4$ )Nr   r*      )rr   rs   sharedr{   )r   metadatas     r   pack_metadataCUDABackend.pack_metadata   sO    OO!!!$!!!$!!!$
 	
r   c                     SS K Js  Js  Jn  U R                  S:  a  UR
                  OUR                  [        U R                  5      S.nU$ )Nr   rJ   )convert_custom_typesr   )	triton.language.extra.cudalanguageextrar   r   convert_custom_float8_sm80convert_custom_float8_sm70r   r   )r   r   codegen_fnss      r   get_codegen_implementation&CUDABackend.get_codegen_implementation   sA    11 04"/DD++$JiJi(5

 r   c                     SSK Jn  SU0$ )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r   get_module_mapCUDABackend.get_module_map   s    819==r   c                 0    [         R                  " U5        g r   )r   load_dialects)r   ctxs     r   r   CUDABackend.load_dialects   s    S!r   c                    [         R                  " U R                  5      nUR                  5         [        R
                  R                  U5        [        R                  R                  U5        [        R                  R                  U5        [        R
                  R                  U5        [        R                  R                  U5        [        R
                  R                  U5        [        R
                  R                  U5        [        R
                  R                  U5        [        R                  R                  U5        UR!                  U 5        U $ r   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dceadd_loop_unrollrun)modr   optpms       r   	make_ttirCUDABackend.make_ttir   s    __S[[)
!!"%..r2#''+))"-b!r"$$R(##B'
s
r   c                 
   [         R                  " 5       nUR                  b<  UR                  S   Ul        UR                  S   Ul        UR                  S   Ul        [        R                  R                  SS5      S:X  aP  [        R                  " 5       n[        R                  " XPR                  5      nU R                  R                  S5        [        R                  " U R                  5      nUR!                  5         ["        R$                  R'                  USU 3UR(                  S	UR*                  5        ["        R,                  R/                  U5        US
-  S:  a  ["        R,                  R1                  U5        [         R"                  R2                  R5                  Xt5        ["        R,                  R7                  U5        ["        R,                  R9                  U5        ["        R,                  R;                  U5        ["        R,                  R7                  U5        ["        R,                  R=                  XsS:  5        ["        R>                  RA                  U5        US
-  S:  GaU  ["        R,                  RC                  U5        ["        R,                  RE                  U5        ["        R,                  RG                  XrRH                  5        ["        R,                  RK                  XrRH                  5        ["        R,                  RM                  XrRH                  5        ["        R,                  RO                  XrRP                  URH                  URR                  URT                  5        ["        R,                  RW                  XrRX                  5        ["        R,                  R[                  XrRH                  5        ["        R,                  R]                  U5        ["        R,                  R=                  XsS:  5        ["        R,                  R7                  U5        ["        R,                  R_                  U5        ["        R,                  Ra                  U5        ["        R>                  RA                  U5        ["        R>                  Rc                  U5        US
-  S:  aR  [         R"                  R2                  Re                  U5        [         R"                  R2                  Rg                  U5        ["        R>                  Ri                  U5        URk                  U 5        UR                  UR                  UR
                  4US'   U $ )Nr   r*   r   MLIR_ENABLE_REMARK0r   Tzcuda:r   rN      rJ   	   r{   )6r   ClusterInfor{   clusterDimXclusterDimYclusterDimZr+   r,   r-   r   
source_mgrr   source_mgr_diagr   printOpOnDiagnosticr   r   r   r   add_convert_to_ttgpuirrr   rs   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r   add_optimize_accumulator_init add_combine_tensor_select_and_ifadd_ws_task_partitionrw   add_taskid_propagateadd_ws_data_partitionadd_ws_code_partitionrv   rx   ry   add_pipelineru   add_ws_loweringadd_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infosrcMgrdiagr   s           r   
make_ttgirCUDABackend.make_ttgir   s   ))+''*'7'7':L$'*'7'7':L$'*'7'7':L$::>>.4;__&F%%fkk:DKK++D1__S[[)
**2zl/CS]]TVX[XdXde##B'q NN))"-,,R>44R833B7,,R044R80025EFb!q NN88<NN;;B?NN005L5LMNN//4K4KLNN005L5LMNN005N5NPSPgPg141E1EsG[G[]NN''NN;NN**2/F/FG##B'0025EF44R82226//3b!$$R(q MM##77;MM##44R8''+
s$0$<$<l>V>VXdXpXp#q 
r   c                 L   [        U5      nU R                  S5      nUb  US==   U-  ss'   U n[        R                  " UR                  5      nUR                  5         [        R                  R                  SS5      S:X  aP  [        R                  " 5       n[        R                  " XR                  5      n	UR                  R                  S5        [        R                  R                  R!                  U5        [        R                  R#                  U5        [        R$                  R'                  U5        [        R$                  R)                  U5        [        R                  R+                  U5        [        R                  R                  R-                  XsU5        [        R                  R.                  R1                  U5        [        R$                  R3                  U5        [        R4                  R7                  U5        [        R4                  R9                  U5        [        R4                  R;                  U5        [        R                  R                  SS5      S:X  a  [        R<                  R?                  U5        URA                  U5        [        RB                  " 5         [        R                  " 5       n
[        RD                  " Xj5      nUS:X  a  S	OS
U 3n[G        U5      nSn[        RH                  " XX5        [        RJ                  " U5        URL                  b`  URO                  5        HL  nURQ                  5       (       a  M  URS                  5       (       d  M1  URU                  URL                  5        MN     URV                  (       a8  URV                   VVs/ s H  u  nnUPM
     nnn[        RX                  " UU5        [        RZ                  " U[        R\                  5        U R                  S5      US'   [_        U5      nAA
U$ s  snnf )Nz"triton_gpu.num-warp-groups-per-ctarr   r   r   r   TTRITON_DISABLE_LINE_INFOr   sm_90asm_nvptx64-nvidia-cudaztriton_gpu.sharedr   )0r]   get_int_attrr   r   r   r   r+   r,   r-   r   r  r  r	  r   r   r  %add_decompose_unsupported_conversionsr  convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr  add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   llvmiradd_di_scoper   init_targets	to_modulerc   attach_datalayoutset_nvvm_reflect_ftzrz   get_functionsis_declarationis_external_linkageset_nvvm_maxnregr   link_extern_libsoptimize_moduleOPTIMIZE_O3rQ   )srcr   r[   r   rZ   num_warp_groupsr   r   r"  r#  r   llvm_modprocrb   tripler   r   r/   r>   rets                       r   	make_llirCUDABackend.make_llir   s   27; **+OP&[!_4!__S[[)
::>>.4;__&F%%fkk:DKK++D1CCBG77;$$R(**2.11"5++BKH11"5**2.''+b!$$R(::>>4c:cAMM&&r*
s,,.>>#/%+x3zl1C(&x@##H- ??&++-''))a.C.C.E.E&&w7 . .5.A.AB.AltTT.AEB!!(E2Xt'7'78 !--.AB(m
 Cs   /P c           	         [        U5      nSnUS:X  a  SOSU 3n[        U5      n[        R                  " XXgS/UR                  S5      n[
        R                  " SU5      n	[        U	5      S:X  d   eU	S	   US
'   US-   SUS-   3n[
        R                  " SSU 3U[
        R                  S9n[
        R                  " SSU5      n[        R                  R                  SS5      S:X  a  [        S5        [        U5        U$ )Nr*  r   r(  r)  znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r*   r   r   rN   rG   z\.version \d+\.\d+z	.version r(   z,\s*debug|debug,\s*r#   NVPTX_ENABLE_DUMPr   r   z // -----// NVPTX Dump //----- //)r]   rc   r   translate_to_asmr|   r8   findalllensubr;   r+   r,   r-   print)
rA  r   r   r   rZ   rE  rD  rb   rF  namess
             r   make_ptxCUDABackend.make_ptx=  s    237&%+x3zl1C$##CBSATVYVjVjlqr

FL5zQ 8$b);r>*:;ff*i},EsRTR^R^_ff+R5::>>-s3s:45#J
r   c                    [        S5      u  pE[        R                  " SSSS9 n[        R                  " SSSS9 nUR                  U 5        UR	                  5         UR
                  S-   n[        R                  R                  S	5      (       a  / OS
/n	UR                  (       a  / OS/n
US:X  a  SOSn[        R                  R                  SS5      S:X  a  SS/O/ nU/U	QU
QSPUQSU U 3PUR
                  PSPUPn [        R                  " USSUS9  [        R                  R                  UR
                  5      (       a   [        R                  " UR
                  5        [        R                  R                  UR
                  5      (       a   [        R                  " UR
                  5        [!        US!5       nUR#                  5       nS S S 5        [        R                  R                  U5      (       a  [        R                  " U5        S S S 5        S S S 5        W$ ! [        R                   a  n[!        UR
                  5       nUR#                  5       nS S S 5        O! , (       d  f       O= f[        R                  R                  UR
                  5      (       a   [        R                  " UR
                  5        UR$                  S:X  a  SnO3UR$                  S[&        R(                  -   :X  a  SnOSUR$                   3n[+        U SW SSR-                  U5       S 35      eS nAff = f! , (       d  f       GNn= f! , (       d  f       GN>= f! , (       d  f       W$ = f)"NrC   Fwz.ptx)deletemodesuffixrz.logz.or'  z	-lineinfoz--fmad=falser   ar#   DISABLE_PTXAS_OPTr   r   z--opt-levelz-vz--gpu-name=sm_z-oT)check	close_fdsr&      z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command:  
re   )rA   tempfileNamedTemporaryFilewriteflushr   r+   r,   r-   r|   r5   r   r/   r3   removeCalledProcessErrorrf   ri   
returncodesignalSIGSEGVr=   r0   )rA  r   r   r   rC   r\   fsrcflogfbin	line_infofmadrW  	opt_level	ptxas_cmdelog_filelogerrorrk   r   s                       r   
make_cubinCUDABackend.make_cubinS  s   "7+((COSW''u3vNRVJJsOJJL99t#D jjnn-GHH{mI--2N3CD&",S"F02

?RTW0X\_0_,egI!$(*.1:>LZLY_X`<acgclclnrtxINydS77>>$)),,IIdii(77>>$)),,IIdii($ dD!Q "ww~~d##		$M O PP + 00 N$))_"--/C %__77>>$)),,IIdii(<<3&?E\\S6>>%994E=all^LE"eW -77:e <558XXi5H4I$M N NN" "!G ON POP sz   MCMB3H&5ML9AMM&L6:L1I)	 	L1)
I73B>L11L66M9
MM
M	M
M,c                 l   ^ ^ UU 4S jUS'   UU 4S jUS'   UU 4S jUS'   UU 4S jUS'   UU 4S	 jUS
'   g )Nc                 (   > TR                  XT5      $ r   )r   rA  r   r[   r   s     r   r   (CUDABackend.add_stages.<locals>.<lambda>  s    t~~cW/Ur   r   c                 >   > TR                  XTTR                  5      $ r   )r$  r   ry  s     r   r   rz        wX\XgXg0hr   ttgirc                 >   > TR                  XTTR                  5      $ r   )rG  r   ry  s     r   r   rz    s    t~~cWVZVeVe/fr   llirc                 >   > TR                  XTTR                  5      $ r   )rQ  r   ry  s     r   r   rz    s    dmmC7TXTcTc.dr   ptxc                 >   > TR                  XTTR                  5      $ r   )ru  r   ry  s     r   r   rz    r|  r   r   r   )r   stagesr[   s   ` `r   
add_stagesCUDABackend.add_stages  s0    Uvhwfvduhwr   c                 8    [        5       nU SU R                   3$ )Nr   )rD   r   )r   r@   s     r   r   CUDABackend.hash  s     #%!DOO,--r   )r   r   )r   r   r   r   staticmethodr   r   r   r
   r   r   r   r   rQ   r   r   r   r   r$  rG  rQ  ru  r  	functools	lru_cacher   r   __classcell__)r   s   @r   r   r      s    (	 ( ("y "T "#S #"
>S*_ 5 >"   2 2h : :x  * * *Xi . .r   r   )&triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr	   r  typingr
   r   r   r   typesr   rg   r8   ra  rh  r+   r5   pathlibr   r   r  rQ   rA   rD   rS   rX   r]   rc   rl   ro   r   r   r   r   <module>r     s   ; 8 8 !  - -   	   	  X X 0C 0 0   
 iS i i$ 
 
 T4 4
 $%? %? %?PF.+ F.r   