
    Bj_                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZ dd	lmZm Z  erd d
l!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( ddl)m*Z* d dl+Z+d dl,Z,d dl-Z,d dl.m/c m0Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZF d dlGmHZH ddlImJZJmKZKmLZLmMZMmZmNZN ddlOmPZP ddlQmRZRmSZSmTZT ddlUmVZVmWZW ddlMmXZXmYZYmZZZm[Z[ ddl\m]Z]m^Z^ ddl_m`Z` ddlmaZambZbmcZcmdZdmeZemfZf ddlgmhZh dd limjZjmkZk dd!llmmZmmnZn dd"lompZpmqZq dd#lrmsZs dd$l/mtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZ dd%lmZ  ej        e          Ze,j                            ed&          Ze,j                            ed'          Ze,j                            ed(          Ze,j                            ed)          Zed*         Zd+ed,<    ed-          Z ed.          Zej         G d/ d0                      Zej         G d1 d2                      Z G d3 d4          Zej         G d5 d6                      Zej         G d7 d8e                      Z G d9 d*          Zej        dd<            Zdd?ZddAZddCZ ej        dDE           G dF dG                      ZddJZ G dK dL          ZddSZ G dT dUe          Z G dV dWe          Z G dX dYe          Zdd\ZddaZ G db dce          Z G dd dee          Z G df dge          Z G dh die          Z G dj dke          Z	 dddtZddyZdd{ZddZddZej         G d d                      Z ej                    ZddZddZddZddZddZddZddZddZddZ G d d^          Z G d d          ZdS )    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericTYPE_CHECKING	TypeAliasTypeVar)	ParamSpec
OrderedSet   )ComputedBuffer	Pointwise)CallableIteratorSequence)
ModuleType)EnterCudaStreamContextLine)PythonWrapperCodegen)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)get_stream_name)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)DevicePropertiesReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder   PartitionType_T_Pc                  t    e Zd ZU dZded<   dZded<   dZded<   d Zedd
            Z	e	 ddd            Z
dS )FusionResultNzbool | Noneshould_fusezCallable[[], bool] | Nonecallable_fnLambdaFuture | Nonefuturec                @    | j         d u| j        d uz  s
J d            d S )NzLFusion result should contain either fusion decision or callable_fn, not both)re   rf   selfs    \/var/www/html/Carbon-Document/venv/lib/python3.11/site-packages/torch/_inductor/scheduler.py__post_init__zFusionResult.__post_init__s   s>     ,1A1MN 	
 	
Z	
 	
N 	
 	
    boolc                "    t          |          S )N)re   rd   )clsre   s     rl   fusezFusionResult.fusex   s    4444rn   Callable[[], bool]c                $    t          ||          S )Nrf   rh   rq   )rr   rf   rh   s      rl   from_callablezFusionResult.from_callable|   s     FCCCCrn   )re   ro   N)rf   rt   rh   rg   )__name__
__module____qualname__re   __annotations__rf   rh   rm   classmethodrs   rw    rn   rl   rd   rd   m   s         #K####-1K1111"&F&&&&
 
 

 5 5 5 [5 LPD D D D [D D Drn   rd   c                  D    e Zd ZU ded<   ded<   ded<   dZded<   ddZdS )PendingFusionrt   rf   r_   node1node2Nrg   rh   return+tuple[BaseSchedulerNode, BaseSchedulerNode]c                    | j         | j        fS rx   r   r   rj   s    rl   get_fusion_nodeszPendingFusion.get_fusion_nodes   s    
DJ''rn   )r   r   )ry   rz   r{   r|   rh   r   r~   rn   rl   r   r      s_         ####"&F&&&&( ( ( ( ( (rn   r   c                  2   e Zd ZdZedd            Zedd            Zedd            Zedd            Z	edd            Z
edd            Zed d            Zed!d            Zedd            Zedd            Zedd            Zed"d            ZdS )#MixOrderReductionz
    This class contains utility functions to decide if we should fuse reductions
    reducing across different dimensions of the same input tensor.
    noder_   r   ro   c                    |                                  o*t          d |                                 D                       S )Nc              3     K   | ]U}t          |t                    |                                +t          |j        t                    E|j        j        d uV  Vd S rx   )
isinstanceSchedulerNodeis_reductionr   r   _split_size.0subnodes     rl   	<genexpr>z7MixOrderReduction.is_split_reduction.<locals>.<genexpr>   s|       +
 +
'=11+
 $$&&	+

 7<88+
L$D0+
 +
 +
 +
 +
 +
rn   )r   all	get_nodesr   s    rl   is_split_reductionz$MixOrderReduction.is_split_reduction   sM      "" 
s +
 +
>>+++
 +
 +
 (
 (
 	
rn   tuple[sympy.Expr, sympy.Expr]c                   |                      |          rjd }d }|                                D ]H}t          |t                    r.|                                rt          |j        t                    sG|j        j        J t          j	        j
                            t          |j        j                            }|j        j        J t          j	        j
                            t          |j        j                            }||}|}t          j	        j
                            ||          sJ | d|             t          j	        j
                            ||          sJ | d|             J|J ||fS |j        d         S )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrZ   graphsizevarssimplifyrY   _original_reduction_rangesstatically_known_equalsgroup)rr   r   xnumelrnumelr   	curxnumel	currnumels          rl   get_numel_rnumelz"MixOrderReduction.get_numel_rnumel   s   !!$'' "	!FF>>++ 4 4w66,,.. #7<@@
 |4@@@G,55!',"?@@ 	 |>JJJG,55!',"IJJ 	 >&F&FF7+CC	  4 4 33	334 4  7+CC	  4 4 33	334 4  4 %%%F##:a= rn   r   r   c                   |                      |          }|                      |          }t          |          dk    st          |          dk    s||k    rdS t          |          t          t          |                    k    S )N   F)r   lentuplereversed)rr   r   r   g1g2s        rl   has_mix_reduction_ordersz*MixOrderReduction.has_mix_reduction_orders   ss     !!%((!!%((r77a<<3r77a<<2885RyyE(2,,////rn   bufstrc                .   d}|j         j        D ]&}t          |t                    r|j        |k    r|} n'|sdS |j        }|j         j        }|sDt          |t                    sJ t          |                       |j	        d         j         j        }|sJ t          |          t          |j                  z
  sdS t          j        j                            t!          |j                  t!          |                                                    rdS dS )z@
        The access to 'buf' is not a broadcast access.
        NFr   T)read_writesreadsr   r4   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r"   rZ   r   r   r   rY   sizevalues)rr   r   r   	found_depdepr   r   s          rl   _is_full_accessz!MixOrderReduction._is_full_access   s"   
 	#) 	 	C#y)) ch#oo	 	5%0
 	?d$677HHDJJHH7Q3>Jz:&&E4F)G)GG 	4
 733).))=9J9J9L9L+M+M
 
 	 4urn   	list[str]c                    g }|                                 |                                 z  }|D ]C}|                     ||          r+|                     ||          r|                    |           D|S rx   )used_buffer_namesr   append)rr   r   r   outcommon_readsr   s         rl   get_common_readz!MixOrderReduction.get_common_read   s~     ..0053J3J3L3LL 	  	 C""3..  33F3FsE3R3R  

3
rn   c                P    t          |                     ||                    dk    S Nr   )r   r   rr   r   r   s      rl   has_common_readz!MixOrderReduction.has_common_read   s'     3&&ue445599rn   intc                    |                      |          }t          j        j                            |d         |d         z  d          S )Nr   r   fallback)r   rZ   r   r   optimization_hint)rr   r   r   s      rl   	get_numelzMixOrderReduction.get_numel  s?    !!$''w11"Q%"Q%-!1LLLrn   c                ,    |                      |          S rx   )r   r   s      rl   get_fusion_scorez"MixOrderReduction.get_fusion_score	  s    
 }}U###rn   c                   t           j        j        sdS t          j        j        rdS |                                r|                                sdS |                                j        }|dvst          |          dk    rdS |
                                r|
                                sdS |j        |                                z  s|j        |                                z  rdS |                     ||          sdS t                              ||          }t!          |          dk    rdS |                     |          r||}}n|                     |          r||}}ndS |                     |          }|\  }}	t           j        j        sd}
t          j        j                            t-          j        ||	z  |
                    sdS t          j        j                            t-          j        ||	dz                      sdS t          j        j                            t-          j        |d                    sdS t1          d |                                D                       rdS t          j        j                            |	d	          sdS t                              |          rdS t9          d
 |                                D                       }|S )zP
        Check whether we can fuse two reductions with mix loop orders.
        F)cudaxputritonr   i  P r   i   c              3     K   | ]A}|                                 |j        j        j        t          j        t          j        fvV  Bd S rx   )r   r   datareduction_hintrD   INNERDEFAULTr   s     rl   r   z-MixOrderReduction.can_fuse.<locals>.<genexpr>[  sh       
 
 ##%%
L,#%
 
 
 
 
 
rn   i @  c              3  t   K   | ]3}|                                 |j                                        d v V  4dS )>   sumprodN)r   r   get_reduction_typer   s     rl   r   z-MixOrderReduction.can_fuse.<locals>.<genexpr>q  sb       
 
 ##%%
L++--
 
 
 
 
 
rn   )r(   r   mix_order_reductionrZ   r   cpp_wrapperrU   
get_devicer   rL   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   r   )rr   r   r   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   s               rl   can_fusezMixOrderReduction.can_fuse  s   
 }0 	5 7 	5||~~ 	U\\^^ 	5&&((-..";//8;;5!!## 	5+=+=+?+? 	5Oe77999 	Oe77999	 5 ++E599 	5 )88FF|!!5!!%(( 	*/ZOO##E** 	*/ZOO5!!/22
d }@ 	 #J
 7#11%(4$;
2S2STT u
 7#11%(42J2JKK u
 7#11%(42F2FGG u  
 
 +4466
 
 
 
 
 		 5
 w44T9EE 	5//@@ 	5  
 
 &//11
 
 
 
 
 
rn   c                .    |                      ||          S rx   )r   r   s      rl   are_mix_order_reductionsz*MixOrderReduction.are_mix_order_reductions|  s     ||E5)))rn   c                Z     t           fdj        j        D                       sdS dS )Nc              3  N   K   | ]}                     |j                  V   d S rx   )is_contiguous_loadr   )r   r   rr   r   s     rl   r   z7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>  sF       
 
7:C""38T22
 
 
 
 
 
rn   FT)r   r   r   )rr   r   s   ``rl   r   z$MixOrderReduction.is_contiguous_node  sS     
 
 
 
 
>B>N>T
 
 
 
 
 	 5trn   parent_nodec                   ddl m} |                                D ]}t          |t                    sJ |j        }|j        |j                 }fd|D             }t          |          dk    rT|D ]y}|j	        |         }	|j
        }
t          |
                                          }t          j        j                            |	||          }|d         dk    s|d         dk    s  dS zdS )Nr   )MemoryUsageTypec                4    g | ]}|j         k    |j        S r~   )buffer_name
index_name)r   er   s     rl   
<listcomp>z8MixOrderReduction.is_contiguous_load.<locals>.<listcomp>  s'    QQQAAMS<P<P1<<P<P<Prn   r   FT)torch._inductor.loop_bodyr   r   r   r   _bodymemory_usageLOADr   indexing_exprsr   listkeysrZ   r   r   stride_vars)rr   r   r   r   r   	loop_bodyentriesindex_namesr   
index_exprr   var_symbolsr  s    `           rl   r   z$MixOrderReduction.is_contiguous_load  s#   ======))++ 	! 	!DdM22222
I,_-ABGQQQQQQQK;1$$ * ! !
&5jA
&1
 #:??#4#455g.::  $B1,,B10D0D 555! trn   Nr   r_   r   ro   )r   r_   r   r   r   r_   r   r_   r   ro   )r   r   r   r_   r   ro   )r   r_   r   r_   r   r   )r   r_   r   r   r   r_   r   r_   r   r   )r   r   r   r_   r   ro   )ry   rz   r{   __doc__staticmethodr   r}   r   r   r   r   r   r   r   r   r   r   r   r~   rn   rl   r   r      s        
 
 
 
 \
 #! #! #! [#!J 	0 	0 	0 [	0    [B 	 	 	 [	 : : : [:
 M M M [M $ $ $ [$ h h h [hT * * * [*
    [    [  rn   r   c                      e Zd ZU ded<   ded<   ded<    ej        e          Zded	<    ej        e          Z	d
ed<   ddZ
ddZddZddZd dZd!dZd"dZd#dZd#dZd$dZdS )%SchedulerBuffer	Scheduler	schedulerz	ir.Bufferr   BaseSchedulerNode | Nonedefining_op)default_factorylist[NodeUser]usersrA   
mpi_bufferr   r   c                @    | j         }|J |                                S rx   )r  get_name)rk   ops     rl   defining_op_namez SchedulerBuffer.defining_op_name  s!    ~~~{{}}rn   r   c                4    t          | j        j                  S rx   )hashr   r   rj   s    rl   __hash__zSchedulerBuffer.__hash__  s    DIN###rn   c                   t                      }|                                 }|                    | dt          | j                  j                    |                    | d| j        j                    |                                 r9|                    | dt          |                                                       | 	                                r9|                    | dt          | 	                                                      t          | j                  dk    r |                    | d| j                    n}|                    | d           |                    d          5  | j        D ]}|                    | d           	 d d d            n# 1 swxY w Y   |                    d	           |                                S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rR   r  	writeliner   r   ry   layoutget_aliasespformatget_mutationsr   r  indentgetrawvalue)rk   resultr   users       rl   	debug_strzSchedulerBuffer.debug_str  s   !!}}D>>DOO$<>>???D>>DI,<>>??? 	PNN9I9I9K9K1L1LNNOOO 	TRR74;M;M;O;O3P3PRRSSStz??a;;tz;;<<<<000111q!! 1 1 J 1 1D$$ZZZ000011 1 1 1 1 1 1 1 1 1 1 1 1 1 1 S!!!!!###s   7#F''F+.F+c                4    | j                                         S rx   r   r  rj   s    rl   r  zSchedulerBuffer.get_name      y!!###rn   Nonec                ^   | j         J | j                                         sd S | j                                         sJ| j                                         s1t	          | j                                         t          j                  r+t          j	        j
                            | j                    d S t          t          j        d          r|                                 t          j        j        v rt          j        j        |                                          }|| j        j        v r| j        j        |         j         }n| j        j        |         j         }t          j	        j
                            || j                    d S t          j	        j
                            | j                    d S )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr+   CommBufferLayoutrZ   r   wrapper_codecodegen_allocationhasattrkernelr  inplace_update_buffersr  name_to_donated_buffername_to_bufcodegen_inplace_reuse)rk   input_buffer_nameinput_buffers      rl   allocatezSchedulerBuffer.allocate  sy   y$$$y((** 	F I2244	y++--	 $)3355r7JKK	
 G 33DI>>>F AHf%%	?18#BBB !" ? P DN$III#~D%    $~9:KLQG 66	    
 G 33DI>>>>>rn   ro   c                    | j         J t          | j         j        t          j                  st          | j                   rdS | j        D ]}t          |j         t                    r dS  dS NFT)r   r   r)  r+   r?   rV   r  
OutputNode)rk   uses     rl   can_freezSchedulerBuffer.can_free  s}    y$$$di&66 	:SI;
 ;
 	 5: 	 	C#(J// uutrn   c                @   i }|D ]r}t          |j                  |v rC|                    |t          |j                                     |t          |j                  <   [||t          |j                  <   st          |                                          | _        d S rx   )idr   merger  r   r  )rk   r  r/  rK  s       rl   	set_userszSchedulerBuffer.set_users   s    &( 	+ 	+C#(||v%%'*yy381E'F'Fr#(||$$'*r#(||$$&--//**


rn   Sequence[str]c                F    | j         J | j                                         S rx   )r   r9  rj   s    rl   r*  zSchedulerBuffer.get_aliases
  s$    y$$$y55777rn   c                F    | j         J | j                                         S rx   )r   r:  rj   s    rl   r,  zSchedulerBuffer.get_mutations  $    y$$$y++---rn   torch.device | Nonec                X    | j                                                                         S rx   )r   r;  r   rj   s    rl   r   zSchedulerBuffer.get_device  s"    y((**55777rn   Nr   r   r   r   r   r5  r   ro   )r  r  r   r5  r   rQ  r   rU  )ry   rz   r{   r|   dataclassesfieldr  r  rA   r  r   r#  r1  r  rG  rL  rP  r*  r,  r   r~   rn   rl   r  r    sQ        OOO))))-K-dCCCECCCC.?k.?3/ / /J       
$ $ $ $$ $ $ $($ $ $ $? ? ? ?B
 
 
 
+ + + +8 8 8 8. . . .8 8 8 8 8 8rn   r  c                      e Zd ZU dZded<   dS )SchedulerDonatedBufferNr  r  )ry   rz   r{   r  r|   r~   rn   rl   r`  r`    s#         ,0K000000rn   r`  c                     e Zd ZU ded<   ded<   ded<   ded<   ded<   ded	<   ded
<   ded<   ded<   dZded<   ded<   ded<   dZded<   ded<   ded<   dZded<   d{d#Zd|d%Zd}d'Z	d}d(Z
d}d)Zd~d+Zd}d,Zdd-Zdd1Zdd3Zdd6Zdd7Zdd9Zdd<Zdd=Zdd>Zdd?Zdd@ZddAZddDZd}dEZd}dFZeddG            ZeddH            ZeddI            Z eddJ            Z!ddLZ"ddNZ#ddQZ$ddSZ%ddTZ&ddUZ'ddVZ(ddWZ)ddXZ*ddYZ+ddZZ,dd[Z-dd^Z.dd_Z/dd`Z0	 dddeZ1eddf            Z2eddg            Z3eddh            Z4ddkZ5ddmZ6eddo            Z7ddqZ8eddr            Z9ddtZ:ddvZ;e<ddz            Z=dS )r_   OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager   min_input_distancemax_input_distance	min_order	max_orderrB   mpi_nodedict[str, str]mutation_renamesNir.Operation | Noner   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_namefloat | Noneoverride_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFro   writtenr  r  r   r5  c                $    || _         d | _        d S )Nc                     g S rx   r~   )r7  kwargss     rl   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>5  s    B rn   )r  debug_device_str)rk   r  s     rl   __init__zBaseSchedulerNode.__init__2  s    $-&& 	rn   ir.Operationc                &    | _         t                       _        d _        d _        t          t
                                _        d _         fd|                                D              _	        d  j	        D              _
        i  _        d S )Nr   Fc                >    g | ]}t          j        |           S ))r  r   r  )r  r  )r   outputrk   s     rl   r   z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>A  sE     
 
 
  .   
 
 
rn   c                8    i | ]}|                                 |S r~   r  r   r   s     rl   
<dictcomp>z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>I  s"    LLLLLLrn   )r   r   r   rd  re  r   rc  ru  get_outputsrm  ro  rj  rk   r   s   ` rl   _init_from_nodez!BaseSchedulerNode._init_from_node8  s    	#"#"#$
   
 
 
 
 **,,
 
 
  MLt|LLL !#rn   r   c                Z    t          |           j         d|                                 dS )Nz(name=)r   ry   r  rj   s    rl   __repr__zBaseSchedulerNode.__repr__R  s*    t**%AAT]]__AAAArn   c                   |                                  }t                      }|                    | dt          |           j         dt          t          | dd                    j         d| dt          | j        j                   d| dt          | j	                   d| d	t          | j        j
        | j	        z
             d| d
| j         d| d| j         d| d           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t$          $ r t&                              dd           Y nw xY w|                                                                S )#Longer form printout for trace logsr%  (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = .min_input_distance = .max_input_distance = z.outputs = [
        r'  Ignoring error in debug_str()Texc_info)r  rR   splicer   ry   getattrr+  r   writesrt  r   rd  re  r-  r  r1  r(  debug_str_extra	Exceptionlogwarningr.  rstrip)rk   r   r   r   s       rl   r1  zBaseSchedulerNode.debug_strU  s   }}

 	d	 #GD&$$?$?@@I  )011    %T%<==  	  #4#3#9D<S#STT	 
  
 "4    "4    
	
 
	
 
	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   ;?EEE('F &F98F9c                    dS )N r~   rj   s    rl   r  z!BaseSchedulerNode.debug_str_extrap      rrn   r   c                ,    |                      |           S rx   )rz  rj   s    rl   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_devices  s    $$T***rn   c                   t          | j        dd           }d}t          |t          j        j        j                  r/d|                    |                                gdd          z   }net          |t          j        j        j	                  rAd|                    |
                                |                                gdd          z   }|  | S )Nr   r  , F)shorten	multiline)r  r   r   torch	_inductorr+   r   
str_helperget_size	Reductionget_reduction_sizer   )rk   
maybe_datadata_strs      rl   debug_str_shortz!BaseSchedulerNode.debug_str_shortv  s    TY55
j%/"4">?? 		j33$$&&'% 4   HH 
EO$6$@AA 	j33..00*2O2O2Q2QR 4   H
 """"rn   c                ^    t                               d| | j        | j        j                   d S )Nz(%s: unmet_dependencies = %s, writes = %s)r  infort  r   r  rj   s    rl   log_detailszBaseSchedulerNode.log_details  s7    6##		
 	
 	
 	
 	
rn   self_depr4   	other_depc                    dS NFr~   )rk   r  r  s      rl   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair  	     urn   renamesc                    fdd | j                                         D             D             | _        |                     | j                             | j                             d S )Nc                *    i | ]}|v ||         S r~   r~   )r   r   r  s     rl   r  z:BaseSchedulerNode.update_mutated_names.<locals>.<dictcomp>  s0     !
 !
 !
w '$-rn   c              3  $   K   | ]}|j         V  d S rx   r   r   r   s     rl   r   z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>  s$      QQcQQQQQQrn   )r   reads_and_writesrj  set_read_writesrenamerk   r  s    `rl   update_mutated_namesz&BaseSchedulerNode.update_mutated_names  s{    !
 !
 !
 !
QQT-=-N-N-P-PQQQ!
 !
 !

 	T-44T5JKKLLLLLrn   r   r3   c                `    |                      | j                            |                     d S rx   )r  r   	with_readrk   r   s     rl   add_fake_depzBaseSchedulerNode.add_fake_dep  s-    T-77<<=====rn   c                X    t          d |                                 D                       S )Nc              3  f   K   | ],}|                                 p|                                V  -d S rx   )r*  r,  r  s     rl   r   z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  sN       
 
9<COO4!2!2!4!4
 
 
 
 
 
rn   )r   r  rj   s    rl   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation  s<     
 
@D@P@P@R@R
 
 
 
 
 	
rn   rwc                ^    || _         | j         j        | _        |                                  d S rx   )r   r   rt  
prune_deps)rk   r  s     rl   r  z!BaseSchedulerNode.set_read_writes  s.    "&"2"8rn   future_used_buffersmutation_real_namec                z    |                                  }t          fd|D                       }||z
  | _        d S )Nc              3  D   K   | ]}                     ||          V  d S rx   )get)r   kr  s     rl   r   z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s4      !U!U1"4"8"8A">">!U!U!U!U!U!Urn   )used_or_aliased_buffer_namesr   rc  )rk   r  r  used_bufferss     ` rl   set_last_usagez BaseSchedulerNode.set_last_usage  sI     88::!!U!U!U!U!U!U!UUU&)<<rn   c                B    | j         D ]}|                                 d S rx   )rm  rG  )rk   r   s     rl   mark_runzBaseSchedulerNode.mark_run  s,    < 	 	CLLNNNN	 	rn   c                    t          d t          j        | j        j        | j        j                  D                       S )Nc              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s8       
 
 H
 
 
 
 
 
rn   )r   	itertoolschainr   r   r  rj   s    rl   r   z#BaseSchedulerNode.used_buffer_names  sH     
 
 t'7'=t?O?VWW
 
 
 
 
 	
rn   c                    t                      d t          j        | j        j        | j        j                  D             }t          |          dk    r|                                }                    |           t          j
        j                            |          rH|                    fdt          j
        j        |                                         D                        t          |          dk    S )z
        Returns buffer names used by this node, including aliases.

        Note: is_fake WeakDeps are excluded since they are purely for ordering
        and should not affect buffer lifetime.
        c                T    g | ]%}t          |t                    r|j        |j        &S r~   )r   r6   is_faker   r  s     rl   r   zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>  sF     
 
 
sG,,
 25
H
 
 
rn   r   c              3  $   K   | ]
}|v|V  d S rx   r~   )r   alias
used_namess     rl   r   zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s>         J..	  /... rn   )r   r  r  r   r   r  r   popaddrZ   r   name_to_bufferr  extendr9  )rk   depsr   r  s      @rl   r  z.BaseSchedulerNode.used_or_aliased_buffer_names  s     '1ll

 
 t'7'=t?O?VWW
 
 

 $ii!mm((**CNN3w%))#..     !"!7"2244	     	 $ii!mm rn   c                R     t           fd j        D                        _        d S )Nc              3  B   K   | ]}|j         j        j        v|V  d S rx   )r   r  available_buffer_namesr   r   rk   s     rl   r   z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>  sA       -
 -
xt~DDD DDDD-
 -
rn   r   rt  rj   s   `rl   r  zBaseSchedulerNode.prune_deps  sD    ", -
 -
 -
 -
.-
 -
 -
 #
 #
rn   c                     d fdt          fd j        j        D                       }                      j                            |                     d S )Nr   r3   r   ro   c                    t          | t                    sdS | j        j        j        vrdS j        j        | j                                                 }|t          j        j        v S r  )	r   r6   r   r  rC  r   rZ   r   removed_operations)r   op_namerk   s     rl   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  s^    c7++ uxt~999un0:KKMMGag888rn   c              3  2   K   | ]} |          |V  d S rx   r~   r   r   r  s     rl   r   z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  sF       
 
\\#5F5F

 
 
 
 
 
rn   r   r3   r   ro   )r   r   r   r  remove_reads)rk   	to_remover  s   ` @rl   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps  s    	9 	9 	9 	9 	9 	9  
 
 
 
+1
 
 
 
 
	 	T-::9EEFFFFFrn   name_to_fused_nodedict[str, BaseSchedulerNode]c                <    t          | || j        j                   d S rx   )_prune_redundant_depsr  rC  )rk   r  s     rl   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps  s"     	d$68RSSSSSrn   c                F    | j         J | j                                         S rx   )r   get_operation_namerj   s    rl   r  zBaseSchedulerNode.get_name  rT  rn   c                *    |                                  S rx   r  rj   s    rl   get_first_namez BaseSchedulerNode.get_first_name  s    }}rn   c                X    t          d |                                 D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   r  r   r   s     rl   r   z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s*      GGd$--//GGGGGGrn   )r   r   rj   s    rl   r   z%BaseSchedulerNode.get_operation_names  s)    GGdnn6F6FGGGGGGrn   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   r  r   r   s     rl   r   z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s*      AAS#,,..AAAAAArn   )r   rm  rj   s    rl   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s!    AADLAAAAAArn   c                X    t          d |                                 D                       S )Nc              3  b   K   | ]*}t          |t                    ot          |d           V  +dS )T)disallow_fp32_opsNr   r   r-   r   ns     rl   r   zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>   sX       
 
  q-(( G+AFFF
 
 
 
 
 
rn   r   r   rj   s    rl   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s<     
 
 ^^%%
 
 
 
 
 	
rn   c                X    t          d |                                 D                       S )Nc              3  ^   K   | ](}t          |t                    ot          |          V  )d S rx   r  r  s     rl   r   z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  sN       
 
 q-((K-H-K-K
 
 
 
 
 
rn   r  rj   s    rl   r-   z-BaseSchedulerNode.can_codegen_without_upcasts  s:     
 
^^%%
 
 
 
 
 	
rn   Sequence[BaseSchedulerNode]c                    | gS rx   r~   rj   s    rl   r   zBaseSchedulerNode.get_nodes  s	    vrn   Sequence[SchedulerBuffer]c                    | j         S rx   )rm  rj   s    rl   r  zBaseSchedulerNode.get_outputs  s
    |rn   buf_namer  c                    | j         |         S rx   )ro  )rk   r  s     rl   
get_outputzBaseSchedulerNode.get_output  s    #H--rn   rU  c                F    | j         J | j                                         S rx   )r   r   rj   s    rl   r   zBaseSchedulerNode.get_device  s$    y$$$y##%%%rn   c                H    |                                  }|d uo
|j        dk    S Ncpu)r   r   rk   devices     rl   is_cpuzBaseSchedulerNode.is_cpu  s(    ""T!:fkU&::rn   c                Z    |                                  }|d uot          |j                  S rx   )r   rU   r   r  s     rl   rU   zBaseSchedulerNode.is_gpu  s+    ""T!9fV[&9&99rn   c                    dS r  r~   rj   s    rl   r   zBaseSchedulerNode.is_reduction"      urn   c                    dS r  r~   rj   s    rl   is_native_matmulz"BaseSchedulerNode.is_native_matmul%  r$  rn   c                    dS r  r~   rj   s    rl   is_split_scanzBaseSchedulerNode.is_split_scan(  r$  rn   c                    dS r  r~   rj   s    rl   is_templatezBaseSchedulerNode.is_template+  r$  rn   c                    dS r  r~   rj   s    rl   	is_externzBaseSchedulerNode.is_extern.  r$  rn   c                    dS r  r~   rj   s    rl   
is_foreachzBaseSchedulerNode.is_foreach1  r$  rn   read_depdependencies.Depc                    dS r  r~   rk   r/  s     rl   can_inplacezBaseSchedulerNode.can_inplace4  r$  rn   c                    dS r  r~   rj   s    rl   has_side_effectsz"BaseSchedulerNode.has_side_effects7  r$  rn   c           	     	   	 ddl m} t           t                    rt          j        rt          j                             	                                t          j                  rht          t          j        t          j        j        j        j                  rt%          t          j        dd          t'          t          j        d          sdS  j        t          j        j        z   j        j        z  	d fd
}                                 D ]}|j        }|J |                                ry|                                se|                                sQ|                                t          j        j        v s,t          |                                t@          j!                  r j"        j#        D ]
}|j$         j        j%        v r j        j%        |j$                 }n$ j        j&        '                    |j$                  }|rt          j        j(        )                    |           rt          |j*        tV                    st|j,        J 	fd|j,        D             } j        -                    |j$                   }|s5t]          |          dk    r!|d         j/        r|d         j         u r|j        t          |j                                        t@          j0        t@          j1        t@          j2        t@          j!        f          s|j*        r[t          |j*        j        t@          j3        t@          j4        f          r+t]          |j                                                  dk    sE ||j        |j                  r. ||          r"t          j        j5        6                    |                                |                                           t          t          j        t          j        j        j        j                  rlt          j        j7        8                    |                                           t          j        j7        8                    |                                           |                                t          j        j9        |                                <    ndS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr7  buf_to_be_inplacedr  r   ro   c                   | j                                       }|                                 t                      }| j        D ]}|j        }t          |t                    s|                                | j         j	        vs| j                             |          |ur\|fd|j
                                        D             z  }t          |          dk    r dS dS )Nc              3  2   K   | ]}|j         k    |V  d S rx   r  )r   or  s     rl   r   z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>m  s<        v)) )))) rn   r   FT)r  get_fused_noder  r   r  r   r   r_   r  r  r   r  r   )r9  
fused_noder  r0  	user_noder  rk   s        @rl   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodeU  s   
 ,5DDTJJJ)2244H %/LLD*0 ! ! I	!)->??  ,,..-7JK K)3BB9MM%& &     &2CCEE   
 t99q== 55 ! 4rn   c                J    g | ]}|j                                         v| S r~   r3  )r   xinconsequential_nodess     rl   r   z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>  s;     & & &6??,,4III IIIrn   r   )r9  r  r   ro   ):codegen.wrapperr7  r   r   r(   inplace_buffersrZ   r   has_featurer   r.   INPLACE_BUFFERSr@  r  r  codegensimd
SIMDKernelr  r?  r   r  r  completed_operationsr  r   r8  r9  r:  r  removed_buffersr;  r+   r<  r   r   r   rB  rC  r  r=  	can_reuser  NopKernelSchedulerNoder  has_cross_stream_hazardr   r3  r?   r>   MutationLayoutSHOULDREMOVEFallbackKernelr=   r7  make_inplacer8  r  rA  )
rk   r7  r@  r   buf_noderead	input_bufremaining_usesrO  rC  s
   `        @rl   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update:  s   
 	;::::: t]++	&	 ##DOO$5$5~7UVV	
 qx)@)E)PQQ	 18[$77C &)) D
 F Ng()n12 	 	  	  	  	  	  	D ##%% L	 L	CxH''',,..	88::	 ..00	 <<>>QW%<<< h6688":MNN =
 (. > >9 EEE $ Edi PII $ : > >ty I II 7,66y$GG7 'y'<>TUU7
 %?666& & & &!*& & &N
 /3n.T.T	4/ /+
 4(//144*1-9 5*1-2d::%N6 *%N::<< " " 4 " = " 3	! ! 7 &1 7 !+ ) 5 :!#!2BN C! ! 7  !$IN$O$O$Q$Q R RUV V V11).#(KK !W 76yAA !W 2293E3E3G3GXXX%Heo&=&B&M  C H.2293E3E3G3GHHHH.223<<>>BBB &..00 7G YL	 L	rn   TbufferrR   	only_oncec                n   t           j        sd S |r	| j        rd S | j        J | j                                        }g }|D ]B}|j        dk    r|                    d           |                    d           d|j         d|j         }d|j        v r|d|j        d          z   }|                    |           d|j        v r|j        d          }|	                    d	d
          d         }|                    d|
                    dd          
                    dd          
                    dd          
                    dd          z              |                    d           |                    d           Dt          |          dk    rd S |                    |           d| _        d S )Nr  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}r  \z\\z#pragma CMT END ORIGINr   T)r(   comment_originru  r   get_originsr  r   targetmetarsplitreplacer   
writelines)	rk   rX  rY  origins	out_linesr<  op_info_strr]  stack_trace_last_lines	            rl   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  s    $ 	F 	 	Fy$$$)''))	 	% 	%AtxR   2333:::::K16!!),Iqvh7G,I,II[)))&&!"!68(3(:(:3(:(K(KB(O%  "+33C>>WS$''WT4((Wf 	     !9:::  $$$y>>QF 	)$$$rn   c                0    |                      dd          S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrj   s    rl   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s$    55t 6 
 
 	
rn   c                0    |                      dd          S )NTFrp  rs  rj   s    rl   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizes  s$    55u 6 
 
 	
rn   c                0    |                      dd          S )NFTrp  rs  rj   s    rl   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizes   s$    55 6 
 
 	
rn   rq  rr  c                r    t          |                     ||                                          d          S )Nrp  r   )start)r   get_read_write_buffer_accessesr   )rk   rq  rr  s      rl   rt  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl  sD     //+N 0  fhh	
 
 
 	
rn   dict[str, int]c                    t           t                    ri S t           t                    rt           j        t                    ri S t           t                    rCt           j        t
          j                  r$ j        j        t          j	        j
        j        u ri S ddt           t                    rY t                                           d                   t                                           d                   z            nt          d          t!          j        t$                    }|r/ j        j        D ]"}||j                                     |           #|r/ j        j        D ]"}||j                                     |           #|r#t1          d	  j        j        D                       nt1                      }|r#t1          d
  j        j        D                       nt1                      }d fdt           t2                    r&t1           fd|D                       }||z
  }||z
  }i }||z  D ]}	t5          fd||	         D                       |	t6          j        j        v rt6          j        j        |	         }
n,|	t6          j        j        v rt6          j        j        |	         }
nzd fd |
          }|	|vr|||	<   ||	xx         |z  cc<   |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.Exprr   r   c                N    t           j        j                            | d          S )Nr   r   )rZ   r   r   r   )r  s    rl   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint:  s     7#55a!5DDDrn   r   r       eAc              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>O  s$      BBCsxBBBBBBrn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>T  $      CCCsxCCCCCCrn   r   r   r   r  ro   c                    j         j        |          j        }t          d |D                       }t	          |t          |          z
            dk    S )Nc              3  $   K   | ]}|j         V  d S rx   r   )r   r0  s     rl   r   z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>[  s$      !>!>$)!>!>!>!>!>!>rn   r   )r  rC  r  r   r   )r   r   r  buf_usesrk   s       rl   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedY  sQ    N.s39E!!>!>!>!>!>>>Hx*V"4"445599rn   c              3  >   K   | ]} |j                   |V  d S rx   r   )r   r   r  rk   s     rl   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>_  sJ       ) )__S$+-N-N)) ) ) ) ) )rn   c              3     K   | ]}V  d S rx   r~   )r   r   
node_numels     rl   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>h  s#      $R$RCZ$R$R$R$R$R$Rrn   4ir.Buffer | ir.TensorBox | ir.TorchBindObject | Nonec                `   | sdS t          | t          j                  r|                                 S t          | j        t
                    rj        j        |                                          j	        }d}|D ]}t          |j
        t                    rt          |j
        t                    sJ t          |j
        j
        t                    r0|j
                                        D ]}| |j
                  z  } dS |S t          | j        t          j                  r-t!          fd|                                 D                       S  	t%          |                                                     }t)          |                                           t-          |          z  S )Nr   c              3  h   K   | ],} t           j                            |                    V  -d S rx   )rZ   r   
get_buffer)r   mut_nameget_buf_bytess     rl   r   zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>  sQ        $ &ag&8&8&B&BCC     rn   )r   r+   TorchBindObjectr  r)  r>   r  rC  r  r  r   rJ  r_   r=   r  r?   r   r:  rY   r  rN   	get_dtypemin)
r   r  totr0  	sched_buf	buf_elemsbuf_accessed_elemsr  rk   r  s
         rl   r  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesq  s     1c2#566 ,,...
,=>>  !N6s||~~FLEC % % %%di<< %$)$)5FGGGGG%dinkBB %-1Y-B-B-D-D E E	 #}}Y^'D'D DE $%11J
BM:: 	    (+(>(>(@(@     
 !.mCLLNN.K.K L LI)#--//::S*I> >  rn   )r  r  r   r   )r   r   r   r  r   ro   )r   r  r   r   )r   rN  ExternKernelSchedulerNoder   r=   r+   rQ  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater   rY   
get_rangesr   collectionsr   r  r   r   r   r   r  r   r   r   rZ   r   r  graph_inputs)rk   rq  rr  buf_accessesr   r   r  rL  buf_byte_accessesr  r   	buf_bytesr  r  r  r  r  s   `           @@@@@rl   r|  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233 	Id566 	:I{<
 <
 	 It677	49b&788	 	%|%BC C I	E 	E 	E 	E dM** 	"&doo//233 1 1! 4556 JJ
 SJ".t44 	3'- 3 3SX&--c2222 	3'. 3 3SX&--c2222 JBB4+;+ABBBBBB 	 JCC4+;+BCCCCCC 		: 	: 	: 	: 	: 	:
 d.// 	,( ) ) ) ) )%) ) )  O o-FO+E,. 3	9 3	9H!$$R$R$R$R<;Q$R$R$R!R!R17111g,X6QW111g*84# # # # # # # # #J &c**I000.7!(++!(+++y8++++  rn   
int | Nonec                F   | j         d S | j                                         }|d S t          |          }|d S t          |t          j                  r|j         j        }t          j        j	        
                    |d          }t          d         dxx         |z  cc<   |S )Nr   r   inductor
flop_count)r   get_origin_noder9   r   r  SymIntexprrZ   r   r   r   r   )rk   fx_nodeflopsresolved_flopss       rl   estimate_flopsz BaseSchedulerNode.estimate_flops  s    94)++--?4w''=4eU\** 	$JOE);;EA;NN\***n<***rn   floatc                F    | j         | j         S |                                 S rx   )rq  _get_estimated_runtimerj   s    rl   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime  s&    *622**,,,rn   c                   |                                  d                                         d         }|j                                        }t	          t          |                    sdS t          | j                  r,t          | j        t          j	                  sJ 	 t          j        rt          |           }t                      }|                    |          }|t          |t                    sJ |S t!          |           }|t#          | j                  }|                    ||           |S t#          | j                  S # t&          $ r%}t(                              |           Y d}~dS d}~wt,          $ r%}t(                              |           Y d}~dS d}~ww xY wt/          | j                  rdS t1          |           }||S |j                                        }		 t5                      }
t7          |	          dz  }|
dk    rt9          d|
           |dk    rt9          d|           n# t:          $ r Y dS w xY w|                                 }|dk    s||                                 |
z  }|dz  }|S d}|                                 }|dn|}||z  |z  d	z  }||
z  }tA          ||          }|dz  }|S )
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!r   r  r   r;  rU   r;   rS   r   r+   IRNoder)   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupr  r2   r1   	set_value
ValueErrorr  r  	TypeErrorrX    maybe_estimate_runtime_benchmarkmaybe_get_dtyperO   rM   AssertionErrorr  r  ru  max)rk   r   r)  	cache_keycache	cache_valmsr   retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     rl   r  z(BaseSchedulerNode._get_estimated_runtime  s#   
 nnq!--//2))++of--.. 	1 ## "	di33333L  I$ O OI688E %Y 7 7I ,))U;;;;;((HNNBz=diHHOOIRO888I7	BBB    qqqqq   qqqqq
 TY 	
 1.t44?J((**	#4#6#6 )%0069I $q(($ZDXZZ   A~~$%U)%U%UVVV  	 	 	11	 ''))	>>Y.22447KKBcBI 99;;*2*Y6#=%(<< }--#X	sD   !AE :=E 8E 
F(E66F(F##F(.AH? ?
IIir.TemplateBuffer | Nonec                    d S rx   r~   rj   s    rl   get_template_nodez#BaseSchedulerNode.get_template_node      trn   ir.TemplateBufferc                6    |                                  }|J |S rx   r  )rk   templates     rl   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throw  s$    ))++###rn   nodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                    t          d t          |           D                       }| d|         }| |         }| |dz   d         }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]\  }}|                                 |V  d S rx   r*  r   ir  s      rl   r   zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s3      PPDAqPaPPPPPPrn   Nr   )next	enumerate)r  template_indexprologuetemplate_nodeepilogues        rl   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogue  sb     PPIe,<,<PPPPP.)n-!+--.00rn   )r  r  r   r5  )r   r|  r   r5  rW  )r   r   rY  r  r4   r  r4   r   ro   r  ri  r   r5  )r   r3   r   r5  rZ  )r  rr  r   r5  r  rb  r  ri  r   r5  r   rb  r  r  r   r5  r   r  )r   r  )r  r   r   r  r\  r/  r0  r   ro   T)rX  rR   rY  ro   r   r5  rX  )rq  ro   rr  ro   r   r   )rq  ro   rr  ro   r   r}  r   r  r   r  r   r  )r   r  )r  r  r   r  )>ry   rz   r{   r|   r   rq  ru  r{  r  r  r1  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  rI   r   r	  r  r-   r   r  r  r   r!  rU   r   r&  r(  r*  r,  r.  r3  r5  rW  rn  ru  rw  ry  rt  r|  r  r  r  r  r  r  r  r~   rn   rl   r_   r_     s        BBBB NNNNNN''''$$$$ $D$$$$""""/////33333((((''''G
 
 
 
# # # #4B B B B* * * *6   + + + +# # # #
 
 
 
   
M M M M> > > >
 
 
 

   
= = = =   
 
 
 
   6
 
 
 
G G G G T T T T
. . . .    H H H ]H B B B ]B 
 
 
 ]
 
 
 
 ]
      . . . .& & & &; ; ; ;: : : :                        I I I IX 9=- - - - -^ 
 
 
 ]

 
 
 
 ]

 
 
 
 ]


 
 
 
L! L! L! L!\    ]$- - - - U U U ]Un      
 1 1 1 \1 1 1rn   r   $torch._inductor.codecache.LocalCachec                 H    t           j        j                                        S rx   )r  r  	codecache
LocalCacher~   rn   rl   r  r  &  s    ?$//111rn   snoder   c                \   t          | j        dd          }| j        j        }| j                            g || j        j        | j        j                  }| j        j        }t          j        ||f          \  }}ddt          |ft          fd|D                       z             }|S )Npython_kernel_namer  r   ro   c                    t          | t          j                  o&t          | t          j        t          j        f           S rx   )r   r+   r  GeneratorStateOpaqueObjectStaterB  s    rl   _is_tensor_irz@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir5  s<    !RY'' 

!2#781
 1
 -
 	
rn   c              3  t   K   | ]2} |          r!t          |                                          nd V  3d S rx   )r   r  )r   ar  s     rl   r   z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr><  sG      UUa}}Q'7'7Aajjll###TUUUUUUrn   rZ  )
r  r   inputsfill_non_provided_argsconstant_argsrx  pytreetree_flattenr   r   )r  r  r7  rx  	flat_argsflat_args_pytree_specr  r  s          @rl   r  r  +  s     -A2FF:D:,,*$*)*
 D ZF'-':D&>'J'J$I$
 
 
 

 	
UUUU9UUU
U
U	V I rn   Callable[[Any], Any] | Nonec                >   t          | t                    sd S t          j        j        j        t          j        j        j        t          j        j        j        d}t          | j	        dd          }||vrd S t          | j	        t          j                  sd S ||         S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  r  )r   r  r  opsatenmmbmmaddmmr  r   r+   ExternKernel)r  mms_fnsr  s      rl   _get_mm_like_fnr  A  s    e677 t"Y^.#in0 %	 4 G
 !-A2FF((tej"/22 t%&&rn   rp  c                    d }d }t           j        rt                     }|d S |} fd}nd S t                     }t	                      }|                    |          }|t          |t                    sJ |S ddlm	  |            \  }}ddl
m}	 |	                    |||ddd          }
|                    ||
	           |
S )
Nc                                 S rx   r~   )r  snode_args_kwargss   rl   ry  z2maybe_estimate_runtime_benchmark.<locals>.<lambda>Z  s    !2!25!9!9 rn   r   )r  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationr  )r(   !runtime_estimations_mms_benchmarkr  r  r  r  r   r  utilsr  $torch._inductor.runtime.benchmarkingr  	benchmarkr  )r  bench_fnargs_kwargs_fnmm_fnr  r  r  r7  rx  r  r  r  s   `          @rl   r  r  Q  s   HN/ &&=499999t9%@@I&((EY''I)U+++++((((((!>##LD&@@@@@@			! 
 
 
B 
OOIRO(((Irn   T)slotsc                  P    e Zd ZU ded<   ded<   ded<   ded<   ddZddZddZdS )	WhyNoFuser   name1name2reasonztuple[Any, ...]r7  r   r_   r   r   r5  c                j    |                                 | _        |                                 | _        d S rx   )r  r(  r)  rk   r   r   s      rl   r{  zWhyNoFuse.__init__~  s(    ^^%%
^^%%


rn   r   c                V    || _         || _        t                              |            d S rx   )r*  r7  
fusion_logdebug)rk   r*  r7  s      rl   __call__zWhyNoFuse.__call__  s*    	rn   c                H    d| j          d| j         d| j        | j        z  z   S )Nzcannot fuse z with r%  )r(  r)  r*  r7  rj   s    rl   __str__zWhyNoFuse.__str__  s2    >dj>>
>>>K$)#
 	
rn   Nr   r_   r   r_   r   r5  )r*  r   r7  r   r   r5  rW  )ry   rz   r{   r|   r{  r0  r2  r~   rn   rl   r'  r'  w  sy         JJJJJJKKK& & & &   

 
 
 
 
 
rn   r'  objr   c                    t          | t          t          f          rt          | t                    } t          j        | d          }d|v rdt          j        |d           S |S )Nkey   )r-  r      )	r   r   setsortedr   pprintr+  textwrapr-  )r4  r/  s     rl   r+  r+    sg    #
C()) #Sc"""^C***Fv~~6HOFG44666Mrn   c                  2    e Zd ZddZddZdd	ZddZeZdS )rJ  r   r5   r   r5  c                0    t          |g          | _        d S rx   r  r  s     rl   r{  zOutputNode.__init__  s    ",cU"3"3rn   ro   c                    dS r  r~   rj   s    rl   r   zOutputNode.is_reduction  r$  rn   rQ  c                    dS )Nr~   r~   rj   s    rl   r9  z'OutputNode.get_inputs_that_alias_output  r  rn   r   c                    dS )NOUTPUTr~   rj   s    rl   r  zOutputNode.get_name  s    xrn   N)r   r5   r   r5  rZ  r[  rW  )ry   rz   r{   r{  r   r9  r  r  r~   rn   rl   rJ  rJ    se        4 4 4 4          HHHrn   rJ  r   r  r  rC  rn  r5  c                    t          j                     j        D ]^}t          |t                    sG|j                                                 }|                                         xx         dz  cc<   _d	 fdt          fd j        D                       }|r> j        |z
   _         	                     j
                            |                     dS dS )
am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r   r3   r   ro   c                   t          | t                    ru| j                                                 }|                                                  dk    o!j                            | |                   }|         k    }|p|S dS )Nr   F)r   r6   r   r   r  r  fusable_weak_dep)r   r  is_redundantis_self_deprC  name_to_dep_countr  r   s       rl   r  z+_prune_redundant_deps.<locals>.should_prune  s    c7## 	!#(+<<>>G,"7+4466 n55'0$   -W5=K.;.5rn   c              3  2   K   | ]} |          |V  d S rx   r~   r  s     rl   r   z(_prune_redundant_deps.<locals>.<genexpr>  sF        ,,s2C2C     rn   Nr  )r  r   rt  r   r6   r   r   r  r   r  r   r  )r   r  rC  r   r  deps_to_prunerI  r  s   ```   @@rl   r  r    s?    '2&9&;&;& K K#w'' 	K!#(+<<>>G09BBDDEEEJEEE              .    M  K"&"9M"IT-::=IIJJJJJK Krn   c                  H     e Zd Zd fdZdd	ZddZddZddZddZ xZ	S )r  r  r  r   r|  r   r5  c                   t                                          |           |                     |           |                     |                                           t          |t          j                  r[|                                rIt          j
        |j        d         j                  }d}|                                }|||ff| _        d S d S d S Nr   r   )superr{  r  r  get_read_writesr   r+   UserDefinedTritonKernelcan_fuse_epiloguemathr   mutable_argsshapeget_device_or_errorr   )rk   r  r   numelr   r   	__class__s         rl   r{  z"ExternKernelSchedulerNode.__init__  s    ###T"""T1133444dB677 	3D<R<R<T<T 	3Id/2899EF--//F 5&/2DJJJ	3 	3 	3 	3rn   r   c                \    |                                   dt          | j        dd            S )Nz.node.kernel = r  )r  r  r   rj   s    rl   r  z)ExternKernelSchedulerNode.debug_str_extra  s.    --//bb'$)EY[_2`2`bbbrn   ro   c                    dS NTr~   rj   s    rl   r,  z#ExternKernelSchedulerNode.is_extern  r  rn   c                p    | j         J t          | j         d          o| j                                         S )Nr5  )r   r?  r5  rj   s    rl   r5  z*ExternKernelSchedulerNode.has_side_effects  s6    y$$$ty"455V$):T:T:V:VVrn   Sequence[Sequence[sympy.Expr]]c                    t          | j        t          j                  rG| j                                        r.t          j        | j        j        d         j                  }|gg fS g g fS r   )	r   r   r+   rQ  rR  rS  r   rT  rU  )rk   rW  s     rl   r  z$ExternKernelSchedulerNode.get_ranges  sb    ty""<==	!	++--	! Idi4Q7=>>EGR= Bxrn   wrapperr   c                x    t          | j        t          j                  sJ | j                            |          S rx   )r   r   r+   r  rH  )rk   r_  s     rl   rH  z!ExternKernelSchedulerNode.codegen  s2    $)R_55555y  )))rn   r  r  r   r|  r   r5  rW  rZ  r   r]  r_  r   r   r5  )
ry   rz   r{   r{  r  r,  r5  r  rH  __classcell__rX  s   @rl   r  r    s        
3 
3 
3 
3 
3 
3c c c c   W W W W   * * * * * * * *rn   r  c                        e Zd Zd fdZ xZS )	rN  r  r  r   r|  r   r5  c                    t                                          |           |                     |           |                     |                                           d S rx   )rO  r{  r  r  rP  rk   r  r   rX  s      rl   r{  zNopKernelSchedulerNode.__init__  sU    ###T"""T113344444rn   ra  )ry   rz   r{   r{  rd  re  s   @rl   rN  rN    s=        5 5 5 5 5 5 5 5 5 5rn   rN  c                  ^    e Zd ZU dZded<   ded<   dE fdZ	 	 dFdGdZ	 	 dFdHdZdIdZdJdZ	dKdZ
dLdZdKdZdMd#ZdKd$ZdNd(ZdOd*ZdPd,ZdQd-ZdQd.ZdQd/ZdQd0ZdRd2ZdSd5ZdTd7ZdUd8Z	 dVdWd<ZedXd=            ZedXd>            ZdYdAZedZdC            ZedQ fdD            Z  xZ!S )[r   zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr@   r  r  r  r   %ir.ComputedBuffer | ir.TemplateBufferr   r5  c                    t                                          |           |                     |           |                                  d S rx   )rO  r{  r  _compute_attrsrh  s      rl   r{  zSchedulerNode.__init__	  sI    
 	###T"""rn   Nextra_indexing_constraints'tuple[dict[Any, Any], list[Any]] | Nonerecompute_sizes_body_funcCallable[_P, _T] | Nonec                   t          | j        t          j        t          j        f          sJ | j                            ||          \  | _        }|| _        | j                                        }| j	        
                    |          j        }| || j                  f| _        t          j         pt          |j                   }t          | j        t          j                  r0|                     | j                            |                     d S |                     t'          j        | j        g| j        R d|i           d S )Nrn  rp  )	normalizert  )r   r   r+   r   TemplateBuffersimplify_and_reorderrj  r  rV  r  get_backendgroup_fnr   r(   loop_ordering_after_fusionrU   r   r  extract_read_writesr*   )rk   rn  rp  bodyr   rx  should_normalizes          rl   rm  zSchedulerNode._compute_attrs  so   
 $)b&79J%KLLLLL I::'A&? ; 
 
T 
..00>--f55>hht{334
  &@@ 
KI
 I
 E
 di!233 		  	--8H-II       0J!%  8H     rn   Callable[..., Any] | Nonec                   t          d | j        j        D                       }|                     ||           |rG|                     | j                            |                              | j                             d S d S )Nc              3  R   K   | ]"}t          |t          t          f          |V  #d S rx   r   r6   r5   r  s     rl   r   z8SchedulerNode.recompute_size_and_body.<locals>.<genexpr>8  J       0
 0
ZgwEW5X5X0
0
 0
 0
 0
 0
 0
rn   rs  )r   r   r   rm  r  r  r  rj  )rk   rn  rp  	fake_depss       rl   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body3  s    
 &0 0
 0
+10
 0
 0
 &
 &
	 	'A&? 	 	
 	
 	
  	   **955<<T=RSS    	 	rn   rt  ro   need_clear_tiling_cachec                   t          d | j        j        D                       }|                     t	          j        | j        g| j        R d|i                    |          	                    | j
                             | j                            |            |r!ddlm} |j                                         d S d S )Nc              3  R   K   | ]"}t          |t          t          f          |V  #d S rx   r  r  s     rl   r   z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>I  r  rn   rt  r   SIMDScheduling)r   r   r   r  r*   rz  r  rj  r  r  rj  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rk   rt  r  r  r  s        rl   refresh_dependenciesz"SchedulerNode.refresh_dependenciesD  s    
 &0 0
 0
+10
 0
 0
 &
 &
	 	,
![  4=  Yy!!VD)**	
 	
 	
 	"..t444" 	;444444 ,88:::::	; 	;rn   	new_orderSequence[int]c                    | j                             |          | _         | j         j        | _        |                     dd           d S )NFTrt  r  )r  reorder_iter_loopssizesrj  r  )rk   r  s     rl   apply_new_loop_orderz"SchedulerNode.apply_new_loop_orderb  sK    Z22
 

 j&!!E4!PPPPPrn   c                   | j                                         }t          | j         j                  |z
  }t	          t          |                    }t	          t          |||z                       }|                     ||z              t          | j        d                   dk    sJ | j        d         | j        d         d         | j        d         d         ff| _        d S )Nr   r   r   )r  get_original_num_rdimsr   	iter_varsr   ranger  r   )rk   	num_rdims
num_pwdimspwdimsrdimss        rl   swap_pw_red_dimensionz#SchedulerNode.swap_pw_red_dimensionj  s    J5577	-..:
uZ(())eJ
Y(>??@@!!%&.1114:a=!!Q&&&&Z]TZ]1%5tz!}Q7G$HH


rn   r_   c                B    | j                                         | _         | S rx   )r  extract_pw_from_reductionrj   s    rl   r  z'SchedulerNode.extract_pw_from_reductiont  s    Z99;;
rn   c                   t                               |           sd S t          | j        t          j                  sJ | j                                        5  |                                  d d d            d S # 1 swxY w Y   d S rx   )r   r   r   r   r+   r   with_original_inner_fnrm  rj   s    rl   cancel_reduction_splitz$SchedulerNode.cancel_reduction_splitx  s     33D99 	F$)R%677777Y--// 	" 	"!!!	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"s   A99A= A=	dimensionr   	new_rangec                   t          | j        t          j        t          j        f          sJ | j                            ||          | _        | j        j        | _        | j        	                                }| j
                            |          j        }| || j                  f| _        |                     dd           d S )NTr  )r   r   r+   r   ru  r  #expand_dimension_for_pointwise_noder  rj  rV  r  rw  rx  r   r  )rk   r  r  r   rx  s        rl   r  z1SchedulerNode.expand_dimension_for_pointwise_node  s     $)b&79J%KLLLLLZCCy
 

 j&..00>--f55>hht{334
 	!!D$!OOOOOrn   c                    | j                                         | _         | j         j        | _        |                     dd           d S )NTFr  )r  merge_loopsr  rj  r  rj   s    rl   r  zSchedulerNode.merge_loops  sD    Z++--
j& 	!!D%!PPPPPrn   r  r4   r  c                   d }| j         d         }t          |          |j        cxk    r|j        k    rn n|                    |          }|rZt          xj        dz  c_        t                              d|                                 |           | 	                    |           dS t                              d|                                            dS )Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
rj  r   num_varsdecide_loop_order_to_matchr,   num_loop_reorderingloop_ordering_logr/  r  r  )rk   r  r  r  
self_sizess        rl   r  z'SchedulerNode.reorder_loops_by_dep_pair  s     	[^
z??h/EEEE93EEEEEE ;;IFFI 	''1,''##4dmmooy   %%i0004##W   5rn   r   c                0   |                                  }| d| j        d          | d| j        d          | d| j         g}| j                                        D ]}t          |t                    sl|j        }t          j	        
                    |          }t          |t          j                  s,|                    | dt          |j                              t          | j        t"                    rX|                    d| d           |                    t%          j        | j                                        d	                     | j        J |                    |                                            d
                    |          S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r9  r  )r  r   rj  r   r  r   r6   r   rZ   r   r  r+   r  r   r+  r)  r  r@   r=  r-  r1  r   r  r  join)rk   r   linesr   r  r   s         rl   r  zSchedulerNode.debug_str_extra  s   }}44TZ]4477
177++dk++

 #4466 	O 	OCc7++ O8g((22!#r'9:: OLLH!M!M
8K8K!M!MNNNdj(++ 	JLL3$333444LL)=)=)?)?HHIIIy$$$T//11222yyrn   r]  c                    | j         S rx   )rj  rj   s    rl   r  zSchedulerNode.get_ranges  
    {rn   c                   t          | j        t          j        t          j        f          sJ dt          | j                              t          | j                                                  o| j        d u p| j        j	         S Ntype(self.node)=)
r   r   r+   r   ru  r   ro   r   r  has_partial_accumulaterj   s    rl   r   zSchedulerNode.is_reduction  s    $)b&79J%KLL 	
 	
!tDI!!	
 	
L DI002233 
J$Gdj&G"G	
rn   c                    t          | j        t          j                  sJ dt	          | j                              | j                                        dk    S )Nr  dot)r   r   r+   r   r   r   rj   s    rl   r&  zSchedulerNode.is_native_matmul  sO    $)R%677NN9NDOO9N9NNN7y++--66rn   c                   t          | j        t          j        t          j        f          sJ dt          | j                              t          | j        t          j                  o#t          | j        j        t          j                  S r  )r   r   r+   r   ru  r   r   	SplitScanrj   s    rl   r(  zSchedulerNode.is_split_scan  s{    $)b&79J%KLL 	
 	
!tDI!!	
 	
L $)R%677 
JINBL=
 =
 	
rn   c                @    t          | j        t          j                  S rx   r   r   r+   ru  rj   s    rl   r*  zSchedulerNode.is_template  s    $)R%6777rn   r  c                R    t          | j        t          j                  r| j        nd S rx   r  rj   s    rl   r  zSchedulerNode.get_template_node  s"    &ty"2CDDNtyy$Nrn   
index_varsSequence[sympy.Expr]c                    |                                   |                                  |                     |           d S rx   )rW  r  rH  )rk   r  s     rl   runzSchedulerNode.run  s9    ""$$$Z     rn   dict[sympy.Expr, sympy.Expr]c                R   | j         }t          t          t          |                    t          t          t          |                    k    sJ t	          t          t          j                            |          t          j                            |                              }|S rx   )	rj  r   mapr   dictzipr  r  from_iterable)rk   r  r  r   s       rl   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  s     3sE??##s3sJ+?+?'@'@@@@@--j99--e44 
 

 rn   c                   |                      |          }	 t          j        t          t          j                    |                    5  t          j                            |           5   | j        |  ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS # t          $ r" t          
                    d| j                    w xY w)a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r  rZ   set_ops_handlerrG   get_ops_handlerr@  set_current_noder  r  r  fatalr   )rk   r  r   s      rl   rH  zSchedulerNode.codegen  sX    00<<
	!"213D3F3F
"S"STT( ())$//( ( 
J''	( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( (
  	 	 	II/;;;	sS   3B& 
 B*B5BB	BB		BB& BB&  B!B& &,CT	pointwiserr  c                    |r| j         nt          | j                   \  }}t          j        | j        |t
          j        j        gt          |          z  g          S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	rj  r   r*   rz  r  r   SZeror   )rk   r  
keep_sizesignore_sizess       rl   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writes  s\     3<#V4;;$+AVAV 
L/J
%',#lBSBS1S0T
 
 
 	
rn   c                .    |                      d          S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  rj   s    rl   r  z#SchedulerNode.pointwise_read_writes  s    
 666FFFrn   c                .    |                      d          S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  rj   s    rl   reduction_read_writesz#SchedulerNode.reduction_read_writes&  s    
 666GGGrn   r/  r0  c                   |                                  rdS t          d |                                 D                       rdS t          | j        j                  dk    rt          |t          j                  rzt          t          | j        j                            }t          |t          j                  sJ dt          |                      |j        |j        k    o|j        |j        k    S dS )NFc              3  >   K   | ]}|                                 V  d S rx   )r*  r  s     rl   r   z,SchedulerNode.can_inplace.<locals>.<genexpr>0  s,      ??Ss  ??????rn   r   ztype(write_dep)=)r*  r   r  r   r   r  r   r*   r4   r  iterr   r   r   )rk   r/  	write_deps      rl   r3  zSchedulerNode.can_inplace-  s     	5??D,<,<,>,>????? 	5t&''1,,l,2
 2
, T$"2"9::;;Ii)?@@WWBWT)__BWBWWW@>Y_4X).9XXurn   rb  c                   t                      }t          | j        t                    r| j                                        D ]}|j        dk    r|j        dk    rd|j        v r|j        d         dk    s)t          |j	                  dk    ra|j	        d         dk    rP|
                    d|j        v r|j        d         n&t          |j	                  dk    r|j	        d	         nd
           |S )Ncall_methodstoremode
atomic_addr  r8  r   r   r   r  )r   r   r  r@   r   r  re  rx  r   r7  r  )rk   buffers_store_as_atomic_addr   s      rl   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers:  s    7A||#dj(++ 	
,,..  G},,w..4;..4;v3F,3V3V	NNa//DIaLL4P4P 033!T[00 F++.1$)nn.A.Adillr  
 +*rn   c                    | j         | j                             d          rdS t                                                      S )Ndevice_assert_asyncT)r  has_oprO  r5  rk   rX  s    rl   r5  zSchedulerNode.has_side_effectsN  s>     :!dj&7&78M&N&N!4ww'')))rn   )r  r  r   rk  r   r5  NN)rn  ro  rp  rq  r   r5  )rn  ro  rp  r}  r   r5  )rt  ro   r  ro   r   r5  )r  r  r   r5  rY  r   r_   )r  r   r  r   r   r5  r  rW  rb  rZ  r  )r  r  r   r5  )r  r]  r   r  )r  r]  r   r5  r  )r  ro   r   rr  )r   rr  r  r  )"ry   rz   r{   r  r|   r{  rm  r  r  r  r  r  r  r  r  r  r  r  r   r&  r(  r*  r  r  r  rH  r  rI   r  r  r3  r  r5  rd  re  s   @rl   r   r      s         
 -,,,OOO      OS=A    F OS?C    "; ; ; ;<Q Q Q QI I I I   " " " "P P P P"
Q 
Q 
Q 
Q   .       ,   
 
 
 
7 7 7 7
 
 
 
8 8 8 8O O O O! ! ! !
      0 !%	
 	
 	
 	
 	
 G G G ]G H H H ]H    + + + ]+& * * * * * ]* * * * *rn   r   group_snode)FusedSchedulerNode | GroupedSchedulerNodec                     j         }                     t          j                            d |D                                  t           fdt          j        d |D              D                        j        j        z
   _	        d S )Nc                    g | ]	}|j         
S r~   r   r   rB  s     rl   r   z3refresh_group_node_dependencies.<locals>.<listcomp>[  s    +J+J+JaAM+J+J+Jrn   c              3  R   K   | ]!}|j                                         v|V  "d S rx   r   r	  )r   r   r  s     rl   r   z2refresh_group_node_dependencies.<locals>.<genexpr>_  sH       
 
x{;;==== ====
 
rn   c                    g | ]	}|j         
S r~   )rt  r  s     rl   r   z3refresh_group_node_dependencies.<locals>.<listcomp>a  s    )O)O)O1!*>)O)O)Orn   )
r   r  r*   
ReadWrites
merge_listr   unionr   r  rt  )r  r   s   ` rl   refresh_group_node_dependenciesr  V  s     F**+J+J6+J+J+JKK  
 	 
 
 
 
!')O)O)O)O)OP
 
 
 	
 	

 
!
(	) """rn   r  r  r   r  c                "   t          | t          t          f          sJ || _        || _        d | _        t          j        d |D              | _        t          |            t          d | j        D                       | _        t          d | j        D                       | _        t          d | j        D                       | _        t          d | j        D                       | _        d |                                 D             | _        d S )Nc                *    g | ]}|j         	|j         S rx   r   r  s     rl   r   z#init_group_node.<locals>.<listcomp>r  s!    	A	A	A!)@!+)@)@)@rn   c              3  $   K   | ]}|j         V  d S rx   rf  r  s     rl   r   z"init_group_node.<locals>.<genexpr>w  $      HHHHHHHHrn   c              3  $   K   | ]}|j         V  d S rx   )rg  r  s     rl   r   z"init_group_node.<locals>.<genexpr>x  r   rn   c              3  $   K   | ]}|j         V  d S rx   )rd  r  s     rl   r   z"init_group_node.<locals>.<genexpr>y  6       ) )!") ) ) ) ) )rn   c              3  $   K   | ]}|j         V  d S rx   )re  r  s     rl   r   z"init_group_node.<locals>.<genexpr>|  r  rn   c                8    i | ]}|                                 |S r~   r  r  s     rl   r  z#init_group_node.<locals>.<dictcomp>  s/     # # # ## # #rn   )r   r   GroupedSchedulerNoder   r  r   r   r  r   r  r  rf  r  rg  rd  re  r  ro  )r  r  r   s      rl   init_group_noder  h  sE   
 k$68L#MNNNNNK%KK&,	A	Av	A	A	AK $K000HH[5GHHHHHKHH[5GHHHHHK%( ) )&1&8) ) ) & &K" &) ) )&1&8) ) ) & &K"# #'2'>'>'@'@# # #Krn   c                      e Zd ZU dZded<   ed;d            Zd<d	Zd=dZe	d>d            Z
d?dZd@ fdZe	dAd            ZdAdZe	dBd            ZdCdZdAdZdAdZdD fd"Ze	dBd#            Ze	dBd$            ZdEd&ZdAd'Ze	dFd(            Ze	dFd)            Ze	dFd*            Ze	dFd+            Ze	dGd-            ZdHd/Ze	dFd0            ZdId2ZdJd5Z dKd8Z!dAd9Z"e	dF fd:            Z# xZ$S )Lr   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r   r   r_   r   r   c                   |j         |j         u sJ t          |t          t          f          sJ |                                r7t          |t
                    r"t          |j        t          j                  sJ nt          |t          t          f          sJ t          t          j        |                                |                                                    } | |j         |          S rx   )r  r   r   r   r*  r  r   r+   r=   r  r  r  r   )rr   r   r   r  s       rl   rs   zFusedSchedulerNode.fuse  s     %/1111%-1C!DEEEEE 	J:e5N#O#O 	Jej".999999em5G%HIIIIIY_U__%6%68I8IJJKKs5?E***rn   c                    | j         D ]C}t          |t                    sJ |                                sJ |                                 D| S rx   )r   r   r   r   r  rk   r   s     rl   r  z,FusedSchedulerNode.extract_pw_from_reduction  s[    { 	0 	0Gg}55555'')))))--////rn   r5  c                p    | j         D ]-}t          |t                    sJ |                                 .d S rx   )r   r   r   r  r  s     rl   r  z(FusedSchedulerNode.swap_pw_red_dimension  sH    { 	, 	,Gg}55555))++++	, 	,rn   r  c                    t          t          d d |                                 D                                 }t          |          dk    rd S t	          |          }|S )Nc              3     K   | ]@}|                                 s|                                *|                                V  Ad S rx   r*  r,  r  r  s     rl   r   z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  h        '')) .2^^-=-=''))     rn   r   r  filterr   r   r   rk   fpsr  s      rl   r  z!FusedSchedulerNode.estimate_flops  o       $ 0 0   	
 	
 s88q==4#hh
rn   r  r4   r  ro   c                   |                                  rdS d}| j        D ]q}t          |t                    s dS |Ht	          |          t	          |j        d                   k    rt                              d            dS |j        d         }rd}|J t          |          |j	        cxk    r|j	        k    rn n|
                    |          }|s/t                              d|                                            dS t          xj        dz  c_        t                              d|                                 |           | j        D ].}t          |t                    sJ |                    |           /t          |            dS )	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)r*  r   r   r   r   rj  r  r/  r   r  r  r  r,   r  r  r  )rk   r  r  r  r  r  s         rl   r  z,FusedSchedulerNode.reorder_loops_by_dep_pair  s     	5
[ 	) 	)Ee]33 uu%%
*;*;uU\RS_?U?U*U*U!''G   uuaJJ	%%%z??h/EEEE93EEEEEE ;;IFFI 	##a   5##q(##;T]]__i	
 	
 	
 [ 	2 	2Ee]33333&&y1111'---trn   r  r  c                    t                                          |           t          | ||           g | _        t	          |d           j        | _        d S )Nc                D    t          |                                           S rx   )r   r   r   s    rl   ry  z-FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C3C/D/D rn   r6  )rO  r{  r  r  r  r   )rk   r  r   rX  s      rl   r{  zFusedSchedulerNode.__init__  sS    ###i000%'
%D%DEEEK


rn   r   c                J    d                     d | j        D                       S )N_c                6    g | ]}|                                 S r~   r  r  s     rl   r   z/FusedSchedulerNode.get_name.<locals>.<listcomp>       ;;;!;;;rn   r  r   rj   s    rl   r  zFusedSchedulerNode.get_name  %    xx;;t{;;;<<<rn   c                @    | j         d                                         S r   r   r  rj   s    rl   r  z!FusedSchedulerNode.get_first_name      {1~&&(((rn   rb  c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r~   r	  r  s     rl   r   z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>  $    !L!L!L1!"4"4"6"6!L!L!Lrn   r   r  r   rj   s    rl   r	  z#FusedSchedulerNode.get_buffer_names  !    !L!L!L!L!LMMrn   rl  c                l    g }| j         D ])}|                    |                                           *|S rx   r   r  r  rk   r/  r   s      rl   r  zFusedSchedulerNode.get_outputs  >    (*K 	. 	.DMM$**,,----rn   c                .     fdt           j                  D             } j        d         j        }|'|                                                                t          j        d                    |                                          d          S )Nc                r    g | ]3\  }}                                  d | d|                                 4S )z.snodes[z] =
)r  r1  )r   r  r   rk   s      rl   r   z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>  sU     
 
 
4 }}BBBB0@0@BB
 
 
rn   r   r  r9  )	r  r   r   r  r  r=  r-  r  r  )rk   r  r   s   `  rl   r  z"FusedSchedulerNode.debug_str_extra  s    
 
 
 
$T[11
 
 
 {1~"LL3355666tyy//6688&AAArn   c                2    d | j         D             }|  d| S )Nc                6    g | ]}|                                 S r~   )r  r  s     rl   r   z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>  s$    EEEd**,,EEErn   z
, snodes: r  )rk   
snodes_strs     rl   r  z"FusedSchedulerNode.debug_str_short  s+    EEEEE
..*...rn   r  r  ri  c                    t                                          ||           t                      }t          | j                  D ]2}|                    ||           |                    |j                   3d S rx   )rO  r  r   r   r   updaterc  )rk   r  r  r   rX  s       rl   r  z!FusedSchedulerNode.set_last_usage	  s    
 	24FGGG 0:||T[)) 	8 	8D 35GHHH&&t7777	8 	8rn   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r~   r   r  s     rl   r   z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>  s$    !M!M!MA!"5"5"7"7!M!M!Mrn   r&  rj   s    rl   r   z$FusedSchedulerNode.used_buffer_names  s!    !M!M!M!M!MNNrn   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r~   )r  r  s     rl   r   zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>  s$    DDD1a,,..DDDrn   r&  rj   s    rl   r  z/FusedSchedulerNode.used_or_aliased_buffer_names  s%    DDDDD
 	
rn   r  c                    | j         S rx   r  rj   s    rl   r   zFusedSchedulerNode.get_nodes   r  rn   c                Z    t          |           j         d|                                  dS )Nz(nodes=r  r  rj   s    rl   r  zFusedSchedulerNode.__repr__#  s*    t**%@@dmmoo@@@@rn   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   r   r  s     rl   r   z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>(  s,      991>>##999999rn   r   r   rj   s    rl   r   zFusedSchedulerNode.is_reduction&  s!    99T[999999rn   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   )r&  r  s     rl   r   z6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>,  s.      ==A1%%''======rn   r=  rj   s    rl   r&  z#FusedSchedulerNode.is_native_matmul*  s!    ========rn   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   )r(  r  s     rl   r   z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>0  s,      ::1??$$::::::rn   r=  rj   s    rl   r(  z FusedSchedulerNode.is_split_scan.  s!    ::dk::::::rn   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   r  r  s     rl   r   z1FusedSchedulerNode.is_template.<locals>.<genexpr>4  s*      88q1==??888888rn   r=  rj   s    rl   r*  zFusedSchedulerNode.is_template2  s!    88DK888888rn   r  c                n    | j         D ],}|                                r|                                c S -d S rx   )r   r*  r  r  s     rl   r  z$FusedSchedulerNode.get_template_node6  sI    K 	0 	0D!! 0--/////0trn   torch.devicec                    | j         d         S r   )r   rj   s    rl   r   zFusedSchedulerNode.get_device=  s    z!}rn   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rx   )r  r  s     rl   r   z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>B  s.      EEA1--//EEEEEErn   r=  rj   s    rl   r  z+FusedSchedulerNode.has_aliasing_or_mutation@  s!    EEEEEEEErn   r  c                    t           rx   NotImplementedErrorr  s     rl   r  z'FusedSchedulerNode.update_mutated_namesF      !!rn   r   r3   c                    t           rx   rJ  )rk   r   s     rl   r  zFusedSchedulerNode.add_fake_depI  rL  rn   r/  r0  c                    t           rx   rJ  r2  s     rl   r3  zFusedSchedulerNode.can_inplaceL  rL  rn   c                   |                                  }d                    d | j        D                       }t                      }|                    | dt          |           j         d| d| dt          | j        j	                   d| dt          | j
                   d| d	t          | j        j        | j
        z
             d| d
| j         d| d| j         d| d           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t&          $ r t(                              dd           Y nw xY w|                                                                S )r  r&  c              3  >   K   | ]}t          |          j        V  d S rx   )r   ry   r  s     rl   r   z/FusedSchedulerNode.debug_str.<locals>.<genexpr>R  s+      FFQQ 0FFFFFFrn   r%  r  r  r  r  r  r  r  r  z.outputs = [
            Nr'  r  Tr  )r  r  r   rR   r  r   ry   r+  r   r  rt  r   rd  re  r-  r  r1  r(  r  r  r  r  r.  r  )rk   r   node_typestrr   r   s        rl   r1  zFusedSchedulerNode.debug_strO  s   }}xxFF$+FFFFF

 	d	 +  )011    %T%<==  	  #4#3#9D<S#STT	 
  
 "4    "4    
	
 
	
 
	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   >?E

EE+'F &F<;F<c                    | j         t          d | j         D                       S t                                                      S )Nc              3  >   K   | ]}|                                 V  d S rx   )r5  r  s     rl   r   z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>n  s.      GG4t,,..GGGGGGrn   )r   r   rO  r5  r  s    rl   r5  z#FusedSchedulerNode.has_side_effectsk  s@    ;"GG4;GGGGGGww'')))rn   r   r_   r   r_   r   r   r  rY  r  r  )r  r  r   r  r   r5  rW  r  r   rl  r  r  rZ  r  )r   rE  r  )r   r3   r   r5  r  )%ry   rz   r{   r  r|   r}   rs   r  r  rI   r  r  r{  r  r  r	  r  r  r  r  r   r  r   r  r   r&  r(  r*  r  r   r  r  r  r3  r1  r5  rd  re  s   @rl   r   r     sP          $###
+ 
+ 
+ [
+   , , , ,
    ]") ) ) )VL L L L L L = = = ]=) ) ) ) N N N ]N   	B 	B 	B 	B/ / / /8 8 8 8 8 8 O O O ]O 
 
 
 ]

   A A A A : : : ]: > > > ]> ; ; ; ]; 9 9 9 ]9    ]    F F F ]F
" " " "" " " "" " " "* * * *8 * * * * * ]* * * * *rn   r   c                  8     e Zd Zd fdZdd	ZddZddZ xZS )FusedMixOrderReductionsr   r_   r   r   r5  c                   t                               |          s t                               |          sJ ||}}|| _        || _        t	                                          |j        t          |                                          t          |                                          z              t           	                    | j                  | _
        d S rx   )r   r   r   r   rO  r{  r  r  r   r   rW  )rk   r   r   rX  s      rl   r{  z FusedMixOrderReductions.__init__s  s     33E:: 	($77>>>>> %5E

OT%//"3"344tEOO<M<M7N7NN	
 	
 	
 '00<<


rn   other_nodestuple[BaseSchedulerNode, ...]c                *   t          |t                    rJ t          |t                    rJ | j                            ||d          sdS t                              |          rt                              |          sdS d
d}d
d}|r4 |||f           ||          z  s ||           |||f          z  rdS |                                 p=t          j        t          | j        
                    ||d	                    | j        k    S )a  
        node1 is from the current mix order reduction; node2 is another node we want to fuse in.

        other_nodes are passed in to check if fusion will introduce producer/consumer relationship
        between the inner and outer reduction. If yes, we don't fuse.
        Fallow_mix_order_reductionr  rZ  r   rb  c                F    t                      } |j        d | D              S )Nc              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zTFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>  s$      ::qq{::::::rn   r   r  r  r   s     rl   _get_ancestorszAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors  s)    ,,C39::E:::;;rn   c                F    t                      } |j        d | D              S )Nc              3  >   K   | ]}|                                 V  d S rx   )r   r  s     rl   r   zZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>  s.      FF1q4466FFFFFFrn   r`  ra  s     rl   _get_operation_nameszGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names  s+     ,,C39FFFFFGGrn   )count_bytes)r  rZ  r   rb  )r   rW  r  r   r   r   r   typingcastr   score_fusion_memoryrW  )rk   r   r   rY  rb  re  s         rl   sub_node_can_fusez)FusedMixOrderReductions.sub_node_can_fuse  ss    e%<=====e%<=====
 ~&&ueu&UU 	5 //
 
 	#66u==	 5	< 	< 	< 	<	H 	H 	H 	H  	u~..1E1Ek1R1RR {++.B.BE5>.R.RR u ""$$$ {T^77uRW7XX  z	
rn   otherc                h   t          |t                    sD|                     | j        || j        f          p!|                     | j        || j        f          S |                     | j        |j        | j        |j        f          o,|                     | j        |j        t                                S rx   )r   rW  rj  r   r   r   rk   rk  s     rl   can_fuse_withz%FusedMixOrderReductions.can_fuse_with  s    %!899 		K))
EDJ=  J''
EDJ=IIJ ))
EK$*ek)B  K((U[%''JJKrn   c                6   | j                                         }| j                            |          }t	          |t
                    rP|                    | j         |j                   }|                    | j        |j                  }t          ||          S |                     | j         || j        f          r0|                    | j         |          }t          || j                  S |                    | j        |          }t          | j         |          S rx   )	r   r   r  rw  r   rW  rs   r   rj  )rk   rk  r   backendfused_node1fused_node2r>  s          rl   	fuse_withz!FusedMixOrderReductions.fuse_with  s    &&((.,,V44e455 
	G!,,tz5;??K!,,tz5;??K*;DDD%%dj%$*GG G$\\$*e<<
.z4:FFF$\\$*e<<
.tz:FFFrn   r3  )r   r_   r   r_   rY  rZ  )rk  r_   )ry   rz   r{   r{  rj  rn  rs  rd  re  s   @rl   rW  rW  r  s        
= 
= 
= 
= 
= 
=2
 2
 2
 2
h
K 
K 
K 
KG G G G G G G Grn   rW  c                  P     e Zd Zd fd	Zedd            ZddZddZddZ xZ	S )$FusedExternTritonKernelSchedulerNoder  r  kernel_noder  fused_epiloguer   r   r5  c                2   t          |j        t          j                  sJ t	          j        t          t                   ||g          }t                      	                    ||           || _
        || _        | j
        j        | _        |j        | _        d S rx   )r   r   r+   rQ  rg  rh  r  r_   rO  r{  rv  rw  rf  rm  )rk   r  rv  rw  r   rX  s        rl   r{  z-FusedExternTritonKernelSchedulerNode.__init__  s     +*B,FGGGGGT"34{N6STTF+++&,)3%-rn   r   r   r   c                   |j         }t          |j                  dk    sJ |j        t	          t          |j                            j                 }|j                            t          |                     t          |||          S Nr   )r  r   rt  rC  r  r  r   r  removeNodeUserru  )rr   r   r   r  original_mutated_buffers        rl   epilogue_fusez2FusedExternTritonKernelSchedulerNode.epilogue_fuse  s     O	 5+,,1111"+"7e.//005#
 	 %,,Xe__===3IueLLLrn   r_  r   c                V   t          | j        j        t          j                  sJ t          | j        j        t          j                  sJ | j        j                                        sJ t          j	        | j        j        j
        d         j                  }ddlm} |                    | j        g|          \  }}ddlm}  || j        g|          }ddlm}  ||||           }	|	                                }
| j        j                            || j        j        |
f          S )Nr   r  )SIMDKernelFeatures)FusedUserDefinedTritonKernel)r   rw  r   r+   r   rv  rQ  rR  rS  r   rT  rU  torch._inductor.codegen.simdr  get_tiling_and_scores,torch._inductor.codegen.simd_kernel_featuresr  torch._inductor.codegen.tritonr  rH  codegen_with_epilogue_fusion)rk   r_  rW  r  tilingr  r  kernel_featuresr  fused_user_kernelnew_kernel_srcs              rl   rH  z,FusedExternTritonKernelSchedulerNode.codegen  sA   $-2B4EFFFFF$*/1KLLLLL$6688888	$*/<Q?EFF??????"88$:M9NPUVV	SSSSSS,,d.A-BEJJOOOOOO88RVWW*2244$AAd).?
 
 	
rn   ro   c                    dS r[  r~   rj   s    rl   r,  z.FusedExternTritonKernelSchedulerNode.is_extern	  r  rn   r]  c                4    | j                                         S rx   )rv  r  rj   s    rl   r  z/FusedExternTritonKernelSchedulerNode.get_ranges		  s    **,,,rn   )r  r  rv  r  rw  r   r   r5  )r   r  r   r   r   r   rc  rZ  rb  )
ry   rz   r{   r{  r}   r~  rH  r,  r  rd  re  s   @rl   ru  ru    s        . . . . . . M M M [M 
 
 
 
,   - - - - - - - -rn   ru  c                      e Zd ZU dZd,dZd-dZed.d
            Zed/d            Z	 	 	 d0d1 fdZ	ed2d            Z
ed3d            ZeZded<   ed4d            Zed3d             Zd5d!Zd5d"Zd6d#Zd7d$Zd8d&Zd9d(Zd:d+Z xZS );ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerr_   r   r  c                    |                                 D ]>}|                                | j        v r!| j        |                                         c S ?d S rx   )r  r  read_to_node)rk   r  r   s      rl   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for	  s]     '')) 	9 	9C||~~!222(8888 3 trn   consumerc                   t          t                               }|j        j        D ]h}|j        | j        j        vr| j        j        |j                                                 }|| j        v r |	                    | j        |                    it          |          dk    rt          t          |                    S d S rz  )r   r_   r   r   r   r  rC  r   name_to_noder  r   r  r  )rk   r  	producersrd	node_names        rl   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for	  s     0133	&, 	< 	<Bwdn888227;LLNNID---d/	:;;; y>>QY(((4rn   ro   c                
   t          |          }                                r|                                rt          j        t                    t          j        t          |          }t          j                  t          |j                  k    }|s |d           |o2t          fdt          j        |j                  D                       S |                                rz	                                r |d           dS t          j        t          |          }|
                              }||j                            |          S  |d           dS                                 rz|	                                r |d           dS t          j        t                                        |          }|j                            ||          S  |d           dS t          d          )	Nzforeach do not have same lengthc              3  T   K   | ]"\  }}j                             ||          V  #d S rx   )r  r   )r   lrr  s      rl   r   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>8	  sN       ) )Aq "++Aq11) ) ) ) ) )rn   zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r'  r.  rg  rh  r  r   r   r   r  r   r  r  r   r  r  )rr   r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rl   r   z#ForeachKernelSchedulerNode.can_fuse/	  s<   (++   &	X%8%8%:%: &	{#=xHHH{#=xHHH00C4H4HHM  75666  S ) ) ) )AA) ) ) & &    "" 	$$&& n   u{#=xHHH'@@JJ+)228=MNNNCGHHH5  "" 	$$&& n   u{#=xHHH'@@JJ+)223CXNNNCGHHH5f
 
 	
rn   c                   |                                 s|                                 sJ |                                 r)t          j        t          |          }|j        }|j        }n(t          j        t          |          }|j        }|j        }d }d }|                                 rn|                                 rZt          j        t          |          }t          j        t          |          }d t          |j        |j                  D             }nO|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    |	|          }
|
}|                    |
           9|                    |	           On|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    ||	          }
|
}|                    |
           9|                    |	           Ont          d           | |j        |||||          S )Nc                J    g | ] \  }}t                               ||          !S r~   )r   rs   )r   r  r  s      rl   r   z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>q	  s<       Aq #''1--  rn   zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r.  rg  rh  r  r  r  r  r   r  r   rs   r   r  r  r  )rr   r  r  r  r  r  r  fused_nodesr  r   new_noder  s               rl   rs   zForeachKernelSchedulerNode.fuse^	  s    ""$$=(;(;(=(====   	7{#=xHHH(0(J%&6OO{#=xHHH(0(J%&6O   &	X%8%8%:%: &	{#=xHHH{#=xHHH AA  KK   "" 	{#=xHHH'@@JJK"KK  - -+++166tXFFH"*K&&x0000&&t,,,,-   "" 	{#=xHHH'@@JJK"KK  - -+++166xFFH"*K&&x0000&&t,,,,- !f   s&?##+
 
 
 	
rn   NFr  r  r   r  r  r  r  r  r5  c                    i  _         i  _        ||ht                                          ||           |D ]A}|j        j        D ]}| j         |j        <   |                                D ]}	| j        |	<   Bn| _        | _	        d  _
        g  _                             t          j                            |j        |j        g                     t!           fdt!          j        |j        |j                  D                        j        j        z
   _        t)          |j        |j        g           _        t-          |j        |j        g           _        t)          |j        |j                   _        t-          |j        |j                   _        |                                rt7          |t8                    sJ ||}}
nt7          |t8                    sJ ||}}
|
j         _         j                            |j                   |
j         _        |                                D ]}	| j        |	<   d  j	        D              _        | _         |d         !                                }|sJ |tE          j#        d          fff _$        t!          tJ          j&        j'                              _(        | _)        d S )Nc              3  R   K   | ]!}|j                                         v|V  "d S rx   r  r  s     rl   r   z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>	  sL         xt'<'<'>'>>>	  ?>>> rn   c                R    i | ]$}|j                                         D ]\  }}||	%S r~   )ro  items)r   r  r  vs       rl   r  z7ForeachKernelSchedulerNode.__init__.<locals>.<dictcomp>	  sW     @ @ @%:O:U:U:W:W@ @26!Q1@ @ @ @rn   r   combo_kernel)*r  r  rO  r{  r   r   r   r   r  r   r   r  r  r*   r  r  r   r  rt  r  r  rf  r  rg  rd  re  r.  r   r  r   r2  ro  r  r   r   Exprr   r  fxNoderj  r  )rk   r  r   r  r  r  r  r   rT  r   foreach_noder   r   rX  s   `            rl   r{  z#ForeachKernelSchedulerNode.__init__	  s    +"5GGY/// 3 3 ,2 8 8D37D%di00 4466 3 3D.2D%d++3	3 'DN DKDI)+DJ  '22 ,k.EF        )/#68V        ")* # !+"79N!OPPDN +"79N!OPPDN&).0N' 'D# '*.0N' 'D# %%'' D!+/IJJJJJ+6j!+/IJJJJJ+6j)3DNN!!*"6777 , 9D"6688 5 5*4!$''@ @"&+@ @ @D  *C&%%''v
> : :<>?
!%(-022.rn   r  c                $   d |D             }|r3t                               dt          |          d |D                        d |D             }|r(t                               dt          |                     d |D             }|r(t                               dt          |                     d |D             }d	 |D             }|r(t                               d
t          |                     d |D             }d |D             r)t                               dt                               fd|D             }t          j        rBd |D             }|r(t                               dt          |                     d |D             }|S )Nc                <    g | ]}t          |t                    |S r~   )r   r  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s(    OOOj4M&N&NO!OOOrn   z/ComboKernels: %d external nodes are filtered %sc                N    g | ]"}|j         	|j                                         #S rx   r   rd  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s-    UUUTty?T&&((?T?T?Trn   c                <    g | ]}t          |t                    |S r~   )r   r  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s(    KKKz!5I'J'JK1KKKrn   z+ComboKernels: %d grouped nodes are filteredc                <    g | ]}t          |t                    |S r~   )r   rW  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s(    PPP1A7N)O)OPQPPPrn   z;ComboKernels: %d FusedMixOrderReductions nodes are filteredc           	     b    g | ],}t          |t          t          t          t          f          *|-S r~   )r   rN  r  r  rW  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>
  sL     
 
 
*-(+	 

 
 
rn   c                <    g | ]}t          |t                    |S r~   r   r  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>
  s8     
 
 
A7Q)R)R

 
 
rn   z+ComboKernels: %d foreach nodes are filteredc                <    g | ]}t          |t                    |S r~   r  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>
  s8     
 
 
Z;U-V-V

 
 
rn   c                :    g | ]}|                                 |S r~   r  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>
  s%    GGGq}}G!GGGrn   z0ComboKernels: %d template nodes are filtered: %sc                    g | ]}|v|	S r~   r~   )r   rB  template_nodess     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>!
  s#    OOOq7N7N!7N7N7Nrn   c                :    g | ]}|                                 |S r~   r<  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>%
  s'    MMMQANN<L<LMqMMMrn   zCComboKernels: %d reduction nodes are filtered (pointwise_only mode)c                :    g | ]}|                                 |S r~   r<  r  s     rl   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>+
  s'    PPPAq~~?O?OPaPPPrn   )r  r/  r   r(   combo_kernels_pointwise_only)	rr   r  externgrouped	mix_orderfiltered_nodesforeach_nodesreduction_nodesr  s	           @rl   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes	  s     POUOOO 	IIAFUUVUUU  
 LKeKKK 	II=G   QPPPP	 	IIMI  

 

 
 

 
%
 
 
  	YIICSEWEWXXX
 
%
 
 
 HG^GGG 	IIBN##  
 POOO^OOO . 	QMM.MMMO 		Y((   QPPPPNrn   list[list[BaseSchedulerNode]]c                  
 |                                  }g }d
t          d |D                       }|D ]5}t          t                    }|D ]b}|                                }|r|j        dk    s|j        dk    r/|                                |z  rG||                             |           c|                                D ]}t          t                    }	|D ]6}|	| j	        
                    |d                                       |           7|	                                D ]@|                    
fdt          dt                    
          D                        A7|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                t    g | ]5}|D ]0}t          |t                    |                                D ]}|16S r~   )r   rW  r	  )r   r   r   r  s       rl   r   zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>;
  sw       !  d$;<<	
 !% 5 5 7 7 
 	     rn   mpsr  r   c                *    g | ]}||z            S r~   r~   )r   r  max_num_nodesstream_nodess     rl   r   zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>[
  s8        ! )Q->)>?  rn   )_topological_sort_nodesr   r   r  r   r   r   r   r   node_to_streamr  r  r  r   )r  sorted_nodesgrouped_nodesexcluded_buffer_namesr  device_groupsr   r   device_nodesstream_groupsr  r  s             @@rl   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels/
  s    !88::1; )  2
 2
 " 	 	E D!!   3 3** v{e33v{e7K7K ))++.CC f%,,T2222
 !. 4 4 6 6 
 
DOPTDUDU( V VD!)":">">tQ"G"GHOOPTUUUU$1$8$8$:$:  L!((    %*1c,.?.?%O%O     	
 rn   4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                    | t           _        d S rx   r  r  )r  s    rl   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsf
  s    
 # 	#DDDrn   c                6    t                               |           S rx   r  r  s    rl   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsn
  s     *KKIVVVrn   c                    t           rx   rJ  rj   s    rl   r  z#ForeachKernelSchedulerNode.mark_runt
  rL  rn   c                    t           rx   rJ  rj   s    rl   rH  z"ForeachKernelSchedulerNode.codegenw
  rL  rn   c                    dS r[  r~   rj   s    rl   r.  z%ForeachKernelSchedulerNode.is_foreachz
  r  rn   c                *    t          | j                  S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r  r   rj   s    rl   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes}
  s     DK   rn   r  c                x    t          t          j                            d | j        D                                 S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  >   K   | ]}|                                 V  d S rx   )r   r  s     rl   r   z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>
  s*      1U1UA!++--1U1U1U1U1U1Urn   )r  r  r  r  r   rj   s    rl   r   z$ForeachKernelSchedulerNode.get_nodes
  s3     IO111U1U1U1U1UUUVVVrn   r   c                @    | j         d                                         S r   )r   r  rj   s    rl   r  z)ForeachKernelSchedulerNode.get_first_name
  s    {1~,,...rn   r  r  c                z    t          | || j        j                   | j        D ]}|                    |           d S rx   )r  r  rC  r   r  )rk   r  r   s      rl   r  z/ForeachKernelSchedulerNode.prune_redundant_deps
  sO     	d$68RSSSK 	: 	:D%%&89999	: 	:rn   )r  r_   r   r  )r  r_   r   r  r  r_   r  r_   r   ro   )r  r_   r  r_   r   r  )NNF)r  r  r   r  r  ro   r  r  r  r  r  ro   r   r5  r  r  r   r  )r  r  r   r  )r  r  r   r5  rY  rZ  r   r  r  rW  r  )ry   rz   r{   r  r  r  r}   r   rs   r{  r  r  r  r  r|   r  r  r  rH  r.  r  r   r  r  rd  re  s   @rl   r  r  	  s         
      & ,
 ,
 ,
 [,
\ >
 >
 >
 [>
J 1504 %L/ L/ L/ L/ L/ L/ L/\ ? ? ? [?B 0 0 0 \0h 	/ & / / / / 
 
 
 \
 W W W \W
" " " "" " " "   ! ! ! !
W W W W
/ / / /: : : : : : : :rn   r  c                       e Zd ZU dZded<   ed"d            Z	 d#d$ fdZd%dZd&dZ	e
d'd            Zd'dZe
d(d            Zd)dZe
d*d            Zd+dZd,dZed-d!            Z xZS ).r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r   r   c                    |d         j         t          fd|D                       sJ  | |          }|D ]}|j        |                                <   |j        |                                <   |S )Nr   c              3  *   K   | ]}|j         u V  d S rx   r  )r   r   r  s     rl   r   z.GroupedSchedulerNode.create.<locals>.<genexpr>
  s*      BB44>Y.BBBBBBrn   )r  r   r  r  )rr   r   grouped_snoder  r  s       @rl   createzGroupedSchedulerNode.create
  s    1I'	BBBB6BBBBBBBBIv.. 	K 	KE=JI()9)9::AN	$]%;%;%=%=>rn   Fr  r  temp_groupingro   r5  c                z    t                                          |           t          | ||           || _        d S rx   )rO  r{  r  r  )rk   r  r   r  rX  s       rl   r{  zGroupedSchedulerNode.__init__
  s?     	###i000 +rn   c                    | j         r| j        S | j        D ]#}|| j        j        |                                <   $| j        j        |                                 = | j                            | j                  S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r   r  r  r  
fuse_nodes)rk   r  s     rl   unpackzGroupedSchedulerNode.unpack
  ss    
  	;[ 	H 	HEBGDN-enn.>.>??N-dmmoo>~((555rn   fake_depr3   c                    |                      | j                            |                     | j                            |           d S rx   )r  r   r  rt  r  )rk   r  s     rl   r  z!GroupedSchedulerNode.add_fake_dep
  sD    T-77AABBB##H-----rn   r   c                J    d                     d | j        D                       S )Nr  c                6    g | ]}|                                 S r~   r  r  s     rl   r   z1GroupedSchedulerNode.get_name.<locals>.<listcomp>
  r  rn   r  rj   s    rl   r  zGroupedSchedulerNode.get_name
  r  rn   c                @    | j         d                                         S r   r   rj   s    rl   r  z#GroupedSchedulerNode.get_first_name
  r!  rn   rb  c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r~   r$  r  s     rl   r   z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>
  r%  rn   r&  rj   s    rl   r	  z%GroupedSchedulerNode.get_buffer_names
  r'  rn   rl  c                l    g }| j         D ])}|                    |                                           *|S rx   r)  r*  s      rl   r  z GroupedSchedulerNode.get_outputs
  r+  rn   r  c                    t          t          d d |                                 D                                 }t          |          dk    rd S t	          |          }|S )Nc              3     K   | ]@}|                                 s|                                *|                                V  Ad S rx   r  r  s     rl   r   z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>
  r  rn   r   r  r  s      rl   r  z#GroupedSchedulerNode.estimate_flops
  r  rn   r  c                    | j         S rx   r  rj   s    rl   r   zGroupedSchedulerNode.get_nodes
  r  rn   rU  c                R    | j         r| j         d                                         nd S r   )r   r   rj   s    rl   r   zGroupedSchedulerNode.get_device
  s&    .2kCt{1~((***tCrn   r  r_   r  c                    dS r  r~   )rr   r  r  s      rl   r   zGroupedSchedulerNode.can_fuse
  r  rn   )r   r  r   r  )F)r  r  r   r  r  ro   r   r5  r  )r  r3   r   r5  rW  r  rU  r  r  r\  r  )ry   rz   r{   r  r|   r}   r  r{  r  r  rI   r  r  r	  r  r  r   r   r   rd  re  s   @rl   r  r  
  s          $###   [ $	+ + + + + + +6 6 6 6. . . . = = = ]=) ) ) ) N N N ]N       ]"   D D D D    [    rn   r  r~   stride_lengthslist[list[int]]r  r  priority_idxr  	list[int]c           
     :    t           j        d	 fd            }t          t          t	          t           d                                                 }t          |          dk    r fd|D              t          j        r|                    |           |S )
z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    r  r   br   c                              dk    s         dk    r$t                    dk             dk              S  fdD             }fdD             }t          d t          ||          D                       }t          d t          ||          D                       }||k    rdS ||k    rdS t                     S )Nr   c                :    g | ]}t          |                   S r~   abs)r   slr  s     rl   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>
  #    <<<rBqE

<<<rn   c                :    g | ]}t          |                   S r~   r  )r   r  r  s     rl   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>  r  rn   c              3  4   K   | ]\  }}|d k    p||k     V  dS r   Nr~   r   sl_asl_bs      rl   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  D       
 
)3tDAI$
 
 
 
 
 
rn   c              3  4   K   | ]\  }}|d k    p||k     V  dS r  r~   r  s      rl   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  r  rn   r   )rJ   r   r  )r  r  stride_len_astride_len_ba_firstb_firstr  r  s   ``    rl   	index_cmpz"pick_loop_order.<locals>.index_cmp  s   8q==E!HMMuQx1}eAh!m444 =<<<^<<<<<<<^<<<  
 
7:<7V7V
 
 
 
 
  
 
7:<7V7V
 
 
 
 
 W2W1 1ayyrn   r   c                     g | ]
}|         S r~   r~   )r   pir  s     rl   r   z#pick_loop_order.<locals>.<listcomp>   s    DDD.,DDDrn   r6  )r  r   r  r   r   r   )		functools
cmp_to_keyr  r   r  r   r(   pick_loop_orderssort)r  r  r
  r   orders   ``   rl   pick_loop_orderr(  
  s           4 %N1$5 6 6778899E
<1DDDD|DDD "

y
!!!Lrn   	orig_nodeir.MultiTemplateBufferr  ir.OperationBufferc                z   |                                 }|                                  }t          |t                    rt          |t                    sJ |                                }|                                 }t          |t                    rt          |t                    sJ t          j        j        |= ||_        t          j        j        |= ||_	        t          j        j
                            |           }t          j        j
                            |           |t          j        j
        |<   |t          j        j        |<   t          j        j                            |           }t          j        j                            |           |t          j        j        |<   |t          j        j        |<   d S rx   )r  r   r   r   rZ   r   r  r   
name_to_opoperation_namebuffersr   r{  
operations)r)  r  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rl   _replace_operation_bufferr6  &  sb    !))++&&((MmS))Pj9JC.P.PPPP2244//11LlC((NZ8H#-N-NNNN	01!HM	+,*H7?  ++DGO8$$$$AGOD,4AG=)7##I..DGh''''AGt'/AG|$$$rn   r  c                    |                                  }|                                }||z
  }||z  }|d|z   z  }||z  S rz  )ry  rw  )r   r   epilogue_runtimetemplate_write_bytesepilogue_read_bytesextra_bytesextra_bytes_ratioextra_memory_ratios           rl    _estimate_fused_epilogue_runtimer>  B  s^     77995577%(<<K $&:: +a2C.CD 000rn   unfused_n_regsr   fused_n_regsfused_n_spills	num_warpsdevice_propsrC   tuple[int, int]c                x    |dk    rdS |j         }|dS |sJ ||j        pdz  }| |z  }||z  }||z  }	||z  }
|	|
fS )Nr  )r   r   )r   r       )regs_per_multiprocessor	warp_size)r?  r@  rA  rB  rC  regs_per_smthreads_per_blockregs_per_block_unfusedregs_per_block_fusedblocks_unfusedblocks_fuseds              rl   "_occupancy_before_and_after_fusionrO  O  s     u 6Kt9!\%;%ArB+.??'*;; $::N"66L<''rn   ms1ms2ro   c                    d}d}t          |||||          \  }	}
|d| z  k    o|
dk    }|
dk    o|
|k    p
|
|	z  |k    p|S )zE
    Determine whether to fuse an epilogue into a GEMM template.
    r8  g      ?r   r   r   )rO  )rP  rQ  r?  r@  rA  rB  rC  MIN_ACCEPTED_OCCUPANCYREGRESSED_OCCUPANCY_RATIOrM  rN  ,epilogue_dominated_with_sufficient_occupancys               rl   _fuse_epiloguerV  j  s      # $Fni$ $ NL 47S=3U\TUEU0
 2 .. 	8.(+DD	87rn   c                  V    e Zd ZU ded<   dZded<   dZded<   dd	ZddZddZddZ	dS )r|  BaseSchedulerNode | OutputNoder   Fro   r3  is_weakr   r   c                h    t          | j                                        | j        | j        f          S rx   )r"  r   r  r3  rY  rj   s    rl   r#  zNodeUser.__hash__  s*    TY''))4+;T\JKKKrn   rk  objectc                    t          |t                    oI|                                 |                                k    o| j        |j        k    o| j        |j        k    S rx   )r   r|  r  r3  rY  rm  s     rl   __eq__zNodeUser.__eq__  sY    uh'' .5>>#3#33. E$55. -		
rn   r   c                4    | j                                         S rx   r3  rj   s    rl   r  zNodeUser.get_name  r4  rn   c                ~    | j         |j         u sJ t          | j         | j        o|j        | j        o|j                  S rx   )r   r|  r3  rY  rm  s     rl   rO  zNodeUser.merge  sH    yEJ&&&&I2!2L*U]
 
 	
rn   NrX  )rk  r[  r   ro   rW  )rk  r|  r   r|  )
ry   rz   r{   r|   r3  rY  r#  r]  r  rO  r~   rn   rl   r|  r|    s         ((((K GL L L L
 
 
 
$ $ $ $
 
 
 
 
 
rn   r|  c                     t           j        S rx   )r(   r  r~   rn   rl   *used_non_deterministic_runtime_estimationsra    s    33rn   	ir.IRNodeOrderedSet[sympy.Symbol]c                   t                      }|                                 }t          |t          j                  r|                    t          |j                  t          |j                  z  t          |j	                  z             t          |t          j
                  r'|                    t          |j                             n|J d|             |S )z=Get free symbols from a node's layout (size, stride, offset).Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r+   Layoutr2  r"   r   strideoffsetrP  get_layout_symintsre  )r   free_symbol_usesr)  s      rl   ri  ri    s    1;""$$F&")$$ 
U%%6=))*6=))*	
 	
 	

 fb;<< 	G##$6v}$E$EFFF~~TFTT~~~rn   c                   t          | t                    r% t                      j        d | j        D              S | j        J | j                                        } |j        d | j                                        D               |S )z
    Gets symbols used in a scheduler node, including free symbols from
    the node's operations and layout symints from outputs.
    c              3  4   K   | ]}t          |          V  d S rx   get_scheduler_node_symbol_uses)r   r  s     rl   r   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>  s+      MM,U33MMMMMMrn   Nc              3  4   K   | ]}t          |          V  d S rx   )ri  )r   ir_nodes     rl   r   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>  s+      	M	M'
W
%
%	M	M	M	M	M	Mrn   )	r   r   r   r  r   r   get_free_symbol_usesr2  r  )r   rj  s     rl   rn  rn    s     $*++ 
!z||!MMMMM
 	
 9   y5577	M	MTY5J5J5L5L	M	M	M  rn   r  c                b    |                                  }||j        |j        S t          j        S z4Check per-template flag, fall back to global config.)r  allow_epilogue_fusionr(   epilogue_fusionr  tbs     rl   _is_epilogue_fusion_enabledrx    2    		(	(	*	*B	~"2>''!!rn   c                b    |                                  }||j        |j        S t          j        S rs  )r  allow_prologue_fusionr(   prologue_fusionrv  s     rl   _is_prologue_fusion_enabledr}    ry  rn   r   r   c                r    |                                  o#|                                  ot          |           S rx   )r*  rx  r   s     rl   is_epilogue_fusionr    =     	/!!###	/'..rn   c                r    |                                 o#|                                   ot          |          S rx   )r*  r}  r   s     rl   is_prologue_fusionr    r  rn   c                B    t          | |          pt          | |          S rx   )r  r  r   s     rl   is_template_fusionr    s"    eU++O/A%/O/OOrn   c                *    t          | |          r|n| S rx   r  r   s     rl   template_fusion_pw_noder    s    &ue44?55%?rn   c                      e Zd ZdZddZd҈ fdZdd	Zdd
ZddZddZ	ddZ
edd            Zej        dd            ZddZddZddZddZddZddZdd Zdd#Zdd%Zdd&Zdd(Zdd*Zdd+Zdd,Zdd-Zdd.Zdd/Zdd2Z	 ddd7Z dd;Z!dd>Z"dd?Z#ddDZ$ddFZ%	 dddHZ&ddLZ'ddMZ(ddPZ)ddSZ*ddVZ+dd]Z,dd^Z-ddaZ.ddbZ/ddddZ0ddeZ1ddfZ2ddgZ3ddhZ4ddjZ5ddkZ6ddnZ7ddoZ8ddpZ9ddqZ:ddvZ;ddxZ<	 	 ddd}Z=dd~Z>ddZ?ddZ@ddZAd ddZB	 	 	 dddZCddZDddZEddZFddZGddZHddZIddZJddZKddZLddZMd	dZNd
dZOddZPeQdd            ZRddZSddZTddZUddZVddZWddZXddZYddZZddZ[ddZ\ddZ]d	dZ^ddZ_ddZ`ddÄZaddĄZbddńZcddƄZdddȄZeddɄZfdd˄Zgdd̄Zhedd̈́            Ziedd΄            Zjd	dτZkddЄZld	dфZm xZnS (  r  z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    r  list[ir.Operation]r   r5  c                    t          d          5  |                     |           d d d            d S # 1 swxY w Y   d S )NzScheduler.__init__)r   _initrk   r  s     rl   r{  zScheduler.__init__  s    .// 	 	JJu	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   377c           
         t                                                        t          j        _        i  _        t          t                     _        t          j
                     _        t                       _        t          g t          j        j                                        t          j        j                                        t          j        j                                                   _         fd|D              _        d  _        d  _                                           j                            t          j        j                                                    j        D ]}|                                 d  _                                          _        d  j        D              _        d  j        D              _         j                                         _        i  _         i  _!        t                       _"        tG          j$         j         j         j                   _         %                                  &                     j                   _         '                                 d  j        D              _         (                                  )                                 tT          xj+        tY           j                  z  c_+        ddl-m.}m/}  | j                   tY           j                   _0         1                                  &                     j                   _        t          td          tf          tf          f                               _4        tj          j6        tk          j6         j                   _        tj          j7        r/ddl8m9} |:                                 (                                 i  _;        i  _<        d	 _=        i  _>         ?                                  @                     j                   _        tj          jA        tk          jA         j                   _        t          d
  j        D                       r '                                  C                                  D                                 tj          jE        stj          jF        r6t                      r(t          jI        jJ        jK        L                                 tj          jM        r@t          ddd          5   O                    d            d d d            n# 1 swxY w Y    P                                 tj          jQ        rddlRmQ}  | j         j         j        t          t          j        j                                                  t          t          j        S                                                     _        tj          jT        stj          jU        rtj          jQ        sddlRmV}  | j         j                   t                      r`t          jY        rTtj          jZ        st          j[        r<d	} j        D ]}t          |j]                  rd} n|rddl#m^}	  |	 j                   t          j_        rddl`ma}
  |
dd  fd           tG          jb         j                   _         c                                 tj          jd        r`tj          je        jf        rOtj          je        jg        r> h                     j                   _         i                     j                   _         j                                 t          jI        j5        jk        jl        r m                                  | j                   t          jn        o                     j                    p                                 t                       _q        i  _r        d  _s        t          d          u                     fd           t                       _v        d S )Nc                :    g | ]}                     |          S r~   )create_scheduler_noder   r  rk   s     rl   r   z#Scheduler._init.<locals>.<listcomp>  s'    CCCd0033CCCrn   c                8    i | ]}|                                 |S r~   r  r  s     rl   r  z#Scheduler._init.<locals>.<dictcomp>&  s/     ;
 ;
 ;
 !AJJLL!;
 ;
 ;
rn   c                f    i | ].}|                                 D ]}|                                |/S r~   )r  r  )r   r   r   s      rl   r  z#Scheduler._init.<locals>.<dictcomp>*  sO     8
 8
 8
$($BRBRBTBT8
 8
;>CLLNNC8
 8
 8
 8
rn   c                8    i | ]}|                                 |S r~   r  r  s     rl   r  z#Scheduler._init.<locals>.<dictcomp>N  s"    "G"G"Gq1::<<"G"G"Grn   r   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotuneFc              3  @   K   | ]}t          |t                    V  d S rx   )r   ru  r  s     rl   r   z"Scheduler._init.<locals>.<genexpr>q  sA       
 
 tABB
 
 
 
 
 
rn   z#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     dddS )N#scheduler_nodes_before_comm_overlapstring)r   encodingr~   r~   rn   rl   ry  z!Scheduler._init.<locals>.<lambda>  s     E$,) ) rn   c                 f    d                     d t           j                  D                       S )Nz

c                z    g | ]8\  }}d | d|                                 z   d|                                 z   9S )zsnode[r'  z buffer_names:)r1  r	  r  s      rl   r   z5Scheduler._init.<locals>.<lambda>.<locals>.<listcomp>  sc        !%1 *QMMMkkmm,Eq/A/A/C/CEEF  rn   )r  r  r  rj   s   rl   ry  z!Scheduler._init.<locals>.<lambda>  s=    v{{  )2$*(=(=	  ( ( rn   )metadata_fn
payload_fngraph_statsc                 H     j          j        t           j                  dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  rj   s   rl   ry  z!Scheduler._init.<locals>.<lambda>  s&     3+/+>*-dj//  rn   )wrO  r{  rZ   r   r  backendsr  _post_grad_graph_counterr  r  count_graph_partition_counterr   rK  r  r  	constantstorchbind_constantsr  r  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr2  r  default_device_contextget_donated_buffersrB  r  rC  copyr  r  rj  seen_template_fusionsr'   decide_global_ordering_of_commsr]   topological_sort_scheduledead_node_eliminationcompute_ancestorscompute_input_distancesr,   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r  create_foreach_nodesr   r   logged_slow_fusionr(   _pre_fusion_custom_passdistributed_max_autotune_gemmr  r  scheduler  buff_to_stream_multi_stream_nodesstream_idx_to_user_obj_idx_populate_stream_assignmentsr  _post_fusion_custom_passr   r  finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodes_enforce_conditional_orderingr  memoryget_output_namesdeterministic reorder_for_compute_comm_overlapr  ra  r)   6runtime_estimations_align_across_all_distributed_ranksr  r  rS   r   r  reorder_sink_verbose_loggingtorch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   r^   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr/  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_index_current_stream_ctxr   add_rowremoved_ops)rk   r  r   r  r  r  r  r  has_collectivesr  r  rX  s   `          rl   r  zScheduler._init  s    <>"&'?"@"@(1(9(9%5?\\!&0%**,,"'')) ,1133'
 '
# DCCCUCCC
7;6:'')))#**17+<+A+A+C+CDDDJ 	 	DOO <@# $$&& 	#;
 ;
%)Z;
 ;
 ;
8
 8
,0J8
 8
 8
 AE@Q@V@V@X@X 35 13 LL 	"
 :J#
 

 	!!###33DJ??
""$$$"G"GDJ"G"G"G   $$&&& 	##s4:6##OOOOOOOO$*%%%!$*oo!!###33DJ??
",U38_"="?"?)57
CCDJ/ 	%...... ))$///""$$$ =?.0). :<'))+++__TZ00
*68DDDJ 
 


 
 
 
 
 	) &&(((,,...$	V(.(;	V&((	V O,ASSUUU 	B5&* $   B B
 ..D.AAAB B B B B B B B B B B B B B B 	**,,, ) 		77777700
 '17/44667717335566 DJ # /	P(O /	P1 UUUUUUAAJ 0  
 ;<<W WW <	W
 $PW #( J  D$TY// *. # W      KJ4:VVV 8 ;;;;;;  ! !           CDJOODJ""$$$ "		W (			W C		W DDTZPPDJJJ4:VVDJ!!!?!.E 	-**,,,4:&&&	dj))) 6@\\! :< GK ''//   	
 	
 	
 -7LLs   #UU
U
!dict[str, SchedulerDonatedBuffer]c                    i }t           j        j        D ][}t          t           j        j        |         t          j                  r*t          | t           j        j        |         d           ||<   \|S )N)r  )rZ   r   graph_inputs_originalr   r+   DonatedBufferr`  )rk   name_to_donated_bufr   s      rl   r  zScheduler.get_donated_buffers  sq     G1 	 	D!'7=r?OPP ,BG1$7 $- - -#D)
 #"rn   c                J  
 ddl m
 i }t          j        d          }| j        D ]{}
}|j        E|j                                        }|*||vrt          |          }|||<   || j        |<   ||         }|| j	        |<   |
                                D ]}|| j        |<   |t          
fd| j	                                        D                       rt          d | j        D             d          }|x| j        D ]p}|j        }	|                                St          |	t           j                  r9t          |	j        t           j                  rt!          j        |          |	_        qt          
fd| j	                                        D                       | _        dS )a=  Populate node_to_stream and buff_to_stream from IR node stream_idx.

        Reads the stream_idx field set on IR nodes during lowering to determine
        which stream each scheduler node should run on. This field is propagated
        from 'custom.stream' FX node metadata via IRNode.current_stream_idx().
        r   )DEFAULT_STREAM_IDXNc              3  $   K   | ]
}|k    V  d S rx   r~   )r   r  r  s     rl   r   z9Scheduler._populate_stream_assignments.<locals>.<genexpr>'  s)      MM1q&&MMMMMMrn   c              3  f   K   | ],}|                                 |                                 V  -d S rx   r   r  s     rl   r   z9Scheduler._populate_stream_assignments.<locals>.<genexpr>)  s7      RRAq||~~7Q7Q7Q7Q7QRRrn   r   c              3  $   K   | ]
}|k    V  d S rx   r~   )r   
stream_idxr  s     rl   r   z9Scheduler._populate_stream_assignments.<locals>.<genexpr>7  s=       '
 '
 ,,'
 '
 '
 '
 '
 '
rn   )stream_constantsr  r  r  r  r   get_stream_idxr  r  r  r	  r  r   r   r   r   r+   Bufferr)  r?   r  )rk   user_obj_to_stream_idxstream_idx_counterr   r  user_obj_idxnew_stream_idxr   r   rp  r  s             @rl   r  z&Scheduler._populate_stream_assignments  s    	988888 24&_Q//J 	6 	6D+Jy$#y7799+#+AAA)-.@)A)A?M.|<JV7G!7!EJ(2D% ,,.. 6 6+5#C((6 MMMM0C0J0J0L0LMMMMM 	FRRRRRTX F ! J F FD"iG))1&w	:: 2&w~r}EE 2
 *,f)E)E)E $' '
 '
 '
 '
"188::'
 '
 '
 $
 $
   rn   ro   c                    | j         S )z7Check if any nodes are assigned to non-default streams.)r  rj   s    rl   _has_multi_stream_nodesz!Scheduler._has_multi_stream_nodes<  s    ''rn   r  r   r   c                    | j                             ||          }| j                            || j                            |d                    S )zAReturn the stream index for a buffer, resolving mutation renames.r   )rj  r  r  )rk   r  reals      rl   get_buf_streamzScheduler.get_buf_stream@  sF    $((8<<"&&tT-@-D-DXq-Q-QRRRrn   r   r_   c                    |                                  sdS |                     |          | j                            |d          k    S )zTrue if buf_name was produced on a different stream than node.

        Resolves mutation renames so that mutated buffers inherit the
        stream of their original definition.
        Fr   )r  r  r  r  )rk   r  r   s      rl   rO  z!Scheduler.has_cross_stream_hazardE  sI     ++-- 	5""8,,0C0G0Ga0P0PPPrn   rU  c                $    t           j        j        S rx   rZ   r   current_devicerj   s    rl   r  zScheduler.current_deviceO  s    w%%rn   r   c                (    |t           j        _        d S rx   r  r  s     rl   r  zScheduler.current_deviceS  s    !'rn   c                    t           j                            dd          dk    rddlm}  || j        d           dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r/  r  r  )rk   r  s     rl   r  zScheduler.debug_draw_graphW  sV    :>>:DAASHH++++++L666666 IHrn   labelc                    t                               t          j                  r9t                               d|           | j        D ]}|                                 d S d S )Nz%s:)r  isEnabledForloggingINFOr  r  r  )rk   r  r   s      rl   debug_print_nodeszScheduler.debug_print_nodes^  sh    GL)) 	#HHUE"""
 # #  """"	# 	## #rn   r|  c                d   |                                 
J d            |                                rt          | |          S t          |t          j        t          j        f          rt          | |          S t          |t          j                  rt          | |          S t          |          )Nz2All nodes passed to scheduling must have an origin)rd  is_no_oprN  r   r+   r   ru  r   r  r  rK  r  s     rl   r  zScheduler.create_scheduler_noded  s    !!--@ .-- ==?? 	,)$555r0"2CDEE 	, t,,,bo.. 	,,T4888%d+++rn   c                    t                      g } j                                        t          j        j                                        D ]~} fd|D             }|s                    |            fd|D             }t          j	        dk    }t           |d|          }|                    |           |D ]}| j        |<   fd j        D             t          |          z    _        d S )Nc                \    g | ](}|v t          j        |         t                    &|)S r~   )r   r  rN  )r   r   kept_node_namesrk   s     rl   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>w  sI       ?**"4#4T#:<RSS + ***rn   c                *    g | ]}j         |         S r~   )r  r   r   rk   s     rl   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>  s!    @@@$d'-@@@rn   r   Fr  r  c                @    g | ]}|                                 v|S r~   r  )r   r   removed_node_namess     rl   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>  s3     
 
 
4==??BT+T+TD+T+T+Trn   )r   r  r  rZ   r   listsr   r2  r(   combo_kernels_autotuner  r   r  r  )	rk   fe_nodesnamesr   r  fe_noder   r  r$  s	   `      @@rl   r  zScheduler.create_foreach_nodesq  sQ   .8ll16688W]))++ 	8 	8E    !  E  %%e,,,@@@@%@@@F$;a?O0*/ /	  G OOG$$$ 8 807'--8
 
 
 
!Z
 
 
NN


rn   c                B   $%&'  G $fddt           t                             $t          j        $          & j        D ]}|                                D ]}|                                }t          |j        j	        t          j                  r&t          |                                          dk    r`|                                D ]Y}|&v r8|&v r4&|         }&|         }||z   }&D ]}&|         |u s
&|         |u r|&|<   >|&v r&|         &|<   N&|         &|<   Zόd*' fd'	 	 d+d,&'fd}	i }
t          j        j                                        D ]x}t          |t$          j                  r|j        D ]}d|
|<   ,t          |t          j                  r2d |                                D             }|D ]}|j        D ]}d|
|<   yd} j        D ]r}|j        J t/          |j                                        d           }|D ];}t          |t$          j                  sJ d}||
vr|                                |
|<   <s j        D ]}t4                              d|j                   |r|j        J t/          |j                            d          d           }|D ]u}||
v sJ | d|
             |
|         x}V j        |                                         D ]6}|                    t?          |                                                     7vt          |j         j!                  dk    rEtE          tG          |j         j!                            x}rt          |tH                    r|j%        }nd}|                                D ]l}t          |&                                          dk    sJ |&                                D ],} '|          } |	||           |                    t?          ||                     &|         j'        D ]}|                                |                                k    r-t          |j        tP                    sJ |j                                        D ]}}|                                } '|          }||                                v }|                    tS          ||                                |                       |	||d           ~.nt          j        j*        |                                         D ]G} |	||d           |                    tS          ||                                d                     Ht          j        j+        |                                         D ]2} |	||d           |                    t?          |                     3|j         j,        D ]<}t          |tR                    s% |	|j-        ||.                    |                     =|/                     j0                   |                                D ]}|&                                D ]x}|                                 j0         '|          <   |                                 j0        |<    j1        2                    ||           j1        |                                <   yt          j        3                                D ]C}t4                              d|            |	|ti          t?          |                               D|rt          j        j5        D ]}|                    d          D ]}||
v s!J | d|
6                                             |
|         x}rd j        |         7                                D ]D}t4                              d||            |	|ti          t?          |                               E j0        D ]}|t          j        j        v rK |	|ti          t?          |                               t          j        j8        9                    |           `|t          j        j:        v r& |	|ti          t?          |                               d  tw          t          j        j        6                                          D             %%fd!t          j        j8        D             t          j        _<         j        D ]K}|                                D ]4}|=                    &|                                         j'                   5L j>        D ]-} j>        |         =                    &|         j'                   .t                      } | @                    d"           &'                                D ]^\  }}!| A                                5  d# |!j'        D             }"| @                    d$| d%|" d&           ddd           n# 1 swxY w Y   _| @                    d'           | B                                C                                }#t                              d(           t                              d)|#           dS )-zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                  6    e Zd ZdZ	 	 ddd	ZddZd fdZdS )1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nr  list[_T] | None
membershipOrderedSet[_T] | Noner   r5  c                B    |pg | _         |pt                      | _        d S rx   )r  r   r.  )rk   r  r.  s      rl   r{  z:Scheduler.compute_dependencies.<locals>.DedupList.__init__  s#    
 #[b
","<
rn   	node_userra   c                    || j         v rd S | j                            |           | j                             |           d S rx   )r.  r  r   r  )rk   r1  s     rl   r   z8Scheduler.compute_dependencies.<locals>.DedupList.append  sF    //F
!!),,,##I.....rn   rk  DedupList[_T]c                     t          j         j        |j                  } j         fd|j        D             z   } ||          S )Nc                &    g | ]}|j         v|S r~   )r.  )r   rB  rk   s     rl   r   zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>  s,     * * *at.F.FA.F.F.Frn   )r   r  r.  r  )rk   rk  new_membership	new_items	DedupLists   `   rl   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1$/5CS!T!T J * * * *${* * * 	 !yN;;;rn   r  )r  r-  r.  r/  r   r5  )r1  ra   r   r5  )rk  r3  r   r3  )ry   rz   r{   r  r{  r   r9  )r8  s   rl   r8  r,    sr          *.48= = = = =/ / / /< < < < < < < <rn   r8  r   r  r   r   c                F    | j         v r j         |                    S | S rx   )rj  )r  r  rk   s    rl   r  z.Scheduler.compute_dependencies.<locals>.rename  s.    D)))vd3A6777Hrn   Fused_by_namer?  rX  r3  ro   rY  r5  c                n     |                                         t          |||                     d S rx   )r   r|  )r;  r?  r3  rY  name_to_usersr  s       rl   add_userz0Scheduler.compute_dependencies.<locals>.add_user  sE     &&../66K99    rn   Nc                F    g | ]}t          |t          j                  |S r~   )r   r   r  r   r  s     rl   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>  s)    SSS!Auz9R9RSASSSrn   c                    | j         S rx   r  r   s    rl   ry  z0Scheduler.compute_dependencies.<locals>.<lambda>  s    AF rn   r6  Tzscheduling %s)unbacked_onlyc                    | j         S rx   r  r   s    rl   ry  z0Scheduler.compute_dependencies.<locals>.<lambda>  s    !& rn   z not in )r  mutating_bufr  )rY  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sc                    i | ]\  }}||	S r~   r~   )r   r   r   s      rl   r  z2Scheduler.compute_dependencies.<locals>.<dictcomp>  s+     
 
 
'E4D%
 
 
rn   c                     g | ]
}|         S r~   r~   )r   r   	inp_namess     rl   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>  s*     &
 &
 &
 $IdO&
 &
 &
rn   r`  c                6    g | ]}|                                 S r~   r  )r   r  s     rl   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>  r  rn   'z': r&  ra  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r   r   r   )FF)
r;  r   r?  rX  r3  ro   rY  ro   r   r5  )Er	   ra   r  r   r  r  r  r   r   r)  r+   r?   r   r*  rZ   r   r  r   r   r  r"   	TensorBoxr  r;  get_unbacked_symbol_defsSymbolr  r/  rq  r  r  r5   r   r  r  r  r4   r  r,  r  r_   r6   additional_buffer_depsadditional_star_depsr   r   r3  r  rj  r  r  r  rJ  graph_outputsr  r	  mutated_inputsr  r  r  mutated_input_idxsrP  rB  rR   r  r-  r.  r  compute_dependencies_log)(rk   r   buf1	buf1_name	buf2_namelist1list2combinedr7  r>  unbacked_symbol_to_origin_nodevalfssym_sizer  has_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r   	node_modealt_namer0  out_buf
other_nameis_aliasadd_deprT  r  r   r   logbufr  r  r   r8  rH  r=  r  s(   `                                   @@@@rl   r]   zScheduler.compute_dependencies  s   	< 	< 	< 	< 	< 	< 	< 	< 	< 	<@ @K?V@
 @
 J 	L 	LD((** L L MMOO	 ty/??D,,..//!33!%!1!1!3!3 L LI M11i=6P6P -i 8 -i 8#(5=#0 > >C -c 2e ; ;#0#5#>#>5=c 2> #m333@3Ki003@3Ki00LL:	 	 	 	 	 	 	 !&!		 		 		 		 		 		 		 		 JL&
 7'..00 
	B 
	BC#uz** 	B* > >B9=2266>C.. B TSs||~~SSS! B BAn B B=A6r::B ',#J 	H 	HD9((( $*	2244:J:J$ $ $  * H H!!U\22222 /3+:::8<215H J V	 V	DIIoty111* Gy,,,'-I222FF((( ( ($
 . G GA >>>>FF&DFF ?>> <A>>K#'#4Q#7#C#C#E#E G GC --gcllnn.E.EFFFF D$+,,11 d&6&=!>!>???S 2sI.. 2  H		 	 '')) E E3,,..//14444 # 1 1 3 3 E EH%vh//HHXt,,,%%ghY&G&G&GHHH -h 7 = E E==??dmmoo==$)$)5FGGGGG'+y'<'<'>'> E EG)0)9)9););J)/
););J (073F3F3H3H'HH -- '$.1408L!" !" !"   %HZtDDDDD%EEE< 79$--//J S S$5555 !!''4==??D"Q"Q"QRRRR77H 4 4$6666!!''"2"23333 (. F F!$00 FHTYd.>.>t.D.DEEE%%d&;<<< ''))   # 1 1 3 3  H>AllnnD)&&*:*:;69llnnD)(3/33HhGG +CLLNN;; 0022 	> 	>HII,h777HXz'(*;*;<<==== ' 	Nw, N N111EE N NA >>>>MM&D&I&I&K&KMM ?>> ;1==q N(,(9!(<(M(M(O(O N NHII M ( !  
 %HXz'(:K:K/L/LMMMMN ) 	: 	:Dqw+++z'$--88999&**40000***z'$--88999
 
+4QW5I5N5N5P5P+Q+Q
 
 
	&
 &
 &
 &
()(>&
 &
 &
"
 J 	C 	CD'')) C CmCLLNN;ABBBBC / 	S 	SD'-77d8K8QRRRR  !!c'--// 	4 	4JC 4 4;;u{;;;2#22%2223334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 	c  ""))++ &&';<<< &&'I3OOOOOs   ).h##h'	*h'	c           
         ddl m}m}m}m} t          t          j        j        	                                          } | j
        |          }t          j        j        j        s | j
         j                   t          t          j                                                  } | j
        ||          \  }}	}	d t#          t%           j
                            D             |D ]~}
|
j        dk    r|
j        dk    r|
j                                        }|
j                 d                             |           |
j                 d                             |           ddlm}  |             d fd}g }t9           j
                  D ]S\  }}|                    |           |                     |||t%           j
                  dz
  k                         T| _
        d S )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufc                    g | ]}g g fS r~   r~   )r   r  s     rl   r   z7Scheduler.insert_memory_check_nodes.<locals>.<listcomp>  s/     C
 C
 C
RHC
 C
 C
rn   r   )register_check_mem_opstep_idxr   is_final_stepro   r   r  c                Z   |          d         }|          d         }|||g}t          j        t          t          j        d                    t          j        j        j        j        g |d           }dj	        |          
                                 |_        t          |          S )Nr   r   r  r  c                6    | |d         |d         |d         dfS )Nr   r   r   )alivedeadro  r~   )tensor_argsr  s     rl   ry  zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>  s/    !.q!1 -a 0)6q)9 C rn   )r)  r@  rt  nontensor_argsunflatten_args
mem_check_)r+   MemoryCheckKernelr?   r  r   r  _inductor_debugcheck_memory_stepdefaultr  r  r.  r  )rn  ro  expected_newly_aliveexpected_newly_deadru  r   rk   step_allocs_deallocss         rl   construct_mem_check_nodezEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node  s     $8#A!#D "6x"@"C24GWN'!e)<)<===y0BJ-     D #Qtz(/C/L/L/N/N"P"PD,T4888rn   )ro  )rn  r   ro  ro   r   r  )r  r  ri  rj  rk  r   rZ   r   r  r  r  r  r  r(   r  rC  r  r  r   
size_alloc	size_freerX  r  
start_stepr   end_step#torch._inductor.runtime.debug_utilsrm  r  )rk   r  ri  rj  rk  r  name_to_freeable_input_bufrP  buf_info_listr  buf_infor  rm  r  	new_nodesr  r   r~  s   `                @rl   r  z#Scheduler.insert_memory_check_nodes  sr   	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 )3173G3L3L3N3N(O(O""4:|<< 	# %= 	==
D,   *4AG4L4L4N4N)O)O55J&
 
q!C
 C
#C
OO44C
 C
 C
 & 	H 	HH"a''H,>!,C,C//11H !45a8??III !23A6==hGGGGMMMMMM	9 	9 	9 	9 	9 	9 	92 	 ,, 	 	GAtT"""((1DJRS@S;SUUU    


rn   c                   t           j        sdS g }t          | j                  D ]ddd}                                D ]}t          fd|j        D                       }|rdt                              d	|	                                           t          j        j                            |	                                           d
}                                 o| }|s|                               t                              d	                                           t          j        j                            	                                           j        j        D ]J}|j        | j        v r:| j        |j                 j        }fd|D             | j        |j                 _        Kt+          t          |                    | _        | j        D ]                                 dS )z0
        Remove any nodes without users
        Nr0  r|  r   ro   c                Z    | j         p$|                                 t          j        j        v S rx   )rY  r  rZ   r   r  )r0  s    rl   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s!    |Tt}}!':T'TTrn   Fc              3  .   K   | ]} |          V  d S rx   r~   )r   ur  s     rl   r   z2Scheduler.dead_node_elimination.<locals>.<genexpr>  s/      #M#Ma$6$6q$9$9#M#M#M#M#M#Mrn   zremoved dead buffer: %sTzremoved dead operation: %sc                r    g | ]3}|j                                                                         k    1|4S r~   r3  )r   r  r   s     rl   r   z3Scheduler.dead_node_elimination.<locals>.<listcomp>  s>     = = ="#0A0AT]]__0T0TA0T0T0Trn   )r0  r|  r   ro   )r(   use_dcer   r  r  r   r  r  r/  r  rZ   r   rL  r  r5  r   r  r   r   r   rC  r  r  )	rk   updated_nodesactive_buffersr   can_eliminaterT  r  r  r   s	          @@rl   r  zScheduler.dead_node_elimination  s    ~ 	F
 TZ(( 	 	DU U U U #N'')) * * ##M#M#M#M39#M#M#M M M  *II7HHHG+//????%)NN $ 5 5 7 77N<NM  $$T**** 		6HHH*..t}}??? ,2  DyD$444 $ 0 ; A= = = =',= = =(39 (=1122
 J 	# 	#D  """"	# 	#rn   r  
str | Nonec                
    |duS )z:Check if store mode requires cross-thread synchronization.Nr~   )rk   r  s     rl   mode_requires_synchronizationz'Scheduler.mode_requires_synchronization  s    4rn   r  c                    t          t                               t                      g dfd|D ]}|                                D ]}||<   |D ]} |           S )z?
        Ensure nodes is in topologically sorted order
        r  r_   r   r5  c                    | vrf                     |            t          | j        d           D ]"}|j        vr |j                            #                    |            d S d S )Nc                    | j         S rx   r  )ds    rl   ry  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>/  s    af rn   r6  )r  r;  rt  r   r   )r  r   r  r/  seenvisits     rl   r  z2Scheduler.topological_sort_schedule.<locals>.visit,  s    }}!!"6<L<LMMM 2 2Cx|33 E,sx01111a      }rn   )r  r_   r   r5  )r   r_   r  r	  )rk   r  r   r   r  r/  r  r  s       @@@@rl   r  z#Scheduler.topological_sort_schedule"  s     +,..59VV*,	! 	! 	! 	! 	! 	! 	! 	! 	!  	* 	*D--// * *%)T""* 	 	DE$KKKKrn   c                   d | j         D             }t          dt          |                    D ]}t          t	          ||                                                             }t          t	          ||dz
                                                               }||                             t          ||d                     d S )Nc                P    g | ]#}t          |j        t          j                  !|$S r~   )r   r   r+   Conditionalr  s     rl   r   z;Scheduler._enforce_conditional_ordering.<locals>.<listcomp>>  s;     
 
 
Z%G%G

 
 
rn   r   TrD  )r  r  r   r  r  r	  r  r6   )rk   conditional_nodesr  rE  prev_bufs        rl   r  z'Scheduler._enforce_conditional_ordering=  s    
 
z
 
 
 q#/0011 	 	A%6q%9%J%J%L%L M MNNLD!21q5!9!J!J!L!LMMNNHa --|TJJJ   	 	rn   r  c                r    t                      }t          |t          t          t          t
          t          f          r%|j        D ]}|                    |j	                   n t          dt          |           d           fd|D             }t          t           fd|D                                 S )Nz+get_unmet_dep_nodes is not implemented for .c              3  V   K   | ]#}j         |                                         V  $d S rx   )rC  r   r  s     rl   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>Z  s7      XXc)#.??AAXXXXXXrn   c              3  2   K   | ]}j         |         V  d S rx   r  r  s     rl   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>[  s+      QQat6q9QQQQQQrn   )r   r   r   r  rN  r   r  rt  r  r   RuntimeErrorr   r  )rk   r  
unmet_depsr   unmet_dep_opss   `    rl   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodesH  s    &0ll
)&"$	
 	
 	 / ) )sx(((() Ld5kkLLL   YXXXZXXXJQQQQ=QQQQQRRRrn   r  c                b   g }t                               | j        d          }i }| j        D ]^}|                     |          }t	          |          ||<   |D ]2}|                    |g           }|                    |           |||<   3_d |                                D             }|rx|                    |           |D ]@}	|                    |	g           D ]}
||
xx         dz  cc<   |                    |	           Ad |                                D             }|x|r
J d            |S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                $    g | ]\  }}|d k    |S r   r~   r   r  r  s      rl   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>l  s!    @@@1a!rn   r   c                $    g | ]\  }}|d k    |S r  r~   r  s      rl   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>s  s!    DDDDAqQ!VVaVVVrn   zTopological sort failed!)	r  fromkeysr  r  r   r  r   r  r  )rk   r'  r  childrenr   r  r   czero_deg_nodesr  r0  s              rl   r  z!Scheduler._topological_sort_nodes]  sf    dj!,,#%J 	" 	"D,,T22Dd))E$K " "LLb)) !"
 A@@@@ 	ELL(((#  $LLB// % %D$KKK1$KKKK		!DDEKKMMDDDN  	E 44444yrn   c                b   i }| j         D ]|}t                      }|j        D ]F}| j        |j                                                 }|                    |           |||         z  }G|||                                <   ||_        }t          | j                   D ]\  }}||_
        ||_        dS )z.
        Populate each node.ancestors
        N)r  r   rt  rC  r   r   r  r  r   r  rf  rg  )rk   name_to_ancestorsr   r   r   dep_node_namer'  s          rl   r  zScheduler.compute_ancestorsw  s    
 9;J 	' 	'D)3I. > > $ 0 : K K M Mm,,,.}==		1:dmmoo.&DNN$TZ00 	# 	#KE4"DN"DNN	# 	#rn   c                D    i i  j         D ]}|j        sd}d}nF fd|j        D             } fd|j        D             }t          |          }t          |          }||                                <   ||                                <   ||_        ||_        dS )z
        Populate each node's min/max_input_distance with the depth from graph
        inputs, measured as dependency hops before fusion. Nodes whose
        dependencies are all satisfied by graph inputs/constants have depth 0.
        r   c                j    g | ]/}j         |j                                                          d z   0S r   rC  r   r   )r   r   name_to_min_distancerk   s     rl   r   z5Scheduler.compute_input_distances.<locals>.<listcomp>  N     ! ! !  ))9#()C)T)T)V)VW! ! !rn   c                j    g | ]/}j         |j                                                          d z   0S r  r  )r   r   name_to_max_distancerk   s     rl   r   z5Scheduler.compute_input_distances.<locals>.<listcomp>  r  rn   N)r  rt  r  r  r  rd  re  )rk   r   min_distmax_distdep_min_distsdep_max_distsr  r  s   `     @@rl   r  z!Scheduler.compute_input_distances  s     02/1J 	/ 	/D* .! ! ! ! !  $6! ! !
! ! ! ! !  $6! ! !
 }--}--4< 14< 1&.D#&.D##)	/ 	/rn   c                b   t           j        sd S | j        D ]}t          |t          t
          f          r$|                                st           j        dk    rC|                                D ]@}t          |t                    r|	                                r,|
                                 Ad S )Nhalide)r(   ry  r  r   r   r   rU   cpu_backendr   r*  r  )rk   r   r  s      rl   r  zScheduler.merge_loops  s    0 	FJ 	$ 	$D d]4F$GHH KKMM&,&8H&D&D)) $ $!%77 5;L;L;N;N !!####$	$ 	$rn   c                    t          ddd          5  t          d          D ]}t          |          }t                              d|dz   |           |                     |d          }t          |          }t                              d	|dz   ||           ||k    s|dk    r t                              d
|dz               nt          j        st          j        r|                     |d          }|cddd           S # 1 swxY w Y   dS )zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTr  r  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r  r   r.  r/  fuse_nodes_oncer(   ry  loop_index_inversion_in_fusion)rk   r  r  old_lennew_lens        rl   r  zScheduler.fuse_nodes  s    #4QU
 
 
 	 	 2YY  e**  EE  
 ,,UU,KKe**  TE	   g%%A$$Eq1u   E	 *6 1K8K ,,UT,JJ;	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   C#DD
Dc                    g }| j         D ]A}|                    t          |t                    r|                                n|g           B|| _         dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r  r   r  r  )rk   r  r   s      rl   r  zScheduler.process_grouped_nodes  se     .0	J 	 	D!+D2F!G!GSdV    


rn   r  tuple[float, str]c                   t          |          dk    sJ |d                                         }|| _        |                     |          }t	          ddd          5  |                    |          cddd           S # 1 swxY w Y   dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r   r   r  rw  r   r  )rk   r  r   rp  s       rl   r  zScheduler.benchmark_fused_nodes  s     5zzA~~~~q$$&&$""6**#"&%D
 
 
 	8 	8
 0077	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8s   B  BBNbenchmark_kernelhint_overrider  c                   t          |          dk    sJ |d                                         }|| _        |                     |          }t	          d          5  |                    |||          cddd           S # 1 swxY w Y   dS )r  r   generate_kernel_code_from_nodesr  N)r   r   r  rw  r   r  )rk   r  r  r  r   rp  s         rl   r  z)Scheduler.generate_kernel_code_from_nodes  s     5zzA~~~~q$$&&$""6**;<< 	 	::'} ;  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B  BBmoduler   rE  c                    || _         |                     |          }t          d          5  |                    |          cddd           S # 1 swxY w Y   dS )r  benchmark_codegened_moduleN)r  rw  r   r  )rk   r  r   rp  s       rl   r  z$Scheduler.benchmark_codegened_module  s     %""6**677 	> 	>55f==	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   AAA
multi_noder*  c                   t           j        j        }|sdS t                              d||           |j        D ]}|                                }t          |dd          r||vst          |t          j
                  rF|j        }||         }t          |t          j                  r!|                    |j                   |j        }t          |t          j                  r&||k    r t                              d|||            dS dS )z
        Check if selecting a Triton template would cause layout conflicts.
        Returns True if there's a conflict and we should fall back to ATen.
        FzNode %s has constraints %sr)  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rZ   r   buffer_layout_constraintsr  r/  r  r  r  r   r+   ReinterpretViewr)  FlexibleLayout freeze_layout_with_exact_stridesrg  FixedLayoutr  )rk   r  constraintsinpinp_namer)  expected_layouts          rl   !_has_layout_conflict_for_templatez+Scheduler._has_layout_conflict_for_template"  s!    g7 	5		.
KHHH$ 	 	C||~~H C400;..c2#566 / ZF)(3O&""344 $ 44_5KLLL&".11 o6O6Oe#	   tturn   c                   t          | j                  D ]\  }}t          |t                    rt          |j        t
          j                  r|j        }t          j        j	        s|
                                \  }}n+t          d |                                D                       }t          |t          j        j        j                  r|                     |          rm|                                D ]*}t          |t          j        j        j                  r|} n+t          |t          j        j        j                  s
J d            t          |t          j        j        j                  rt          j        ri }||d<   t          j        D ]e}|                    |          }	d |	                                D             }
t+          |
                                d           d         }|||<   f|j                            |           n|j                            |           t
          j                            |j                  5  |                                }ddd           n# 1 swxY w Y   |j        }t          |t
          j                  sJ |j        }t          |t
          j                  sJ |j        rtA          ||j                   |j!        |_!        | "                    ||||           dS )	a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c              3  b   K   | ]*}t          |t          j        j        j                  &|V  +d S rx   )r   r  r  r  ExternKernelCaller)r   timings     rl   r   z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>`  sT         &) & % @ S   "     rn   zZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNr  c                D    i | ]\  }}t          |t                    ||S r~   r   r   )r   r  r  s      rl   r  z=Scheduler.finalize_multi_template_buffers.<locals>.<dictcomp>  s?     . . .$(Aq#-a1I#J#J. !1. . .rn   c                    | d         S rz  r~   r   s    rl   ry  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>  s    qQRt rn   r6  r   )#r  r  r   r   r   r+   MultiTemplateBufferr(   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr  r  r   r  r  r  multi_kernel_hintsr  r  finalize_as_triton_callersfinalize_as_triton_callerr  current_originsrj  output_noder   
StorageBoxOperationBufferorigin_noder:   r)  _replace_node)rk   r  r   r  min_node_unfusedr  choicecallershinttimingstriton_timingsout_tensorboxout_storage
out_buffers                 rl   r  z)Scheduler.finalize_multi_template_buffersM  s    !,, L	D L	DGAt$.. KD:	214 4 KD "Y
*P *4*C*C*E*E'$aa'+ *4*C*C*E*E  	( 	($ $O&?  
 ==jII &0&?&?&A&A & &F) & % @ S    & 4: 0 %&  *"EO$D$W      y    $O&?  
 0 NNP(8 %+$= 3 3D&0&?&?d&?&S&SG. .,3MMOO. . .N
 &))=)=)?)?^^%T%T%TUV%WF,2GDMM	<<WEEEE	;;<LMMMY..z/ABB C C$4$@$@$B$BMC C C C C C C C C C C C C C C+0!+r}=====(-
!*b.@AAAAA) N&}j6LMMM$.$5
!"":z1dCCCYL	D L	Ds   I77I;	>I;	r  r+  r  r   c                   t          ||           |                     |          }|| j        |<   || j        |                                <   || j        |                                <   i t          j        |j        j	        |j
                  D ].}| j                            |j        d           x}r
|j        |<   /dfd} ||j
                  |_
         ||j        j	                  |j        _	        t          |                                |                                          D ]-\  }	}
|	| j        |
                                <   |
j        |	_        .|j        |_        |j        |_        |j        |_        |j        |_        d S )Nr  rs  r   c                :    t          fd| D                       S )Nc              3  B   K   | ]}|                               V  d S rx   )r  )r   r   rj  s     rl   r   z?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>  s0      KKscjj)9::KKKKKKrn   r   )r  rj  s    rl   rename_depsz,Scheduler._replace_node.<locals>.rename_deps  s&    KKKKdKKKKKKrn   )r  rs  r   rs  )r6  r  r  r  r  r  r  r  r   r   rt  r  r  r   r  r  rC  r  rf  rg  r   rc  )rk   r  r  r  r   new_scheduler_noder   	real_namer  new_outold_outrj  s              @rl   r  zScheduler._replace_node  s    	"*j999!77
CC*
1-?$--//*3E0 ?4#3#94;RSS 	7 	7C 377$GGGy 7.1h +	L 	L 	L 	L 	L 	L 1<11
 1
- 0;{*00
 0
&, !$**,,d.>.>.@.@!
 !
 	* 	*GW 4;DW--//0#MGMM'+~$'+~$'+~$(,%%%rn   	node_listc                4    t          d |D                       S )Nc              3     K   | ]Q}t          |j        d           o7|j        duo.t          |j        j        d          o|j        j        j        dk    V  RdS )r   Nscatter_moder  )r?  r   r   r  r  s     rl   r   z,Scheduler._any_atomic_add.<locals>.<genexpr>  s       
 

 	 AFF## 9d"9^449 (L8	
 
 
 
 
 
rn   )r   )rk   r  s     rl   _any_atomic_addzScheduler._any_atomic_add  s2     
 

 
 
 
 
 
 	
rn   &tuple[LambdaFuture | None, ModuleType]c                2   |                      |d|          }t          j        |          }t          j        j                                        }|                                sd }n.|                    d|          }t          |t                    sJ ||fS )NT)r  r  triton_)kernel_namesource_code)r  r   loadr  r  async_compileAsyncCompileuse_process_poolr   r   r   )rk   r  r  src_codemodr  futs          rl   compile_kernelzScheduler.compile_kernel  s     77D 8 
 
 x((5BBDD--// 	1CC&&9(&SSCc<00000Szrn   r   r   rd   c                    !"#$%&'()* t          d fD                       }t          j        s|st                              d          S                                 r,t                                          t          j	                  r(
                                s
                                rt                              d          S                                 }|d                                         sJ j        dk    r*t          j        dk    rt                              d          S                                 }t          t!          j        ||                    }                     |          rt                              d          S ddlm t+                    *|d                                         J d!fd$|r%t          d fD                       r	                                dur                                n                                )t          )t          j                  sJ                      )          rt                              d          S i #g !t          j        D ]})                    |          t5                                          d           D ]\  }}	t          |t8          j        j        j                  s*)                     |          5  !!                    |g "                    ||j#                  R            ddd           n# 1 swxY w Y   tI          d          }
d}i }!D ]\  }}}	 ||%                                 n[# tL          $ rN}tN          (                    tR          j*                  r tN          +                    dsdnd|           Y d}~qd}~ww xY w)                     |          5   ,                    |          \  }}|||<   ||
k     r|}
|}ddd           n# 1 swxY w Y   |)j-        |<   t          |t\                    sJ |#|<   t          j/        ta          d )j1        D                       }te                      o o|t          j3        k    "tI          d          tI          d          c&'d%"sa)                                )4                                \  %&t5                                          tk          j6        d                    }nd )j1        D             }r0r 7                    |          n 7                    |          \  '}nAst                              d          S 8                                'ts          '          (ddl:m;} g !d}|D ]\  }}t          |t\                    ss!ty          |d          r|j=        )j=        k    r>r|&'z   k    r n|dz  }|t          j3        k    r ns)                     |          5  	 !!                    |g "                    |          R            n# |$ r Y ddd           w xY w	 ddd           n# 1 swxY w Y   t}          !          dk    rt                              d          S d"!"#$%&'() fd}t          ?                    |!d         d                   S  "                    |           "                    |            "                    |          d" $ *fd}t          ?                    |d                    S )#
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]D}|                                 o+t          |                                t          j                  V  Ed S rx   )r*  r   r  r+   r  r  s     rl   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sb        
  
  MMOO J1..00"2HII 
  
  
  
  
  
rn   Tr   r  r   CompilationErrorNms_fusedr  rP  rQ  r   r5  c           
        t                               t          j                  r| ||z   k     rXt                               d                                                                t          ||z   | z  d                     d S t                               d                                                                t          | ||z   z  d                     d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r.  r  r  DEBUGr/  r	  rE   rF   )r"  rP  rQ  r   r   s      rl   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}55 cCi''$$S..00..00"sSyH&<#B#BCC	     $$W..00..00 Hc	$:!@!@AA	     rn   c              3  B   K   | ]}|                                 d uV  d S rx   r  r  s     rl   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>)  sD       %
 %
23A!!-%
 %
 %
 %
 %
 %
rn   Fc                    | d         S rz  r~   r   s    rl   ry  z-Scheduler.speedup_by_fusion.<locals>.<lambda>=  s    aPQd rn   r6  r  infException in compiling %s: %sr  r  c              3  @   K   | ]}t          |t                    V  d S rx   r  r   r  s     rl   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>h  s>       % %<=
1677% % % % % %rn   r   c                    g | ]}|d fS r  r~   r,  s     rl   r   z/Scheduler.speedup_by_fusion.<locals>.<listcomp>~  s    &J&J&J!1v&J&J&Jrn   )	CantSplitallowed_prologue_inpsro   c                    t          d          } d }i }rkrt          t          j                  sJ                                                                 \  fdD             t          fd          D ]\  }}}	 ||                                }n s|j        }|	                                 nd }n[# t          $ rN}t                              t          j                  r t                              dsdnd|           Y d }~d }~ww xY wrV                    |          5                      |          \  }}	|||<   || k     r|} |}d d d            n# 1 swxY w Y   |k    pz   |         z   k    }
|r}|
r{|	                                 |j        r|j        sJ |j        d         }|j        }|j        }t+          |j        |||j        j        t1          j                            }|r|} nr |            r	| z   k     rL|Jt4          j        r|d <                                  n                    |           r
|j        d <   d	S d
S )Nr)  c                (    g | ]}|d          v |S r  r~   )r   
fut_choicer  s     rl   r   zMScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<listcomp>  s2     & & &&%a=N:: #:::rn   c                     | d                  S r   r~   )rB  r  s    rl   ry  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s    nQqT&: rn   r6  r*  r  r  r   TF)r  r   r+   r  r  r  r;  r/  r  
precompiler  r.  r  r  r%  r/  swap_as_triton_callerr  	launchersn_regsn_spillsrV  bmreqrB  rC   r  r(   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr  rh   	mod_fusedresr   r"  pathfusible_choicecompiled_kernelr@  rA  should_fuse_epiloguebench_epiloguer  r   ru  future_choicesget_choice_timings_async hint_override_best_fusion_choicer&  
min_choicerP  rQ  	ms2_fusedr  rk   s                  rl   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   $U||"& + %X*ZAW*X*XXXX%/%>%>%@%@N&0&?&?&A&AOJ& & & &*8& & &N &,&::::& & &N
 2@ <& <&-FFI!!-"(--//CC!/ '"+"3CNN,,,,"&C % ! ! !%227=AA &,, ?2A Q

z !  
 !! & '&'==fEE 	9 	9-1-L-L ) &. .NHd
 3;K/',66/728	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 '&0 N"Sy>&+AI+MM '
  &> &"--///#&=BV]BB#B.1mA.>O+:+AL-<-EN3A # # & , . & 6 0 7 ? ?4 40  4 &28 %! 7J|S#666 '!*6#)*D*D%10 NAP8>"==<   
 #<<_MMM% G;F
2484 5s+   7C
DADD6)E++E/	2E/	c                 T   ddl m}  	 d         d         d         fD ]}||                                                     d         
          \  t	          j                  r d           dS                     d         
          \  t	          j                  r d           dS                     d         
          \  t	          j                  r d           dS             t          d          rZz   k    rQfj        vrFj                            f           t          d          
                    fd	           z   k     S # | $ r Y dS 	$ r}d
t          |          v rY d }~dS  d }~ww xY w)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior~   )rP  rQ  r"  path1path2
path_fuseds   rl   ry  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>`  s-    053605365?8@3;sSy3I% % rn   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsrL  r/  r  rS  isinfr    r  r  r   r  r   )rL  r  r   rP  rQ  r"  rV  rW  rX  r!  r   future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r&  rk   r  s      @@@@@@rl   rJ  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready-  sm        A *!,)!,/2  ) )
 ?JJLLL!%!@!@)!," "JC
 z# %CDDD$u!%!@!@)!," "JC
 z# %DEEE$u+/+J+J/2, ,(Hj
 z(++ %CDDD$uJxc222 0>>$c	11"EN$2III/33UENCCC(77??        
 
 
 $cCi//+ ! ! ! 55'   .#a&&88#ttttts7   A.E> ?>E> ?>E> ?A>E> >F'F'
F"!F""F'rv   )r"  r  rP  r  rQ  r  r   r5  rZ  )@r   r(   benchmark_fusionrd   rs   r*  r   r  r+   TritonTemplateBufferr.  r   r   r   r  r  r  r  r  triton.compiler.errorsr!  r'  r  r  r  r  r;  r  r  r  r  TritonTemplateCallerr5  r   r  r  r  r/  r  r.  r  r  r%  r/  r  r:  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesr  operator
itemgetterr  r  r>  r  r.  r?  r/  r   rw   )+rk   r   r   is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r;  r<  r=  rh   r>  r   r"  r@  num_triton_callerschoice_timings_iterrW  r.  triton_choicesunfused_timerJ  r!  rD  r  r   ru  r\  r]  r^  rE  rF  rG  r&  rH  rP  rQ  rI  r  r  s+   ```                      @@@@@@@@@@@@@@@@@@rl   speedup_by_fusionzScheduler.speedup_by_fusion  sE
       
  
 U^ 
  
  
 
 

 & 	+/@ 	+$$T*** 	+u6688":QRR	+ !!	+ !!		+  $$T***oo''Q**,,v ;%F$6($B$B$$T***oo''y{KHHII
 00 	+$$T***;;;;;;u%% #..00!!!	 	 	 	 	 	 	"  N	 %
 %
8=u~%
 %
 %
 "
 "
 N	 $5577tCO #/''))),,.. 
 j"*@AAAAA55jAA 0#((///  - QSN!'!: *R *R!+!:!:=!I!I!'(<(<(>(>NN!S!S!S  IFA% @ U  ! !#99&AA  &-- &!%!4!4$36CW "5 "" ""                   %U||CG 1? 5 5-FFI
!!-"MMOOO$ ! ! !%227=AA &,, ?2A Q

z !  
 !! $99&AA 5 5)-)H)H%v* *$ /7F+#l22+3L.4O5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 =H
*=9!/3KLLLLLBQ0??#=N!$ % %AKAS% % % " "
 )** R&&R&&*QQ % U||U5\\HC15J+ 
K!+!:!:!<!<",";";"="=
C&,"((**0CA0F0F' ' '## 'K&Jz7I&J&J&J# P 'AD..{;;;33K@@ UU ' 4',,U3332244<UE3OO	 ?>>>>>PRNN(; ! !$!&*BCC  ((?@@ 4
8XXX! lcCi&?&?E!#!F$KKKE55f== ! !!&--#Kd&9&9/&J&JKK    % ! ! ! !! ! ! ! ! ! !
!! ! ! ! ! ! ! ! ! ! ! ! ! ! ! >""a''#((///j! j! j! j! j! j! j! j! j! j! j! j! j! j! j! j! j! j! j!X  --$nQ&7&:   !% 3 3K @ @ $ 3 3K @ @&*&9&9/&J&J#F F F F F F F F F F F F FP  --09PQR9S .   ss   4MMM?N
O. AO))O.)P;;P?P?Z,Y43Z4Z9ZZZZ	Z	c                @    | j         |                                         S )z0Look up the node in Scheduler name_to_fused_node)r  r  r  s     rl   r=  zScheduler.get_fused_nodey  s    &t':':'<'<==rn   r  OrderedSet[BaseSchedulerNode]c                p   t                               d|                                |                                           |                                }|                                |k    sJ |                     |                              ||          |                    |           |                    |           |                               | j        	                    fd
                                D                        | j                            |          }|
|| j        <   S )Nzfusing %s with %sc                :    i | ]}|                                 S r~   r  )r   r  node3s     rl   r  z,Scheduler.fuse_two_nodes.<locals>.<dictcomp>  s#    'W'W'W

e'W'W'Wrn   )r.  r/  r  r   rw  rs   r{  r  r  r2  r   r  r  )rk   r   r   r  r   stream1ru  s         @rl   fuse_two_nodeszScheduler.fuse_two_nodes}  s#    	,enn.>.>@P@PQQQ!!##!!V++++  ((--eU;;5!!!5!!!&&'W'W'W'WU__EVEV'W'W'WXXX %))%00)0D&rn   
speedup_fnrt   c                    |                      ||          r9|                     ||          s# |            r|                     |||           dS dS NTF)r   will_fusion_create_cyclerw  )rk   r   r   rx  r  s        rl   fuse_if_speedupzScheduler.fuse_if_speedup  sf     MM%''	11%??	 
	
 uk:::4urn   template_fusion_candidates,dict[BaseSchedulerNode, list[PendingFusion]]c                   |rg }i }t                      }|D ]8}||v rt          ||                   dk    sJ ||                             d          }t          ||                   dk    r|                    |           |                                \  }}	|	|k    rt          ||	          sJ |}
n||k    sJ t          ||	          sJ |	}
|                     |
          |
ur|j        r.|j        j        }|J |	                    |           ||f||<   | 
                    ||	|j        |          r|                    |           :t          |          D ]o}||         \  }}| 
                    |                     |j                  |                     |j                  |j        |          r|                    |           p|D ]}|                    |           |dS dS )z
        Evaluate pending template fusions for a set of fusion candidate nodes.
        The fusion candidate nodes are pointwise nodes as potential epilogue
        or prologue fusions
        r   r   N)r   r   r  r  r   r  r  r=  rh   r   r|  rf   r   r   r   )rk   r}  r  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionr   r   r  fcands                rl   "_evaluate_pending_template_fusionsz,Scheduler._evaluate_pending_template_fusions  s    ) 9	2-/  % @J||7 %9 %9	!;;;6yABBaGGGH "<I!F!J!J1!M!M1)<==BB%)))444->>@@uI%%-eU;;;;;$)MM I-----eU;;;;;$)M &&}55]JJ!( 
9&-4A===$++A...3A92M,Q// ++un&@+  9 *--i888 ""233 0 0'?'B$''''(<==''(<==".	  0 &))$///& 2 2*..q1111s ) 9	2 9	2 9	2 9	2 9	2rn   possible_fusion_pairs1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]pending_fusions&dict[BaseSchedulerNode, PendingFusion]template_fusion_nodesr  c                    d fd}|D ]Y\  }} |||                                 |          }                      |          }t          ||          r||f j        v rX                     |||          r                     ||          sԉ                     ||          }	|	j        t          |	j        |||	j                  }
t          ||          r^||f j        vsJ  j        	                    ||f           t          ||          }||vrg ||<   ||                             |
           n
|
|<   |
|<   9|	j        sB                     ||           [d S )	Nr   r_   r   r   r5  c                                        |           v s                     |          v r:                                         |                                                    |                              }|J |                                \  }}|j        }                    |d                                |d                                 |          |u sJ                      |          |u sJ  |            r                    | |          r                     ||                                |           v                       |          v 8d S d S rx   )r=  r  r   rf   r  r{  rw  )	r   r   r  	node_key1	node_key2
is_speedupr  r  rk   s	         rl   resolve_pending_fusionsz<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions  s   
 ##E**o==&&u--@@!0!4!4''..#''(;(;E(B(BCC" " &111'5'F'F'H'H$	9+7
##It444##It444**955BBBB**955BBBB!z|| t'D'DUE'R'R ##Iy+FFF+ ##E**o==&&u--@@@@@@rn   )rf   r   r   rh   r3  )r=  r  r  r   r{  rp  rf   r   rh   r  r  r   re   rw  )rk   r  r  r  r  r  r  r   r   
fusion_resr  template_pw_nodes   ` ` `       rl   _try_fusion_pairszScheduler._try_fusion_pairs  s    	G 	G 	G 	G 	G 	G 	G 	G8 2 +	? +	?LE5 $#E5111''..E''..E #5%00ENd&@@@}}u.  ?33E5AA? "33E5AA
)5%2$.$:##)0	& & &N *%77 
@ %u~T5OOOOO266u~FFF+B5%+P+P(+3HHHFH12BC-.>?FF~VVVV1?.1?.!- ##E5+>>>W+	? +	?rn   c                t   t                      }|                                D ]}|                                \  }}|j        }||v st	          ||          r5|                    |           |                     |          |u sJ |                     |          |u sJ |                     ||||           d S rx   )r   r   r   rf   r  r  r=  r|  )rk   r  r  seen_pair_speedup_fnr  r  r  is_speedup_fns           rl   _finish_pending_fusionsz!Scheduler._finish_pending_fusions=  s    
 @J|| .4466 	S 	SN#1#B#B#D#D Iy*6M 4448J99 94  $$]333&&y11Y>>>>&&y11Y>>>>  I}kRRRR	S 	Srn   possible_fusionsdeferred_prologue_fusionsc                    t          d |D                       }g }|D ]H\  }}t          ||          r||v r|                    ||f           1|                    ||f           I|S )Nc                8    g | ]\  }}t          ||          |S r~   r  )r   n1n2s      rl   r   z6Scheduler._handle_template_overlap.<locals>.<listcomp>^  s,    MMMFB2DR2L2LMRMMMrn   )r   r  r   )rk   r  r  epilogue_template_nodesnew_possible_fusionsr  r  s          rl   _handle_template_overlapz"Scheduler._handle_template_overlapU  s     #-MM.MMM#
 #
  "& 	6 	6FB!"b)) 6b4K.K.K)00"b::::$++RH5555##rn   c                $   |                      |           t          |          }t                              t          j                  rLt                              d           |D ]/}t                              d|                                           0i }i }g }|                     ||          }t          j
        st          j        r|                     ||          }|                     |||||           |                     ||           |                     ||           |                                 |r/|                     |||||           |                     ||           t#          |d           }|                     |          }|S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                    | j         S rx   r  r   s    rl   ry  z+Scheduler.fuse_nodes_once.<locals>.<lambda>  s    !+ rn   r6  )r  r   r.  r  r  r%  r/  r  get_possible_fusionsr(   r  r  r  r  r  r  clearr;  r  )	rk   r  r  r  r   r  r  r  r  s	            rl   r  zScheduler.fuse_nodes_oncei  s    	!!%((( ''""7=11 	A;<<<# A A  )=)=)?)?@@@@  	
 OQ  	"  44
 

 # 	v': 	#<< ";    	!	
 	
 	
 	$$[/BBB//0E{SSS##%%%$ 	X"")%    334I;WWW{(=(=>>>..u55rn   r  c                  
 t          | j                  }d}t          | j                  }t                              d|           t          t                              |                     D ]_\  }}t                              |          }t          |          dk     r4|	||k    r n"| 	                    |          st                              d|           p|dz  }t          j        dk    }t          |d         j        |d|          
t                              d	t          |          |           |D ]}|                    |           |                    
           | j                            
fd

                                D                        | j                            |d                   }	|	
|	| j        
<   at+          |d           | _        |                     | j                  | _        t                              d||t          | j                             |                     | j                   dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr"  z0ComboKernels: Combining %d nodes for %d-th groupc                :    i | ]}|                                 S r~   r  )r   r  r  s     rl   r  z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>  s#    LLLq{LLLrn   c                    | j         S rx   r  r   s    rl   ry  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>  s    q{ rn   r6  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   r  r/  r  r  r  r  speedup_by_combo_kernelr(   r&  r  r  r{  r  r  r2  r   r  r  r;  r  r  )rk   r  r  r  num_nodes_orignumr  r  r   streamr  s             @rl   r  z#Scheduler.create_combo_kernel_nodes  sX    !,,TZ		FUUU'&DDTJJ
 
 "	: "	:NC 3CCINNI9~~!!'EL,@,@//	:: 		EsKKKQJE$;a?O4!&*. /	  K HHBI  
 " ) )""4((((OOK(((#**LLLLK4I4I4K4KLLL  
 (,,Yq\::F!39#K0K-B-BCCC
33DJ??
R
OO		
 	
 	
 	!!$*-----rn   c                D    |D ]}|                     | j                   d S rx   )r  r  )rk   r  r   s      rl   r  zScheduler.prune_redundant_deps  s5     	? 	?D%%d&=>>>>	? 	?rn   c                0   
 g 
t          t          t          t          f                              d
 fd}t          j        t
                    }|D ]J}                     |          r|                                D ]}||                             |           K|	                                D ]} ||           t          j        rnt          j        t
                    }|D ]0}t          |dd          }	|	r||	                             |           1|	                                D ]} ||                                
          

                     j        d	           t                               d
t%          
                     
S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  r   r5  c                   t          |           D ]\  }}| |dz   |dz   t          j        z            D ]}||f}|v r                    |                               ||          r                    |           M|                                s|                                r.                    ||          r                    ||f           d S rz  )r  r(   )max_fusion_buffer_group_pairwise_attemptsr  r   r   r*  r.  )	r  node1_indexr   r   r7  r  r  r  rk   s	        rl   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s'   &/&6&6 @ @"U"!Ok'F'G G @ @E
 !%.Cd{{ HHSMMM}}UE3CDD @(//4444++-- @1A1A1C1C @u&6J J @ )//???!@@ @rn   r   NT)r7  reversezfound %d possible fusionsr  r  r   r5  )r   r   r_   r  r   r  unfusable_noder   r   r   r(   aggressive_fusionr  *get_possible_fusions_with_highest_priorityr&  score_fusion_keyr.  r/  r   )rk   r  r  r  buffer_names_groupingr   r   node_groupinggroup_groupingr   r  r  s   ` `       @@rl   r  zScheduler.get_possible_fusions  s    % 13D DEFHH	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@( !, 7 = = 	8 	8D""4(( --// 8 8%c*11$77778299;; 	+ 	+MOM****# 	/(4T::N 7 7gt44 7"5)00666!/!6!6!8!8 / /....JJ
 
 	$"7FFF4c:J6K6KLLLrn   c                    t          t                               d fd|                                j                                        |                                j                                        z  |j        j                                        |j        j                                        z  z
  t           fdD                       }|r t          ||          d           |S )	z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        r   r_   r   ro   c                ,   t          | t                    r}| vry                    |            |                                                               rdS t          | j        z            p#t          fd| j        z
  D                       S dS )NFc              3  D   K   | ]} j         |                   V  d S rx   r  r   r  
found_pathrk   s     rl   r   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>:  sQ       H H #
4#:1#=>>H H H H H Hrn   )r   r   r  r   issubsetro   r   r   )r   combined_ancestorscombined_namesr  rk   visiteds    rl   r  z6Scheduler.will_fusion_create_cycle.<locals>.found_path)  s    $ 233 G8K8KD!!!++--667IJJ  !5   ?@@ C H H H H H!%2D!DH H H E E  5rn   c              3  D   K   | ]} j         |                   V  d S rx   r  r  s     rl   r   z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>H  s5      WWqJJt6q9::WWWWWWrn   zwill create cycler  )r   r   r   _dictr  r   r   r'  )rk   r   r   cycler  r  r  r  s   `   @@@@rl   r{  z"Scheduler.will_fusion_create_cycle  s    /022	 	 	 	 	 	 	 	 	 	2 %%''-2244''))/44667 	
 O!&&((5?+@+E+E+G+GG WWWWWDVWWWWW 	9#IeU##$7888rn   c                    ddl m d fd} ||          } ||          }t          fd|D                       }t          fd	|D                       }|                    |          }d
}	|D ]-}
	 |	t	          |
d                   z  }	# t
          $ r Y  dS w xY w                     ||          }t          j        j	        
                    |	d|z            rdS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   )buffer_reuse_keyr   r_   r   list[ir.Buffer]c                   g }| j         j        D ]n}j                            |j                  }|rKt          |j                  dk    r3|j                                        r|	                    |j                   o|S rz  )
r   r   rC  r  r   r   r  r   has_tensor_outputr   )r   r  r  r   rk   s       rl   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsd  s~     F&, , ,&**2733 ,3sy>>Q..383M3M3O3O.MM#(+++Mrn   c              3  .   K   | ]} |          V  d S rx   r~   r   r   r  s     rl   r   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>r  /      #S#Sc$4$4S$9$9#S#S#S#S#S#Srn   c              3  .   K   | ]} |          V  d S rx   r~   r  s     rl   r   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>s  r  rn   r   r   FrF  T)r   r_   r   r  )rD  r  r   intersectionr   r  ri  rZ   r   r   statically_known_gt)rk   r   r   r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr7  	bw_savingr  s   `           @rl   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memoryM  sG   * 	655555	 	 	 	 	 	 10770077##S#S#S#S]#S#S#SSS##S#S#S#S]#S#S#SSS*77GG$ 	 	C3s1v;;.   uuu ,,UE::	 7//iPP 	4us   7B
BB	thresholdc                0   t          d |                                D             d |                                D             z             }t          d |j        j        D                       }t          d |j        j        D                       }||z  }t                      }|j        j        D ]7}	|                     |	j        |          r|                    |	j                   8t          d |j        j        D                       t          d |j        j        D                       z  }
t          d |j        j        D                       t          d |j        j        D                       z  }|
|z
  }||z
  }||z  }t          |          |k    S )	Nc                6    g | ]}|                                 S r~   r  r  s     rl   r   zFScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<listcomp>  s     ;;;T]]__;;;rn   c                6    g | ]}|                                 S r~   r  r  s     rl   r   zFScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<listcomp>  s     ===4t}}===rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s$      &T&TCsx&T&T&T&T&T&Trn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  $      %R%R3ch%R%R%R%R%R%Rrn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s5       $
 $
CH$
 $
 $
 $
 $
 $
rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  r  rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s5       %
 %
CH%
 %
 %
 %
 %
 %
rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s$      DDCsxDDDDDDrn   )	r   r   r   r  r   $can_buffer_be_removed_through_fusionr   r  r   )rk   r   r   r  fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionr  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                  rl   (fusion_prevent_too_many_reads_and_writesz2Scheduler.fusion_prevent_too_many_reads_and_writes  s    &;;):):;;;==5??+<+<===>
 
 '&T&T5;L;S&T&T&TTT%%R%R%:K:Q%R%R%RRR'7:K'K$ :D%*1 	B 	BI88 0  B .11).AAA $ $
 $
 % 1 7$
 $
 $
 
 
CC5+<+BCCCCCD
 % %
 %
 % 1 8%
 %
 %
 
 
DD5+<+CDDDDDE
 &(DD (*GG )=8$%%	11rn   c                    t          t          |j        |j        z
            t          |j        |j        z
                      }|dk    S )aA  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heuristic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  rf  rg  )rk   r   r   proximity_scores       rl   are_long_distant_nodesz Scheduler.are_long_distant_nodes  sH    * %/122%/122
 
 ##rn   common_buf_names!tuple[str, ...] | OrderedSet[str]c                   i }d |j                                         D             }d |j                                         D             }|D ]}t          j                            |          }||         }	||         }
t          |	t                    rt          |
t                    s&dt          |	           dt          |
           ||<   |	                                |
                                k    r0d|	                                 d|
                                 ||<   t          |	j
                  t          |
j
                  k    rd||<   |	                                }|
                                }||k    rd| d| ||<   H|	                                |
                                k    rd|	 d|
 ||<   d	}t          |t          j                  s
d
|j         }d|	 d|
 d| ||<   t!          |          S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                    i | ]
}|j         |S r~   r  r  s     rl   r  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>      XXXC#(CXXXrn   c                    i | ]
}|j         |S r~   r  r  s     rl   r  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>  r  rn   znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  rZ   r   r  r   r4   r   r   rY   r   
get_offsetnormalize_with_stride_orderr+   r  r)  r   )rk   r   r   r  reasonsnode1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                 rl   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason  sY    XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX( ,	 ,	H'$$X..C$X.G$X.Ggy11 GY9W9W Jd7mmJJ4==JJ !   ""g&7&7&9&999X(9(9(;(;XX7CTCTCVCVXX !  W\**mGL.I.III$/!((**G((**G'!! %R$Q$Q$Q$Q! 335566889 9 %VW$U$UG$U$U! Jc2#566 54
44
I7II'IIZII H 7||rn   c                0   t           j        sdS t          d ||fD                       rdS |j                                        }|j                                        }||z  }|sdS t          d |j        D                       }||z
  rdS t          |          dk    rdS t          |j        j                  dk    st          |j        j	                  dk    rdS t          t          |j        j                            }t          t          |j        j	                            }t          |t                    rt          |t                    sdS d |j        j	        D             }	|j        |	vrdS |	|j                 }
t          |
t                    sdS |
                                }
|
j        |j        k    r|
j        |j        k    rdS |j        |j        k    st          |j                  dk    rdS t          |j        j                  dk    rdS |j        j        rdS d|j        j        v rd|j        j        v sJ t          d	 |j                                        D                       }t          |          dk    rdS t          t          |                    }||j        j        d         k    rd}d}n||j        j        d         k    sJ d}d}d
dlm} |j        j        d
         }t          |          dk    rdS g }t4          j                            |          D ]9}|                    t<          j        j         !                    |                     :tE          |          } |||d
                   }|dS |j        j        |         |j        j        |<   ||j        j        |<   |#                    dd           | $                    ||          }t          |tJ                    sJ tL          '                    d|           |S )aW  
        Attempts to enable fusion between two nodes by inverting indexing patterns.

        This optimization targets cases where node1 has a contiguous write and
        node2 has a contiguous write but discontiguous read. By inverting the
        indexing in node2's read and write operations, we can make them compatible
        with node1 for potential fusion.

        Args:
            node1: First scheduler node (source)
            node2: Second scheduler node (target for inversion)

        Returns:
            int: Fusion score if successful, 0 if optimization not applicable
        r   c              3  >   K   | ]}|                                 V  d S rx   r!  r  s     rl   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>%  s*      22aqxxzz222222rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>1  s5       .
 .
CH.
 .
 .
 .
 .
 .
rn   r   c                    i | ]
}|j         |S r~   r  r  s     rl   r  zBScheduler.shared_data_after_inverting_indexing.<locals>.<dictcomp>F  s    JJJ##JJJrn   r   index0index1c              3     K   | ]}|V  d S rx   r~   )r   r  s     rl   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>l  s"      %T%Ttd%T%T%T%T%T%Trn   r   )generate_inverse_formulaNTFz!Shared memory after inversion: %d)(r(   r  r   r   buffer_namesr   rt  r   r   r  r  r  r   r4   r   rt  r   r   	var_namesr  r  	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr  varsr   Add	make_argsr   rZ   r   r   combine_modular_indexing_pairsr   r  ri  r   r.  r  )rk   r   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writenode1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr  r  simplified_termstermsimplified_read_exprinverse_formulascores                         rl   $shared_data_after_inverting_indexingz.Scheduler.shared_data_after_inverting_indexing  s   & 4 	222E5>22222 	2 #.;;==".;;==03EE" 	2 $. .
 .
 % 8.
 .
 .
 $
 $
  $&88 	2'((1,,2 u &''!++s53D3K/L/Lq/P/P2$u067788
4 1 899::*i00 	
9
 9
 	 2JJ1B1IJJJ?,..2":?3+y11 	2 "++-- !222 K$4442?k...#j6J2K2Kq2P2P2 u{)**a//2 ;  	2 222EK66667
 &%T%Tu{7Q7Q7S7S%T%T%TTT  A%%2.//00	 28<<<&O' :8 DDDDD&O'QQQQQQ[%a(
z??a2I''	22 	 	D## ??EE     ##344223GTUWW "2
 7<k6P7
"?3 8G"#34 	""4///((66%%%%%%;UCCCrn   c                   t           j        rt          d ||fD                       rdS |                                s|                                rdS |j                                        }|j                                        }||z  }|sdS d |j                                        D             }d |j                                        D             }g }|D ]}	||	         }
||	         }|
                                |                                k    rN|                    t          j
        j                            |
                                d          |
|f           t          |          dk    rdS t          |t!          j        d                    \  }}
}t%          |
t&                    rt%          |t&                    sdS |
j        |j        k    rA|
                                |                                k    r|                     |
          S dS d}|                                s|                    |
|          }nk|                                s|                    ||
          }n@t2                              d	|                                |                                           |r.t9          j        t<          |                     ||                    ndS )
a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c              3  >   K   | ]}|                                 V  d S rx   r  r  s     rl   r   z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>  s;       8
 8
AHHJJ8
 8
 8
 8
 8
 8
rn   r   c                    i | ]
}|j         |S r~   r  r  s     rl   r  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>  r  rn   c                    i | ]
}|j         |S r~   r  r  s     rl   r  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>  r  rn   r   r   r6  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s) r(   ry  r   r*  r   r  r  r   r   rZ   r   r   r   r   r   r  rf  rg  r   r4   r  rt  dep_size_hintr   r  r  r/  r  rg  rh  r   ri  )rk   r   r   r  r  r  r  r  
candidatesr   r  r  _numel	reordereds                 rl   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loop  s     0 	C 8
 8
!&8
 8
 8
 5
 5
 	 2
  	%"3"3"5"5 	2".;;==".;;==03EE" 	2XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX 
. 	 	K$[1G$[1G335566889 9 !!(::#--//! ;       z??a2 $'zx7J17M7M#N#N#N '9-- 	Z5S5S 	2w///
   ""g&7&7&9&999))'2222	!!## 		77IIII##%% 	77IIII##Q       FKT55eUCCDDD	
rn   c                t   t          |t                    r*|                                 ot          |j                   S t          |t
                    rct          |j        t          j                  r|j                                         S |                                 ot          |j                   S dS )z>
        Is this node unfusable under any conditions.
        F)	r   rN  r*  rW   r   r  r+   rQ  rR  r  s     rl   r  zScheduler.unfusable_node  s     d233 	''))) 2U	3 3 /  d566 	$)R%?@@ 99668888''))) 2U	3 3 /  urn   prologue_noder  r  r'  c                B   |                                 t          j        j        k    rdS |                                }|                                }d}|||z  k    r |d           dS t          d |                                D                       }|t          j	        j
        j        j        fk    r |d           dS dd}|                                }	|	                                s1 ||	j                  r!|                                s |d           dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]9}|j         0|j                                         D ]}|j        dk    |j        V  :d S )Ncall_function)r   rd  r  re  )r   r  r   s      rl   r   zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>+  sf       
 
v!V'')) "!t&&	 H '&&&&
 
rn   z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsr  torch.dtyper   ro   c                &    | j         dk    o| j        S )Nr   )itemsizeis_floating_point)r  s    rl   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp8  s    >Q&B5+BBrn   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  r=  r   ro   )r   rZ   r   invoke_quant_opsrw  ry  r   r   r  r  r  constant_pad_ndr{  r  rV   r  r  )
rk   r8  r  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERrj  rA  template_bufs
             rl   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  sv    ,,..!'2JJJ4"88::
#::<< &)"'AABBCRSSS5  
 
",,..
 
 
 
 
 uy~5=???Cn   5	C 	C 	C 	C %??AA6688	L.//	 ">>@@	
 Ch   5trn   ,tuple[int, SchedulerNode, sympy.Expr] | Nonec                    t          |t                    rt          |t                    sdS t          |j        t          j                  rt          |j        t          j                  sdS |                                s|                                rdS t          j        dk    rdS |j        |j        }}|\  }}|\  }}|	                                s:|	                                s&||k    s t          |          t          |          k    rdS t          |j        j                  dk    st          |j        j                  dk    rdS                      t          t          |j        j                                      }	                     t          t          |j        j                                      }
t!          |	|
          t          j        k    rdS d
 fd} ||          s ||          rdS g }t%          t'          ||                    D ]#\  }\  }}||k    r|                    |           $t          |          dk    rdS |d	         }||         ||         }}t*          j        j                            ||          r|||fS t*          j        j                            ||          r|||fS dS )ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        Nr  r   r   r_   r   ro   c                .   | j         j        D ]}|j        j        v rj        |j                 }nj                            |j                  }|rBt          j        j        	                    ||           rt          |j        t                    s dS dS rz  )r   r   r   rB  rC  r  rZ   r   r=  rM  r   r  rN  )r   rT  rU  rk   s      rl   has_reusable_bufferzIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    (.    9 ;;; $ ;DI FII $ 0 4 4TY ? ?I  ,66y$GG  'y'<>TUU 
  445rn   r   r  )r   r   r   r+   r   r  r(   r  rj  r   r   r   r  r2  r  r  r  small_memory_access_thresholdr  r  r   rZ   r   r   statically_known_lt)rk   r   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryrL  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  rl   "get_expand_dim_for_pointwise_nodesz,Scheduler.get_expand_dim_for_pointwise_nodesH  s    %// 	z%7W7W 	4 uz2#455	5:r'899	 4 ))++ 	u/M/M/O/O 	4 ))4 #\5<()1&)1&  	!!##	 /11=!!S%7%7774 u '((1,,E4E4L0M0MPQ0Q0Q4 "//T%:K:R5S5S0T0TUU!//T%:K:R5S5S0T0TUU"$67723 3 4	 	 	 	 	 	  u%% 	)<)<U)C)C 	4 !'0]M1R1R'S'S 	0 	0#C#'7'!!#**3///"##q((4*1-,',' ' 7//OO 	66W11..QQ 	664rn   FTcan_reorderr]  c                   u rdS |                                  r@| j                                      }| j                                      }|
|||k    rdS t          t                    r                              S t          t                    rdS t                    }                                r=|                     	                                          
                              rdS t          t                    st          t                    r |d           dS t          t                    r!                                s |d           dS t          t                    rt          j        t          j                  s |d           dS j                                        s |d           dS t          t$                    s |d           dS t          j        t&                    s |d	           dS t          j        j        t*                    s |d
           dS t-          j        j                  dk    sJ j        j        d         j        t3          fdj        j        D                       r |d           dS j        j                                        }|D ]=}	j        j                            |	          }
t3          d |
D                       r dS >t-          j        j                  dk    sJ j        j        d         j        j        j        k    r |d           dS d(fdt3          fd| j         D                       rdS t          t          t          f          r!                                s |d           dS !                                j"        z  r |d           dS                                 rCtG                    s |d           dS $                                s                                r |d           dS %                                }|&                                }|s |d           dS tO          d |j(        D                       |z
  })                                |z  r |d           dS *                                s*                                r |d           dS +                                dd         D ]J}|,                                }|D ]1}t[          fd|j.        D                       s |d             dS 2Kt          t^                    sgnd  j0        D             }t-          |          dk    sJ |d         }t-          d         j1                  dk    rNt-          d         j1        d         j.                  dk    r%d         j1        d         j.        d         j        |u s |d!           dS | 2                    |          sdS                                 r*                                s#$                                stg                    s |d"           dS 4                                }|J |5                                r,t          j        t          j                  s |d#           dS )                                tl          j7        j8        z  s&)                                tl          j7        j8        z  r |d$           dS 	                                }	                                }||k    r |d%||           dS ~| 9                    |&          }t          |tt                    sJ |r:|tv          j<        k     r*tv          j=        r| >                              }|dk    r|}tv          j?        ra| @                              x}rI|\  }}}|A                    ||           | 9                              }t          |tt                    sJ tv          jB        r.|tv          j<        k     r| C                              }|dk    r|}t          E                    t          jG                  rAt          H                    d'I                                I                                |           tl          jJ        K                    | |          sdS !                                j"        z  ra| L                              oJtl          jJ        L                    | |          o(|                     |          L                              S tl          jJ        M                    | |          o(|                     |          M                              S ))zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FNTz/grouped node must not be fused with other nodesznode1 is nopz'node1 is extern but not a triton kernelz5node1's triton kernel doesn't support epilogue fusionz.node1 is extern but node2 is not SchedulerNodez3node1 is extern but node2.node is not SchedulerNodez4node1 is extern but node2.node.data is not Pointwiser   r   c              3  .   K   | ]}|j         k    V  d S rx   r  )r   r   written_buffer_names     rl   r   z%Scheduler.can_fuse.<locals>.<genexpr>  s+      VVs3822VVVVVVrn   z9epilogue reads from buffers other than the mutated outputc              3  "   K   | ]
}|d k    V  dS )r  Nr~   )r   usages     rl   r   z%Scheduler.can_fuse.<locals>.<genexpr>  s&      ;;5u;;;;;;rn   z*node1 and node2 uses different buf layoutsr   r_   c                @    | uo| uo|                                  v S rx   r5  )r   r   r   rb  s    rl   ._is_other_node_that_references_mutation_bufferzJScheduler.can_fuse.<locals>._is_other_node_that_references_mutation_buffer   s:      u, N#50N+z/K/K/M/MMrn   c              3  .   K   | ]} |          V  d S rx   r~   )r   r   rf  s     rl   r   z%Scheduler.can_fuse.<locals>.<genexpr>	  sB         ?>tDD     rn   znode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz'template has no allowed prologue inputsc              3  >   K   | ]}|                                 V  d S rx   r  )r   r  s     rl   r   z%Scheduler.can_fuse.<locals>.<genexpr>*  s*      EEc3<<>>EEEEEErn   z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesr   c              3  *   K   | ]}|j         v V  d S rx   r   )r   r0  prologue_nodess     rl   r   z%Scheduler.can_fuse.<locals>.<genexpr>:  s*      QQttyN:QQQQQQrn   z7template prologue can only fuse nodes with a single usec                :    g | ]}|                                 |S r~   r  r  s     rl   r   z&Scheduler.can_fuse.<locals>.<listcomp>A  s%    AAAAAaAAArn   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz6multi-output template epilogue requires ComputedBufferz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r\  z%s and %s has %s shared data)r   r_   )Nr  r  r  r   rW  rn  r'  r*  rw  r   can_fuse_multi_outputs_templater  rN  r  r   r+   rQ  rR  r   r   r   r   r   mutation_outputsr   r   r   r   inner_fn_free_symbolscollect_inner_fn_symbol_usagerT  r)  r  r   r   r}  r   r  get_allowed_prologue_inpsr   r  r	  r  r   r  r   r  r   r   rm  rH  rx  r  rV   rZ   r   no_fuse_buffer_namesri  r   r(   score_fusion_memory_thresholdry  r6  $expand_dimension_for_pointwise_nodesr^  r  r  r-  r  r  r  r%  r/  r  rd  r   can_fuse_verticalcan_fuse_horizontal)rk   r   r   r_  r]  rv  stream2r  node2_inner_fn_free_symbolssymbolusagesr  r/  unsupported_prologue_argsr   	node_outsr   template_snodestemplate_snoderG  r   device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerf  rj  rb  s    ``                         @@@rl   r   zScheduler.can_fuse  sc    E>>5 '')) 	)--e44G)--e44G"w':w'?Q?Que455 	.&&u---e455 	 5u%% 	4#3#3$
 $

)
)%
7
7	 4e122 	j'7
 7
 	 CABBB5e344 	U=N=N=P=P 	C5e677 7	ej"*DEE =>>>u://11 KLLLue]33 DEEEuej.99 IJJJuejoy99 JKKKuuz233q8888"'*"=a"@"E VVVVe>O>UVVVVV OPPPu +0*/*O*O*Q*Q'5 ! !FFvNN;;F;;;;; ! 55! uz.//14444z&q)0EJ4EEE@AAAu             J      u u8:PQRR	%%''	 C()))5$$&&8 	C,---5 7	.u55 0111u!!## u'8'8':': HIIIu7799H$,$F$F$H$H!( =>>>u EEX_EEEEE'( &
 %%''*CC QRRRu--// 53Q3Q3S3S PQQQu"__..N&ss+ % % ,,..	$ % %CQQQQsyQQQQQ %UVVV$uuu%% "%);<<BAAAAA 
 ''1,,,,,Q/N N2&.//144r*215;<<AA"2&.q17:?>QQ[   u@@sSS u 	..00%%'' 3599
 5666u 2244L+++5577 

B-A A  LMMMu""$$qw'CC 	""$$qw'CC	 C56665!!##""$$WC,fg>>>5 4454M 5 
 
 +S11111 	:!F$HHH1 I %)$J$J5RW$X$X!$))$9!6 	6#FFueTTTO	6 7F3Z{<<ZUUU $ 8 8 F F/55555 1	:!F$HHH$($M$Mu% %! %))$9!))'-88 	##.    !	   y!!$u6GHH 	5$$&&8 
	M &&ue44 MI//eUDUVVM$$V,,>>ueLL 900eU$5  M""6**>>ueLLMrn   c                   |                                 }t          ||          }t          t                    }|j        D ]o}| j                            |j        |j                  }t          |t                    r| 
                    |||          rT||                             |           p|j        j        D ]}t          |t                    st          |t                    s-|                    | j                            |j        |j                            }	|	r|	D ]}
t          |t                    r,|                     |
|          r|	                    |
           Ct          |t                    r1|                     |
||j                  r|	                    |
           t)          d t*          j                            |                                          D                       }||z  r |d           dS |                                }|D ]D}| j        |                                         }|| j        |         j        z  r |d            dS EdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s8       $
 $
 H$
 $
 $
 $
 $
 $
rn   zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r	  r'  r   r  rt  rj  r  r   r   r6   rF  r   r   r  r4   r5   fusable_read_and_writer{  .fusable_stardep_write_and_read_on_empty_tensorr   r   r  r  r  r   r   rC  r   r  r   )rk   r   r   node1_buf_namesr  remaining_deps_by_namer   r   cd	remainingr  remaining_depsnode1_op_namesr  s                 rl   rt  zScheduler.can_fuse_vertical  s     0022u%%7B47H7H+ 	5 	5C(,,SXsx@@D#w'' D,A,A#ue,T,T "4(//4444#* 	- 	-Bb),, ZG5L5L .22%))"'27;; I  -# 
- 
-B!"i00 	-T5P5PB6 6 	- "((,,,,#G  -MMB
 -
 "((,,,# $
 $
 445K5R5R5T5TUU$
 $
 $
 
 

 O+ 	
 C+,,,52244" 	 	D&t,==??G 7 @ JJ >???uu trn   weak_depr6   c                  	
 j         |                                vrdS fd|j        j        D             }t	          |          dk    rdS |d         
t          
t                    rdS t          
t                    sJ t          
j	        t          j                  rdS t          
j                  
j	        j        k    sdS | j        j                 	|g}t          |t"                    r|j        }d}|D ]@}	fd|j        j        D             }|s|dz  }t)          
fd|D                       s dS A|dk    S )NFc                4    g | ]}|j         j        k    |S r~   )r   rE  )r   writer  s     rl   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>  s3     
 
 
zX222 222rn   r   r   c                *    g | ]}|j         k    |S r~   r  )r   rT  r  s     rl   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>  s0       9	)) )))rn   c              3     K   | ]Y}t          |t                    o?t          |j        t          j                   o|j        j        k    o|j        j        k    V  Zd S rx   )r   r4   r#   r   r%   TMPr   )r   rT  r  s     rl   r   z-Scheduler.fusable_weak_dep.<locals>.<genexpr>  s        
 	 4++ ,+DJAAA,J%+-, I+	     rn   )r   r	  r   r  r   r   r5   r4   r#   r   r%   r  r   r  r"   r  rE  r  r   r   r   )rk   r  r   r   mutating_writesrelevant_reading_nodesnum_concurrent_readsreading_noderelevant_readsr  r  s    `       @@rl   rF  zScheduler.fusable_weak_dep  s    = 6 6 8 8885
 
 
 
*1
 
 

 1$$5"eW%% 	5%+++++u{DH55 	5
 %/**ek.FFF5+H,AB	"'e788 	2%*\" 2 	 	L   (4:  N
 "  A%     
 +      uu $q((rn   rT  r3   r  r4   c                   t          |t                    r4| j                            |j        |j                  }||j        k    s>t          |j        t          j                  st          |j        t          j                  rdS t          j
        r8|j        |j        k    r(|                                }|                                }|                     |j                  rdS |j        |j        k    oSt          |j                  t          |j                  k    o)|j        d t          |j                           |j        k    S t          |t"                    ri| j                            |j        |j                  }| j                            |j        |j                  }|j        |j        k    r|j        ||k    rdS dS rI  )r   r4   rj  r  r   r#   r   r%   r  r(   ry  r  rt  r  r  r   r   r5   )rk   rT  r  	read_name
write_names        rl   r  z Scheduler.fusable_read_and_write  s   dI&& #	-11$)TYGGI UZ''&tz48<< (&u{DH== ( u0 *T]en5T5T ~~'')) 11%*== u 
ek) ?	NNc%*oo5?I/EJ/0EJ>
 g&& 	-11$)TYGGI.225:uzJJJ	UZ''J*++turn   r5   writing_noderk  c                8   t          |t          j                  sdS |                                sdS | j                            |j        |j                  }| j                            |j        |j                  }t          |t                    r||k    rdS dS rI  )r   r+   rQ  rR  rj  r  r   r5   )rk   rT  r  r  r  r  s         rl   r  z8Scheduler.fusable_stardep_write_and_read_on_empty_tensorF  s     ,(BCC 	5--// 	5)--diCC	*..uz5:FF
eW%% 	)z*A*A4urn   r   rf  c                B    t           j                            ||          S rx   )rZ   r   get_dep_size_hint)rk   r   rf  s      rl   r2  zScheduler.dep_size_hintS  s    w((k:::rn   return_is_mix_order_reductionint | tuple[int, int, bool]c           	     h    fd}|rCt                               |          r(t                               |          } ||dd          S t          |j        t
          j                  r|j                                        s(|                                s                                r|j	        j
        |j	        j        z  }j	        j
        j	        j        z  }	dd}
d}|D ]L}|	D ]G} |
||          r9|t                               |                               |                    z  }HM ||dd          S t          |j	        j
                  t          |j	        j                  z   }t          j	        j
                  t          j	        j                  z   }t          ||          d	z  t          ||          k     rW||k    r|c}fd
|j	        j
        |j	        j        z  D             } |t!           fd|D                       dd          S |j	        j
        |j	        j        z  j	        j
        j	        j        z  z  }t!           fd|D                       }d}|dk    r,                     |          r                     |          } |||d          S )a4  
        The first term in our fusion score that estimates number of saved
        memory operations.

        This function scores fusion candidates based on shared memory access patterns.
        Higher scores indicate better fusion candidates.

        Scoring strategy:
        1. If nodes share exact memory deps (same buffer + same indexing), return
           the sum of shared dep sizes (original behavior).
        2. If no exact matches (score == 0), check for same-buffer reads with
           different indexing (e.g., split operations reading different slices).
           - Give bonus if nodes read from exactly the same set of buffers
           - Score based on overlap ratio: common_buffer_size / total_read_size
           - High overlap (>50%) suggests good cache locality benefit from fusion
        c                    r| ||fS | |z   S rx   r~   )r,  buffer_overlap_scoreis_mix_order_reductionr  s      rl   _construct_return_valuez>Scheduler.score_fusion_memory.<locals>._construct_return_valueo  s)     - M35KLL///rn   r   Tdep1r3   dep2c                    | |k    rdS t          | t          t          f          r,t          |t          t          f          r| j        |j        k    S dS rz  )r   r5   r4   r   )r  r  s     rl   _matchz-Scheduler.score_fusion_memory.<locals>._match  sX    4<<4dWi$899 2j7I.? ? 2  9	11urn   Fr8  c                L    g | ] }|j         j        v s|j         j        v |!S r~   )r   r   r  )r   r   r   s     rl   r   z1Scheduler.score_fusion_memory.<locals>.<listcomp>  sD       %+111SE<M<T5T5T 5T5T5Trn   c              3  D   K   | ]}                     |          V  d S rx   r2  )r   r   rf  rk   s     rl   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s3      IISD&&sK88IIIIIIrn   c              3  B   K   | ]}                     |          V  d S rx   r  r  s     rl   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s1      JJD&&s++JJJJJJrn   )r  r3   r  r3   )r   r   r   r   r   r+   rQ  rR  r*  r   r   r  r  r2  r   r  r   _can_use_buffer_overlap_scoring&_score_fusion_memory_by_buffer_overlap)rk   r   r   rf  r  r]  r  r,  
node1_deps
node2_depsr  	node1_dep	node2_depnode1_dep_lennode2_dep_lenr  common_memory_depsr  s   ` ```             rl   ri  zScheduler.score_fusion_memoryV  sG   2	0 	0 	0 	0 	0 % 	;):)C)CE5)Q)Q 	;
 &66ueDDE**5!T::: 5:r'ABB	< J0022	<
   ""	<   ""	< *053D3KKJ*053D3KKJ    E'  	!+  Ivi33  ..y994;M;Mi;X;X" "  +*5!U;;;E-344s5;L;S7T7TTE-344s5;L;S7T7TT }m,,q03}m3T3TTT},,$eu    ,2U5F5MM  D +*IIIIIDIIIII1e   $/58I8PP#e&7&>>
 JJJJ7IJJJJJ
  !A::$>>ueLL:#'#N#Nu$ $  '&u.BEJJJrn   c                r   |                                 s|                                 rdS |                                s|                                rdS t          j        st          j        rI|                                }|                                }|r|sdS t          d |D                       }t          d |D                       }t                      }|D ]}|j        D ]}	t          |	j	        t                    r|	j	                                        rt          |	j	                  r|	j	                                        }
|
Nt          |
t          j                  r4|
                                }||z  r|                    |	j	                   |                    |	j	                   ͌|r|D ]}|j        D ]}	t          |	j	        t                    ry|	j	                                        r`|	j	        |v rW|	j	                                        }
|
8t          |
t          j                  r|
                                }||z  r  dS   dS ||fD ]Y}|j        j        D ]J}| j                            |j                  }|'|                                rt-          |          r  dS KZdS )a@  
        Check if buffer overlap scoring should be used for this node pair.

        Buffer overlap scoring handles split/cat patterns where nodes read from
        the same buffer at different indices. We skip it when:
        - Either node is a reduction (different memory access patterns)
        - Either node is a template
        - Both nodes are prologue/epilogue candidates for the same template,
          because horizontal fusion would prevent them from being absorbed
          into the template kernel. For example, in:
            q = a[:64, :]; k = a[64:, :]
            return mm(q + 2, k - 2)
          "q + 2" and "k - 2" both read from `a` and would get a high overlap
          score, but fusing them horizontally prevents prologue fusion into mm
          (resulting in 2 kernels instead of 1).

        We allow buffer overlap scoring when:
        - The node outputs are not actually in the template's allowed_prologue_inps,
          meaning they can't be prologue-fused anyway, so horizontal fusion doesn't
          prevent any optimization opportunity.
        FTc              3  >   K   | ]}|                                 V  d S rx   r  r  s     rl   r   z<Scheduler._can_use_buffer_overlap_scoring.<locals>.<genexpr>  *      +T+TsCLLNN+T+T+T+T+T+Trn   c              3  >   K   | ]}|                                 V  d S rx   r  r  s     rl   r   z<Scheduler._can_use_buffer_overlap_scoring.<locals>.<genexpr>  r  rn   )r   r*  r(   r  r  r  r   r  r   r   r_   r}  r  r+   r`  rp  r  r   r   r  r  r   rx  )rk   r   r   node1_outputsnode2_outputsnode1_output_namesnode2_output_names&node1_prologue_eligible_template_usersr   r0  r  allowed_inpsr   r   r  s                  rl   r  z)Scheduler._can_use_buffer_overlap_scoring  s?   4  	5#5#5#7#7 	5 	%"3"3"5"5 	5 C	%&": C	%!--//M!--//M !  t!++T+Tm+T+T+T!T!T!++T+Tm+T+T+T!T!T
  3 % R RI R RD"49.?@@R I1133R 8	BBR )-	(C(C(E(E(4)2+B: :4 ,9+R+R+T+TL1L@ V F J J49 U U U CFFtyQQQ%R, 6 -( - -C #	 - -&ty2CDD- $	 5 5 7 7- !%	-S S S -1I,G,G,I,IM,8Z -r/F> >8 0=/V/V/X/X#5#D !1+0555!1 (-uuu!-$  % %+1 % %C#6::38DDH ,$0022 -7AA -  %uuu% trn   c                h    dd fdt          d |j        j        D                       }t          d |j        j        D                       }||z  sd	S t          fd
|j        j        D                       }t          fd|j        j        D                       }t	          ||          }|d	k    rd	S t          fd|j        j        D                       }t          fd|j        j        D                       }	t	          ||	          }
|
|z  }|t
          j        k    r|
nd	S )a8  
        Score fusion based on buffer name overlap when exact dep matching fails.

        This handles the split/cat fusion case where nodes read from the same buffer
        but at different indices (e.g., different slices from a split operation).

        Scoring logic:
        - If nodes read from exactly the same buffers: high bonus (encourages fusion)
        - For common buffers: score based on overlap ratio
          - overlap_ratio = common_buffer_size /
            max(node1_total_reads, node2_total_reads)
          - If overlap_ratio > threshold (e.g., 0.5): give proportional score
          - If overlap_ratio < threshold: minimal/no score (not worth fusing)

        Note on dynamic shapes:
        - When deps have unbacked symbols (dynamic shapes), dep_size_hint returns 0
        - In this case, we use count * 10 as a proxy for size
        - This ensures fusion still works for models with dynamic batch sizes

        Note on multiple deps from same buffer:
        - A node may have multiple MemoryDep entries for the same buffer name
          (e.g., 4 split reads from arg0_1 at different indices)
        - We sum ALL dep sizes for each buffer, not just take max
        - This ensures overlap ratio is calculated correctly when nodes read
          multiple slices from the same underlying buffer
        r  r   r3   r   r   c                B                         |           }|dk    r|nS r   r  )r   r   FALLBACK_DEP_SIZErk   s     rl   get_dep_sizezFScheduler._score_fusion_memory_by_buffer_overlap.<locals>.get_dep_sizeK  s)    %%c**D!8844)::rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>O  r  rn   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>P  r  rn   r   c              3  .   K   | ]} |          V  d S rx   r~   r   r   r  s     rl   r   zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>Y  >       $
 $
"%LL$
 $
 $
 $
 $
 $
rn   c              3  .   K   | ]} |          V  d S rx   r~   r  s     rl   r   zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>\  r  rn   c              3  @   K   | ]}|j         v  |          V  d S rx   r  r   r   common_namesr  s     rl   r   zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>g  F       %
 %
x<'' L''''%
 %
rn   c              3  @   K   | ]}|j         v  |          V  d S rx   r  r  s     rl   r   zCScheduler._score_fusion_memory_by_buffer_overlap.<locals>.<genexpr>l  r  rn   )r   r3   r   r   )r   r   r   r   r  r(   min_overlap_ratio)rk   r   r   node1_read_namesr  node1_total_read_sizenode2_total_read_sizemax_total_read_sizenode1_common_read_sizenode2_common_read_sizecommon_read_buffer_sizeoverlap_ratior  r  r  s   `           @@@rl   r  z0Scheduler._score_fusion_memory_by_buffer_overlap+  s   < 	; 	; 	; 	; 	; 	; 	; &%R%R%:K:Q%R%R%RRR%%R%R%:K:Q%R%R%RRR (*:: 	1 !$ $
 $
 $
 $
).):)@$
 $
 $
 !
 !
 !$ $
 $
 $
 $
).):)@$
 $
 $
 !
 !
 ""79NOO!##1
 "% %
 %
 %
 %
 %
(.%
 %
 %
 "
 "

 "% %
 %
 %
 %
 %
(.%
 %
 %
 "
 "
 #&&<>T"U"U 02EE
 (58P'P'P##VW	
rn   c                $   t          |          dk    r|S i }|D ]\  }}|                                |                                k    sJ |                                }t          |                     |                              ||                    }||vr	||fg||<   ||                             ||f           t          |                                t          j	        d                    d         }t          |          dk    sJ |S )Nr   r6  r   )
r   r   r   rw  get_fusion_pair_priorityr   r  r  rf  rg  )rk   r  "possible_fusions_group_by_priorityr   r   r   fusion_pair_priority&possible_fusions_with_highest_prioritys           rl   r  z4Scheduler.get_possible_fusions_with_highest_priority  sY   
   A%%##  	+ - 	 	LE5##%%)9)9););;;;;%%''F#&  ((AA%OO$ $  $+MMMENL23GHH 33GHOOEN    25.4466H<OPQ<R<R2
 2
 2

2. 9::Q>>>>55rn   r   r   c                0    t          j        j        | g|R  S )z-
        Shim for list.sort(key=...)
        )rZ   rd  score_fusionr  s     rl   r  zScheduler.score_fusion_key  s     y%d3U3333rn   c                    t          t          j                                                  }t	          | j                  D ]7}|                    || j                   |                    |j	                   8dS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rZ   r   r  r   r  r  r  r2  rc  )rk   r  r   s      rl   r  zScheduler.compute_last_usage  sv    
 ))A)A)C)CDDTZ(( 	8 	8D 3T5LMMM&&t7777	8 	8rn   c                D   t          | j        t          j        j        z
  t          j        j        j        z
            D ]G}|| j        v rK| j        |         }|                                r)t          j        j        	                    |j
                   W|t          j        j        v rt          j        j        |         }t          |t          j                  r%t          j        j        	                    |           t          |t          j        t          j        f          r|j        }t          |t          j                  r|                                sJ t          j        j        	                    |j                   I| j                                         dS )z*Free any buffers that are no longer neededN)r;  r  rZ   r   rL  r=  freedrC  rL  codegen_freer   r  r   r+   r  r  r  r   r  is_input_bufferr  )rk   r   r   r  storages        rl   free_bufferszScheduler.free_buffers  sj   %g%&g"()
 
 	D 	DD
 t'''&t,<<>> @G(55ch???---g*40c2#566 	DG(55c::::b&79M%NOO D!hG"7BM::?F?V?V?X?X XG(55glCCC!'')))))rn   c                    | j                                         D ]}|                                 |                                  d S rx   )r  r   flushr  )rk   rp  s     rl   r  zScheduler.flush  sF    }++-- 	 	GMMOOOOrn   scheduler_nodec                   t          |t          t          f          sJ t          d         dxx         dz  cc<   t	          j        t          d                    5  |                                 |                                 d d d            n# 1 swxY w Y   |	                    t          j
        j                   |                                  d S )Nr  extern_callsr   F)increase_kernel_count)r   r  ru  r   rZ   set_kernel_handlerr0   rW  r  rH  r   r=  r  )rk   r  s     rl   codegen_extern_callzScheduler.codegen_extern_call  s    &(LM
 
 	
 	
 
 	^,,,1,,,!&u"E"E"EFF 	& 	&00222##%%%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	qw3444s   )BBBBaseSchedulingc                F   t          |j                  r|j        J | d            t          j                            |           t          |j                  }|t          d|j                   t                      s|j        dk    rKt          j
                            |          x}j        dk     r!t          |t          j                              t          |j                  r+|j        dk    s t!          t          j                               ||           S )Nz( should have been normalized in loweringzUnsupported device type: r      r  )rU   r   r   rZ   r   add_device_infor/   r  r&   r  r   get_device_propertiesmajorr7   inspectcurrentframer8   )rk   r   device_schedulingrC  s       rl   create_backendzScheduler.create_backend  s   &+&& 	
&,*B*B??? +C*BB 	
'''5fkBB$H6;HHIII|| 	<v%%%*Z%E%Ef%M%MM\TWXXX(w7K7M7MNNN$$ <V[E-A-A#G$8$:$:;;;  &&&rn   c                p    |J || j         vr|                     |          | j         |<   | j         |         S rx   )r  r  r  s     rl   rw  zScheduler.get_backend  sB    !!!&&$($7$7$?$?DM&!}V$$rn   c                4    d	 fdfd|                                 D             }t          |                                          }|rLt          |t	          j        d                    \  }}t          j        j        	                    |           d S d S )
Nr  torch.fx.Noder   r   c                    | j         vr;j                             d t          | j        j                  D                        j         |          S )Nc                    i | ]\  }}||	S r~   r~   r  s      rl   r  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>  s    ,W,W,WdaQ,W,W,Wrn   )r  r2  r  r   r  )r  rk   s    rl   	get_orderz*Scheduler.enter_context.<locals>.get_order   sQ    ,,,$++,W,Wi>V>V,W,W,WXXX'**rn   c                r    i | ]3}|j         	|j                                         D ]} |          |fd 4S rx   r  )r   r  r   r	  s      rl   r  z+Scheduler.enter_context.<locals>.<dictcomp>  sY     
 
 
v!V'')) "! Yq\\1t!!!!rn   r   r6  )r  r  r   r   )
r   r  r  r  rf  rg  rZ   r   r=  enter_context)rk   r   rj  r  lastr	  s   `    @rl   r	  zScheduler.enter_context  s    	+ 	+ 	+ 	+ 	+ 	+
 
 
 
^^%%
 
 
 w||~~&& 	5'x':1'='=>>>GAtG ..t44444	5 	5rn   r   r  rb  c                    	 | j         |         j        }n# t          $ r Y dS w xY wt          fd|D                       o|| j        vo|| j        vS )NFc              3  R   K   | ]!}|j         p|                                v V  "d S rx   )rY  r  )r   r0  r  s     rl   r   zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s8      VVC3C CVVVVVVrn   )rC  r  KeyErrorr   rj  r  )rk   r   r  r  s     ` rl   r  z.Scheduler.can_buffer_be_removed_through_fusion  s    	$T*0EE 	 	 	55	 VVVVPUVVVVV 4D114D33	
s    
$$c                   |j         }t          |t          j        j        j                  r]|j        x}rTt          |          \  }}|t          j	        v s|t          j	        v r&t          |t          j
        j                  sJ d| S t          j        j        j        j        st          j        dS t          |t                     r'|j        D ]}|                     |          }|r|c S dS |j         J |                                s|                                 dS t          |j         t          j                  rdS t          |j         t          j                  rdS t/          |j         dd          rdS t1          |j                   rd	S |                     |          x}r|S t          j        j        rt7          |          rd
S dS )z
        Return the reason why we should partition the inductor graph on this node,
        or None if the node is cudagraphable.
        zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r  r  r+   rQ  r  rP   r(   custom_should_partition_ops_ops
OpOverloadr   r^   rH   r_  r   r   should_partitionrU   r   
DeviceCopyr  r  rT   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsrn  )rk   r   rp  r  op_overload_packet_nameop_overload_namer  r*  s           rl   r	  zScheduler.should_partition  s    )gu1@AA 		B%%B		B 9ER8H8H5#%5'6+MMM#v'III!"ej&;<<<<<A/?AAA &-8	L6>FKKd.// 	 " "..u55 "!MMM"4y$$${{}} 	.oo''----di// 	$##di00 	%$$491488 	*))!$),, 	100@@FFF6 	M =6 	+-d33 +**trn   rc  c                   t                      }t          j        s|S | j        D ]}|j        }|t          |t          j        j        j	                  s1|j
        }|;t          |          \  }}|t          j        vr|t          j        vrj|                                D ]a}t          j        j                            |          }t#          |t$          j        t$          j        f          r|                    |           b|S )zc
        Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
        )r   r(   cudagraph_unsafe_unbacked_opsr  r   r   r  r  r+   rQ  r  rP   rL  rZ   r   r   r   r$   r%   UNBACKED_INTUNBACKED_FLOATr  )rk   unsafe_symintsr   rp  r  r	  r	  syms           rl   &_get_cudagraph_unsafe_unbacked_symintsz0Scheduler._get_cudagraph_unsafe_unbacked_symints]  s   
 4><<3 	"!!J 	, 	,DiGgu'9'HII $Bz8DR8H8H5#%5'v/SSS$F,PPP7799 , ,g&//44!#(94;N'OPP ,"&&s+++,
 rn   c                    |                                  }|sd S t          |          }|D ]=}t          j        j                            |          }|j        D ]}||v r	d| c c S >d S )Nz'uses cudagraph-unsafe unbacked symint: )r	  rn  rZ   r   r   r   r"   )rk   r   r	  node_symbolsr	  simplified_symfree_syms          rl   r	  z0Scheduler._uses_cudagraph_unsafe_unbacked_symint  s     DDFF 	45d;; 	P 	PCW-66s;;N*7 P P~--OXOOOOOOO .P trn   6dict[str, ir.IRNode | ir.TorchBindObject | sympy.Expr]c                    i }|                     t          j        j                   | j        D ]+}|j                                        D ]\  }}|j        ||<   ,|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r2  rZ   r   r  r  ro  r  r   )rk   r  r   r   scheduler_buffers        rl   get_name_to_nodeszScheduler.get_name_to_nodes  sv     PRAG0111J 	; 	;D*.*>*D*D*F*F ; ;&&%5%:T""; rn   
signatureslist[GraphPartitionSignature]c           	        d t          t          j        j                  D             }d t          t          j                                                  D             }g t          j        _        t          |          D ]\  }}|j        rg }|j        D ]*}|                    |	                    |                     +g }|j
        D ]<}	|                    |	                    |	                                                     =t          j        j                            t          ||||j                             dS )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        c                    i | ]\  }}||	S r~   r~   r   rX  r   s      rl   r  z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>  s+     %
 %
 %
##tD#%
 %
 %
rn   c                    i | ]\  }}||	S r~   r~   r&	  s      rl   r  z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>  s+     &
 &
 &
##tD#&
 &
 &
rn   N)r  rZ   r   r  r  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr  rQ   constant_names)
rk   r"	  name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingr   output_mappingr   s
             rl   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_maps  si   %
 %
'01E'F'F%
 %
 %
!&
 &
'01I1I1K1K'L'L&
 &
 &
" "$'0'<'< 	 	#L)' 
 M!- J J$$%>%B%B4%H%HIIIIN!. W W%%&@&D&DT]]__&U&UVVVVG"))! !",	    !	 	rn   	partitionr`   r*	  c                   dddd} t                      j        d |D              } |j        fd	|                                D                ||          }t                      }|D ]@}t          j        j                            |          }|                    |j                   At          t          |t          j        d
                              S )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        r   +ir.IRNode | sympy.Expr | ir.TorchBindObjectr   rc  c                    t          | t          j                  rt                      S t          | t          j                  rt          |           S t          dt          |                      )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r+   r  r   r  ri  rK  r   r   s    rl   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sf     $ 233 X!||#D"),, X)$/// **V$t***V*VWWWrn   symbolsc                4    t          d | D                       S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]B}t          |t          j        t          j        t          j        t          j        f          >|V  Cd S rx   )r$   r%   SIZEFLOATr	  r	  r@  s     rl   r   zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sd        !	
)+	      rn   r   )r9	  s    rl   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s2             rn   c              3  4   K   | ]}t          |          V  d S rx   rm  r  s     rl   r   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s+      IIt,T22IIIIIIrn   c              3  4   K   | ]\  }} |          V  d S rx   r~   )r   r  r   r8	  s      rl   r   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s3      NNwq$$$T**NNNNNNrn   r   r6  )r   r6	  r   rc  )r9	  rc  r   rc  )r   r  r2  r  rZ   r   r   r   r"   r;  rf  
attrgetter)	rk   r4	  r*	  r>	  candidate_symbolsr?  r  symplified_sr8	  s	           @rl   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputs  s   	X 	X 	X 	X 	 	 	 	, 7Ijll6HIIyIII7
 	! NNNN+:K:K:M:MNNN	
 	
 +N+<==(2" 	2 	2A7+44Q77LJJ|01111&(*=f*E*EFFFGGGrn   
partitionslist[PartitionType]skip_cudagraphs
list[bool]c           	         g }t          t          j                                                  }                                 d fdt          t          |          t          |                    D ]\  }}t                      }|D ].}|                    |j        	                                           /|
                    |          }	t          j                            d |D                       }
t          d |
j        |
j        z  D                       |z
  }t           fd|D                       }t                      |D ]}                    |j                   fd	|z
  D             }|                    |           fd
|D             }fd|D             }fd|D             }|	                    |           t           fd|	D                       }	fd|	D             }d |D             }                     ||          }t%          ||||||          }|                    |           |                    ||	z
            }|ddd         S )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        r  r   r   ro   c                    j                             | d          }|dS t          |j        j        t
                    r*j                            | d          x}r |          S dS dS )z
            Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
            Buffers with NoneLayout are not allocated so graph partition should not
            take them as inputs or outputs.
            NFT)rC  r  r   r   r)  r?   r  )r  r   r  is_unallocated_bufferrk   s      rl   rK	  zFScheduler.get_graph_partition_signature.<locals>.is_unallocated_buffer  sz     "&&x66C{u#(/:66  !% 7 ; ;Hd K KK9 <00;;;t5rn   c                    g | ]	}|j         
S r~   r  r  s     rl   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>>  s    888d!888rn   c                F    g | ]}t          |t                    |j        S r~   )r   r6   r   r  s     rl   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>E  s:       )!W55  rn   c              3  N   K   | ]}j                             ||          V   d S rx   r  r  r!  s     rl   r   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>N  J       / / '++D$77/ / / / / /rn   c                    g | ]}|v |	S r~   r~   r   r   r  s     rl   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>Z  s.     ! ! !<'' '''rn   c                *    i | ]}|v ||         S r~   r~   rR	  s     rl   r  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>a  s5       <'' l4('''rn   c                "    i | ]}|v ||v S r~   r~   r   r   r  r  s     rl   r  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>f  s6     " " "<'' d22'''rn   c                $    g | ]}|v |v
|S r~   r~   rU	  s     rl   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>p  s9     " " "<''D8L,L,L ,L,L,Lrn   c              3  N   K   | ]}j                             ||          V   d S rx   rO	  r!  s     rl   r   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>x  rP	  rn   c                6    g | ]} |          |         S r~   r~   )r   r   rK	  r  s     rl   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>}  s?       ,,T22T"  rn   c                8    g | ]}|t           j        j        v |S r~   )rZ   r   r  r   r   s     rl   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>  s.       $!'BS:S:S:S:S:Srn   Nr   )r  r   r   ro   )r   rZ   r   r  r!	  r  r   r2  ro  r  r  r*   r  r  r   r  rc  rD	  r<   r   r  )rk   rE	  rG	  r"	  unmet_output_namesr4	  r)	  output_namesr   returned_output_namesr   partition_input_namesextra_input_namesr*	  input_deallocationextra_output_namesr+	  r,	  symbol_inputspartition_signaturer  rK	  r  s   `                   @@@rl   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature  s    
'(@(@(B(BCC--//	 	 	 	 	 	 	, *-Z  (?";";*
 *
 g	 g	%I~ -7LLL! A A##D$8$=$=$?$?@@@@$0$=$=>P$Q$Q! '1<<88i888 K  !,!2[5G!G     " %/ / / / /1/ / / % %!
 5?LL ! = =$++DO<<<<
! ! ! !1L@! ! !
 "(():;;;   1  K
" " " " "1" " "" " " " "1" " " "(();<<<$. / / / /1/ / / % %!
    1  L !6  N !BB; M #:"# # 1222!6!<!<"%::" " $$B$rn   r0	  r<   c                   d |j                                         D             }d |j                                        D             }d |j        D             }d |j        D             }t          |j        ||||j        |          S )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        c                @    i | ]\  }}|t           j        j        v||S r~   rZ   r   rL  )r   r   rX  s      rl   r  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>  s8     
 
 
f17222 &222rn   c                @    i | ]\  }}|t           j        j        v||S r~   rg	  )r   r   r[  s      rl   r  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>  s8     
 
 
c17222 #222rn   c                \    g | ])}|                                 t          j        j        v'|*S r~   )maybe_get_namerZ   r   rL  r  s     rl   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>  s>     
 
 
""$$AG,CCC CCCrn   c                8    g | ]}|t           j        j        v|S r~   rg	  rZ	  s     rl   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>  s2     
 
 
17222 222rn   )r*	  r  r`	  r+	  r,	  r<   rb	  r)	  )rk   r0	  r*	  r`	  r+	  r,	  s         rl   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures  s    
 
 ) 5 ; ; = =
 
 


 
&9??AA
 
 


 
!.
 
 


 
!0
 
 

 '#$
 
 	
rn   c                   	
 ddl t                      
g g d t          |          D             d fd	d	
fd	}|D ]5}t          |j        j                  
|<   
|         dk    r 	|           6g }d}|t          |          k     rsrr:                              \  }}|                    |            ||           :r:                              \  }}|                    |            ||           :|d
z  }|t          |          k     r|t          |          k    rt          d          |S )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    i | ]\  }}||	S r~   r~   )r   rX  r   s      rl   r  z>Scheduler.reorder_for_minimizing_partition.<locals>.<dictcomp>  s    EEEysDsEEErn   r   r_   r   r5  c                    |          | f}                     |           r                    |           d S                     |           d S rx   )r	  heappush)r   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrk   s     rl   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  s_    ,T2D9O$$T** E6HHHHH2ODDDDDrn   c                    | j         j        D ]7}|         dk    sJ |xx         dz  cc<   |         dk    r |           8d S rN  )rh  
succ_nodes)r   	succ_noderv	  node_to_indegrees     rl   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  su    !]5 4 4	'	2Q6666 +++q0+++#I.!33((333	4 4rn   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r_   r   r5  )	rs	  r  r  r   rh  
pred_nodesheappopr   r  )rk   r  r{	  r   r  	num_itersr  rr	  rs	  rv	  rz	  rt	  ru	  s   `      @@@@@@rl    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partition  s    	9=CEGIEEIe4D4DEEE	E 	E 	E 	E 	E 	E 	E 	E 	E 	E	4 	4 	4 	4 	4 	4 	4  	+ 	+D%()A%B%BT"%**$$T***,.	#e**$$# %': % * &--(?@@4%%%%%% * &
 & &--(;<<4%%%%%% & &
 NI #e**$$# %': % s5zz!!   rn   c           	     `   ddl m}m} t          t          j                                                  } ||| j        | j        t          t          j        j	        
                                          |          \  }}|                     |          } ||||          \  }}	||dz  k     r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_infor:  )r  r	  r	  r   rZ   r   r  rC  r  r  r  r	  )
rk   r  r	  r	  rP  default_peak_memoryr  reordered_nodesreorder_peak_memoryr  s
             rl   r  z0Scheduler.maybe_reorder_for_minimizing_partition  s     	HGGGGGGG"17#;#;#=#=>>:O:O#qw+002233;
 ;
77 ??FF!5!57"
 "
Q
 !4s!:::""rn   c                .   g }g }g }dd}|D ]}|                      |          du}|r.t          |j                  dk    r|                    |           I|r! ||          r|                    |           l|                    |           ||z   |z   S )	a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        r   r_   r   ro   c                    |                                  D ]*}|j        D ] }t          |j        t                    s  dS !+dS rI  )r  r  r   r   rJ  )r   r   rK  s      rl   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user.  s\    '')) % %9 % %C%ch
;; %$uuu%% 4rn   Nr   r  )r	  r   rt  r   )rk   r  frontmiddlebackr	  r   r	  s           rl   r  z6Scheduler.reorder_for_partition_with_simple_dependency   s     *,*,(*	 	 	 	  	$ 	$D#44T::$F $C(?$@$@A$E$ET""""! $&6&6t&<&< $D!!!!d####v~$$rn   9tuple[list[PartitionType], list[GraphPartitionSignature]]c                   g }d}g }g }| j         D ]d}|                     |          du}|r2||k    r,|                    |           |                    |           g }|}|                    |           e|r*|                    |           |                    |           t          j        j        }|dk    rit          t          ||                    D ]K\  }\  }	}
|
sAt          d |	D                       }||k     r"d||<   t          
                    d|||           L|                     ||          }|                     |           |                     ||           ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        TNr   c              3  D   K   | ]}t          |t                    d V  dS r   N)r   rN  r  s     rl   r   z,Scheduler.graph_partition.<locals>.<genexpr>`  sG       ' ')!-CDD'' ' ' ' ' 'rn   zFPartition %d has %d kernels, below minimum size %d, skipping cudagraph)rE	  rG	  )r  r	  r   r(   r   cudagraph_min_partition_sizer  r  r   cudagraphs_logr/  rd	  r3	  _log_graph_partitions)rk   rE	  r)	  cur_partitionrG	  r   node_should_partitionmin_sizer  r4	  skipkernel_countr"	  s                rl   r  zScheduler.graph_partition@  s    +-
')J 	' 	'D$($9$9$$?$?t$K! #3H!H!H!!-000&&~666 "2N  &&&& 	3m,,,"">222 ==a<<(1#j/2R2R(S(S  $$It #& ' '!*' ' ' $ $L
 $h..-1*&,,d($	   77!? 8 
 

 	))*555"":z::::%%rn   c                   t                               t          j                  sd S t	          d t
          j        j        D                       }|sd S t          d |D                       }t          |          |z
  }t           
                    dt          |          ||           t          t          ||                    D ]\  }\  }}t           
                    d|t          |          |j        rdndt          |j                  t          |j                             |j        r|D ]}	|                     |	           d S )Nc              3  4   K   | ]}t          |          V  d S rx   )rU   )r   r   s     rl   r   z2Scheduler._log_graph_partitions.<locals>.<genexpr>  s(      OOVF^^OOOOOOrn   c              3  (   K   | ]}|j         	d V  dS r	  )r)	  r@  s     rl   r   z2Scheduler._log_graph_partitions.<locals>.<genexpr>  s*      !P!Pq?O!P!!P!P!P!P!P!Prn   zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)r	  r  r  r%  r   rZ   r   device_typesr   r   r/  r  r  r)	  r*	  r+	  _log_non_cudagraphable_node)
rk   rE	  r"	  has_gpu_devicecudagraphable_countnon_cudagraphable_countr  r4	  r0	  r   s
             rl   r	  zScheduler._log_graph_partitionsw  si   
 **7=99 	F OO!':NOOOOO 	F!!P!PZ!P!P!PPP"%j//4G"GQ
OO#		
 	
 	
 *33z:3N3N)O)O 	; 	;%A%	9  EI'0'?T##_I)**I*++   ' ;% ; ;D44T::::	; 	;rn   c                   |                      |          }|sdS |                                }|j        |j                                        nd}d| g}t	          |j                  j        }|                    d|            |G|j         dd                    d |j	        D                        d}|                    d|            t                              d	|d                    |                     |b|j                            d
d          }|rG|                                                    d          D ]!}	t                              d|	           dS dS dS )z)Log details for a non-cudagraphable node.Nzreason=zir=r  r  c              3  4   K   | ]}t          |          V  d S rx   )r   )r   r  s     rl   r   z8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>  s(      2P2Pa3q662P2P2P2P2P2Prn   r  zfx=z
    %s: %sr]  r  z         %s)r	  r  r   r  r   ry   r   re  r  r7  r	  r/  rf  r  stripsplit)
rk   r   r*  r  r  partsir_typefx_strr]  lines
             rl   r	  z%Scheduler._log_non_cudagraphable_node  s   &&t,, 	FMMOO	151F$)++---D#6##$ty//*_7__%%%SS2P2P7<2P2P2P)P)PSSSFLLv(((\9dii6F6FGGG !,**=$??K >'--//55d;; > >D"((====	 > >> >rn   c                    t          d          5  t          j        j        j        r|                                 n|                     | j                  	 cd d d            S # 1 swxY w Y   d S )NzScheduler.codegen)r   r  r  r(   r  _codegen_partitions_codegenr  rj   s    rl   rH  zScheduler.codegen  s    -.. 	 	 ?)9/((***]]4:..	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA""A&)A&c                   ddl m} t          j        j        }t          | j                  }t          j                                        5  t          j                            dd| ||           | 	                    |           t          t          j        j        |          sJ |                     |          }|t          j        j        _        t          j        j                                         t          j        j        }t          j        j                            t          j        j                  \  }}ddd           n# 1 swxY w Y   t          j        j                            ||           t          j        j                            ||           t          j        j        j                            d |j        D                        dS )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesNc                6    g | ]}|                                 S r~   r  r  s     rl   r   z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>  s     @@@T]]__@@@rn   )rD  r	  rZ   r   r=  r  r  set_current_wrapper_codeinit_wrapper_coder	  r   rl	  r	  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr2  r+	  )	rk   r4	  r0	  r	  r	  graph_partition_id
graph_namepartition_coder  s	            rl   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapper  s    	BAAAAAg2!$"?@@W--// 	T 	TG%% ?+=??$7%.	 &    MM)$$$ ag24PQQQQQKKIVVI8AAG 5G --///J ! 4 = =ag>R S SNA/	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T2 	
88^TTT	334F	RRR	&--@@)?@@@	
 	
 	
 	
 	
s   
C2EEE'contextlib.AbstractContextManager[None]c                P     t           j        d fd            } |            S )Nr   Iterator[None]c               3  <  K                                    j        r]t          j        j                  rDj        j        
J d            t
          j        j                            j        j                   	 d V  j        r<t          j        j                  r#t
          j        j        	                                 d _        d S # j        r<t          j        j                  r#t
          j        j        	                                 d _        w xY w)Ndevice should have an index)
%update_graph_partition_default_devicer  rK   r   r   rZ   r   r=  codegen_device_guard_entercodegen_device_guard_exit)rE	  rk   r"	  s   rl   ctxz1Scheduler.use_default_device_context.<locals>.ctx  s@     66z:NNN* /@+00 0  28DD1 EDD $??/5  3. E3D/44 4 E G(BBDDD.2+++	 . E3D/44 4 E G(BBDDD.2+2222s   ?C AD)r   r	  )
contextlibcontextmanager)rk   rE	  r"	  r	  s   ``` rl   use_default_device_contextz$Scheduler.use_default_device_context  sJ     
	"	3 	3 	3 	3 	3 	3 	3 
#	"	3* suurn   c                    t          |          dk    r|d         j        sd S dd}dd
}d }t          ||          D ]\  }}|j        s ||          } n|d S t          ||          D ]\  }}|j        r |||          s d S || _        d S )Nr   r   r4	  r`   r   rE  c                B    | d                                          }|J |S r   r  )r4	  partition_devices     rl   get_cudagraph_partition_devicezWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s*    (|6688#///##rn   target_devicero   c                J    | D ]}|                                 }||k    r dS  dS rI  r  )r4	  r	  r   r   s       rl   all_on_target_devicezMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s>     " ! !**]** 55 +4rn   )r4	  r`   r   rE  )r4	  r`   r	  rE  r   ro   )r   r)	  r  r  )rk   rE	  r"	  r	  r	  cudagraph_partition_devicer4	  r0	  s           rl   r	  z/Scheduler.update_graph_partition_default_device  s    z??a
1(D F	$ 	$ 	$ 	$
	 	 	 	 &*"$'
J$?$? 	 	 Iy+ -K-KI-V-V*
 &-F$'
J$?$? 	 	 Iy' 0D0D51 1  &@###rn   c                6   |                                  \  }}t          |          dk    r(t          d         dxx         t          |          z  cc<   |                     ||          5  t	          ||          D ]e\  }}t          |          dk    sJ dt          |                       |j        r|                     |           O|                     ||           f	 ddd           n# 1 swxY w Y   t          | j	                  }t          j        j                            |           |dk    rat          j        j        J |t          t          j        j                  k    s.J d| dt          t          j        j                               dS dS )	z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   r  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r   r   r	  r  r)	  r	  r	  r  r  rZ   r   r=  set_all_partition_namesr(	  )rk   rE	  r"	  r4	  r0	  num_partitionss         rl   r	  zScheduler._codegen_partitions0  s     "&!5!5!7!7
Jz??QZ !7888C
OOK888,,ZDD 		J 		J(+J
(C(C J J$	99~~***\CPYNN\\ +** + JMM),,,,33IyIIIIJ		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J d;<<	44^DDD A7)555!S)?%@%@@@@_.__#agF\B]B]__ A@@ @@s   )A7C--C14C1c                   t           j        rdd l}t          j                    }t                      }t          |          D ]k}|j        dk    r|j        |j	        j
        j        k    r nC|j        |j        f}||vsJ d|j         d|j         d            |                    |           l| j        | _        | j        J | j        r4t           j        j        r#t&          j        j                                         t&          j        j                                         |D ]0}t0                              t4          j                  r	 t0                              d|                                |                                           n=# t>          $ r0 t0                              d|                                           Y nw xY w|                      |           t           j!        r8t&          j        j        "                    d |j#        j$        D                        |%                                x}r_|| j        k    s(|&                                s|'                                r| (                                 || j        k    r| j        rWtS          | j        j*                  r>| j+        | ,                                 t&          j        j        -                                 || _        tS          |j*                  r|j.        
J d	            d
}	| /                                r<t          | j0        1                                          }
|
rte          |
          d
z   nd
}	t&          j        j        3                    |j.        |	| j4                   | /                                r| j        | 5                    |           t&          j        j        6                    d |j#        j$        D                        || _7        | j8        9                    |j:                   |'                                rd|;                    ty          |=                                                    \  }}}| >                    |          ?                    |||           ny|&                                r| @                    |           nN|A                                rt          jC        t          |          }| >                    |          }d
dlEmF} d
dlGmH} d
dlImJ} t          ||||f          r|}nt          dtU          |                     |M                    |           nt          |t                    r)| >                    |          O                    |           npt          |t          t          f          r)| >                    |          R                    |           n+t          |t                    sJ |T                                 t           j        jU        r'| >                    |          V                                 | jW        9                    |X                                           | jY        9                    |Z                                           t          |t                    s\|%                                }|F|j*        dk    r;| >                    |          [                                r| (                                 t          d |=                                D                       r	|| _        )d | _        2| j        | j        k    rE| j        J tS          | j        j*                  r#t&          j        j        -                                 d | _        | (                                 d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   z%Scheduler._codegen.<locals>.<genexpr>  s=       D D!$CHD D D D D Drn   r	  r   c              3  $   K   | ]}|j         V  d S rx   r  r  s     rl   r   z%Scheduler._codegen.<locals>.<genexpr>  s=       C C C C C C C Crn   )CUDACombinedSchedulingr  )XPUCombinedSchedulingztype(self)=rf  c              3  @   K   | ]}t          |t                    V  d S rx   )r   r   r  s     rl   r   z%Scheduler._codegen.<locals>.<genexpr>  s,      JJA:a//JJJJJJrn   )]r(   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  r  r  r  r   autotune_at_compile_timerZ   r   r=  write_get_raw_stream_headerregister_alignment_check_inputsr  r  r  r%  r/  r  r  r  r	  size_assertscodegen_deferred_input_assertsr   r   r   r,  r*  r  rK   r   current_stream_idxgenerate_stream_ctx_exitr	  r   r  r  r   r  r	  r  generate_stream_ctx_switching!codegen_deferred_alignment_copiesr  r  r2  rc  r  r  r   rw  codegen_templater  r.  rg  rh  r   codegen.cuda_combined_schedulingr	  r  r  #codegen.xpu.xpu_combined_schedulingr	  r   r  codegen_combo_kernelrW  codegen_mix_order_reductionr   r   codegen_noderN  r  debug_sync_kernelcodegen_syncr  r	  rK  r   ready_to_flushr   )rk   r  r  stackr  framer7  r   r   num_streamsunique_streamsr  r  r  backend_r	  r  r	  rp  s                      rl   r	  zScheduler._codegenP  s   4 	....+--E7A||D!%   J"222%-*E*NNNE~u|4$JU^ J Jel J J J '
 "9!))) & 	?6=+Q 	?G <<>>> 	
<<>>> B	* B	*D.. 
IIO2244   
 !   IIP     t$$$ " $CC D D(,(8(>D D D    ***v d111~~'' 2'')) 2 JJLLLT000* I/@+00 0 I  2> 99;;;,FFHHH*0D'(55 %|779V777&'7799 -78K8R8R8T8T-U-UN;I PN 3 3a 7 7q ( ,GG"L' ;   ++-- 9$2E2Q224888 G BB C C$($4$:C C C    !%D%,,T_===!! " 484W4W))**5 51-   ((99!8X    !!  ((....""  {#=tDD++F33TTTTTT888888VVVVVV#%;=RS  ; 'GG()9DJJ)9)9:::,,T2222D"9::    ((DDTJJJJD#5}"EFF    ((55d;;;;!$(>????? }. 8  ((55777'..t/D/D/F/FGGG%,,T-E-E-G-GHHHd$:;; !**&v--((00??AA . JJLLLJJ9I9IJJJJJ *%)""%)""$"=== &222 !4!9:: A $>>@@@!

s   
A F7GG%tuple[float, float, list[str | None]]c                    |d                                          }| t          j        _        || _        |J |                     |          }|                    ||          S )r  r   )r   rZ   r   r  r  rw  benchmark_combo_kernel)rk   r  node_benchmark_resultsr   rp  s        rl   r
  z Scheduler.benchmark_combo_kernel  sa     1((** $!!!""6**--i9OPPPrn   c                   |}|d                                          t          fd|D                       s
J d            t          j        sdS ddlm} dg }}i }t          |          D ]\  }}|                                }	|                     |	          rt          
                    d           	 |                     |	          \  }
}|
|f||<   t          j        |
          rt          
                    d|            d	S n@# |$ r8}d
t          |          v r!t          
                    d           Y d}~ dS  d}~ww xY w||
z  }|                    |           	 |                     ||          \  }}}n?# |$ r7}d
t          |          v r t          
                    d           Y d}~dS  d}~ww xY w||z
  dk     p|dk     }t                              t"          j                  rc||k    s|r.t          
                    dt'          ||z  d                     n-t          
                    dt)          ||z  d                     ||z
  |k     p|S )r  r   c              3  H   K   | ]}|                                 k    V  d S rx   r  )r   r   r   s     rl   r   z4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>  s2      KK44??$$.KKKKKKrn   z<All nodes in a combo kernel group must be on the same deviceTr   g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFrY  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr$  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r(   r
  ra  r!  r  r   r  r.  r/  r  rS  r[  r   r   r  r  r%  rE   rF   )rk   r  subkernel_nodesr!  rP  
path1_listr
  r  r  r  r  r@  r   rQ  	ms2_clone_path2_listsmall_kernelr   s                    @rl   r  z!Scheduler.speedup_by_combo_kernel  s      #..00KKKK?KKKKK 	
 	
J	
 	
K , 	4;;;;;;rZ!#!/22 	$ 	$HAu))I ##I..   R  55i@@D13T
&u-:b>> !$$U   !55! $   *c!ff44$$]    444444 2ICd####	*.*E*E!7+ +'CKK   	 	 	&#a&&00  Y   ttttt	 Y,9c	""7=11 
	SyyLy  E#)1122   
   Ic	//00  
 Y$44s=   5ADE+D?>D??E#E> >F:+F54F55F:	ir.Layoutc                `    | j         |         }|j        J |j                                        S rx   )rC  r   
get_layout)rk   r  r   s      rl   get_buffer_layoutzScheduler.get_buffer_layout_  s1    x(x###x""$$$rn   c                   | j         D ]}|                                r|j        j        D ]}t          j        j                            |j                  }|rut          |          dk    rbt          |j        t          t          f          sA|                                g k    r)t          j        j                            |j                   d S r  )r  rU   r   r   rZ   r   r  r  r   r;   r   r)  r?   r>   r  zero_dim_cpu_tensor_listr  )rk   r   rT  rX  s       rl   r  z$Scheduler.update_zero_dim_cpu_tensord  s    J 	H 	HD{{}} H ,2 
H 
HDW377	BBFH+F33u<< *"MJ8I+J! ! = #OO--338<<TYGGG	H 	Hrn   c                ,    | j         | j         j        S dS )z:CUDA Stream index that current scheduler node assigned to.N)r  r  rj   s    rl   r	  zScheduler.current_stream_idxs  s     #/+664rn   c                6    | j         x}t          |          S dS )z9CUDA Stream name that current scheduler node assigned to.N)r	  r!   )rk   r  s     rl   current_stream_namezScheduler.current_stream_name{  s%     11J>":...4rn   c                    t          |t                    rJ | j        |         }t          j        j                            |          | _        dS )z6Code-gen to enter the Stream context assigned to node.)r  N)r   rN  r  rZ   r   r=  codegen_cuda_stream_enterr  )rk   r   node_streams      rl   generate_stream_ctx_enterz#Scheduler.generate_stream_ctx_enter  sR    d$:;;;;;)$/#$7#7#Q#Q" $R $
 $
   rn   c                l    | j         J t          j        j                                         d| _         dS )z1Code-gen to exit from the current Stream context.N)r  rZ   r   r=  codegen_cuda_stream_exitrj   s    rl   r	  z"Scheduler.generate_stream_ctx_exit  s6    '333	55777#'   rn   c                &   || j         v sJ t          |t                    rdn| j         |         }| j        |k    rdS | j        |dS | j        ||                     |           dS |                                  |                     |           dS )am  Generate stream entering and exiting to properly run node in a multi-stream scenario.

        Stream context switching is only generated if ``node``'s assigned stream is different from
        the previous node's stream. NopKernelSchedulerNodes have stream=None and inherit the
        enclosing stream context (or do nothing if no context is active yet).
        N)r  r   rN  r	  r
  r	  )rk   r   r  s      rl   r	  z'Scheduler.generate_stream_ctx_switching  s     t***** $ 677+DD$T* 	
 "f,, F$0V^ F$,1C**400000 ))+++**400000rn   )r  r  r   r5  )r   r  rY  rZ  )r  r   r   r   )r  r   r   r_   r   ro   r\  )r   rU  r   r5  )r  r   r   r5  )r   r|  r   r_   )r  r  r   ro   r  )r  r_   r   r  )r   r  r  r  r   r  rx   r  r  r  ro   r  r  r   r   )r  r   r   rE  r   r  )r  r*  r   ro   )
r  r+  r  r*  r  r   r   r   r   r5  )r  r  r   ro   )r  r  r  r  r   r  )r   r_   r   r_   r   rd   )r   r_   r   r_   )r   r_   r   r_   r  rr  r   r_   )r   r_   r   r_   rx  rt   r  rr  )r}  r~  r  rr  r   r5  )
r  r  r  r  r  r~  r  rr  r  ro   )r  rr  r  r  )r  r  r  r  )r  r  r  ro   r   r  )r  r  r   r5  r  )r  r  r  ro   r   r  r  )r   r_   r   r_   r  r   r   ro   )r   r_   r   r_   r  r  r   r   r  r  )r8  r_   r  r_   r  r'  r   ro   )r   r_   r   r_   r   rI  )FT)
r   r_   r   r_   r_  ro   r]  ro   r   ro   )r  r6   r   r_   r   r_   r   ro   )rT  r3   r  r4   r   ro   )rT  r3   r  r5   r  rk  r   ro   r  )r   r3   rf  ro   r   r   )TFT)r   r_   r   r_   rf  ro   r  ro   r]  ro   r   r  )r  r  r   r  )r  r   r   r   )r  r_   r   r5  )r   rE  r   r  )r   rU  r   r  r|	  )r   r   r  rb  r   ro   )r   r_   r   r  )r   rc  )r   r	  )r"	  r#	  r   r5  )r4	  r`   r*	  r	  r   rc  )rE	  rF	  rG	  rH	  r   r#	  )r0	  r<   r   r<   )r   r	  )rE	  rF	  r"	  r#	  r   r5  )r4	  r`   r0	  r<   r   r5  )rE	  rF	  r"	  r#	  r   r	  r  r  r   r
  )r  r  r   ro   )r  r   r   r
  r  )r   r  )ory   rz   r{   r  r{  r  r  r  r  r  rO  propertyr  setterr  r  r  r  r]   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rp  r=  rw  r|  r  r  r  r  r  r  r  r  r{  r  r  r  r	  r-  r6  r  rH  r^  r   rt  rF  r  r  r2  ri  r  r  r  r  r  r  r  r  r  rw  r	  r  r	  rI   r	  r	  r!	  r3	  rD	  rd	  rl	  r	  r  r  r  r	  r	  rH  r	  r	  r	  r	  r	  r
  r  r
  r  r	  r
  r
  r	  r	  rd  re  s   @rl   r  r    s	        
   o9 o9 o9 o9 o9 o9b	# 	# 	# 	#6
 6
 6
 6
p( ( ( (S S S S
Q Q Q Q & & & X& ( ( ( (7 7 7 7# # # #, , , ," " " "HMP MP MP MP^K K K KZ+# +# +# +#Z          6	 	 	 	S S S S*   4# # # #&/ / / /<$ $ $ $6! ! ! !F	 	 	 	8 8 8 8, %)	    &
> 
> 
> 
>) ) ) )VVD VD VD VDp'8 '8 '8 '8R
 
 
 
 OS     P P P Pd> > > >   0   "D2 D2 D2 D2LO? O? O? O?bS S S S0$ $ $ $(C C C CJ3. 3. 3. 3. 3.j? ? ? ?6  6  6  6 p, , , ,\7 7 7 7r.2 .2 .2 .2`$ $ $ $6< < < <|M M M M^[
 [
 [
 [
z    ; ; ; ;z` ` ` `L "*.zM zM zM zM zMx; ; ; ;z3) 3) 3) 3)r% % % %T   ; ; ; ; ; !.3*.mK mK mK mK mK^d d d dLR
 R
 R
 R
h6 6 6 6@4 4 4 4	8 	8 	8 	8* * * *4   
   &' ' ' '*% % % %5 5 5 5$
 
 
 
= = = =~ ! ! ! ]!F   "    ' ' ' 'RBH BH BH BHHK  K  K  K Z"
 "
 "
 "
H? ? ? ?B   >% % % %@5& 5& 5& 5&n"; "; "; ";H> > > >0   )
 )
 )
 )
V   6-A -A -A -A^   @o o o obQ Q Q QN5 N5 N5 N5`% % % %
H H H H    X    X
 
 
 
( ( ( (1 1 1 1 1 1 1 1rn   c                       e Zd Zd: fdZd;dZd<d
Zd=dZd=dZd=dZd>dZ	d?dZ
d@dZ	 dAdBd$ZdCd'ZdDd)Zd;d*ZdEd+Zd;d,ZdFd.ZdGd1ZdHd3ZdId6Z	 dAdJd9Z xZS )Kr  r  Scheduler | Nonec                V    t                                                       || _        d S rx   )rO  r{  r  )rk   r  rX  s     rl   r{  zBaseScheduling.__init__  s$    "rn   r   r5  c                J    | j         r| j                                          d S d S rx   )r  r  rj   s    rl   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_scheduler  s0    > 	*N'')))))	* 	*rn   r   rE  OrderedSet[BackendFeature]c                    t                      S )z0Return a set of .codegen.common.BackendFeature()r   r  s     rl   get_backend_featuresz#BaseScheduling.get_backend_features  s    ||rn   r   r_   r   ro   c                    t           )zO
        Check whether node1 and node2 can be vertically fused or not.
        rJ  r,  s      rl   rt  z BaseScheduling.can_fuse_vertical  
     "!rn   c                    t           )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        rJ  r,  s      rl   ru  z"BaseScheduling.can_fuse_horizontal  r*
  rn   c                   |                                 }t          |t          j                  sdS |                                sdS t          |j        t          j                  rt          |j        j                  dk    oct          |j        j        d         t          j	                  o9|j        j        d         
                                |
                                k    S dS )av  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.

        Fr   r   )r  r   r+   ru  rV   r   r=   r   r  r  r  )rk   r   r   rG  s       rl   rl  z.BaseScheduling.can_fuse_multi_outputs_template  s     ..00,(9:: 	55577 	5ej".11 	EJ%&&!+ Ouz03RY??OJ%a(1133|7L7L7N7NN urn   r   c                4   |                                 s|                                 rt                              ||          S t                              ||          rt          ||          S t          |t
                    r|                    |          S t          |t                    rQt          |t                    r<t          |j
        t          j                  sJ t                              ||          S t                              ||          S )z 
        Fuse two nodes
        )r.  r  rs   r   r   rW  r   rs  r  r   r   r+   rQ  ru  r~  r   r,  s      rl   rs   zBaseScheduling.fuse  s      	9!1!1!3!3 	9-225%@@@77uEE 
	9*5%888677 	9??5)))899 	9j=?
 ?
 	9 ej"*DEEEEE7EEeUSSS%**5%888rn   r  r]  "tuple[tuple[sympy.Expr, ...], ...]c                    t           )z[
        Process the iteration sizes in case a transformation needs to be applied.
        rJ  )rk   r  s     rl   rx  zBaseScheduling.group_fn  r*
  rn   r  epilogue_nodesr  rj  r  c                    t           )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        rJ  )rk   r  r0
  rj  s       rl   r	  zBaseScheduling.codegen_template  s
     "!rn   Nr  r  r  r  r   c                    t           zD
        Generate a kernel given a list of pre-fused nodes.
        rJ  )rk   r  r  r  s       rl   r  z.BaseScheduling.generate_kernel_code_from_nodes  s
     "!rn   r   "FusedSchedulerNode | SchedulerNodec                    t           r3
  rJ  r  s     rl   r	  zBaseScheduling.codegen_node  
     "!rn   rW  c                    t           rx   rJ  r  s     rl   r	  z*BaseScheduling.codegen_mix_order_reduction!  rL  rn   c                    t           )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        rJ  rj   s    rl   r	  zBaseScheduling.codegen_sync$  r6
  rn   c                    dS )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr~   rj   s    rl   r	  zBaseScheduling.ready_to_flush*  s	    
 urn   c                    t           )z]
        Flush the generated kernel and python wrapper code to the source code file.
        rJ  rj   s    rl   r  zBaseScheduling.flush1  r6
  rn   r  c                    t           )r  rJ  r  s     rl   r  z$BaseScheduling.benchmark_fused_nodes7  
     "!rn   r  r   c                    t           )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        rJ  )rk   r  s     rl   r  z)BaseScheduling.benchmark_codegened_module@  s
    
 "!rn   r   c                    dS )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r~   r,  s      rl   r  z'BaseScheduling.get_fusion_pair_priorityG  s	     qrn   r  r
  c                    t           )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        rJ  )rk   r  r
  s      rl   r
  z%BaseScheduling.benchmark_combo_kernelP  r<
  rn   node_scheduler  c                |    |r9ddl m}  |||          }t          j        j                            ||           d S d S )Nr   )'set_kernel_post_grad_provenance_tracing)r  rB
  rZ   r   r=  write_provenance_debug_handle)rk   r@
  r  rB
  debug_handles        rl   codegen_commentzBaseScheduling.codegen_commentY  sn    
  		UUUUUUBB L G >>\    		 		rn   )r  r"
  rY  )r   rE  r   r&
  r  rT  )r  r]  r   r.
  )r  r_   r0
  r  rj  r  r   r  rx   r
  )r   r4
  r   r5  )r   rW  r   r5  rZ  r
  )r  r   r   r  r  r
  )r@
  r  r  r  r   r5  )ry   rz   r{   r{  r%
  r(
  rt  ru  rl  rs   rx  r	  r  r	  r	  r	  r	  r  r  r  r  r
  rE
  rd  re  s   @rl   r  r    s       # # # # # #* * * *   " " " "" " " "   49 9 9 9(" " " "" " " "$ %)		" 	" 	" 	" 	"" " " "" " " "" " " "   " " " "" " " "" " " "   " " " " #'        rn   r  )r   r  )r  r_   r   r   )r  r_   r   r  )r  r_   r   rp  )r4  r   r   r   )r   r_   r  r  rC  rn  r   r5  )r  r  r   r5  )r  r  r  r  r   r  r   r5  )r~   )r  r	  r  r  r
  r  r   r  )r)  r*  r  r+  r   r5  r  )r?  r   r@  r   rA  r   rB  r   rC  rC   r   rD  )rP  r  rQ  r  r?  r   r@  r   rA  r   rB  r   rC  rC   r   ro   rZ  )r   rb  r   rc  )r   r_   r   rc  )r  r_   r   ro   )r   r_   r   r_   )
__future__r   r  r	  r]  r#  r  r  r  rS  rf  r  r<  r=  r	  rg  r   r   concurrent.futuresr   r   r   r	   r
   r   r   typing_extensionsr   torch.utils._ordered_setr   r+   r   r   collections.abcr   r   r   typesr   torch._inductor.codegen.wrapperr   rD  r   r   r  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r    torch._inductor.stream_utilsr!   %torch.fx.experimental.symbolic_shapesr"   torch.utils._sympy.symbolr#   r$   r%   torch.utils._tritonr&   r  r'   r(   r)   r*   r,   analyze_preserves_zero_maskr-   codegen.commonr.   r/   r0   comm_analysisr1   r2   r3   r4   r5   r6   excr7   r8   fx_utilsr9   r:   r;   r<   r=   r>   r?   r	  r@   r  rA   rB   runtime.hintsrC   rD   runtime.runtime_utilsrE   rF   r   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   virtualizedrZ   	getLoggerry   r  _logginggetArtifactLoggerr.  r  rS  r	  r  r`   r|   ra   rb   	dataclassrd   r   r   r  r`  r_   r  r  r  r  r  r'  r+  rJ  r  r  rN  r   r  r  r   rW  ru  r  r  r(  r6  r>  rO  rV  r|  r  r  ra  ri  rn  rx  r}  r  r  r  r  r  r  r~   rn   rl   <module>re
     s   " " " " " " "                         				        , , , , , , , , 3 3 3 3 3 3 3 3 B B B B B B B B B B B B B B ' ' ' ' ' ' / / / / / / ) ) ) ) ) ) ) )  6<<<<<<<<<<      JJJJJJ555555   $ $ $ $ $ $ $ $ $ $ $ $ $ 6 6 6 6 6 6 6 6 E E E E E E ? ? ? ? ? ? ? ? 7 7 7 7 7 7 M M M M M M M M 8 8 8 8 8 8 > > > > > > O O O O O O O O O O * * * * * * D D D D D D D D D D D D D D D D D D D D D D M M M M M M M M M M        ; : : : : : : : : : : : 2 2 2 2 2 2 2 2 $ $ $ $ $ $                       J J J J J J J J : : : : : : : : 7 7 7 7 7 7 7 7 & & & & & &                                       (       g!!^--hAA
N44XOO  >;;$   11(LII 34 4 4 4 4WT]]Yt__ D D D D D D D D* ( ( ( ( ( ( ( (Y Y Y Y Y Y Y Yx h8 h8 h8 h8 h8 h8 h8 h8V 1 1 1 1 1_ 1 1 1H1 H1 H1 H1 H1 H1 H1 H1V 2 2 2 2   ,' ' ' ' # # # #L T"""
 
 
 
 
 
 
 #"
*           *K *K *K *KZ"* "* "* "* "* 1 "* "* "*J5 5 5 5 5. 5 5 5S* S* S* S* S*% S* S* S*l
   $   8k* k* k* k* k** k* k* k*\[G [G [G [G [G0 [G [G [G|:- :- :- :- :-+= :- :- :-zC: C: C: C: C:!3 C: C: C:Lb b b b b, b b bP #%+ + + + +\0 0 0 08
1 
1 
1 
1( ( ( (6   @ 
 
 
 
 
 
 
 
> +9?,, 4 4 4 4   $   &" " " "" " " "      P P P P@ @ @ @nJ1 nJ1 nJ1 nJ1 nJ1 nJ1 nJ1 nJ1bUx x x x x x x x x xrn   